{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 250, "global_step": 2675, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05009345794392523, "grad_norm": 0.6554287672042847, "learning_rate": 5e-07, "loss": 0.4929, "step": 134 }, { "epoch": 0.09345794392523364, "eval_sft_sg_values_eval_loss": 0.4027545750141144, "eval_sft_sg_values_eval_runtime": 28.7907, "eval_sft_sg_values_eval_samples_per_second": 17.367, "eval_sft_sg_values_eval_steps_per_second": 8.683, "step": 250 }, { "epoch": 0.10018691588785046, "grad_norm": 0.877953827381134, "learning_rate": 1e-06, "loss": 0.4862, "step": 268 }, { "epoch": 0.1502803738317757, "grad_norm": 1.139445185661316, "learning_rate": 9.443290402991276e-07, "loss": 0.4112, "step": 402 }, { "epoch": 0.18691588785046728, "eval_sft_sg_values_eval_loss": 0.35132163763046265, "eval_sft_sg_values_eval_runtime": 28.7753, "eval_sft_sg_values_eval_samples_per_second": 17.376, "eval_sft_sg_values_eval_steps_per_second": 8.688, "step": 500 }, { "epoch": 0.20037383177570092, "grad_norm": 1.0352528095245361, "learning_rate": 8.886580805982551e-07, "loss": 0.3246, "step": 536 }, { "epoch": 0.2504672897196262, "grad_norm": 1.3118387460708618, "learning_rate": 8.329871208973826e-07, "loss": 0.2797, "step": 670 }, { "epoch": 0.2803738317757009, "eval_sft_sg_values_eval_loss": 0.2957317531108856, "eval_sft_sg_values_eval_runtime": 28.7598, "eval_sft_sg_values_eval_samples_per_second": 17.385, "eval_sft_sg_values_eval_steps_per_second": 8.693, "step": 750 }, { "epoch": 0.3005607476635514, "grad_norm": 1.1230024099349976, "learning_rate": 7.773161611965101e-07, "loss": 0.2643, "step": 804 }, { "epoch": 0.3506542056074766, "grad_norm": 1.5997352600097656, "learning_rate": 7.216452014956377e-07, "loss": 0.2602, "step": 938 }, { "epoch": 0.37383177570093457, "eval_sft_sg_values_eval_loss": 0.26124340295791626, "eval_sft_sg_values_eval_runtime": 28.7839, "eval_sft_sg_values_eval_samples_per_second": 17.371, "eval_sft_sg_values_eval_steps_per_second": 8.685, "step": 1000 }, { "epoch": 0.40074766355140184, "grad_norm": 2.0844686031341553, "learning_rate": 6.659742417947652e-07, "loss": 0.2542, "step": 1072 }, { "epoch": 0.4508411214953271, "grad_norm": 1.0873792171478271, "learning_rate": 6.103032820938928e-07, "loss": 0.2503, "step": 1206 }, { "epoch": 0.4672897196261682, "eval_sft_sg_values_eval_loss": 0.23845288157463074, "eval_sft_sg_values_eval_runtime": 28.7889, "eval_sft_sg_values_eval_samples_per_second": 17.368, "eval_sft_sg_values_eval_steps_per_second": 8.684, "step": 1250 }, { "epoch": 0.5009345794392523, "grad_norm": 1.5952402353286743, "learning_rate": 5.546323223930204e-07, "loss": 0.2441, "step": 1340 }, { "epoch": 0.5510280373831775, "grad_norm": 1.4540523290634155, "learning_rate": 4.989613626921479e-07, "loss": 0.2416, "step": 1474 }, { "epoch": 0.5607476635514018, "eval_sft_sg_values_eval_loss": 0.22509750723838806, "eval_sft_sg_values_eval_runtime": 28.785, "eval_sft_sg_values_eval_samples_per_second": 17.37, "eval_sft_sg_values_eval_steps_per_second": 8.685, "step": 1500 }, { "epoch": 0.6011214953271028, "grad_norm": 1.5883971452713013, "learning_rate": 4.4329040299127543e-07, "loss": 0.2381, "step": 1608 }, { "epoch": 0.6512149532710281, "grad_norm": 1.1363577842712402, "learning_rate": 3.87619443290403e-07, "loss": 0.238, "step": 1742 }, { "epoch": 0.6542056074766355, "eval_sft_sg_values_eval_loss": 0.2166362851858139, "eval_sft_sg_values_eval_runtime": 28.8027, "eval_sft_sg_values_eval_samples_per_second": 17.359, "eval_sft_sg_values_eval_steps_per_second": 8.68, "step": 1750 }, { "epoch": 0.7013084112149532, "grad_norm": 1.0956478118896484, "learning_rate": 3.3194848358953054e-07, "loss": 0.2332, "step": 1876 }, { "epoch": 0.7476635514018691, "eval_sft_sg_values_eval_loss": 0.21183913946151733, "eval_sft_sg_values_eval_runtime": 28.802, "eval_sft_sg_values_eval_samples_per_second": 17.36, "eval_sft_sg_values_eval_steps_per_second": 8.68, "step": 2000 }, { "epoch": 0.7514018691588785, "grad_norm": 1.294016718864441, "learning_rate": 2.7627752388865805e-07, "loss": 0.2282, "step": 2010 }, { "epoch": 0.8014953271028037, "grad_norm": 1.3785642385482788, "learning_rate": 2.206065641877856e-07, "loss": 0.2335, "step": 2144 }, { "epoch": 0.8411214953271028, "eval_sft_sg_values_eval_loss": 0.20833030343055725, "eval_sft_sg_values_eval_runtime": 28.8236, "eval_sft_sg_values_eval_samples_per_second": 17.347, "eval_sft_sg_values_eval_steps_per_second": 8.673, "step": 2250 }, { "epoch": 0.851588785046729, "grad_norm": 1.1724706888198853, "learning_rate": 1.6493560448691317e-07, "loss": 0.2307, "step": 2278 }, { "epoch": 0.9016822429906542, "grad_norm": 2.404283285140991, "learning_rate": 1.0926464478604071e-07, "loss": 0.2359, "step": 2412 }, { "epoch": 0.9345794392523364, "eval_sft_sg_values_eval_loss": 0.20680196583271027, "eval_sft_sg_values_eval_runtime": 28.7876, "eval_sft_sg_values_eval_samples_per_second": 17.369, "eval_sft_sg_values_eval_steps_per_second": 8.684, "step": 2500 }, { "epoch": 0.9517757009345794, "grad_norm": 1.2437078952789307, "learning_rate": 5.3593685085168255e-08, "loss": 0.2313, "step": 2546 }, { "epoch": 1.0, "step": 2675, "total_flos": 3.769390263140352e+17, "train_loss": 0.280462795507128, "train_runtime": 7587.9769, "train_samples_per_second": 2.82, "train_steps_per_second": 0.353 } ], "logging_steps": 134, "max_steps": 2675, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.769390263140352e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }