{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 228, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 3.949260765314102, "epoch": 0.06611570247933884, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 233.2270751953125, "mean_token_accuracy": 0.026110622729174793, "num_tokens": 30341.0, "step": 5 }, { "entropy": 3.926559180021286, "epoch": 0.1322314049586777, "grad_norm": 0.0, "learning_rate": 2.25e-05, "loss": 233.90556640625, "mean_token_accuracy": 0.025600465817842633, "num_tokens": 60184.0, "step": 10 }, { "entropy": 3.9399050384759904, "epoch": 0.19834710743801653, "grad_norm": 0.0, "learning_rate": 3.5e-05, "loss": 233.5798828125, "mean_token_accuracy": 0.026415328145958483, "num_tokens": 90535.0, "step": 15 }, { "entropy": 3.9297859936952593, "epoch": 0.2644628099173554, "grad_norm": 0.0, "learning_rate": 4.75e-05, "loss": 233.126611328125, "mean_token_accuracy": 0.02820728103397414, "num_tokens": 116077.0, "step": 20 }, { "entropy": 3.962813687324524, "epoch": 0.3305785123966942, "grad_norm": 0.0, "learning_rate": 4.9038461538461536e-05, "loss": 232.579443359375, "mean_token_accuracy": 0.026059218565933406, "num_tokens": 147898.0, "step": 25 }, { "entropy": 3.959607166051865, "epoch": 0.39669421487603307, "grad_norm": 0.0, "learning_rate": 4.783653846153847e-05, "loss": 233.8760986328125, "mean_token_accuracy": 0.02393958643078804, "num_tokens": 179807.0, "step": 30 }, { "entropy": 3.942630410194397, "epoch": 0.4628099173553719, "grad_norm": 0.0, "learning_rate": 4.6634615384615384e-05, "loss": 232.322216796875, "mean_token_accuracy": 0.026402945548761637, "num_tokens": 207991.0, "step": 35 }, { "entropy": 3.926275482773781, "epoch": 0.5289256198347108, "grad_norm": 0.0, "learning_rate": 4.543269230769231e-05, "loss": 232.013916015625, "mean_token_accuracy": 0.02679328005760908, "num_tokens": 236639.0, "step": 40 }, { "entropy": 3.9455344051122667, "epoch": 0.5950413223140496, "grad_norm": 0.0, "learning_rate": 4.423076923076923e-05, "loss": 233.5448974609375, "mean_token_accuracy": 0.028373240097425877, "num_tokens": 263099.0, "step": 45 }, { "entropy": 3.938176819682121, "epoch": 0.6611570247933884, "grad_norm": 0.0, "learning_rate": 4.302884615384616e-05, "loss": 231.9595703125, "mean_token_accuracy": 0.024394837359432132, "num_tokens": 294450.0, "step": 50 }, { "entropy": 3.945117491483688, "epoch": 0.7272727272727273, "grad_norm": 0.0, "learning_rate": 4.182692307692308e-05, "loss": 234.357421875, "mean_token_accuracy": 0.026401228457689285, "num_tokens": 323322.0, "step": 55 }, { "entropy": 3.958406376838684, "epoch": 0.7933884297520661, "grad_norm": 0.0, "learning_rate": 4.0625000000000005e-05, "loss": 234.3664306640625, "mean_token_accuracy": 0.02587177858222276, "num_tokens": 353600.0, "step": 60 }, { "entropy": 3.9519979149103164, "epoch": 0.859504132231405, "grad_norm": 0.0, "learning_rate": 3.942307692307692e-05, "loss": 232.3470947265625, "mean_token_accuracy": 0.027900667279027402, "num_tokens": 382910.0, "step": 65 }, { "entropy": 3.937412214279175, "epoch": 0.9256198347107438, "grad_norm": 0.0, "learning_rate": 3.8221153846153846e-05, "loss": 230.767626953125, "mean_token_accuracy": 0.028427720058243722, "num_tokens": 410956.0, "step": 70 }, { "entropy": 3.942065638303757, "epoch": 0.9917355371900827, "grad_norm": 0.0, "learning_rate": 3.701923076923077e-05, "loss": 233.8431396484375, "mean_token_accuracy": 0.02679448336130008, "num_tokens": 436905.0, "step": 75 }, { "entropy": 3.9374109313294694, "epoch": 1.052892561983471, "grad_norm": 0.0, "learning_rate": 3.5817307692307695e-05, "loss": 215.2375, "mean_token_accuracy": 0.02448466032542087, "num_tokens": 465647.0, "step": 80 }, { "entropy": 3.953054502606392, "epoch": 1.1190082644628099, "grad_norm": 0.0, "learning_rate": 3.461538461538462e-05, "loss": 234.6711181640625, "mean_token_accuracy": 0.02631221548654139, "num_tokens": 492908.0, "step": 85 }, { "entropy": 3.94879230260849, "epoch": 1.1851239669421487, "grad_norm": 0.0, "learning_rate": 3.3413461538461536e-05, "loss": 230.2646240234375, "mean_token_accuracy": 0.026181722863111646, "num_tokens": 521382.0, "step": 90 }, { "entropy": 3.936227795481682, "epoch": 1.2512396694214876, "grad_norm": 0.0, "learning_rate": 3.221153846153847e-05, "loss": 232.071826171875, "mean_token_accuracy": 0.025906398333609103, "num_tokens": 549041.0, "step": 95 }, { "entropy": 3.930509549379349, "epoch": 1.3173553719008264, "grad_norm": 0.0, "learning_rate": 3.1009615384615384e-05, "loss": 232.701025390625, "mean_token_accuracy": 0.027634602575562894, "num_tokens": 578290.0, "step": 100 }, { "entropy": 3.9412417501211165, "epoch": 1.3834710743801653, "grad_norm": 0.0, "learning_rate": 2.9807692307692308e-05, "loss": 233.9791259765625, "mean_token_accuracy": 0.026698815589770674, "num_tokens": 607499.0, "step": 105 }, { "entropy": 3.957675591111183, "epoch": 1.449586776859504, "grad_norm": 0.0, "learning_rate": 2.860576923076923e-05, "loss": 232.7337158203125, "mean_token_accuracy": 0.026053250976838173, "num_tokens": 637795.0, "step": 110 }, { "entropy": 3.9587122440338134, "epoch": 1.515702479338843, "grad_norm": 0.0, "learning_rate": 2.7403846153846156e-05, "loss": 232.01123046875, "mean_token_accuracy": 0.026154680154286326, "num_tokens": 666226.0, "step": 115 }, { "entropy": 3.9355483412742616, "epoch": 1.5818181818181818, "grad_norm": 0.0, "learning_rate": 2.620192307692308e-05, "loss": 233.4347900390625, "mean_token_accuracy": 0.027279651118442418, "num_tokens": 694227.0, "step": 120 }, { "entropy": 3.9471412718296053, "epoch": 1.6479338842975206, "grad_norm": 0.0, "learning_rate": 2.5e-05, "loss": 233.40400390625, "mean_token_accuracy": 0.027405926771461964, "num_tokens": 722605.0, "step": 125 }, { "entropy": 3.948015907406807, "epoch": 1.7140495867768595, "grad_norm": 0.0, "learning_rate": 2.3798076923076922e-05, "loss": 233.548486328125, "mean_token_accuracy": 0.028720574569888413, "num_tokens": 750872.0, "step": 130 }, { "entropy": 3.9404551655054094, "epoch": 1.7801652892561983, "grad_norm": 0.0, "learning_rate": 2.2596153846153846e-05, "loss": 235.551025390625, "mean_token_accuracy": 0.0262312832986936, "num_tokens": 778888.0, "step": 135 }, { "entropy": 3.9351486653089522, "epoch": 1.8462809917355372, "grad_norm": 0.0, "learning_rate": 2.139423076923077e-05, "loss": 231.918896484375, "mean_token_accuracy": 0.025760800496209414, "num_tokens": 810828.0, "step": 140 }, { "entropy": 3.948241111636162, "epoch": 1.912396694214876, "grad_norm": 0.0, "learning_rate": 2.0192307692307694e-05, "loss": 233.4085205078125, "mean_token_accuracy": 0.02667535947402939, "num_tokens": 843110.0, "step": 145 }, { "entropy": 3.94561530649662, "epoch": 1.9785123966942149, "grad_norm": 0.0, "learning_rate": 1.8990384615384615e-05, "loss": 234.3532958984375, "mean_token_accuracy": 0.025313910108525305, "num_tokens": 871729.0, "step": 150 }, { "entropy": 3.943527849944862, "epoch": 2.0396694214876034, "grad_norm": 0.0, "learning_rate": 1.778846153846154e-05, "loss": 215.9062255859375, "mean_token_accuracy": 0.025429070713250217, "num_tokens": 900426.0, "step": 155 }, { "entropy": 3.94203040599823, "epoch": 2.105785123966942, "grad_norm": 0.0, "learning_rate": 1.6586538461538463e-05, "loss": 230.911865234375, "mean_token_accuracy": 0.02837887905770913, "num_tokens": 928864.0, "step": 160 }, { "entropy": 3.9532357156276703, "epoch": 2.171900826446281, "grad_norm": 0.0, "learning_rate": 1.5384615384615387e-05, "loss": 234.5886962890625, "mean_token_accuracy": 0.02579844092251733, "num_tokens": 957054.0, "step": 165 }, { "entropy": 3.938724213838577, "epoch": 2.2380165289256198, "grad_norm": 0.0, "learning_rate": 1.4182692307692308e-05, "loss": 233.734033203125, "mean_token_accuracy": 0.026959190622437745, "num_tokens": 986375.0, "step": 170 }, { "entropy": 3.9513088554143905, "epoch": 2.304132231404959, "grad_norm": 0.0, "learning_rate": 1.2980769230769232e-05, "loss": 234.519482421875, "mean_token_accuracy": 0.02572885201079771, "num_tokens": 1016379.0, "step": 175 }, { "entropy": 3.9490306466817855, "epoch": 2.3702479338842974, "grad_norm": 0.0, "learning_rate": 1.1778846153846154e-05, "loss": 232.4193115234375, "mean_token_accuracy": 0.02734465190442279, "num_tokens": 1045506.0, "step": 180 }, { "entropy": 3.9409770250320433, "epoch": 2.4363636363636365, "grad_norm": 0.0, "learning_rate": 1.0576923076923077e-05, "loss": 234.274267578125, "mean_token_accuracy": 0.025954201573040337, "num_tokens": 1075203.0, "step": 185 }, { "entropy": 3.945644071698189, "epoch": 2.502479338842975, "grad_norm": 0.0, "learning_rate": 9.375000000000001e-06, "loss": 233.009033203125, "mean_token_accuracy": 0.025436178874224426, "num_tokens": 1105840.0, "step": 190 }, { "entropy": 3.9378468334674834, "epoch": 2.568595041322314, "grad_norm": 0.0, "learning_rate": 8.173076923076923e-06, "loss": 231.7622802734375, "mean_token_accuracy": 0.02464213842758909, "num_tokens": 1136555.0, "step": 195 }, { "entropy": 3.9481330841779707, "epoch": 2.634710743801653, "grad_norm": 0.0, "learning_rate": 6.9711538461538465e-06, "loss": 234.6160888671875, "mean_token_accuracy": 0.024175548332277686, "num_tokens": 1166980.0, "step": 200 }, { "entropy": 3.942607444524765, "epoch": 2.700826446280992, "grad_norm": 0.0, "learning_rate": 5.76923076923077e-06, "loss": 232.27705078125, "mean_token_accuracy": 0.02845959346741438, "num_tokens": 1191975.0, "step": 205 }, { "entropy": 3.94253448843956, "epoch": 2.7669421487603305, "grad_norm": 0.0, "learning_rate": 4.567307692307692e-06, "loss": 231.2766357421875, "mean_token_accuracy": 0.026302760490216313, "num_tokens": 1222197.0, "step": 210 }, { "entropy": 3.9235304445028305, "epoch": 2.8330578512396696, "grad_norm": 0.0, "learning_rate": 3.3653846153846154e-06, "loss": 233.171337890625, "mean_token_accuracy": 0.02858651557471603, "num_tokens": 1250391.0, "step": 215 }, { "entropy": 3.9577153235673905, "epoch": 2.899173553719008, "grad_norm": 0.0, "learning_rate": 2.1634615384615387e-06, "loss": 233.0808349609375, "mean_token_accuracy": 0.025711765605956317, "num_tokens": 1281477.0, "step": 220 }, { "entropy": 3.941769191622734, "epoch": 2.9652892561983473, "grad_norm": 0.0, "learning_rate": 9.615384615384617e-07, "loss": 232.8261962890625, "mean_token_accuracy": 0.02683557631680742, "num_tokens": 1310280.0, "step": 225 } ], "logging_steps": 5, "max_steps": 228, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.582163831136659e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }