{ "best_metric": 1.1622273921966553, "best_model_checkpoint": "./outputs/instruct-lora-8b-aplly_chat_template-land/checkpoint-740", "epoch": 1.0652463382157125, "eval_steps": 20, "global_step": 800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0013315579227696406, "eval_loss": 1.4733461141586304, "eval_runtime": 59.4361, "eval_samples_per_second": 22.461, "eval_steps_per_second": 5.619, "step": 1 }, { "epoch": 0.02663115845539281, "grad_norm": 0.7614122629165649, "learning_rate": 2.666666666666667e-06, "loss": 1.4194, "step": 20 }, { "epoch": 0.02663115845539281, "eval_loss": 1.471280813217163, "eval_runtime": 57.1574, "eval_samples_per_second": 23.357, "eval_steps_per_second": 5.844, "step": 20 }, { "epoch": 0.05326231691078562, "grad_norm": 0.7800308465957642, "learning_rate": 5.333333333333334e-06, "loss": 1.376, "step": 40 }, { "epoch": 0.05326231691078562, "eval_loss": 1.4474345445632935, "eval_runtime": 57.2352, "eval_samples_per_second": 23.325, "eval_steps_per_second": 5.836, "step": 40 }, { "epoch": 0.07989347536617843, "grad_norm": 0.8508164286613464, "learning_rate": 8e-06, "loss": 1.3563, "step": 60 }, { "epoch": 0.07989347536617843, "eval_loss": 1.3645858764648438, "eval_runtime": 57.1364, "eval_samples_per_second": 23.365, "eval_steps_per_second": 5.846, "step": 60 }, { "epoch": 0.10652463382157124, "grad_norm": 0.8896499276161194, "learning_rate": 1.0666666666666667e-05, "loss": 1.2653, "step": 80 }, { "epoch": 0.10652463382157124, "eval_loss": 1.303858757019043, "eval_runtime": 57.1088, "eval_samples_per_second": 23.376, "eval_steps_per_second": 5.848, "step": 80 }, { "epoch": 0.13315579227696406, "grad_norm": 0.9267684817314148, "learning_rate": 1.3333333333333333e-05, "loss": 1.2094, "step": 100 }, { "epoch": 0.13315579227696406, "eval_loss": 1.279226541519165, "eval_runtime": 59.6928, "eval_samples_per_second": 22.365, "eval_steps_per_second": 5.595, "step": 100 }, { "epoch": 0.15978695073235685, "grad_norm": 1.0457453727722168, "learning_rate": 1.6e-05, "loss": 1.1917, "step": 120 }, { "epoch": 0.15978695073235685, "eval_loss": 1.2594722509384155, "eval_runtime": 57.1101, "eval_samples_per_second": 23.376, "eval_steps_per_second": 5.848, "step": 120 }, { "epoch": 0.18641810918774968, "grad_norm": 1.1883381605148315, "learning_rate": 1.866666666666667e-05, "loss": 1.2034, "step": 140 }, { "epoch": 0.18641810918774968, "eval_loss": 1.2453105449676514, "eval_runtime": 57.085, "eval_samples_per_second": 23.386, "eval_steps_per_second": 5.851, "step": 140 }, { "epoch": 0.21304926764314247, "grad_norm": 1.2522987127304077, "learning_rate": 2.1333333333333335e-05, "loss": 1.1147, "step": 160 }, { "epoch": 0.21304926764314247, "eval_loss": 1.2352497577667236, "eval_runtime": 59.6977, "eval_samples_per_second": 22.363, "eval_steps_per_second": 5.595, "step": 160 }, { "epoch": 0.2396804260985353, "grad_norm": 1.3950749635696411, "learning_rate": 2.4e-05, "loss": 1.1172, "step": 180 }, { "epoch": 0.2396804260985353, "eval_loss": 1.2247178554534912, "eval_runtime": 57.1298, "eval_samples_per_second": 23.368, "eval_steps_per_second": 5.846, "step": 180 }, { "epoch": 0.2663115845539281, "grad_norm": 1.3889997005462646, "learning_rate": 2.6666666666666667e-05, "loss": 1.1148, "step": 200 }, { "epoch": 0.2663115845539281, "eval_loss": 1.2236417531967163, "eval_runtime": 57.1101, "eval_samples_per_second": 23.376, "eval_steps_per_second": 5.848, "step": 200 }, { "epoch": 0.2929427430093209, "grad_norm": 1.4289050102233887, "learning_rate": 2.9333333333333333e-05, "loss": 1.0828, "step": 220 }, { "epoch": 0.2929427430093209, "eval_loss": 1.217771291732788, "eval_runtime": 57.09, "eval_samples_per_second": 23.384, "eval_steps_per_second": 5.85, "step": 220 }, { "epoch": 0.3195739014647137, "grad_norm": 1.48817777633667, "learning_rate": 2.9995950624188135e-05, "loss": 1.0756, "step": 240 }, { "epoch": 0.3195739014647137, "eval_loss": 1.2135677337646484, "eval_runtime": 57.0597, "eval_samples_per_second": 23.397, "eval_steps_per_second": 5.854, "step": 240 }, { "epoch": 0.34620505992010653, "grad_norm": 1.4912829399108887, "learning_rate": 2.9977957806883764e-05, "loss": 1.0463, "step": 260 }, { "epoch": 0.34620505992010653, "eval_loss": 1.207130789756775, "eval_runtime": 57.0489, "eval_samples_per_second": 23.401, "eval_steps_per_second": 5.855, "step": 260 }, { "epoch": 0.37283621837549935, "grad_norm": 1.4056388139724731, "learning_rate": 2.99455888692835e-05, "loss": 1.0452, "step": 280 }, { "epoch": 0.37283621837549935, "eval_loss": 1.2046023607254028, "eval_runtime": 57.0853, "eval_samples_per_second": 23.386, "eval_steps_per_second": 5.851, "step": 280 }, { "epoch": 0.3994673768308921, "grad_norm": 1.4942606687545776, "learning_rate": 2.989887487969095e-05, "loss": 1.0261, "step": 300 }, { "epoch": 0.3994673768308921, "eval_loss": 1.1982561349868774, "eval_runtime": 57.1051, "eval_samples_per_second": 23.378, "eval_steps_per_second": 5.849, "step": 300 }, { "epoch": 0.42609853528628494, "grad_norm": 1.6378928422927856, "learning_rate": 2.983786067505537e-05, "loss": 1.0198, "step": 320 }, { "epoch": 0.42609853528628494, "eval_loss": 1.197502851486206, "eval_runtime": 59.5901, "eval_samples_per_second": 22.403, "eval_steps_per_second": 5.605, "step": 320 }, { "epoch": 0.45272969374167776, "grad_norm": 1.569143533706665, "learning_rate": 2.9762604817936267e-05, "loss": 1.0101, "step": 340 }, { "epoch": 0.45272969374167776, "eval_loss": 1.197273850440979, "eval_runtime": 57.1144, "eval_samples_per_second": 23.374, "eval_steps_per_second": 5.848, "step": 340 }, { "epoch": 0.4793608521970706, "grad_norm": 1.6125699281692505, "learning_rate": 2.9673179540294035e-05, "loss": 1.0121, "step": 360 }, { "epoch": 0.4793608521970706, "eval_loss": 1.1948621273040771, "eval_runtime": 57.1203, "eval_samples_per_second": 23.372, "eval_steps_per_second": 5.847, "step": 360 }, { "epoch": 0.5059920106524634, "grad_norm": 1.5121594667434692, "learning_rate": 2.9569670674160343e-05, "loss": 1.0169, "step": 380 }, { "epoch": 0.5059920106524634, "eval_loss": 1.1911152601242065, "eval_runtime": 60.0674, "eval_samples_per_second": 22.225, "eval_steps_per_second": 5.56, "step": 380 }, { "epoch": 0.5326231691078562, "grad_norm": 1.5439465045928955, "learning_rate": 2.945217756925498e-05, "loss": 0.9799, "step": 400 }, { "epoch": 0.5326231691078562, "eval_loss": 1.1894199848175049, "eval_runtime": 57.1247, "eval_samples_per_second": 23.37, "eval_steps_per_second": 5.847, "step": 400 }, { "epoch": 0.559254327563249, "grad_norm": 1.857911229133606, "learning_rate": 2.9320812997628184e-05, "loss": 0.9872, "step": 420 }, { "epoch": 0.559254327563249, "eval_loss": 1.1862047910690308, "eval_runtime": 57.1282, "eval_samples_per_second": 23.368, "eval_steps_per_second": 5.847, "step": 420 }, { "epoch": 0.5858854860186418, "grad_norm": 1.6074450016021729, "learning_rate": 2.9175703045419906e-05, "loss": 0.988, "step": 440 }, { "epoch": 0.5858854860186418, "eval_loss": 1.184722661972046, "eval_runtime": 57.1666, "eval_samples_per_second": 23.353, "eval_steps_per_second": 5.843, "step": 440 }, { "epoch": 0.6125166444740346, "grad_norm": 1.587011456489563, "learning_rate": 2.9016986991840035e-05, "loss": 0.9861, "step": 460 }, { "epoch": 0.6125166444740346, "eval_loss": 1.1814427375793457, "eval_runtime": 57.1111, "eval_samples_per_second": 23.375, "eval_steps_per_second": 5.848, "step": 460 }, { "epoch": 0.6391478029294274, "grad_norm": 1.6503058671951294, "learning_rate": 2.8844817175485628e-05, "loss": 0.9997, "step": 480 }, { "epoch": 0.6391478029294274, "eval_loss": 1.1827510595321655, "eval_runtime": 59.6344, "eval_samples_per_second": 22.386, "eval_steps_per_second": 5.601, "step": 480 }, { "epoch": 0.6657789613848203, "grad_norm": 1.4606473445892334, "learning_rate": 2.865935884812353e-05, "loss": 0.9756, "step": 500 }, { "epoch": 0.6657789613848203, "eval_loss": 1.177931785583496, "eval_runtime": 57.1613, "eval_samples_per_second": 23.355, "eval_steps_per_second": 5.843, "step": 500 }, { "epoch": 0.6924101198402131, "grad_norm": 1.6386032104492188, "learning_rate": 2.8460790016078664e-05, "loss": 0.9704, "step": 520 }, { "epoch": 0.6924101198402131, "eval_loss": 1.1767512559890747, "eval_runtime": 57.128, "eval_samples_per_second": 23.369, "eval_steps_per_second": 5.847, "step": 520 }, { "epoch": 0.7190412782956058, "grad_norm": 1.5629956722259521, "learning_rate": 2.824930126938027e-05, "loss": 0.9575, "step": 540 }, { "epoch": 0.7190412782956058, "eval_loss": 1.1756982803344727, "eval_runtime": 59.3596, "eval_samples_per_second": 22.49, "eval_steps_per_second": 5.627, "step": 540 }, { "epoch": 0.7456724367509987, "grad_norm": 1.9192149639129639, "learning_rate": 2.8025095598830108e-05, "loss": 0.9845, "step": 560 }, { "epoch": 0.7456724367509987, "eval_loss": 1.1744287014007568, "eval_runtime": 57.1096, "eval_samples_per_second": 23.376, "eval_steps_per_second": 5.848, "step": 560 }, { "epoch": 0.7723035952063915, "grad_norm": 1.5297322273254395, "learning_rate": 2.7788388201168096e-05, "loss": 0.9635, "step": 580 }, { "epoch": 0.7723035952063915, "eval_loss": 1.1726077795028687, "eval_runtime": 57.1106, "eval_samples_per_second": 23.376, "eval_steps_per_second": 5.848, "step": 580 }, { "epoch": 0.7989347536617842, "grad_norm": 1.5995993614196777, "learning_rate": 2.7539406272522557e-05, "loss": 1.0019, "step": 600 }, { "epoch": 0.7989347536617842, "eval_loss": 1.1684755086898804, "eval_runtime": 59.1165, "eval_samples_per_second": 22.583, "eval_steps_per_second": 5.65, "step": 600 }, { "epoch": 0.8255659121171771, "grad_norm": 1.989475131034851, "learning_rate": 2.7278388790343133e-05, "loss": 0.965, "step": 620 }, { "epoch": 0.8255659121171771, "eval_loss": 1.16959547996521, "eval_runtime": 57.5389, "eval_samples_per_second": 23.202, "eval_steps_per_second": 5.805, "step": 620 }, { "epoch": 0.8521970705725699, "grad_norm": 1.581007719039917, "learning_rate": 2.7005586284025857e-05, "loss": 0.9521, "step": 640 }, { "epoch": 0.8521970705725699, "eval_loss": 1.1685765981674194, "eval_runtime": 57.0994, "eval_samples_per_second": 23.38, "eval_steps_per_second": 5.849, "step": 640 }, { "epoch": 0.8788282290279628, "grad_norm": 1.8926242589950562, "learning_rate": 2.6721260594450408e-05, "loss": 0.9714, "step": 660 }, { "epoch": 0.8788282290279628, "eval_loss": 1.1654787063598633, "eval_runtime": 57.0989, "eval_samples_per_second": 23.38, "eval_steps_per_second": 5.849, "step": 660 }, { "epoch": 0.9054593874833555, "grad_norm": 1.7182027101516724, "learning_rate": 2.6425684622660387e-05, "loss": 0.9893, "step": 680 }, { "epoch": 0.9054593874833555, "eval_loss": 1.1642155647277832, "eval_runtime": 57.0492, "eval_samples_per_second": 23.401, "eval_steps_per_second": 5.855, "step": 680 }, { "epoch": 0.9320905459387483, "grad_norm": 1.7494959831237793, "learning_rate": 2.6119142067927872e-05, "loss": 0.9581, "step": 700 }, { "epoch": 0.9320905459387483, "eval_loss": 1.164635419845581, "eval_runtime": 59.4597, "eval_samples_per_second": 22.452, "eval_steps_per_second": 5.617, "step": 700 }, { "epoch": 0.9587217043941412, "grad_norm": 1.9605196714401245, "learning_rate": 2.5801927155453614e-05, "loss": 0.9165, "step": 720 }, { "epoch": 0.9587217043941412, "eval_loss": 1.164476752281189, "eval_runtime": 59.4987, "eval_samples_per_second": 22.437, "eval_steps_per_second": 5.614, "step": 720 }, { "epoch": 0.9853528628495339, "grad_norm": 1.636960744857788, "learning_rate": 2.5474344353964275e-05, "loss": 0.9849, "step": 740 }, { "epoch": 0.9853528628495339, "eval_loss": 1.1622273921966553, "eval_runtime": 57.4882, "eval_samples_per_second": 23.222, "eval_steps_per_second": 5.81, "step": 740 }, { "epoch": 1.0119840213049267, "grad_norm": 1.6740643978118896, "learning_rate": 2.513670808347771e-05, "loss": 0.905, "step": 760 }, { "epoch": 1.0119840213049267, "eval_loss": 1.1645617485046387, "eval_runtime": 57.4263, "eval_samples_per_second": 23.247, "eval_steps_per_second": 5.816, "step": 760 }, { "epoch": 1.0386151797603196, "grad_norm": 1.7723573446273804, "learning_rate": 2.4789342413516838e-05, "loss": 0.8868, "step": 780 }, { "epoch": 1.0386151797603196, "eval_loss": 1.1635513305664062, "eval_runtime": 57.091, "eval_samples_per_second": 23.384, "eval_steps_per_second": 5.85, "step": 780 }, { "epoch": 1.0652463382157125, "grad_norm": 1.7861186265945435, "learning_rate": 2.4432580752061735e-05, "loss": 0.8853, "step": 800 }, { "epoch": 1.0652463382157125, "eval_loss": 1.1627150774002075, "eval_runtime": 57.0672, "eval_samples_per_second": 23.393, "eval_steps_per_second": 5.853, "step": 800 } ], "logging_steps": 20, "max_steps": 2253, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 200, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 3 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.1795547152069427e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }