{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.3816793893129771, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003816793893129771, "grad_norm": 2.560382604598999, "learning_rate": 1.3740458015267178e-06, "loss": 0.1604, "step": 10 }, { "epoch": 0.007633587786259542, "grad_norm": 3.7791059017181396, "learning_rate": 2.900763358778626e-06, "loss": 0.2, "step": 20 }, { "epoch": 0.011450381679389313, "grad_norm": 3.4597890377044678, "learning_rate": 4.427480916030535e-06, "loss": 0.1753, "step": 30 }, { "epoch": 0.015267175572519083, "grad_norm": 2.7554614543914795, "learning_rate": 5.9541984732824435e-06, "loss": 0.1171, "step": 40 }, { "epoch": 0.019083969465648856, "grad_norm": 2.082803726196289, "learning_rate": 7.480916030534352e-06, "loss": 0.0549, "step": 50 }, { "epoch": 0.022900763358778626, "grad_norm": 2.8653404712677, "learning_rate": 9.007633587786259e-06, "loss": 0.0671, "step": 60 }, { "epoch": 0.026717557251908396, "grad_norm": 1.3137264251708984, "learning_rate": 1.0534351145038168e-05, "loss": 0.0492, "step": 70 }, { "epoch": 0.030534351145038167, "grad_norm": 1.7409602403640747, "learning_rate": 1.2061068702290078e-05, "loss": 0.0343, "step": 80 }, { "epoch": 0.03435114503816794, "grad_norm": 0.6623567342758179, "learning_rate": 1.3587786259541987e-05, "loss": 0.0439, "step": 90 }, { "epoch": 0.03816793893129771, "grad_norm": 1.5286792516708374, "learning_rate": 1.5114503816793895e-05, "loss": 0.0377, "step": 100 }, { "epoch": 0.04198473282442748, "grad_norm": 0.9364567399024963, "learning_rate": 1.6641221374045802e-05, "loss": 0.019, "step": 110 }, { "epoch": 0.04580152671755725, "grad_norm": 0.9754840135574341, "learning_rate": 1.816793893129771e-05, "loss": 0.0245, "step": 120 }, { "epoch": 0.04961832061068702, "grad_norm": 0.17266954481601715, "learning_rate": 1.969465648854962e-05, "loss": 0.0259, "step": 130 }, { "epoch": 0.05343511450381679, "grad_norm": 1.5330302715301514, "learning_rate": 2.122137404580153e-05, "loss": 0.0268, "step": 140 }, { "epoch": 0.05725190839694656, "grad_norm": 1.4104247093200684, "learning_rate": 2.2748091603053437e-05, "loss": 0.0211, "step": 150 }, { "epoch": 0.061068702290076333, "grad_norm": 0.004101096652448177, "learning_rate": 2.4274809160305344e-05, "loss": 0.0263, "step": 160 }, { "epoch": 0.0648854961832061, "grad_norm": 0.6165310144424438, "learning_rate": 2.5801526717557254e-05, "loss": 0.0138, "step": 170 }, { "epoch": 0.06870229007633588, "grad_norm": 0.37463024258613586, "learning_rate": 2.732824427480916e-05, "loss": 0.0322, "step": 180 }, { "epoch": 0.07251908396946564, "grad_norm": 0.620210587978363, "learning_rate": 2.885496183206107e-05, "loss": 0.0188, "step": 190 }, { "epoch": 0.07633587786259542, "grad_norm": 0.758702278137207, "learning_rate": 3.038167938931298e-05, "loss": 0.0139, "step": 200 }, { "epoch": 0.08015267175572519, "grad_norm": 1.3074781894683838, "learning_rate": 3.190839694656489e-05, "loss": 0.0306, "step": 210 }, { "epoch": 0.08396946564885496, "grad_norm": 0.6031699776649475, "learning_rate": 3.3435114503816796e-05, "loss": 0.0175, "step": 220 }, { "epoch": 0.08778625954198473, "grad_norm": 1.136716365814209, "learning_rate": 3.4961832061068706e-05, "loss": 0.0242, "step": 230 }, { "epoch": 0.0916030534351145, "grad_norm": 0.5463850498199463, "learning_rate": 3.6488549618320616e-05, "loss": 0.0218, "step": 240 }, { "epoch": 0.09541984732824428, "grad_norm": 0.1893869787454605, "learning_rate": 3.801526717557252e-05, "loss": 0.0171, "step": 250 }, { "epoch": 0.09923664122137404, "grad_norm": 0.05841643363237381, "learning_rate": 3.954198473282443e-05, "loss": 0.0183, "step": 260 }, { "epoch": 0.10305343511450382, "grad_norm": 0.5519092082977295, "learning_rate": 3.999913022869401e-05, "loss": 0.0171, "step": 270 }, { "epoch": 0.10687022900763359, "grad_norm": 0.75583815574646, "learning_rate": 3.999487030645216e-05, "loss": 0.0269, "step": 280 }, { "epoch": 0.11068702290076336, "grad_norm": 0.34393128752708435, "learning_rate": 3.998706123456487e-05, "loss": 0.021, "step": 290 }, { "epoch": 0.11450381679389313, "grad_norm": 0.741912841796875, "learning_rate": 3.997570439916593e-05, "loss": 0.0139, "step": 300 }, { "epoch": 0.1183206106870229, "grad_norm": 0.42950552701950073, "learning_rate": 3.996080181612776e-05, "loss": 0.01, "step": 310 }, { "epoch": 0.12213740458015267, "grad_norm": 0.8363562822341919, "learning_rate": 3.9942356130703715e-05, "loss": 0.0119, "step": 320 }, { "epoch": 0.12595419847328243, "grad_norm": 0.2727142572402954, "learning_rate": 3.992037061705845e-05, "loss": 0.0127, "step": 330 }, { "epoch": 0.1297709923664122, "grad_norm": 1.3103034496307373, "learning_rate": 3.98948491776868e-05, "loss": 0.0167, "step": 340 }, { "epoch": 0.13358778625954199, "grad_norm": 0.1281624436378479, "learning_rate": 3.986579634272105e-05, "loss": 0.0178, "step": 350 }, { "epoch": 0.13740458015267176, "grad_norm": 0.22181691229343414, "learning_rate": 3.9833217269126826e-05, "loss": 0.0213, "step": 360 }, { "epoch": 0.14122137404580154, "grad_norm": 0.3226180970668793, "learning_rate": 3.979711773978772e-05, "loss": 0.0177, "step": 370 }, { "epoch": 0.1450381679389313, "grad_norm": 0.15748214721679688, "learning_rate": 3.9757504162478827e-05, "loss": 0.0161, "step": 380 }, { "epoch": 0.14885496183206107, "grad_norm": 0.7109506726264954, "learning_rate": 3.971438356872931e-05, "loss": 0.0186, "step": 390 }, { "epoch": 0.15267175572519084, "grad_norm": 0.1298808604478836, "learning_rate": 3.966776361257431e-05, "loss": 0.0068, "step": 400 }, { "epoch": 0.15648854961832062, "grad_norm": 0.5112190842628479, "learning_rate": 3.961765256919633e-05, "loss": 0.0175, "step": 410 }, { "epoch": 0.16030534351145037, "grad_norm": 0.12070173770189285, "learning_rate": 3.956405933345637e-05, "loss": 0.0138, "step": 420 }, { "epoch": 0.16412213740458015, "grad_norm": 0.009782848879694939, "learning_rate": 3.950699341831502e-05, "loss": 0.0087, "step": 430 }, { "epoch": 0.16793893129770993, "grad_norm": 0.49791470170021057, "learning_rate": 3.944646495314395e-05, "loss": 0.0153, "step": 440 }, { "epoch": 0.1717557251908397, "grad_norm": 0.5504122376441956, "learning_rate": 3.938248468192785e-05, "loss": 0.0202, "step": 450 }, { "epoch": 0.17557251908396945, "grad_norm": 1.6467090845108032, "learning_rate": 3.931506396135738e-05, "loss": 0.0206, "step": 460 }, { "epoch": 0.17938931297709923, "grad_norm": 0.4959694445133209, "learning_rate": 3.924421475881333e-05, "loss": 0.0146, "step": 470 }, { "epoch": 0.183206106870229, "grad_norm": 0.6547468304634094, "learning_rate": 3.916994965024234e-05, "loss": 0.014, "step": 480 }, { "epoch": 0.18702290076335878, "grad_norm": 0.37912440299987793, "learning_rate": 3.909228181792464e-05, "loss": 0.0149, "step": 490 }, { "epoch": 0.19083969465648856, "grad_norm": 0.17722374200820923, "learning_rate": 3.901122504813418e-05, "loss": 0.0142, "step": 500 }, { "epoch": 0.19083969465648856, "eval_loss": 0.03738304600119591, "eval_runtime": 140.2925, "eval_samples_per_second": 56.895, "eval_steps_per_second": 0.299, "step": 500 }, { "epoch": 0.1946564885496183, "grad_norm": 0.012979953549802303, "learning_rate": 3.89267937286915e-05, "loss": 0.0095, "step": 510 }, { "epoch": 0.1984732824427481, "grad_norm": 0.30195748805999756, "learning_rate": 3.883900284640987e-05, "loss": 0.0075, "step": 520 }, { "epoch": 0.20229007633587787, "grad_norm": 0.7806116938591003, "learning_rate": 3.874786798443505e-05, "loss": 0.0169, "step": 530 }, { "epoch": 0.20610687022900764, "grad_norm": 0.17662391066551208, "learning_rate": 3.8653405319479285e-05, "loss": 0.0135, "step": 540 }, { "epoch": 0.2099236641221374, "grad_norm": 0.12067825347185135, "learning_rate": 3.855563161894985e-05, "loss": 0.0104, "step": 550 }, { "epoch": 0.21374045801526717, "grad_norm": 0.5847358107566833, "learning_rate": 3.8454564237972786e-05, "loss": 0.0175, "step": 560 }, { "epoch": 0.21755725190839695, "grad_norm": 0.2978436350822449, "learning_rate": 3.835022111631235e-05, "loss": 0.0093, "step": 570 }, { "epoch": 0.22137404580152673, "grad_norm": 0.07228351384401321, "learning_rate": 3.8242620775186596e-05, "loss": 0.0111, "step": 580 }, { "epoch": 0.22519083969465647, "grad_norm": 0.33176591992378235, "learning_rate": 3.8131782313979864e-05, "loss": 0.0087, "step": 590 }, { "epoch": 0.22900763358778625, "grad_norm": 0.08005018532276154, "learning_rate": 3.801772540685253e-05, "loss": 0.011, "step": 600 }, { "epoch": 0.23282442748091603, "grad_norm": 0.09141408652067184, "learning_rate": 3.7900470299248824e-05, "loss": 0.0111, "step": 610 }, { "epoch": 0.2366412213740458, "grad_norm": 0.21106335520744324, "learning_rate": 3.778003780430317e-05, "loss": 0.0091, "step": 620 }, { "epoch": 0.24045801526717558, "grad_norm": 0.8840196132659912, "learning_rate": 3.7656449299145814e-05, "loss": 0.0096, "step": 630 }, { "epoch": 0.24427480916030533, "grad_norm": 0.09080683439970016, "learning_rate": 3.7529726721108305e-05, "loss": 0.0113, "step": 640 }, { "epoch": 0.2480916030534351, "grad_norm": 0.026247017085552216, "learning_rate": 3.739989256382957e-05, "loss": 0.0117, "step": 650 }, { "epoch": 0.25190839694656486, "grad_norm": 0.116756871342659, "learning_rate": 3.726696987326319e-05, "loss": 0.0097, "step": 660 }, { "epoch": 0.25572519083969464, "grad_norm": 0.23034921288490295, "learning_rate": 3.713098224358673e-05, "loss": 0.0072, "step": 670 }, { "epoch": 0.2595419847328244, "grad_norm": 0.03517864644527435, "learning_rate": 3.699195381301364e-05, "loss": 0.0048, "step": 680 }, { "epoch": 0.2633587786259542, "grad_norm": 0.23623879253864288, "learning_rate": 3.684990925950871e-05, "loss": 0.0103, "step": 690 }, { "epoch": 0.26717557251908397, "grad_norm": 0.043465759605169296, "learning_rate": 3.6704873796407605e-05, "loss": 0.0118, "step": 700 }, { "epoch": 0.27099236641221375, "grad_norm": 0.2958603799343109, "learning_rate": 3.655687316794144e-05, "loss": 0.0092, "step": 710 }, { "epoch": 0.2748091603053435, "grad_norm": 0.20194493234157562, "learning_rate": 3.640593364466712e-05, "loss": 0.0097, "step": 720 }, { "epoch": 0.2786259541984733, "grad_norm": 1.3390525579452515, "learning_rate": 3.625208201880421e-05, "loss": 0.0155, "step": 730 }, { "epoch": 0.2824427480916031, "grad_norm": 0.41017207503318787, "learning_rate": 3.609534559947926e-05, "loss": 0.0122, "step": 740 }, { "epoch": 0.2862595419847328, "grad_norm": 0.30380892753601074, "learning_rate": 3.5935752207878315e-05, "loss": 0.0112, "step": 750 }, { "epoch": 0.2900763358778626, "grad_norm": 0.3873494863510132, "learning_rate": 3.5773330172308627e-05, "loss": 0.0067, "step": 760 }, { "epoch": 0.29389312977099236, "grad_norm": 0.4039303958415985, "learning_rate": 3.5608108323170246e-05, "loss": 0.0109, "step": 770 }, { "epoch": 0.29770992366412213, "grad_norm": 0.5778370499610901, "learning_rate": 3.544011598783855e-05, "loss": 0.0177, "step": 780 }, { "epoch": 0.3015267175572519, "grad_norm": 0.032092221081256866, "learning_rate": 3.526938298545858e-05, "loss": 0.0144, "step": 790 }, { "epoch": 0.3053435114503817, "grad_norm": 0.06190834194421768, "learning_rate": 3.509593962165199e-05, "loss": 0.0117, "step": 800 }, { "epoch": 0.30916030534351147, "grad_norm": 0.609595000743866, "learning_rate": 3.491981668313776e-05, "loss": 0.0141, "step": 810 }, { "epoch": 0.31297709923664124, "grad_norm": 0.14446590840816498, "learning_rate": 3.474104543226746e-05, "loss": 0.0125, "step": 820 }, { "epoch": 0.31679389312977096, "grad_norm": 0.23196451365947723, "learning_rate": 3.455965760147604e-05, "loss": 0.0068, "step": 830 }, { "epoch": 0.32061068702290074, "grad_norm": 0.4062262177467346, "learning_rate": 3.4375685387649266e-05, "loss": 0.0157, "step": 840 }, { "epoch": 0.3244274809160305, "grad_norm": 0.16551926732063293, "learning_rate": 3.4189161446408706e-05, "loss": 0.0048, "step": 850 }, { "epoch": 0.3282442748091603, "grad_norm": 0.1871049553155899, "learning_rate": 3.4000118886315173e-05, "loss": 0.0082, "step": 860 }, { "epoch": 0.3320610687022901, "grad_norm": 0.437691330909729, "learning_rate": 3.3808591262991935e-05, "loss": 0.0133, "step": 870 }, { "epoch": 0.33587786259541985, "grad_norm": 0.24410566687583923, "learning_rate": 3.361461257316845e-05, "loss": 0.007, "step": 880 }, { "epoch": 0.33969465648854963, "grad_norm": 0.532948911190033, "learning_rate": 3.341821724864585e-05, "loss": 0.0097, "step": 890 }, { "epoch": 0.3435114503816794, "grad_norm": 0.1803872287273407, "learning_rate": 3.321944015018522e-05, "loss": 0.0074, "step": 900 }, { "epoch": 0.3473282442748092, "grad_norm": 0.019201336428523064, "learning_rate": 3.3018316561319667e-05, "loss": 0.0195, "step": 910 }, { "epoch": 0.3511450381679389, "grad_norm": 0.38370391726493835, "learning_rate": 3.281488218209141e-05, "loss": 0.0081, "step": 920 }, { "epoch": 0.3549618320610687, "grad_norm": 1.1995326280593872, "learning_rate": 3.260917312271491e-05, "loss": 0.0181, "step": 930 }, { "epoch": 0.35877862595419846, "grad_norm": 0.1598449945449829, "learning_rate": 3.2401225897167193e-05, "loss": 0.0127, "step": 940 }, { "epoch": 0.36259541984732824, "grad_norm": 0.15862394869327545, "learning_rate": 3.219107741670653e-05, "loss": 0.0096, "step": 950 }, { "epoch": 0.366412213740458, "grad_norm": 0.02166523039340973, "learning_rate": 3.1978764983320556e-05, "loss": 0.0135, "step": 960 }, { "epoch": 0.3702290076335878, "grad_norm": 0.05919814854860306, "learning_rate": 3.176432628310506e-05, "loss": 0.0087, "step": 970 }, { "epoch": 0.37404580152671757, "grad_norm": 0.11933150142431259, "learning_rate": 3.1547799379574593e-05, "loss": 0.0048, "step": 980 }, { "epoch": 0.37786259541984735, "grad_norm": 0.044685620814561844, "learning_rate": 3.132922270690607e-05, "loss": 0.0121, "step": 990 }, { "epoch": 0.3816793893129771, "grad_norm": 0.5684109330177307, "learning_rate": 3.1108635063116604e-05, "loss": 0.0107, "step": 1000 }, { "epoch": 0.3816793893129771, "eval_loss": 0.03127766400575638, "eval_runtime": 140.1532, "eval_samples_per_second": 56.952, "eval_steps_per_second": 0.3, "step": 1000 } ], "logging_steps": 10, "max_steps": 2620, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 9.056287130325811e+17, "train_batch_size": 192, "trial_name": null, "trial_params": null }