{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 795, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012586532410320957, "grad_norm": 0.662326455116272, "learning_rate": 2.25e-05, "loss": 0.4395, "step": 10 }, { "epoch": 0.025173064820641914, "grad_norm": 0.6513380408287048, "learning_rate": 4.75e-05, "loss": 0.2944, "step": 20 }, { "epoch": 0.03775959723096287, "grad_norm": 0.8022783994674683, "learning_rate": 7.25e-05, "loss": 0.2625, "step": 30 }, { "epoch": 0.05034612964128383, "grad_norm": 0.873383104801178, "learning_rate": 9.75e-05, "loss": 0.2161, "step": 40 }, { "epoch": 0.06293266205160478, "grad_norm": 0.6631338000297546, "learning_rate": 9.99649425670309e-05, "loss": 0.225, "step": 50 }, { "epoch": 0.07551919446192575, "grad_norm": 1.2419400215148926, "learning_rate": 9.984381949287127e-05, "loss": 0.1729, "step": 60 }, { "epoch": 0.0881057268722467, "grad_norm": 0.8251152038574219, "learning_rate": 9.9636407606048e-05, "loss": 0.2406, "step": 70 }, { "epoch": 0.10069225928256766, "grad_norm": 0.8475081324577332, "learning_rate": 9.93430659746588e-05, "loss": 0.1807, "step": 80 }, { "epoch": 0.11327879169288861, "grad_norm": 0.5358186960220337, "learning_rate": 9.896430242698277e-05, "loss": 0.2198, "step": 90 }, { "epoch": 0.12586532410320955, "grad_norm": 0.6192899942398071, "learning_rate": 9.850077267233636e-05, "loss": 0.208, "step": 100 }, { "epoch": 0.13845185651353054, "grad_norm": 0.9918169379234314, "learning_rate": 9.795327916592004e-05, "loss": 0.194, "step": 110 }, { "epoch": 0.1510383889238515, "grad_norm": 0.46744799613952637, "learning_rate": 9.732276971962082e-05, "loss": 0.1777, "step": 120 }, { "epoch": 0.16362492133417245, "grad_norm": 1.0237959623336792, "learning_rate": 9.661033586117569e-05, "loss": 0.1703, "step": 130 }, { "epoch": 0.1762114537444934, "grad_norm": 0.7563631534576416, "learning_rate": 9.581721094453653e-05, "loss": 0.2294, "step": 140 }, { "epoch": 0.18879798615481436, "grad_norm": 0.6137734651565552, "learning_rate": 9.494476801470761e-05, "loss": 0.1962, "step": 150 }, { "epoch": 0.2013845185651353, "grad_norm": 0.521679699420929, "learning_rate": 9.399451743075247e-05, "loss": 0.1464, "step": 160 }, { "epoch": 0.21397105097545627, "grad_norm": 0.7511913776397705, "learning_rate": 9.296810425108472e-05, "loss": 0.2017, "step": 170 }, { "epoch": 0.22655758338577722, "grad_norm": 0.43685382604599, "learning_rate": 9.186730538556954e-05, "loss": 0.189, "step": 180 }, { "epoch": 0.23914411579609818, "grad_norm": 0.9906732439994812, "learning_rate": 9.069402651936615e-05, "loss": 0.1537, "step": 190 }, { "epoch": 0.2517306482064191, "grad_norm": 1.0969626903533936, "learning_rate": 8.945029881383655e-05, "loss": 0.1995, "step": 200 }, { "epoch": 0.2643171806167401, "grad_norm": 0.756263792514801, "learning_rate": 8.813827539023187e-05, "loss": 0.1897, "step": 210 }, { "epoch": 0.27690371302706107, "grad_norm": 0.8148688673973083, "learning_rate": 8.676022760224371e-05, "loss": 0.1644, "step": 220 }, { "epoch": 0.289490245437382, "grad_norm": 0.5143342018127441, "learning_rate": 8.531854110387351e-05, "loss": 0.1941, "step": 230 }, { "epoch": 0.302076777847703, "grad_norm": 0.7203834652900696, "learning_rate": 8.381571171942692e-05, "loss": 0.1517, "step": 240 }, { "epoch": 0.3146633102580239, "grad_norm": 0.45761600136756897, "learning_rate": 8.225434112278328e-05, "loss": 0.1531, "step": 250 }, { "epoch": 0.3272498426683449, "grad_norm": 0.785302996635437, "learning_rate": 8.063713233341985e-05, "loss": 0.1662, "step": 260 }, { "epoch": 0.3398363750786658, "grad_norm": 0.6176906228065491, "learning_rate": 7.896688503698841e-05, "loss": 0.1392, "step": 270 }, { "epoch": 0.3524229074889868, "grad_norm": 0.6485356092453003, "learning_rate": 7.724649073854477e-05, "loss": 0.1623, "step": 280 }, { "epoch": 0.36500943989930773, "grad_norm": 0.5827588438987732, "learning_rate": 7.547892775682206e-05, "loss": 0.1666, "step": 290 }, { "epoch": 0.3775959723096287, "grad_norm": 0.5967457890510559, "learning_rate": 7.366725606821356e-05, "loss": 0.1494, "step": 300 }, { "epoch": 0.39018250471994964, "grad_norm": 0.8609017133712769, "learning_rate": 7.181461200939127e-05, "loss": 0.1374, "step": 310 }, { "epoch": 0.4027690371302706, "grad_norm": 0.6973686218261719, "learning_rate": 6.992420284773055e-05, "loss": 0.1385, "step": 320 }, { "epoch": 0.41535556954059155, "grad_norm": 0.7819589972496033, "learning_rate": 6.799930122894089e-05, "loss": 0.1876, "step": 330 }, { "epoch": 0.42794210195091253, "grad_norm": 0.7068976163864136, "learning_rate": 6.604323951151472e-05, "loss": 0.1737, "step": 340 }, { "epoch": 0.44052863436123346, "grad_norm": 0.3449420630931854, "learning_rate": 6.405940399780222e-05, "loss": 0.1392, "step": 350 }, { "epoch": 0.45311516677155445, "grad_norm": 0.66518634557724, "learning_rate": 6.205122907169974e-05, "loss": 0.1841, "step": 360 }, { "epoch": 0.4657016991818754, "grad_norm": 0.7556864619255066, "learning_rate": 6.0022191253099955e-05, "loss": 0.1665, "step": 370 }, { "epoch": 0.47828823159219636, "grad_norm": 0.41134288907051086, "learning_rate": 5.7975803179397034e-05, "loss": 0.1281, "step": 380 }, { "epoch": 0.4908747640025173, "grad_norm": 0.5263403654098511, "learning_rate": 5.591560752446554e-05, "loss": 0.1648, "step": 390 }, { "epoch": 0.5034612964128382, "grad_norm": 0.6483963131904602, "learning_rate": 5.384517086564104e-05, "loss": 0.161, "step": 400 }, { "epoch": 0.5160478288231592, "grad_norm": 0.7022745013237, "learning_rate": 5.1768077509318994e-05, "loss": 0.154, "step": 410 }, { "epoch": 0.5286343612334802, "grad_norm": 0.835760235786438, "learning_rate": 4.968792328586181e-05, "loss": 0.1691, "step": 420 }, { "epoch": 0.5412208936438011, "grad_norm": 0.7210333943367004, "learning_rate": 4.76083093245554e-05, "loss": 0.1475, "step": 430 }, { "epoch": 0.5538074260541221, "grad_norm": 0.7896990180015564, "learning_rate": 4.5532835819392664e-05, "loss": 0.1688, "step": 440 }, { "epoch": 0.5663939584644431, "grad_norm": 0.6531348824501038, "learning_rate": 4.346509579647588e-05, "loss": 0.1487, "step": 450 }, { "epoch": 0.578980490874764, "grad_norm": 0.5440480709075928, "learning_rate": 4.140866889382814e-05, "loss": 0.1527, "step": 460 }, { "epoch": 0.5915670232850849, "grad_norm": 0.6104904413223267, "learning_rate": 3.936711516438211e-05, "loss": 0.1514, "step": 470 }, { "epoch": 0.604153555695406, "grad_norm": 1.1999554634094238, "learning_rate": 3.7343968912873936e-05, "loss": 0.1741, "step": 480 }, { "epoch": 0.6167400881057269, "grad_norm": 0.6821284294128418, "learning_rate": 3.5342732577312304e-05, "loss": 0.1558, "step": 490 }, { "epoch": 0.6293266205160478, "grad_norm": 0.7797755599021912, "learning_rate": 3.3366870665614345e-05, "loss": 0.1665, "step": 500 }, { "epoch": 0.6419131529263687, "grad_norm": 0.6217632293701172, "learning_rate": 3.141980375790578e-05, "loss": 0.1336, "step": 510 }, { "epoch": 0.6544996853366898, "grad_norm": 0.6567578911781311, "learning_rate": 2.9504902584868032e-05, "loss": 0.1438, "step": 520 }, { "epoch": 0.6670862177470107, "grad_norm": 1.0686393976211548, "learning_rate": 2.762548219238381e-05, "loss": 0.1896, "step": 530 }, { "epoch": 0.6796727501573316, "grad_norm": 1.048843502998352, "learning_rate": 2.5784796202583518e-05, "loss": 0.1269, "step": 540 }, { "epoch": 0.6922592825676526, "grad_norm": 0.46409177780151367, "learning_rate": 2.3986031181227146e-05, "loss": 0.1432, "step": 550 }, { "epoch": 0.7048458149779736, "grad_norm": 0.73060142993927, "learning_rate": 2.223230112117348e-05, "loss": 0.1346, "step": 560 }, { "epoch": 0.7174323473882945, "grad_norm": 1.0117199420928955, "learning_rate": 2.0526642051485982e-05, "loss": 0.1332, "step": 570 }, { "epoch": 0.7300188797986155, "grad_norm": 0.6381825804710388, "learning_rate": 1.8872006781508516e-05, "loss": 0.1376, "step": 580 }, { "epoch": 0.7426054122089364, "grad_norm": 0.7147350311279297, "learning_rate": 1.727125978900964e-05, "loss": 0.1171, "step": 590 }, { "epoch": 0.7551919446192574, "grad_norm": 0.775750994682312, "learning_rate": 1.5727172261245148e-05, "loss": 0.1547, "step": 600 }, { "epoch": 0.7677784770295784, "grad_norm": 0.4995289444923401, "learning_rate": 1.4242417297523486e-05, "loss": 0.1865, "step": 610 }, { "epoch": 0.7803650094398993, "grad_norm": 0.5421444773674011, "learning_rate": 1.2819565281579748e-05, "loss": 0.141, "step": 620 }, { "epoch": 0.7929515418502202, "grad_norm": 0.5107913017272949, "learning_rate": 1.1461079431768745e-05, "loss": 0.1374, "step": 630 }, { "epoch": 0.8055380742605412, "grad_norm": 0.37916406989097595, "learning_rate": 1.0169311536781551e-05, "loss": 0.1211, "step": 640 }, { "epoch": 0.8181246066708622, "grad_norm": 0.5083438158035278, "learning_rate": 8.94649788426682e-06, "loss": 0.1325, "step": 650 }, { "epoch": 0.8307111390811831, "grad_norm": 0.621924102306366, "learning_rate": 7.794755389405973e-06, "loss": 0.1356, "step": 660 }, { "epoch": 0.8432976714915041, "grad_norm": 0.9067131280899048, "learning_rate": 6.71607793014391e-06, "loss": 0.1628, "step": 670 }, { "epoch": 0.8558842039018251, "grad_norm": 0.4797362983226776, "learning_rate": 5.712332895419797e-06, "loss": 0.1316, "step": 680 }, { "epoch": 0.868470736312146, "grad_norm": 0.6007776260375977, "learning_rate": 4.7852579523735785e-06, "loss": 0.1478, "step": 690 }, { "epoch": 0.8810572687224669, "grad_norm": 0.40377601981163025, "learning_rate": 3.936458038124874e-06, "loss": 0.117, "step": 700 }, { "epoch": 0.893643801132788, "grad_norm": 0.8351572751998901, "learning_rate": 3.1674025813316967e-06, "loss": 0.133, "step": 710 }, { "epoch": 0.9062303335431089, "grad_norm": 0.5732216238975525, "learning_rate": 2.4794229583395e-06, "loss": 0.1511, "step": 720 }, { "epoch": 0.9188168659534298, "grad_norm": 1.0418872833251953, "learning_rate": 1.8737101883239571e-06, "loss": 0.1491, "step": 730 }, { "epoch": 0.9314033983637507, "grad_norm": 0.657448410987854, "learning_rate": 1.3513128714178757e-06, "loss": 0.1167, "step": 740 }, { "epoch": 0.9439899307740718, "grad_norm": 0.5969143509864807, "learning_rate": 9.131353733916437e-07, "loss": 0.1305, "step": 750 }, { "epoch": 0.9565764631843927, "grad_norm": 0.8757323622703552, "learning_rate": 5.599362600298308e-07, "loss": 0.12, "step": 760 }, { "epoch": 0.9691629955947136, "grad_norm": 0.4313090145587921, "learning_rate": 2.923269839144649e-07, "loss": 0.1533, "step": 770 }, { "epoch": 0.9817495280050346, "grad_norm": 0.8322012424468994, "learning_rate": 1.107708258881457e-07, "loss": 0.1334, "step": 780 }, { "epoch": 0.9943360604153556, "grad_norm": 0.7973638772964478, "learning_rate": 1.5582093029814504e-08, "loss": 0.1526, "step": 790 }, { "epoch": 1.0, "step": 795, "total_flos": 1.1946958975929385e+18, "train_loss": 0.16642639535777973, "train_runtime": 8286.6819, "train_samples_per_second": 1.533, "train_steps_per_second": 0.096 } ], "logging_steps": 10, "max_steps": 795, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.1946958975929385e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }