{ "best_global_step": 750, "best_metric": 0.7697240710258484, "best_model_checkpoint": "/home/tkwang/scratch/SecSteer-v2/axolotl-outputs/lora/Qwen2.5-Coder-7B-evol-stage1/checkpoint-750", "epoch": 0.7976601967561818, "eval_steps": 150, "global_step": 750, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "eval_loss": 0.804951012134552, "eval_ppl": 2.23659, "eval_runtime": 237.4236, "eval_samples_per_second": 28.161, "eval_steps_per_second": 1.761, "memory/device_reserved (GiB)": 41.82, "memory/max_active (GiB)": 37.85, "memory/max_allocated (GiB)": 37.85, "step": 0 }, { "epoch": 0.0010635469290082426, "grad_norm": 0.0461450070142746, "learning_rate": 0.0, "loss": 0.7683508396148682, "memory/device_reserved (GiB)": 51.3, "memory/max_active (GiB)": 45.83, "memory/max_allocated (GiB)": 45.83, "ppl": 2.15621, "step": 1, "tokens/total": 262144, "tokens/train_per_sec_per_gpu": 143.71, "tokens/trainable": 21968 }, { "epoch": 0.002127093858016485, "grad_norm": 0.053678449243307114, "learning_rate": 4.2553191489361704e-07, "loss": 0.8233645558357239, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.27815, "step": 2, "tokens/total": 524288, "tokens/train_per_sec_per_gpu": 206.75, "tokens/trainable": 44841 }, { "epoch": 0.0031906407870247273, "grad_norm": 0.05005470663309097, "learning_rate": 8.510638297872341e-07, "loss": 0.8364737033843994, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.30821, "step": 3, "tokens/total": 786432, "tokens/train_per_sec_per_gpu": 190.31, "tokens/trainable": 67815 }, { "epoch": 0.00425418771603297, "grad_norm": 0.04871873930096626, "learning_rate": 1.276595744680851e-06, "loss": 0.7923524975776672, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.20859, "step": 4, "tokens/total": 1048576, "tokens/train_per_sec_per_gpu": 172.64, "tokens/trainable": 89239 }, { "epoch": 0.0053177346450412125, "grad_norm": 0.052344731986522675, "learning_rate": 1.7021276595744682e-06, "loss": 0.7642413973808289, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.14736, "step": 5, "tokens/total": 1310720, "tokens/train_per_sec_per_gpu": 185.1, "tokens/trainable": 109336 }, { "epoch": 0.006381281574049455, "grad_norm": 0.04826882481575012, "learning_rate": 2.1276595744680853e-06, "loss": 0.8659416437149048, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.37724, "step": 6, "tokens/total": 1572864, "tokens/train_per_sec_per_gpu": 183.2, "tokens/trainable": 130625 }, { "epoch": 0.007444828503057698, "grad_norm": 0.05040327087044716, "learning_rate": 2.553191489361702e-06, "loss": 0.8249338865280151, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.28173, "step": 7, "tokens/total": 1835008, "tokens/train_per_sec_per_gpu": 209.68, "tokens/trainable": 153061 }, { "epoch": 0.00850837543206594, "grad_norm": 0.05111980810761452, "learning_rate": 2.978723404255319e-06, "loss": 0.7967497706413269, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.21832, "step": 8, "tokens/total": 2097152, "tokens/train_per_sec_per_gpu": 185.83, "tokens/trainable": 173852 }, { "epoch": 0.009571922361074183, "grad_norm": 0.04268274083733559, "learning_rate": 3.4042553191489363e-06, "loss": 0.7484039068222046, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.11362, "step": 9, "tokens/total": 2359296, "tokens/train_per_sec_per_gpu": 214.69, "tokens/trainable": 196255 }, { "epoch": 0.010635469290082425, "grad_norm": 0.04418900981545448, "learning_rate": 3.8297872340425535e-06, "loss": 0.8072069883346558, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.24164, "step": 10, "tokens/total": 2621440, "tokens/train_per_sec_per_gpu": 168.89, "tokens/trainable": 217227 }, { "epoch": 0.011699016219090667, "grad_norm": 0.04787033051252365, "learning_rate": 4.255319148936171e-06, "loss": 0.7684656381607056, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.15645, "step": 11, "tokens/total": 2883584, "tokens/train_per_sec_per_gpu": 187.36, "tokens/trainable": 240990 }, { "epoch": 0.01276256314809891, "grad_norm": 0.04942560940980911, "learning_rate": 4.680851063829788e-06, "loss": 0.8147498369216919, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.25861, "step": 12, "tokens/total": 3145728, "tokens/train_per_sec_per_gpu": 146.78, "tokens/trainable": 263455 }, { "epoch": 0.013826110077107153, "grad_norm": 0.05288751795887947, "learning_rate": 5.106382978723404e-06, "loss": 0.8472910523414612, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.33332, "step": 13, "tokens/total": 3407872, "tokens/train_per_sec_per_gpu": 179.19, "tokens/trainable": 286052 }, { "epoch": 0.014889657006115395, "grad_norm": 0.05669346824288368, "learning_rate": 5.531914893617022e-06, "loss": 0.7845062017440796, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.19132, "step": 14, "tokens/total": 3670016, "tokens/train_per_sec_per_gpu": 147.62, "tokens/trainable": 305257 }, { "epoch": 0.015953203935123637, "grad_norm": 0.04507856070995331, "learning_rate": 5.957446808510638e-06, "loss": 0.7491350769996643, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.11517, "step": 15, "tokens/total": 3932160, "tokens/train_per_sec_per_gpu": 254.85, "tokens/trainable": 328628 }, { "epoch": 0.01701675086413188, "grad_norm": 0.04711790010333061, "learning_rate": 6.382978723404256e-06, "loss": 0.7844012975692749, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.19109, "step": 16, "tokens/total": 4194304, "tokens/train_per_sec_per_gpu": 210.21, "tokens/trainable": 351162 }, { "epoch": 0.01808029779314012, "grad_norm": 0.04914192110300064, "learning_rate": 6.808510638297873e-06, "loss": 0.8149253129959106, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.25901, "step": 17, "tokens/total": 4456448, "tokens/train_per_sec_per_gpu": 172.85, "tokens/trainable": 373818 }, { "epoch": 0.019143844722148366, "grad_norm": 0.057912107557058334, "learning_rate": 7.234042553191491e-06, "loss": 0.7840430736541748, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.19031, "step": 18, "tokens/total": 4718592, "tokens/train_per_sec_per_gpu": 154.54, "tokens/trainable": 393528 }, { "epoch": 0.020207391651156606, "grad_norm": 0.05160650238394737, "learning_rate": 7.659574468085107e-06, "loss": 0.8152034282684326, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.25964, "step": 19, "tokens/total": 4980736, "tokens/train_per_sec_per_gpu": 188.44, "tokens/trainable": 415305 }, { "epoch": 0.02127093858016485, "grad_norm": 0.052523426711559296, "learning_rate": 8.085106382978723e-06, "loss": 0.816782534122467, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.26321, "step": 20, "tokens/total": 5242880, "tokens/train_per_sec_per_gpu": 223.4, "tokens/trainable": 438205 }, { "epoch": 0.022334485509173094, "grad_norm": 0.06776182353496552, "learning_rate": 8.510638297872341e-06, "loss": 0.8410882949829102, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.31889, "step": 21, "tokens/total": 5505024, "tokens/train_per_sec_per_gpu": 177.12, "tokens/trainable": 457521 }, { "epoch": 0.023398032438181334, "grad_norm": 0.05495529994368553, "learning_rate": 8.936170212765958e-06, "loss": 0.7947180271148682, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.21382, "step": 22, "tokens/total": 5767168, "tokens/train_per_sec_per_gpu": 175.57, "tokens/trainable": 478475 }, { "epoch": 0.024461579367189578, "grad_norm": 0.061899591237306595, "learning_rate": 9.361702127659576e-06, "loss": 0.8852798342704773, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.42366, "step": 23, "tokens/total": 6029312, "tokens/train_per_sec_per_gpu": 142.65, "tokens/trainable": 498659 }, { "epoch": 0.02552512629619782, "grad_norm": 0.0498763844370842, "learning_rate": 9.787234042553192e-06, "loss": 0.672831654548645, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 1.95978, "step": 24, "tokens/total": 6291456, "tokens/train_per_sec_per_gpu": 214.6, "tokens/trainable": 521341 }, { "epoch": 0.026588673225206062, "grad_norm": 0.056778695434331894, "learning_rate": 1.0212765957446808e-05, "loss": 0.7445163726806641, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.10542, "step": 25, "tokens/total": 6553600, "tokens/train_per_sec_per_gpu": 191.7, "tokens/trainable": 542902 }, { "epoch": 0.027652220154214306, "grad_norm": 0.05652826279401779, "learning_rate": 1.0638297872340426e-05, "loss": 0.7965201139450073, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.21781, "step": 26, "tokens/total": 6815744, "tokens/train_per_sec_per_gpu": 173.66, "tokens/trainable": 565280 }, { "epoch": 0.028715767083222547, "grad_norm": 0.05963267385959625, "learning_rate": 1.1063829787234044e-05, "loss": 0.8641867637634277, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.37308, "step": 27, "tokens/total": 7077888, "tokens/train_per_sec_per_gpu": 197.23, "tokens/trainable": 587539 }, { "epoch": 0.02977931401223079, "grad_norm": 0.06136506423354149, "learning_rate": 1.1489361702127662e-05, "loss": 0.8187180161476135, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.26759, "step": 28, "tokens/total": 7340032, "tokens/train_per_sec_per_gpu": 222.81, "tokens/trainable": 609839 }, { "epoch": 0.03084286094123903, "grad_norm": 0.0565866194665432, "learning_rate": 1.1914893617021277e-05, "loss": 0.8024689555168152, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.23104, "step": 29, "tokens/total": 7602176, "tokens/train_per_sec_per_gpu": 204.38, "tokens/trainable": 631280 }, { "epoch": 0.031906407870247275, "grad_norm": 0.06343540549278259, "learning_rate": 1.2340425531914895e-05, "loss": 0.7662019729614258, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.15158, "step": 30, "tokens/total": 7864320, "tokens/train_per_sec_per_gpu": 228.13, "tokens/trainable": 652766 }, { "epoch": 0.03296995479925552, "grad_norm": 0.056531310081481934, "learning_rate": 1.2765957446808513e-05, "loss": 0.8442375659942627, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.3262, "step": 31, "tokens/total": 8126464, "tokens/train_per_sec_per_gpu": 190.64, "tokens/trainable": 674612 }, { "epoch": 0.03403350172826376, "grad_norm": 0.06304491311311722, "learning_rate": 1.3191489361702127e-05, "loss": 0.8129785060882568, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.25461, "step": 32, "tokens/total": 8388608, "tokens/train_per_sec_per_gpu": 214.77, "tokens/trainable": 695779 }, { "epoch": 0.035097048657272, "grad_norm": 0.05908438190817833, "learning_rate": 1.3617021276595745e-05, "loss": 0.7475928068161011, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.11191, "step": 33, "tokens/total": 8650752, "tokens/train_per_sec_per_gpu": 177.7, "tokens/trainable": 716063 }, { "epoch": 0.03616059558628024, "grad_norm": 0.049326092004776, "learning_rate": 1.4042553191489363e-05, "loss": 0.7267792224884033, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.06841, "step": 34, "tokens/total": 8912896, "tokens/train_per_sec_per_gpu": 201.94, "tokens/trainable": 737753 }, { "epoch": 0.03722414251528849, "grad_norm": 0.04339035972952843, "learning_rate": 1.4468085106382981e-05, "loss": 0.8321285247802734, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.29821, "step": 35, "tokens/total": 9175040, "tokens/train_per_sec_per_gpu": 238.51, "tokens/trainable": 761270 }, { "epoch": 0.03828768944429673, "grad_norm": 0.040435630828142166, "learning_rate": 1.4893617021276596e-05, "loss": 0.7482062578201294, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.11321, "step": 36, "tokens/total": 9437184, "tokens/train_per_sec_per_gpu": 235.13, "tokens/trainable": 783478 }, { "epoch": 0.039351236373304975, "grad_norm": 0.04054463654756546, "learning_rate": 1.5319148936170214e-05, "loss": 0.7387034296989441, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.09322, "step": 37, "tokens/total": 9699328, "tokens/train_per_sec_per_gpu": 173.41, "tokens/trainable": 805224 }, { "epoch": 0.04041478330231321, "grad_norm": 0.03981109336018562, "learning_rate": 1.5744680851063832e-05, "loss": 0.7435116767883301, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.10331, "step": 38, "tokens/total": 9961472, "tokens/train_per_sec_per_gpu": 185.32, "tokens/trainable": 828977 }, { "epoch": 0.041478330231321456, "grad_norm": 0.04639929160475731, "learning_rate": 1.6170212765957446e-05, "loss": 0.8203743696212769, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.27135, "step": 39, "tokens/total": 10223616, "tokens/train_per_sec_per_gpu": 183.52, "tokens/trainable": 850008 }, { "epoch": 0.0425418771603297, "grad_norm": 0.04261818155646324, "learning_rate": 1.6595744680851064e-05, "loss": 0.8682478070259094, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.38273, "step": 40, "tokens/total": 10485760, "tokens/train_per_sec_per_gpu": 219.05, "tokens/trainable": 872900 }, { "epoch": 0.043605424089337944, "grad_norm": 0.04111519455909729, "learning_rate": 1.7021276595744682e-05, "loss": 0.7811744213104248, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.18404, "step": 41, "tokens/total": 10747904, "tokens/train_per_sec_per_gpu": 204.48, "tokens/trainable": 895505 }, { "epoch": 0.04466897101834619, "grad_norm": 0.03414495289325714, "learning_rate": 1.74468085106383e-05, "loss": 0.8004995584487915, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.22665, "step": 42, "tokens/total": 11010048, "tokens/train_per_sec_per_gpu": 198.04, "tokens/trainable": 917814 }, { "epoch": 0.045732517947354424, "grad_norm": 0.030925795435905457, "learning_rate": 1.7872340425531915e-05, "loss": 0.7756137251853943, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.17192, "step": 43, "tokens/total": 11272192, "tokens/train_per_sec_per_gpu": 195.24, "tokens/trainable": 939811 }, { "epoch": 0.04679606487636267, "grad_norm": 0.026804521679878235, "learning_rate": 1.8297872340425533e-05, "loss": 0.6872485876083374, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 1.98824, "step": 44, "tokens/total": 11534336, "tokens/train_per_sec_per_gpu": 228.04, "tokens/trainable": 963796 }, { "epoch": 0.04785961180537091, "grad_norm": 0.02924325503408909, "learning_rate": 1.872340425531915e-05, "loss": 0.7919371128082275, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.20767, "step": 45, "tokens/total": 11796480, "tokens/train_per_sec_per_gpu": 193.49, "tokens/trainable": 984430 }, { "epoch": 0.048923158734379156, "grad_norm": 0.030018026009202003, "learning_rate": 1.914893617021277e-05, "loss": 0.7972186803817749, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.21936, "step": 46, "tokens/total": 12058624, "tokens/train_per_sec_per_gpu": 180.38, "tokens/trainable": 1004277 }, { "epoch": 0.0499867056633874, "grad_norm": 0.030266476795077324, "learning_rate": 1.9574468085106384e-05, "loss": 0.7901904582977295, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.20382, "step": 47, "tokens/total": 12320768, "tokens/train_per_sec_per_gpu": 185.52, "tokens/trainable": 1026347 }, { "epoch": 0.05105025259239564, "grad_norm": 0.028692839667201042, "learning_rate": 2e-05, "loss": 0.7853357791900635, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.19314, "step": 48, "tokens/total": 12582912, "tokens/train_per_sec_per_gpu": 167.93, "tokens/trainable": 1048721 }, { "epoch": 0.05211379952140388, "grad_norm": 0.026348290964961052, "learning_rate": 2.0425531914893616e-05, "loss": 0.8077329397201538, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.24282, "step": 49, "tokens/total": 12845056, "tokens/train_per_sec_per_gpu": 185.28, "tokens/trainable": 1071780 }, { "epoch": 0.053177346450412125, "grad_norm": 0.029625559225678444, "learning_rate": 2.0851063829787238e-05, "loss": 0.7733415365219116, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.167, "step": 50, "tokens/total": 13107200, "tokens/train_per_sec_per_gpu": 189.79, "tokens/trainable": 1091760 }, { "epoch": 0.05424089337942037, "grad_norm": 0.027405593544244766, "learning_rate": 2.1276595744680852e-05, "loss": 0.7173340916633606, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.04896, "step": 51, "tokens/total": 13369344, "tokens/train_per_sec_per_gpu": 188.87, "tokens/trainable": 1113687 }, { "epoch": 0.05530444030842861, "grad_norm": 0.02946804091334343, "learning_rate": 2.1702127659574467e-05, "loss": 0.7727050185203552, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.16562, "step": 52, "tokens/total": 13631488, "tokens/train_per_sec_per_gpu": 209.22, "tokens/trainable": 1135391 }, { "epoch": 0.05636798723743685, "grad_norm": 0.02892529033124447, "learning_rate": 2.2127659574468088e-05, "loss": 0.8007056713104248, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.22711, "step": 53, "tokens/total": 13893632, "tokens/train_per_sec_per_gpu": 202.13, "tokens/trainable": 1159518 }, { "epoch": 0.05743153416644509, "grad_norm": 0.031362369656562805, "learning_rate": 2.2553191489361703e-05, "loss": 0.7692879438400269, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.15823, "step": 54, "tokens/total": 14155776, "tokens/train_per_sec_per_gpu": 151.27, "tokens/trainable": 1178964 }, { "epoch": 0.05849508109545334, "grad_norm": 0.027873003855347633, "learning_rate": 2.2978723404255324e-05, "loss": 0.7864505052566528, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.19559, "step": 55, "tokens/total": 14417920, "tokens/train_per_sec_per_gpu": 213.4, "tokens/trainable": 1201830 }, { "epoch": 0.05955862802446158, "grad_norm": 0.030442189425230026, "learning_rate": 2.340425531914894e-05, "loss": 0.7171883583068848, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.04866, "step": 56, "tokens/total": 14680064, "tokens/train_per_sec_per_gpu": 163.06, "tokens/trainable": 1221040 }, { "epoch": 0.060622174953469825, "grad_norm": 0.030432693660259247, "learning_rate": 2.3829787234042553e-05, "loss": 0.8123354911804199, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.25316, "step": 57, "tokens/total": 14942208, "tokens/train_per_sec_per_gpu": 191.04, "tokens/trainable": 1245037 }, { "epoch": 0.06168572188247806, "grad_norm": 0.030456526204943657, "learning_rate": 2.4255319148936175e-05, "loss": 0.7400133609771729, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.09596, "step": 58, "tokens/total": 15204352, "tokens/train_per_sec_per_gpu": 173.09, "tokens/trainable": 1265410 }, { "epoch": 0.0627492688114863, "grad_norm": 0.028698932379484177, "learning_rate": 2.468085106382979e-05, "loss": 0.7431353330612183, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.10252, "step": 59, "tokens/total": 15466496, "tokens/train_per_sec_per_gpu": 204.25, "tokens/trainable": 1287471 }, { "epoch": 0.06381281574049455, "grad_norm": 0.029805311933159828, "learning_rate": 2.5106382978723404e-05, "loss": 0.785997748374939, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.1946, "step": 60, "tokens/total": 15728640, "tokens/train_per_sec_per_gpu": 224.54, "tokens/trainable": 1309972 }, { "epoch": 0.0648763626695028, "grad_norm": 0.03344248980283737, "learning_rate": 2.5531914893617025e-05, "loss": 0.7236359119415283, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.06192, "step": 61, "tokens/total": 15990784, "tokens/train_per_sec_per_gpu": 162.36, "tokens/trainable": 1329223 }, { "epoch": 0.06593990959851104, "grad_norm": 0.03220194950699806, "learning_rate": 2.595744680851064e-05, "loss": 0.7307531833648682, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.07664, "step": 62, "tokens/total": 16252928, "tokens/train_per_sec_per_gpu": 161.66, "tokens/trainable": 1350124 }, { "epoch": 0.06700345652751928, "grad_norm": 0.032156504690647125, "learning_rate": 2.6382978723404255e-05, "loss": 0.8302059173583984, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.29379, "step": 63, "tokens/total": 16515072, "tokens/train_per_sec_per_gpu": 174.14, "tokens/trainable": 1372459 }, { "epoch": 0.06806700345652753, "grad_norm": 0.031544484198093414, "learning_rate": 2.6808510638297876e-05, "loss": 0.8317389488220215, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.29731, "step": 64, "tokens/total": 16777216, "tokens/train_per_sec_per_gpu": 203.61, "tokens/trainable": 1394135 }, { "epoch": 0.06913055038553576, "grad_norm": 0.028723040595650673, "learning_rate": 2.723404255319149e-05, "loss": 0.7596557140350342, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.13754, "step": 65, "tokens/total": 17039360, "tokens/train_per_sec_per_gpu": 175.65, "tokens/trainable": 1416653 }, { "epoch": 0.070194097314544, "grad_norm": 0.03393164649605751, "learning_rate": 2.7659574468085112e-05, "loss": 0.8652482032775879, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.3756, "step": 66, "tokens/total": 17301504, "tokens/train_per_sec_per_gpu": 195.94, "tokens/trainable": 1437390 }, { "epoch": 0.07125764424355224, "grad_norm": 0.030688602477312088, "learning_rate": 2.8085106382978727e-05, "loss": 0.7935420274734497, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.21121, "step": 67, "tokens/total": 17563648, "tokens/train_per_sec_per_gpu": 220.73, "tokens/trainable": 1461784 }, { "epoch": 0.07232119117256049, "grad_norm": 0.03269756957888603, "learning_rate": 2.851063829787234e-05, "loss": 0.7466378211975098, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.10989, "step": 68, "tokens/total": 17825792, "tokens/train_per_sec_per_gpu": 172.64, "tokens/trainable": 1482296 }, { "epoch": 0.07338473810156873, "grad_norm": 0.03261660039424896, "learning_rate": 2.8936170212765963e-05, "loss": 0.7684181928634644, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.15635, "step": 69, "tokens/total": 18087936, "tokens/train_per_sec_per_gpu": 153.82, "tokens/trainable": 1503447 }, { "epoch": 0.07444828503057697, "grad_norm": 0.029810158535838127, "learning_rate": 2.9361702127659577e-05, "loss": 0.745851993560791, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.10824, "step": 70, "tokens/total": 18350080, "tokens/train_per_sec_per_gpu": 176.59, "tokens/trainable": 1526617 }, { "epoch": 0.07551183195958522, "grad_norm": 0.03327067568898201, "learning_rate": 2.9787234042553192e-05, "loss": 0.8320407867431641, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.298, "step": 71, "tokens/total": 18612224, "tokens/train_per_sec_per_gpu": 172.54, "tokens/trainable": 1549503 }, { "epoch": 0.07657537888859346, "grad_norm": 0.030294055119156837, "learning_rate": 3.0212765957446813e-05, "loss": 0.7923359870910645, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.20855, "step": 72, "tokens/total": 18874368, "tokens/train_per_sec_per_gpu": 186.06, "tokens/trainable": 1572991 }, { "epoch": 0.0776389258176017, "grad_norm": 0.03210108354687691, "learning_rate": 3.063829787234043e-05, "loss": 0.746250569820404, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.10908, "step": 73, "tokens/total": 19136512, "tokens/train_per_sec_per_gpu": 140.36, "tokens/trainable": 1594140 }, { "epoch": 0.07870247274660995, "grad_norm": 0.030283037573099136, "learning_rate": 3.1063829787234046e-05, "loss": 0.7411618232727051, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.09837, "step": 74, "tokens/total": 19398656, "tokens/train_per_sec_per_gpu": 208.45, "tokens/trainable": 1616672 }, { "epoch": 0.07976601967561818, "grad_norm": 0.03764800727367401, "learning_rate": 3.1489361702127664e-05, "loss": 0.7435256242752075, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.10334, "step": 75, "tokens/total": 19660800, "tokens/train_per_sec_per_gpu": 167.25, "tokens/trainable": 1635193 }, { "epoch": 0.08082956660462642, "grad_norm": 0.03491177409887314, "learning_rate": 3.191489361702128e-05, "loss": 0.7323366403579712, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.07993, "step": 76, "tokens/total": 19922944, "tokens/train_per_sec_per_gpu": 214.58, "tokens/trainable": 1656375 }, { "epoch": 0.08189311353363467, "grad_norm": 0.03278028592467308, "learning_rate": 3.234042553191489e-05, "loss": 0.7500340938568115, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.11707, "step": 77, "tokens/total": 20185088, "tokens/train_per_sec_per_gpu": 187.04, "tokens/trainable": 1678155 }, { "epoch": 0.08295666046264291, "grad_norm": 0.032096248120069504, "learning_rate": 3.276595744680851e-05, "loss": 0.7423413991928101, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.10085, "step": 78, "tokens/total": 20447232, "tokens/train_per_sec_per_gpu": 209.49, "tokens/trainable": 1701373 }, { "epoch": 0.08402020739165116, "grad_norm": 0.03354285657405853, "learning_rate": 3.319148936170213e-05, "loss": 0.8002707958221436, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.22614, "step": 79, "tokens/total": 20709376, "tokens/train_per_sec_per_gpu": 178.24, "tokens/trainable": 1721916 }, { "epoch": 0.0850837543206594, "grad_norm": 0.03523889556527138, "learning_rate": 3.361702127659575e-05, "loss": 0.7944124937057495, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.21314, "step": 80, "tokens/total": 20971520, "tokens/train_per_sec_per_gpu": 221.61, "tokens/trainable": 1744950 }, { "epoch": 0.08614730124966764, "grad_norm": 0.03261874243617058, "learning_rate": 3.4042553191489365e-05, "loss": 0.7720386981964111, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.16417, "step": 81, "tokens/total": 21233664, "tokens/train_per_sec_per_gpu": 167.21, "tokens/trainable": 1765422 }, { "epoch": 0.08721084817867589, "grad_norm": 0.033395156264305115, "learning_rate": 3.446808510638298e-05, "loss": 0.779296875, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.17994, "step": 82, "tokens/total": 21495808, "tokens/train_per_sec_per_gpu": 217.82, "tokens/trainable": 1787402 }, { "epoch": 0.08827439510768413, "grad_norm": 0.032813675701618195, "learning_rate": 3.48936170212766e-05, "loss": 0.7013646364212036, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.0165, "step": 83, "tokens/total": 21757952, "tokens/train_per_sec_per_gpu": 165.9, "tokens/trainable": 1807444 }, { "epoch": 0.08933794203669237, "grad_norm": 0.035501375794410706, "learning_rate": 3.531914893617022e-05, "loss": 0.7295072674751282, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.07406, "step": 84, "tokens/total": 22020096, "tokens/train_per_sec_per_gpu": 182.67, "tokens/trainable": 1827505 }, { "epoch": 0.0904014889657006, "grad_norm": 0.0353703536093235, "learning_rate": 3.574468085106383e-05, "loss": 0.7775543928146362, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.17614, "step": 85, "tokens/total": 22282240, "tokens/train_per_sec_per_gpu": 188.9, "tokens/trainable": 1847467 }, { "epoch": 0.09146503589470885, "grad_norm": 0.03491484373807907, "learning_rate": 3.617021276595745e-05, "loss": 0.8319449424743652, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.29778, "step": 86, "tokens/total": 22544384, "tokens/train_per_sec_per_gpu": 214.28, "tokens/trainable": 1868200 }, { "epoch": 0.09252858282371709, "grad_norm": 0.032434333115816116, "learning_rate": 3.6595744680851066e-05, "loss": 0.8519909977912903, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.34431, "step": 87, "tokens/total": 22806528, "tokens/train_per_sec_per_gpu": 259.84, "tokens/trainable": 1893531 }, { "epoch": 0.09359212975272534, "grad_norm": 0.04186626523733139, "learning_rate": 3.7021276595744684e-05, "loss": 0.8195874094963074, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.26956, "step": 88, "tokens/total": 23068672, "tokens/train_per_sec_per_gpu": 178.33, "tokens/trainable": 1914870 }, { "epoch": 0.09465567668173358, "grad_norm": 0.03298460692167282, "learning_rate": 3.74468085106383e-05, "loss": 0.7469631433486938, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.11058, "step": 89, "tokens/total": 23330816, "tokens/train_per_sec_per_gpu": 200.02, "tokens/trainable": 1937963 }, { "epoch": 0.09571922361074182, "grad_norm": 0.03386974707245827, "learning_rate": 3.787234042553192e-05, "loss": 0.7484230399131775, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.11366, "step": 90, "tokens/total": 23592960, "tokens/train_per_sec_per_gpu": 193.44, "tokens/trainable": 1961198 }, { "epoch": 0.09678277053975007, "grad_norm": 0.039303258061409, "learning_rate": 3.829787234042554e-05, "loss": 0.75224769115448, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.12176, "step": 91, "tokens/total": 23855104, "tokens/train_per_sec_per_gpu": 176.98, "tokens/trainable": 1982479 }, { "epoch": 0.09784631746875831, "grad_norm": 0.03529525175690651, "learning_rate": 3.872340425531915e-05, "loss": 0.7571150064468384, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.13212, "step": 92, "tokens/total": 24117248, "tokens/train_per_sec_per_gpu": 227.43, "tokens/trainable": 2004994 }, { "epoch": 0.09890986439776656, "grad_norm": 0.034970078617334366, "learning_rate": 3.914893617021277e-05, "loss": 0.7881733179092407, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.19938, "step": 93, "tokens/total": 24379392, "tokens/train_per_sec_per_gpu": 180.97, "tokens/trainable": 2028105 }, { "epoch": 0.0999734113267748, "grad_norm": 0.036846473813056946, "learning_rate": 3.9574468085106385e-05, "loss": 0.7719080448150635, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.16389, "step": 94, "tokens/total": 24641536, "tokens/train_per_sec_per_gpu": 186.35, "tokens/trainable": 2051020 }, { "epoch": 0.10103695825578303, "grad_norm": 0.03919777274131775, "learning_rate": 4e-05, "loss": 0.7920703887939453, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.20796, "step": 95, "tokens/total": 24903680, "tokens/train_per_sec_per_gpu": 147.2, "tokens/trainable": 2070277 }, { "epoch": 0.10210050518479127, "grad_norm": 0.03782414644956589, "learning_rate": 3.9999862427247416e-05, "loss": 0.8352775573730469, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.30545, "step": 96, "tokens/total": 25165824, "tokens/train_per_sec_per_gpu": 181.92, "tokens/trainable": 2090839 }, { "epoch": 0.10316405211379952, "grad_norm": 0.03704574331641197, "learning_rate": 3.999944971088228e-05, "loss": 0.761550784111023, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.14159, "step": 97, "tokens/total": 25427968, "tokens/train_per_sec_per_gpu": 206.15, "tokens/trainable": 2112256 }, { "epoch": 0.10422759904280776, "grad_norm": 0.03385859355330467, "learning_rate": 3.999876185658244e-05, "loss": 0.7179139852523804, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.05015, "step": 98, "tokens/total": 25690112, "tokens/train_per_sec_per_gpu": 187.02, "tokens/trainable": 2136667 }, { "epoch": 0.105291145971816, "grad_norm": 0.03678734600543976, "learning_rate": 3.99977988738109e-05, "loss": 0.7438210248947144, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.10396, "step": 99, "tokens/total": 25952256, "tokens/train_per_sec_per_gpu": 178.09, "tokens/trainable": 2157962 }, { "epoch": 0.10635469290082425, "grad_norm": 0.03856838122010231, "learning_rate": 3.999656077581569e-05, "loss": 0.7466105222702026, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.10984, "step": 100, "tokens/total": 26214400, "tokens/train_per_sec_per_gpu": 184.35, "tokens/trainable": 2178396 }, { "epoch": 0.1074182398298325, "grad_norm": 0.03785452991724014, "learning_rate": 3.9995047579629654e-05, "loss": 0.7475836873054504, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.11189, "step": 101, "tokens/total": 26476544, "tokens/train_per_sec_per_gpu": 201.31, "tokens/trainable": 2201039 }, { "epoch": 0.10848178675884074, "grad_norm": 0.03467912971973419, "learning_rate": 3.9993259306070256e-05, "loss": 0.7737405300140381, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.16786, "step": 102, "tokens/total": 26738688, "tokens/train_per_sec_per_gpu": 185.99, "tokens/trainable": 2223019 }, { "epoch": 0.10954533368784898, "grad_norm": 0.04308745265007019, "learning_rate": 3.999119597973925e-05, "loss": 0.8207772970199585, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.27227, "step": 103, "tokens/total": 27000832, "tokens/train_per_sec_per_gpu": 147.51, "tokens/trainable": 2242785 }, { "epoch": 0.11060888061685722, "grad_norm": 0.036378778517246246, "learning_rate": 3.998885762902241e-05, "loss": 0.7338327169418335, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.08305, "step": 104, "tokens/total": 27262976, "tokens/train_per_sec_per_gpu": 179.24, "tokens/trainable": 2264578 }, { "epoch": 0.11167242754586545, "grad_norm": 0.04125402122735977, "learning_rate": 3.998624428608906e-05, "loss": 0.8683584332466125, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.383, "step": 105, "tokens/total": 27525120, "tokens/train_per_sec_per_gpu": 195.93, "tokens/trainable": 2285624 }, { "epoch": 0.1127359744748737, "grad_norm": 0.03740216791629791, "learning_rate": 3.9983355986891664e-05, "loss": 0.7756333947181702, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.17197, "step": 106, "tokens/total": 27787264, "tokens/train_per_sec_per_gpu": 246.08, "tokens/trainable": 2308345 }, { "epoch": 0.11379952140388194, "grad_norm": 0.04036470502614975, "learning_rate": 3.9980192771165364e-05, "loss": 0.7976692914962769, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.22036, "step": 107, "tokens/total": 28049408, "tokens/train_per_sec_per_gpu": 215.98, "tokens/trainable": 2328867 }, { "epoch": 0.11486306833289019, "grad_norm": 0.03836773335933685, "learning_rate": 3.997675468242738e-05, "loss": 0.7081190943717957, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.03017, "step": 108, "tokens/total": 28311552, "tokens/train_per_sec_per_gpu": 191.07, "tokens/trainable": 2349916 }, { "epoch": 0.11592661526189843, "grad_norm": 0.035974569618701935, "learning_rate": 3.9973041767976466e-05, "loss": 0.7658606767654419, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.15084, "step": 109, "tokens/total": 28573696, "tokens/train_per_sec_per_gpu": 200.69, "tokens/trainable": 2374197 }, { "epoch": 0.11699016219090667, "grad_norm": 0.0417025052011013, "learning_rate": 3.9969054078892185e-05, "loss": 0.8230124711990356, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.27735, "step": 110, "tokens/total": 28835840, "tokens/train_per_sec_per_gpu": 195.73, "tokens/trainable": 2395345 }, { "epoch": 0.11805370911991492, "grad_norm": 0.03637392073869705, "learning_rate": 3.996479167003428e-05, "loss": 0.7655156254768372, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.1501, "step": 111, "tokens/total": 29097984, "tokens/train_per_sec_per_gpu": 214.49, "tokens/trainable": 2418145 }, { "epoch": 0.11911725604892316, "grad_norm": 0.04198000580072403, "learning_rate": 3.996025460004189e-05, "loss": 0.8185654878616333, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.26725, "step": 112, "tokens/total": 29360128, "tokens/train_per_sec_per_gpu": 236.86, "tokens/trainable": 2439031 }, { "epoch": 0.1201808029779314, "grad_norm": 0.041592370718717575, "learning_rate": 3.995544293133273e-05, "loss": 0.8015573024749756, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.22901, "step": 113, "tokens/total": 29622272, "tokens/train_per_sec_per_gpu": 186.73, "tokens/trainable": 2458819 }, { "epoch": 0.12124434990693965, "grad_norm": 0.04079896956682205, "learning_rate": 3.995035673010225e-05, "loss": 0.7219120264053345, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.05837, "step": 114, "tokens/total": 29884416, "tokens/train_per_sec_per_gpu": 184.15, "tokens/trainable": 2480736 }, { "epoch": 0.12230789683594788, "grad_norm": 0.041768353432416916, "learning_rate": 3.994499606632272e-05, "loss": 0.8270866870880127, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.28665, "step": 115, "tokens/total": 30146560, "tokens/train_per_sec_per_gpu": 193.64, "tokens/trainable": 2504003 }, { "epoch": 0.12337144376495612, "grad_norm": 0.04556523263454437, "learning_rate": 3.9939361013742275e-05, "loss": 0.7384425401687622, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.09267, "step": 116, "tokens/total": 30408704, "tokens/train_per_sec_per_gpu": 213.94, "tokens/trainable": 2524722 }, { "epoch": 0.12443499069396437, "grad_norm": 0.03868886083364487, "learning_rate": 3.9933451649883866e-05, "loss": 0.709857165813446, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.0337, "step": 117, "tokens/total": 30670848, "tokens/train_per_sec_per_gpu": 231.95, "tokens/trainable": 2546913 }, { "epoch": 0.1254985376229726, "grad_norm": 0.04056168347597122, "learning_rate": 3.9927268056044266e-05, "loss": 0.7398765087127686, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.09568, "step": 118, "tokens/total": 30932992, "tokens/train_per_sec_per_gpu": 183.97, "tokens/trainable": 2568114 }, { "epoch": 0.12656208455198087, "grad_norm": 0.04197125881910324, "learning_rate": 3.992081031729285e-05, "loss": 0.7923115491867065, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.2085, "step": 119, "tokens/total": 31195136, "tokens/train_per_sec_per_gpu": 199.13, "tokens/trainable": 2588738 }, { "epoch": 0.1276256314809891, "grad_norm": 0.037061259150505066, "learning_rate": 3.9914078522470526e-05, "loss": 0.8101846575737, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.24832, "step": 120, "tokens/total": 31457280, "tokens/train_per_sec_per_gpu": 193.31, "tokens/trainable": 2613736 }, { "epoch": 0.12868917840999733, "grad_norm": 0.04570484906435013, "learning_rate": 3.9907072764188435e-05, "loss": 0.7499140501022339, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.11682, "step": 121, "tokens/total": 31719424, "tokens/train_per_sec_per_gpu": 180.47, "tokens/trainable": 2634223 }, { "epoch": 0.1297527253390056, "grad_norm": 0.04544052109122276, "learning_rate": 3.9899793138826736e-05, "loss": 0.8020647764205933, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.23014, "step": 122, "tokens/total": 31981568, "tokens/train_per_sec_per_gpu": 198.92, "tokens/trainable": 2655869 }, { "epoch": 0.13081627226801382, "grad_norm": 0.04469464346766472, "learning_rate": 3.989223974653323e-05, "loss": 0.7518518567085266, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.12092, "step": 123, "tokens/total": 32243712, "tokens/train_per_sec_per_gpu": 208.31, "tokens/trainable": 2679088 }, { "epoch": 0.13187981919702207, "grad_norm": 0.0395895391702652, "learning_rate": 3.9884412691222016e-05, "loss": 0.7855230569839478, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.19355, "step": 124, "tokens/total": 32505856, "tokens/train_per_sec_per_gpu": 195.72, "tokens/trainable": 2702201 }, { "epoch": 0.1329433661260303, "grad_norm": 0.04236849397420883, "learning_rate": 3.987631208057205e-05, "loss": 0.775454044342041, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.17158, "step": 125, "tokens/total": 32768000, "tokens/train_per_sec_per_gpu": 169.99, "tokens/trainable": 2724593 }, { "epoch": 0.13400691305503856, "grad_norm": 0.04403228312730789, "learning_rate": 3.986793802602566e-05, "loss": 0.7912722229957581, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.2062, "step": 126, "tokens/total": 33030144, "tokens/train_per_sec_per_gpu": 178.51, "tokens/trainable": 2747546 }, { "epoch": 0.1350704599840468, "grad_norm": 0.0415693037211895, "learning_rate": 3.985929064278701e-05, "loss": 0.803294837474823, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.23289, "step": 127, "tokens/total": 33292288, "tokens/train_per_sec_per_gpu": 199.75, "tokens/trainable": 2769876 }, { "epoch": 0.13613400691305505, "grad_norm": 0.043098073452711105, "learning_rate": 3.985037004982056e-05, "loss": 0.8380795121192932, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.31192, "step": 128, "tokens/total": 33554432, "tokens/train_per_sec_per_gpu": 197.32, "tokens/trainable": 2791990 }, { "epoch": 0.13719755384206328, "grad_norm": 0.04367615282535553, "learning_rate": 3.984117636984933e-05, "loss": 0.7381528615951538, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.09207, "step": 129, "tokens/total": 33816576, "tokens/train_per_sec_per_gpu": 157.27, "tokens/trainable": 2812157 }, { "epoch": 0.1382611007710715, "grad_norm": 0.03902239724993706, "learning_rate": 3.983170972935333e-05, "loss": 0.6622740030288696, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 1.9392, "step": 130, "tokens/total": 34078720, "tokens/train_per_sec_per_gpu": 199.7, "tokens/trainable": 2834450 }, { "epoch": 0.13932464770007977, "grad_norm": 0.04401889070868492, "learning_rate": 3.982197025856772e-05, "loss": 0.8131764531135559, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.25506, "step": 131, "tokens/total": 34340864, "tokens/train_per_sec_per_gpu": 210.08, "tokens/trainable": 2857868 }, { "epoch": 0.140388194629088, "grad_norm": 0.055351078510284424, "learning_rate": 3.98119580914811e-05, "loss": 0.8648597002029419, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.37467, "step": 132, "tokens/total": 34603008, "tokens/train_per_sec_per_gpu": 164.9, "tokens/trainable": 2877451 }, { "epoch": 0.14145174155809626, "grad_norm": 0.04292495548725128, "learning_rate": 3.980167336583359e-05, "loss": 0.8202415108680725, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.27105, "step": 133, "tokens/total": 34865152, "tokens/train_per_sec_per_gpu": 179.62, "tokens/trainable": 2898969 }, { "epoch": 0.14251528848710449, "grad_norm": 0.039902154356241226, "learning_rate": 3.979111622311501e-05, "loss": 0.7659401893615723, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.15102, "step": 134, "tokens/total": 35127296, "tokens/train_per_sec_per_gpu": 204.35, "tokens/trainable": 2922244 }, { "epoch": 0.14357883541611274, "grad_norm": 0.04879293963313103, "learning_rate": 3.978028680856286e-05, "loss": 0.7666274309158325, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.15249, "step": 135, "tokens/total": 35389440, "tokens/train_per_sec_per_gpu": 203.21, "tokens/trainable": 2941098 }, { "epoch": 0.14464238234512097, "grad_norm": 0.04211945831775665, "learning_rate": 3.97691852711604e-05, "loss": 0.7596578001976013, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.13754, "step": 136, "tokens/total": 35651584, "tokens/train_per_sec_per_gpu": 182.99, "tokens/trainable": 2962728 }, { "epoch": 0.14570592927412923, "grad_norm": 0.05592913180589676, "learning_rate": 3.975781176363451e-05, "loss": 0.8827542066574097, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.41755, "step": 137, "tokens/total": 35913728, "tokens/train_per_sec_per_gpu": 167.07, "tokens/trainable": 2983012 }, { "epoch": 0.14676947620313746, "grad_norm": 0.042312368750572205, "learning_rate": 3.9746166442453667e-05, "loss": 0.7679699659347534, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.15539, "step": 138, "tokens/total": 36175872, "tokens/train_per_sec_per_gpu": 173.16, "tokens/trainable": 3005801 }, { "epoch": 0.14783302313214572, "grad_norm": 0.043086566030979156, "learning_rate": 3.973424946782578e-05, "loss": 0.7419267892837524, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.09998, "step": 139, "tokens/total": 36438016, "tokens/train_per_sec_per_gpu": 179.27, "tokens/trainable": 3027970 }, { "epoch": 0.14889657006115395, "grad_norm": 0.044038690626621246, "learning_rate": 3.972206100369594e-05, "loss": 0.8171659111976624, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.26407, "step": 140, "tokens/total": 36700160, "tokens/train_per_sec_per_gpu": 156.51, "tokens/trainable": 3051271 }, { "epoch": 0.14996011699016218, "grad_norm": 0.04058285430073738, "learning_rate": 3.970960121774419e-05, "loss": 0.8079518675804138, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.24331, "step": 141, "tokens/total": 36962304, "tokens/train_per_sec_per_gpu": 167.22, "tokens/trainable": 3074209 }, { "epoch": 0.15102366391917044, "grad_norm": 0.04522034898400307, "learning_rate": 3.9696870281383255e-05, "loss": 0.8723236322402954, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.39246, "step": 142, "tokens/total": 37224448, "tokens/train_per_sec_per_gpu": 158.71, "tokens/trainable": 3095651 }, { "epoch": 0.15208721084817867, "grad_norm": 0.04753715172410011, "learning_rate": 3.968386836975611e-05, "loss": 0.7392692565917969, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.0944, "step": 143, "tokens/total": 37486592, "tokens/train_per_sec_per_gpu": 189.77, "tokens/trainable": 3118220 }, { "epoch": 0.15315075777718692, "grad_norm": 0.045635782182216644, "learning_rate": 3.9670595661733654e-05, "loss": 0.8149927854537964, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.25916, "step": 144, "tokens/total": 37748736, "tokens/train_per_sec_per_gpu": 209.61, "tokens/trainable": 3143149 }, { "epoch": 0.15421430470619515, "grad_norm": 0.049446720629930496, "learning_rate": 3.9657052339912166e-05, "loss": 0.8048349022865295, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.23633, "step": 145, "tokens/total": 38010880, "tokens/train_per_sec_per_gpu": 200.16, "tokens/trainable": 3164817 }, { "epoch": 0.1552778516352034, "grad_norm": 0.04811964929103851, "learning_rate": 3.9643238590610864e-05, "loss": 0.7713128328323364, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.1626, "step": 146, "tokens/total": 38273024, "tokens/train_per_sec_per_gpu": 210.98, "tokens/trainable": 3185683 }, { "epoch": 0.15634139856421164, "grad_norm": 0.04874229058623314, "learning_rate": 3.9629154603869294e-05, "loss": 0.7867254614830017, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.19619, "step": 147, "tokens/total": 38535168, "tokens/train_per_sec_per_gpu": 161.76, "tokens/trainable": 3207493 }, { "epoch": 0.1574049454932199, "grad_norm": 0.04509029909968376, "learning_rate": 3.961480057344474e-05, "loss": 0.8230168223381042, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.27736, "step": 148, "tokens/total": 38797312, "tokens/train_per_sec_per_gpu": 165.98, "tokens/trainable": 3227269 }, { "epoch": 0.15846849242222813, "grad_norm": 0.048180241137742996, "learning_rate": 3.9600176696809555e-05, "loss": 0.7925543785095215, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.20903, "step": 149, "tokens/total": 39059456, "tokens/train_per_sec_per_gpu": 178.87, "tokens/trainable": 3247693 }, { "epoch": 0.15953203935123636, "grad_norm": 0.044965874403715134, "learning_rate": 3.9585283175148425e-05, "loss": 0.7305552959442139, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.07623, "step": 150, "tokens/total": 39321600, "tokens/train_per_sec_per_gpu": 185.73, "tokens/trainable": 3271323 }, { "epoch": 0.15953203935123636, "eval_loss": 0.7756121754646301, "eval_ppl": 2.17192, "eval_runtime": 237.3162, "eval_samples_per_second": 28.173, "eval_steps_per_second": 1.761, "memory/device_reserved (GiB)": 51.31, "memory/max_active (GiB)": 38.19, "memory/max_allocated (GiB)": 38.19, "step": 150 }, { "epoch": 0.16059558628024462, "grad_norm": 0.05009883642196655, "learning_rate": 3.9570120213355636e-05, "loss": 0.7295466661453247, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.07414, "step": 151, "tokens/total": 39583744, "tokens/train_per_sec_per_gpu": 212.53, "tokens/trainable": 3293376 }, { "epoch": 0.16165913320925285, "grad_norm": 0.05454389378428459, "learning_rate": 3.955468802003222e-05, "loss": 0.8171148300170898, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.26396, "step": 152, "tokens/total": 39845888, "tokens/train_per_sec_per_gpu": 160.49, "tokens/trainable": 3314003 }, { "epoch": 0.1627226801382611, "grad_norm": 0.05227701738476753, "learning_rate": 3.953898680748311e-05, "loss": 0.8315908908843994, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.29697, "step": 153, "tokens/total": 40108032, "tokens/train_per_sec_per_gpu": 178.92, "tokens/trainable": 3335783 }, { "epoch": 0.16378622706726934, "grad_norm": 0.04624287411570549, "learning_rate": 3.952301679171421e-05, "loss": 0.7561501860618591, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.13006, "step": 154, "tokens/total": 40370176, "tokens/train_per_sec_per_gpu": 164.87, "tokens/trainable": 3355886 }, { "epoch": 0.1648497739962776, "grad_norm": 0.04997319355607033, "learning_rate": 3.950677819242943e-05, "loss": 0.788512110710144, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.20012, "step": 155, "tokens/total": 40632320, "tokens/train_per_sec_per_gpu": 188.8, "tokens/trainable": 3378024 }, { "epoch": 0.16591332092528582, "grad_norm": 0.05008501932024956, "learning_rate": 3.949027123302764e-05, "loss": 0.8327994346618652, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.29975, "step": 156, "tokens/total": 40894464, "tokens/train_per_sec_per_gpu": 163.0, "tokens/trainable": 3399925 }, { "epoch": 0.16697686785429408, "grad_norm": 0.05233265459537506, "learning_rate": 3.9473496140599626e-05, "loss": 0.8238826394081116, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.27933, "step": 157, "tokens/total": 41156608, "tokens/train_per_sec_per_gpu": 182.71, "tokens/trainable": 3420640 }, { "epoch": 0.1680404147833023, "grad_norm": 0.05217234417796135, "learning_rate": 3.945645314592495e-05, "loss": 0.7473776340484619, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.11146, "step": 158, "tokens/total": 41418752, "tokens/train_per_sec_per_gpu": 223.76, "tokens/trainable": 3442988 }, { "epoch": 0.16910396171231057, "grad_norm": 0.04567525163292885, "learning_rate": 3.943914248346879e-05, "loss": 0.7227488160133362, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.06009, "step": 159, "tokens/total": 41680896, "tokens/train_per_sec_per_gpu": 177.03, "tokens/trainable": 3464101 }, { "epoch": 0.1701675086413188, "grad_norm": 0.05548242852091789, "learning_rate": 3.9421564391378685e-05, "loss": 0.7955631017684937, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.21569, "step": 160, "tokens/total": 41943040, "tokens/train_per_sec_per_gpu": 152.75, "tokens/trainable": 3483419 }, { "epoch": 0.17123105557032703, "grad_norm": 0.04398762434720993, "learning_rate": 3.9403719111481295e-05, "loss": 0.7600826025009155, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.13845, "step": 161, "tokens/total": 42205184, "tokens/train_per_sec_per_gpu": 205.76, "tokens/trainable": 3504602 }, { "epoch": 0.1722946024993353, "grad_norm": 0.04506729915738106, "learning_rate": 3.9385606889279035e-05, "loss": 0.7480685114860535, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.11292, "step": 162, "tokens/total": 42467328, "tokens/train_per_sec_per_gpu": 227.39, "tokens/trainable": 3526406 }, { "epoch": 0.17335814942834352, "grad_norm": 0.044185835868120193, "learning_rate": 3.9367227973946745e-05, "loss": 0.7433359026908875, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.10294, "step": 163, "tokens/total": 42729472, "tokens/train_per_sec_per_gpu": 182.07, "tokens/trainable": 3548313 }, { "epoch": 0.17442169635735177, "grad_norm": 0.05059230327606201, "learning_rate": 3.934858261832822e-05, "loss": 0.6835325956344604, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 1.98086, "step": 164, "tokens/total": 42991616, "tokens/train_per_sec_per_gpu": 169.05, "tokens/trainable": 3568190 }, { "epoch": 0.17548524328636, "grad_norm": 0.050207290798425674, "learning_rate": 3.932967107893274e-05, "loss": 0.7899980545043945, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.20339, "step": 165, "tokens/total": 43253760, "tokens/train_per_sec_per_gpu": 205.5, "tokens/trainable": 3588948 }, { "epoch": 0.17654879021536826, "grad_norm": 0.04827320948243141, "learning_rate": 3.931049361593157e-05, "loss": 0.7980469465255737, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.2212, "step": 166, "tokens/total": 43515904, "tokens/train_per_sec_per_gpu": 173.52, "tokens/trainable": 3609829 }, { "epoch": 0.1776123371443765, "grad_norm": 0.048085663467645645, "learning_rate": 3.9291050493154336e-05, "loss": 0.7630643844604492, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.14484, "step": 167, "tokens/total": 43778048, "tokens/train_per_sec_per_gpu": 200.45, "tokens/trainable": 3631205 }, { "epoch": 0.17867588407338475, "grad_norm": 0.05210770294070244, "learning_rate": 3.927134197808544e-05, "loss": 0.7417425513267517, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.09959, "step": 168, "tokens/total": 44040192, "tokens/train_per_sec_per_gpu": 138.92, "tokens/trainable": 3651025 }, { "epoch": 0.17973943100239298, "grad_norm": 0.04291163384914398, "learning_rate": 3.9251368341860343e-05, "loss": 0.7509276866912842, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.11896, "step": 169, "tokens/total": 44302336, "tokens/train_per_sec_per_gpu": 189.85, "tokens/trainable": 3674343 }, { "epoch": 0.1808029779314012, "grad_norm": 0.05076931044459343, "learning_rate": 3.923112985926185e-05, "loss": 0.7864252328872681, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.19553, "step": 170, "tokens/total": 44564480, "tokens/train_per_sec_per_gpu": 173.58, "tokens/trainable": 3694079 }, { "epoch": 0.18186652486040947, "grad_norm": 0.046801142394542694, "learning_rate": 3.921062680871635e-05, "loss": 0.6877319812774658, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 1.9892, "step": 171, "tokens/total": 44826624, "tokens/train_per_sec_per_gpu": 176.69, "tokens/trainable": 3715203 }, { "epoch": 0.1829300717894177, "grad_norm": 0.05101482570171356, "learning_rate": 3.9189859472289956e-05, "loss": 0.800000786781311, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.22554, "step": 172, "tokens/total": 45088768, "tokens/train_per_sec_per_gpu": 150.34, "tokens/trainable": 3737632 }, { "epoch": 0.18399361871842596, "grad_norm": 0.04480605199933052, "learning_rate": 3.916882813568461e-05, "loss": 0.7518149614334106, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.12085, "step": 173, "tokens/total": 45350912, "tokens/train_per_sec_per_gpu": 216.08, "tokens/trainable": 3761754 }, { "epoch": 0.18505716564743419, "grad_norm": 0.05060945823788643, "learning_rate": 3.914753308823422e-05, "loss": 0.8080068230628967, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.24343, "step": 174, "tokens/total": 45613056, "tokens/train_per_sec_per_gpu": 147.42, "tokens/trainable": 3784061 }, { "epoch": 0.18612071257644244, "grad_norm": 0.04640955105423927, "learning_rate": 3.9125974622900596e-05, "loss": 0.8282898664474487, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.2894, "step": 175, "tokens/total": 45875200, "tokens/train_per_sec_per_gpu": 208.37, "tokens/trainable": 3808216 }, { "epoch": 0.18718425950545067, "grad_norm": 0.04194442555308342, "learning_rate": 3.91041530362695e-05, "loss": 0.7835493087768555, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.18923, "step": 176, "tokens/total": 46137344, "tokens/train_per_sec_per_gpu": 198.59, "tokens/trainable": 3832481 }, { "epoch": 0.18824780643445893, "grad_norm": 0.049947503954172134, "learning_rate": 3.90820686285465e-05, "loss": 0.8008949756622314, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.22753, "step": 177, "tokens/total": 46399488, "tokens/train_per_sec_per_gpu": 151.75, "tokens/trainable": 3852693 }, { "epoch": 0.18931135336346716, "grad_norm": 0.04894804581999779, "learning_rate": 3.905972170355286e-05, "loss": 0.711793065071106, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.03764, "step": 178, "tokens/total": 46661632, "tokens/train_per_sec_per_gpu": 178.97, "tokens/trainable": 3873846 }, { "epoch": 0.19037490029247542, "grad_norm": 0.04731186851859093, "learning_rate": 3.903711256872139e-05, "loss": 0.7765140533447266, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.17388, "step": 179, "tokens/total": 46923776, "tokens/train_per_sec_per_gpu": 206.26, "tokens/trainable": 3898641 }, { "epoch": 0.19143844722148365, "grad_norm": 0.055336493998765945, "learning_rate": 3.901424153509218e-05, "loss": 0.8252753019332886, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.28251, "step": 180, "tokens/total": 47185920, "tokens/train_per_sec_per_gpu": 170.2, "tokens/trainable": 3919863 }, { "epoch": 0.19250199415049188, "grad_norm": 0.04609975218772888, "learning_rate": 3.899110891730834e-05, "loss": 0.7351381778717041, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.08577, "step": 181, "tokens/total": 47448064, "tokens/train_per_sec_per_gpu": 177.94, "tokens/trainable": 3942514 }, { "epoch": 0.19356554107950014, "grad_norm": 0.05186214670538902, "learning_rate": 3.896771503361165e-05, "loss": 0.7917764186859131, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.20731, "step": 182, "tokens/total": 47710208, "tokens/train_per_sec_per_gpu": 200.82, "tokens/trainable": 3964229 }, { "epoch": 0.19462908800850837, "grad_norm": 0.049349937587976456, "learning_rate": 3.8944060205838204e-05, "loss": 0.7680513858795166, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.15556, "step": 183, "tokens/total": 47972352, "tokens/train_per_sec_per_gpu": 190.57, "tokens/trainable": 3983779 }, { "epoch": 0.19569263493751662, "grad_norm": 0.053863126784563065, "learning_rate": 3.892014475941399e-05, "loss": 0.7624801397323608, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.14359, "step": 184, "tokens/total": 48234496, "tokens/train_per_sec_per_gpu": 165.8, "tokens/trainable": 4004641 }, { "epoch": 0.19675618186652485, "grad_norm": 0.04809142276644707, "learning_rate": 3.8895969023350384e-05, "loss": 0.8235425353050232, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.27856, "step": 185, "tokens/total": 48496640, "tokens/train_per_sec_per_gpu": 198.93, "tokens/trainable": 4025071 }, { "epoch": 0.1978197287955331, "grad_norm": 0.04834391921758652, "learning_rate": 3.8871533330239646e-05, "loss": 0.7338411211967468, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.08307, "step": 186, "tokens/total": 48758784, "tokens/train_per_sec_per_gpu": 185.51, "tokens/trainable": 4048197 }, { "epoch": 0.19888327572454134, "grad_norm": 0.052105050534009933, "learning_rate": 3.884683801625032e-05, "loss": 0.7286123633384705, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.0722, "step": 187, "tokens/total": 49020928, "tokens/train_per_sec_per_gpu": 175.75, "tokens/trainable": 4069621 }, { "epoch": 0.1999468226535496, "grad_norm": 0.04482823610305786, "learning_rate": 3.8821883421122645e-05, "loss": 0.7568373680114746, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.13152, "step": 188, "tokens/total": 49283072, "tokens/train_per_sec_per_gpu": 203.94, "tokens/trainable": 4093663 }, { "epoch": 0.20101036958255783, "grad_norm": 0.0523945651948452, "learning_rate": 3.879666988816386e-05, "loss": 0.755517840385437, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.12871, "step": 189, "tokens/total": 49545216, "tokens/train_per_sec_per_gpu": 160.35, "tokens/trainable": 4113650 }, { "epoch": 0.20207391651156606, "grad_norm": 0.04911473020911217, "learning_rate": 3.877119776424347e-05, "loss": 0.7191125154495239, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.05261, "step": 190, "tokens/total": 49807360, "tokens/train_per_sec_per_gpu": 221.01, "tokens/trainable": 4136190 }, { "epoch": 0.20313746344057432, "grad_norm": 0.053953029215335846, "learning_rate": 3.8745467399788506e-05, "loss": 0.7884220480918884, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.19992, "step": 191, "tokens/total": 50069504, "tokens/train_per_sec_per_gpu": 154.41, "tokens/trainable": 4158226 }, { "epoch": 0.20420101036958255, "grad_norm": 0.05675153061747551, "learning_rate": 3.871947914877866e-05, "loss": 0.7461360692977905, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.10884, "step": 192, "tokens/total": 50331648, "tokens/train_per_sec_per_gpu": 179.47, "tokens/trainable": 4179916 }, { "epoch": 0.2052645572985908, "grad_norm": 0.05521610751748085, "learning_rate": 3.869323336874146e-05, "loss": 0.7196043729782104, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.05362, "step": 193, "tokens/total": 50593792, "tokens/train_per_sec_per_gpu": 201.18, "tokens/trainable": 4200779 }, { "epoch": 0.20632810422759904, "grad_norm": 0.056510064750909805, "learning_rate": 3.8666730420747336e-05, "loss": 0.8237625360488892, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.27906, "step": 194, "tokens/total": 50855936, "tokens/train_per_sec_per_gpu": 171.77, "tokens/trainable": 4222217 }, { "epoch": 0.2073916511566073, "grad_norm": 0.0467703752219677, "learning_rate": 3.863997066940463e-05, "loss": 0.7923108339309692, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.20849, "step": 195, "tokens/total": 51118080, "tokens/train_per_sec_per_gpu": 193.25, "tokens/trainable": 4244857 }, { "epoch": 0.20845519808561552, "grad_norm": 0.04992164671421051, "learning_rate": 3.8612954482854606e-05, "loss": 0.7622380256652832, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.14307, "step": 196, "tokens/total": 51380224, "tokens/train_per_sec_per_gpu": 216.77, "tokens/trainable": 4267572 }, { "epoch": 0.20951874501462378, "grad_norm": 0.049939945340156555, "learning_rate": 3.8585682232766385e-05, "loss": 0.7296082973480225, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.07427, "step": 197, "tokens/total": 51642368, "tokens/train_per_sec_per_gpu": 214.99, "tokens/trainable": 4290348 }, { "epoch": 0.210582291943632, "grad_norm": 0.05264829471707344, "learning_rate": 3.8558154294331807e-05, "loss": 0.7297487258911133, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.07456, "step": 198, "tokens/total": 51904512, "tokens/train_per_sec_per_gpu": 178.25, "tokens/trainable": 4310984 }, { "epoch": 0.21164583887264027, "grad_norm": 0.05678649619221687, "learning_rate": 3.853037104626031e-05, "loss": 0.8022236227989197, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.2305, "step": 199, "tokens/total": 52166656, "tokens/train_per_sec_per_gpu": 212.29, "tokens/trainable": 4332640 }, { "epoch": 0.2127093858016485, "grad_norm": 0.04627032205462456, "learning_rate": 3.8502332870773675e-05, "loss": 0.8206828832626343, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.27205, "step": 200, "tokens/total": 52428800, "tokens/train_per_sec_per_gpu": 173.49, "tokens/trainable": 4356101 }, { "epoch": 0.21377293273065673, "grad_norm": 0.04949687048792839, "learning_rate": 3.847404015360081e-05, "loss": 0.7801845073699951, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.18187, "step": 201, "tokens/total": 52690944, "tokens/train_per_sec_per_gpu": 202.37, "tokens/trainable": 4378733 }, { "epoch": 0.214836479659665, "grad_norm": 0.05199093371629715, "learning_rate": 3.8445493283972414e-05, "loss": 0.7552693486213684, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.12818, "step": 202, "tokens/total": 52953088, "tokens/train_per_sec_per_gpu": 165.43, "tokens/trainable": 4401167 }, { "epoch": 0.21590002658867322, "grad_norm": 0.060018111020326614, "learning_rate": 3.841669265461562e-05, "loss": 0.8178205490112305, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.26556, "step": 203, "tokens/total": 53215232, "tokens/train_per_sec_per_gpu": 192.79, "tokens/trainable": 4421155 }, { "epoch": 0.21696357351768147, "grad_norm": 0.0454607792198658, "learning_rate": 3.838763866174862e-05, "loss": 0.798674464225769, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.22259, "step": 204, "tokens/total": 53477376, "tokens/train_per_sec_per_gpu": 181.78, "tokens/trainable": 4443402 }, { "epoch": 0.2180271204466897, "grad_norm": 0.05475523695349693, "learning_rate": 3.835833170507519e-05, "loss": 0.7501173615455627, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.11725, "step": 205, "tokens/total": 53739520, "tokens/train_per_sec_per_gpu": 174.06, "tokens/trainable": 4465823 }, { "epoch": 0.21909066737569796, "grad_norm": 0.04717683792114258, "learning_rate": 3.8328772187779196e-05, "loss": 0.7843440771102905, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.19097, "step": 206, "tokens/total": 54001664, "tokens/train_per_sec_per_gpu": 172.19, "tokens/trainable": 4490092 }, { "epoch": 0.2201542143047062, "grad_norm": 0.04792032018303871, "learning_rate": 3.829896051651907e-05, "loss": 0.7995268702507019, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.22449, "step": 207, "tokens/total": 54263808, "tokens/train_per_sec_per_gpu": 198.34, "tokens/trainable": 4513112 }, { "epoch": 0.22121776123371445, "grad_norm": 0.05625506862998009, "learning_rate": 3.8268897101422154e-05, "loss": 0.7814656496047974, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.18467, "step": 208, "tokens/total": 54525952, "tokens/train_per_sec_per_gpu": 178.5, "tokens/trainable": 4533887 }, { "epoch": 0.22228130816272268, "grad_norm": 0.05093343183398247, "learning_rate": 3.823858235607915e-05, "loss": 0.8371798396110535, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.30984, "step": 209, "tokens/total": 54788096, "tokens/train_per_sec_per_gpu": 179.89, "tokens/trainable": 4557864 }, { "epoch": 0.2233448550917309, "grad_norm": 0.051436666399240494, "learning_rate": 3.820801669753833e-05, "loss": 0.8444880247116089, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.32679, "step": 210, "tokens/total": 55050240, "tokens/train_per_sec_per_gpu": 208.06, "tokens/trainable": 4579529 }, { "epoch": 0.22440840202073917, "grad_norm": 0.046146344393491745, "learning_rate": 3.8177200546299894e-05, "loss": 0.7421606779098511, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.10047, "step": 211, "tokens/total": 55312384, "tokens/train_per_sec_per_gpu": 198.81, "tokens/trainable": 4602512 }, { "epoch": 0.2254719489497474, "grad_norm": 0.06141940504312515, "learning_rate": 3.81461343263101e-05, "loss": 0.7741906642913818, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.16884, "step": 212, "tokens/total": 55574528, "tokens/train_per_sec_per_gpu": 196.43, "tokens/trainable": 4623604 }, { "epoch": 0.22653549587875565, "grad_norm": 0.061457011848688126, "learning_rate": 3.81148184649555e-05, "loss": 0.7711528539657593, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.16226, "step": 213, "tokens/total": 55836672, "tokens/train_per_sec_per_gpu": 148.28, "tokens/trainable": 4645354 }, { "epoch": 0.22759904280776388, "grad_norm": 0.05289280042052269, "learning_rate": 3.8083253393057006e-05, "loss": 0.7671029567718506, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.15352, "step": 214, "tokens/total": 56098816, "tokens/train_per_sec_per_gpu": 198.49, "tokens/trainable": 4666565 }, { "epoch": 0.22866258973677214, "grad_norm": 0.05258488655090332, "learning_rate": 3.805143954486401e-05, "loss": 0.7617560029029846, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.14203, "step": 215, "tokens/total": 56360960, "tokens/train_per_sec_per_gpu": 177.48, "tokens/trainable": 4688241 }, { "epoch": 0.22972613666578037, "grad_norm": 0.05477464199066162, "learning_rate": 3.801937735804838e-05, "loss": 0.736034095287323, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.08764, "step": 216, "tokens/total": 56623104, "tokens/train_per_sec_per_gpu": 164.79, "tokens/trainable": 4710328 }, { "epoch": 0.23078968359478863, "grad_norm": 0.058859411627054214, "learning_rate": 3.798706727369845e-05, "loss": 0.7572994232177734, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.13251, "step": 217, "tokens/total": 56885248, "tokens/train_per_sec_per_gpu": 155.41, "tokens/trainable": 4730784 }, { "epoch": 0.23185323052379686, "grad_norm": 0.055905554443597794, "learning_rate": 3.795450973631293e-05, "loss": 0.7484654188156128, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.11375, "step": 218, "tokens/total": 57147392, "tokens/train_per_sec_per_gpu": 195.23, "tokens/trainable": 4752708 }, { "epoch": 0.23291677745280512, "grad_norm": 0.05177122727036476, "learning_rate": 3.792170519379482e-05, "loss": 0.6985906958580017, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.01092, "step": 219, "tokens/total": 57409536, "tokens/train_per_sec_per_gpu": 151.36, "tokens/trainable": 4776585 }, { "epoch": 0.23398032438181335, "grad_norm": 0.05102040618658066, "learning_rate": 3.788865409744527e-05, "loss": 0.7429978847503662, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.10223, "step": 220, "tokens/total": 57671680, "tokens/train_per_sec_per_gpu": 161.92, "tokens/trainable": 4798099 }, { "epoch": 0.23504387131082158, "grad_norm": 0.04876242205500603, "learning_rate": 3.785535690195728e-05, "loss": 0.8290475010871887, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.29114, "step": 221, "tokens/total": 57933824, "tokens/train_per_sec_per_gpu": 215.34, "tokens/trainable": 4821746 }, { "epoch": 0.23610741823982984, "grad_norm": 0.04731612280011177, "learning_rate": 3.782181406540954e-05, "loss": 0.7661755084991455, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.15152, "step": 222, "tokens/total": 58195968, "tokens/train_per_sec_per_gpu": 213.65, "tokens/trainable": 4845142 }, { "epoch": 0.23717096516883807, "grad_norm": 0.058613162487745285, "learning_rate": 3.77880260492601e-05, "loss": 0.7483052015304565, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.11342, "step": 223, "tokens/total": 58458112, "tokens/train_per_sec_per_gpu": 219.8, "tokens/trainable": 4866360 }, { "epoch": 0.23823451209784632, "grad_norm": 0.05162626504898071, "learning_rate": 3.775399331833998e-05, "loss": 0.798062264919281, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.22123, "step": 224, "tokens/total": 58720256, "tokens/train_per_sec_per_gpu": 144.23, "tokens/trainable": 4886502 }, { "epoch": 0.23929805902685455, "grad_norm": 0.04998771846294403, "learning_rate": 3.7719716340846845e-05, "loss": 0.8206250667572021, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.27192, "step": 225, "tokens/total": 58982400, "tokens/train_per_sec_per_gpu": 195.94, "tokens/trainable": 4910020 }, { "epoch": 0.2403616059558628, "grad_norm": 0.056194525212049484, "learning_rate": 3.768519558833849e-05, "loss": 0.878259539604187, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.40671, "step": 226, "tokens/total": 59244544, "tokens/train_per_sec_per_gpu": 157.32, "tokens/trainable": 4931232 }, { "epoch": 0.24142515288487104, "grad_norm": 0.0521056093275547, "learning_rate": 3.765043153572643e-05, "loss": 0.759453535079956, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.13711, "step": 227, "tokens/total": 59506688, "tokens/train_per_sec_per_gpu": 193.4, "tokens/trainable": 4954744 }, { "epoch": 0.2424886998138793, "grad_norm": 0.04911046847701073, "learning_rate": 3.761542466126929e-05, "loss": 0.7336410284042358, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.08265, "step": 228, "tokens/total": 59768832, "tokens/train_per_sec_per_gpu": 207.37, "tokens/trainable": 4975992 }, { "epoch": 0.24355224674288753, "grad_norm": 0.06128966435790062, "learning_rate": 3.758017544656628e-05, "loss": 0.7942535877227783, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.21279, "step": 229, "tokens/total": 60030976, "tokens/train_per_sec_per_gpu": 185.07, "tokens/trainable": 4996744 }, { "epoch": 0.24461579367189576, "grad_norm": 0.057662662118673325, "learning_rate": 3.754468437655056e-05, "loss": 0.7774747014045715, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.17597, "step": 230, "tokens/total": 60293120, "tokens/train_per_sec_per_gpu": 144.76, "tokens/trainable": 5015784 }, { "epoch": 0.24567934060090402, "grad_norm": 0.0500815324485302, "learning_rate": 3.7508951939482543e-05, "loss": 0.701805591583252, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.01739, "step": 231, "tokens/total": 60555264, "tokens/train_per_sec_per_gpu": 198.76, "tokens/trainable": 5037125 }, { "epoch": 0.24674288752991225, "grad_norm": 0.05346173420548439, "learning_rate": 3.74729786269432e-05, "loss": 0.7507196068763733, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.11852, "step": 232, "tokens/total": 60817408, "tokens/train_per_sec_per_gpu": 197.05, "tokens/trainable": 5059497 }, { "epoch": 0.2478064344589205, "grad_norm": 0.05386090278625488, "learning_rate": 3.7436764933827284e-05, "loss": 0.7338147163391113, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.08301, "step": 233, "tokens/total": 61079552, "tokens/train_per_sec_per_gpu": 157.02, "tokens/trainable": 5080212 }, { "epoch": 0.24886998138792873, "grad_norm": 0.05847088247537613, "learning_rate": 3.7400311358336555e-05, "loss": 0.7185083627700806, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.05137, "step": 234, "tokens/total": 61341696, "tokens/train_per_sec_per_gpu": 178.9, "tokens/trainable": 5101917 }, { "epoch": 0.249933528316937, "grad_norm": 0.05917196720838547, "learning_rate": 3.736361840197288e-05, "loss": 0.7877013087272644, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.19834, "step": 235, "tokens/total": 61603840, "tokens/train_per_sec_per_gpu": 172.85, "tokens/trainable": 5122988 }, { "epoch": 0.2509970752459452, "grad_norm": 0.05370993912220001, "learning_rate": 3.732668656953136e-05, "loss": 0.7264862656593323, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.0678, "step": 236, "tokens/total": 61865984, "tokens/train_per_sec_per_gpu": 215.97, "tokens/trainable": 5145194 }, { "epoch": 0.25206062217495345, "grad_norm": 0.05220884829759598, "learning_rate": 3.728951636909338e-05, "loss": 0.7533116340637207, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.12402, "step": 237, "tokens/total": 62128128, "tokens/train_per_sec_per_gpu": 211.82, "tokens/trainable": 5167987 }, { "epoch": 0.25312416910396174, "grad_norm": 0.054722413420677185, "learning_rate": 3.725210831201961e-05, "loss": 0.7575439810752869, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.13303, "step": 238, "tokens/total": 62390272, "tokens/train_per_sec_per_gpu": 154.07, "tokens/trainable": 5189725 }, { "epoch": 0.25418771603296997, "grad_norm": 0.05407283455133438, "learning_rate": 3.721446291294301e-05, "loss": 0.7419638633728027, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.10006, "step": 239, "tokens/total": 62652416, "tokens/train_per_sec_per_gpu": 241.74, "tokens/trainable": 5213041 }, { "epoch": 0.2552512629619782, "grad_norm": 0.05353840813040733, "learning_rate": 3.717658068976168e-05, "loss": 0.7568483352661133, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.13155, "step": 240, "tokens/total": 62914560, "tokens/train_per_sec_per_gpu": 170.27, "tokens/trainable": 5232929 }, { "epoch": 0.25631480989098643, "grad_norm": 0.05174530670046806, "learning_rate": 3.713846216363179e-05, "loss": 0.6827611923217773, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 1.97934, "step": 241, "tokens/total": 63176704, "tokens/train_per_sec_per_gpu": 212.44, "tokens/trainable": 5253894 }, { "epoch": 0.25737835681999466, "grad_norm": 0.050874270498752594, "learning_rate": 3.7100107858960404e-05, "loss": 0.7131105661392212, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.04033, "step": 242, "tokens/total": 63438848, "tokens/train_per_sec_per_gpu": 167.64, "tokens/trainable": 5275682 }, { "epoch": 0.25844190374900294, "grad_norm": 0.06289295852184296, "learning_rate": 3.7061518303398244e-05, "loss": 0.814382791519165, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.25778, "step": 243, "tokens/total": 63700992, "tokens/train_per_sec_per_gpu": 181.58, "tokens/trainable": 5296823 }, { "epoch": 0.2595054506780112, "grad_norm": 0.05226941406726837, "learning_rate": 3.7022694027832456e-05, "loss": 0.716779887676239, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.04783, "step": 244, "tokens/total": 63963136, "tokens/train_per_sec_per_gpu": 202.14, "tokens/trainable": 5318716 }, { "epoch": 0.2605689976070194, "grad_norm": 0.04937303066253662, "learning_rate": 3.698363556637927e-05, "loss": 0.7856250405311584, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.19378, "step": 245, "tokens/total": 64225280, "tokens/train_per_sec_per_gpu": 229.48, "tokens/trainable": 5344987 }, { "epoch": 0.26163254453602763, "grad_norm": 0.049020808190107346, "learning_rate": 3.694434345637671e-05, "loss": 0.7429791688919067, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.10219, "step": 246, "tokens/total": 64487424, "tokens/train_per_sec_per_gpu": 226.82, "tokens/trainable": 5369834 }, { "epoch": 0.2626960914650359, "grad_norm": 0.052014704793691635, "learning_rate": 3.690481823837714e-05, "loss": 0.7686535120010376, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.15686, "step": 247, "tokens/total": 64749568, "tokens/train_per_sec_per_gpu": 199.34, "tokens/trainable": 5392410 }, { "epoch": 0.26375963839404415, "grad_norm": 0.060707978904247284, "learning_rate": 3.686506045613986e-05, "loss": 0.7760209441184998, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.17281, "step": 248, "tokens/total": 65011712, "tokens/train_per_sec_per_gpu": 157.58, "tokens/trainable": 5414349 }, { "epoch": 0.2648231853230524, "grad_norm": 0.06462915241718292, "learning_rate": 3.6825070656623626e-05, "loss": 0.7831145524978638, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.18828, "step": 249, "tokens/total": 65273856, "tokens/train_per_sec_per_gpu": 159.08, "tokens/trainable": 5433875 }, { "epoch": 0.2658867322520606, "grad_norm": 0.057429276406764984, "learning_rate": 3.678484938997912e-05, "loss": 0.6994718909263611, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.01269, "step": 250, "tokens/total": 65536000, "tokens/train_per_sec_per_gpu": 166.54, "tokens/trainable": 5454358 }, { "epoch": 0.26695027918106884, "grad_norm": 0.05674952268600464, "learning_rate": 3.674439720954138e-05, "loss": 0.7693842649459839, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.15844, "step": 251, "tokens/total": 65798144, "tokens/train_per_sec_per_gpu": 199.24, "tokens/trainable": 5476733 }, { "epoch": 0.2680138261100771, "grad_norm": 0.06037837266921997, "learning_rate": 3.670371467182219e-05, "loss": 0.7689650058746338, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.15753, "step": 252, "tokens/total": 66060288, "tokens/train_per_sec_per_gpu": 184.65, "tokens/trainable": 5498250 }, { "epoch": 0.26907737303908535, "grad_norm": 0.055501531809568405, "learning_rate": 3.66628023365024e-05, "loss": 0.7238588333129883, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.06238, "step": 253, "tokens/total": 66322432, "tokens/train_per_sec_per_gpu": 227.0, "tokens/trainable": 5520172 }, { "epoch": 0.2701409199680936, "grad_norm": 0.05900080129504204, "learning_rate": 3.66216607664243e-05, "loss": 0.7000092267990112, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.01377, "step": 254, "tokens/total": 66584576, "tokens/train_per_sec_per_gpu": 153.45, "tokens/trainable": 5539604 }, { "epoch": 0.2712044668971018, "grad_norm": 0.054649997502565384, "learning_rate": 3.658029052758377e-05, "loss": 0.7665979266166687, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.15243, "step": 255, "tokens/total": 66846720, "tokens/train_per_sec_per_gpu": 173.48, "tokens/trainable": 5561266 }, { "epoch": 0.2722680138261101, "grad_norm": 0.05519971251487732, "learning_rate": 3.653869218912258e-05, "loss": 0.7569284439086914, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.13172, "step": 256, "tokens/total": 67108864, "tokens/train_per_sec_per_gpu": 171.2, "tokens/trainable": 5583034 }, { "epoch": 0.27333156075511833, "grad_norm": 0.055965524166822433, "learning_rate": 3.649686632332052e-05, "loss": 0.7072951793670654, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.0285, "step": 257, "tokens/total": 67371008, "tokens/train_per_sec_per_gpu": 175.76, "tokens/trainable": 5604839 }, { "epoch": 0.27439510768412656, "grad_norm": 0.05398353934288025, "learning_rate": 3.645481350558754e-05, "loss": 0.7658364176750183, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.15079, "step": 258, "tokens/total": 67633152, "tokens/train_per_sec_per_gpu": 153.96, "tokens/trainable": 5626766 }, { "epoch": 0.2754586546131348, "grad_norm": 0.04775004833936691, "learning_rate": 3.6412534314455836e-05, "loss": 0.6940434575080872, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.00179, "step": 259, "tokens/total": 67895296, "tokens/train_per_sec_per_gpu": 142.3, "tokens/trainable": 5647569 }, { "epoch": 0.276522201542143, "grad_norm": 0.05857420340180397, "learning_rate": 3.637002933157187e-05, "loss": 0.8444321155548096, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.32666, "step": 260, "tokens/total": 68157440, "tokens/train_per_sec_per_gpu": 169.5, "tokens/trainable": 5667784 }, { "epoch": 0.2775857484711513, "grad_norm": 0.05263550207018852, "learning_rate": 3.6327299141688396e-05, "loss": 0.8108729720115662, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.24987, "step": 261, "tokens/total": 68419584, "tokens/train_per_sec_per_gpu": 143.48, "tokens/trainable": 5689804 }, { "epoch": 0.27864929540015954, "grad_norm": 0.05478999391198158, "learning_rate": 3.6284344332656396e-05, "loss": 0.8421679735183716, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.32139, "step": 262, "tokens/total": 68681728, "tokens/train_per_sec_per_gpu": 246.12, "tokens/trainable": 5714024 }, { "epoch": 0.27971284232916777, "grad_norm": 0.06489334255456924, "learning_rate": 3.6241165495417006e-05, "loss": 0.8724418878555298, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.39275, "step": 263, "tokens/total": 68943872, "tokens/train_per_sec_per_gpu": 166.84, "tokens/trainable": 5734370 }, { "epoch": 0.280776389258176, "grad_norm": 0.06463494151830673, "learning_rate": 3.619776322399336e-05, "loss": 0.7543226480484009, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.12617, "step": 264, "tokens/total": 69206016, "tokens/train_per_sec_per_gpu": 214.95, "tokens/trainable": 5757030 }, { "epoch": 0.2818399361871843, "grad_norm": 0.05572579428553581, "learning_rate": 3.615413811548247e-05, "loss": 0.7700116634368896, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.15979, "step": 265, "tokens/total": 69468160, "tokens/train_per_sec_per_gpu": 187.77, "tokens/trainable": 5777899 }, { "epoch": 0.2829034831161925, "grad_norm": 0.05551740154623985, "learning_rate": 3.6110290770046954e-05, "loss": 0.7438211441040039, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.10396, "step": 266, "tokens/total": 69730304, "tokens/train_per_sec_per_gpu": 154.69, "tokens/trainable": 5798783 }, { "epoch": 0.28396703004520074, "grad_norm": 0.06058730185031891, "learning_rate": 3.606622179090682e-05, "loss": 0.7585455179214478, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.13517, "step": 267, "tokens/total": 69992448, "tokens/train_per_sec_per_gpu": 180.57, "tokens/trainable": 5819730 }, { "epoch": 0.28503057697420897, "grad_norm": 0.056012120097875595, "learning_rate": 3.6021931784331136e-05, "loss": 0.7486584186553955, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.11416, "step": 268, "tokens/total": 70254592, "tokens/train_per_sec_per_gpu": 156.75, "tokens/trainable": 5842124 }, { "epoch": 0.28609412390321726, "grad_norm": 0.06160522252321243, "learning_rate": 3.5977421359629715e-05, "loss": 0.6910536289215088, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 1.99582, "step": 269, "tokens/total": 70516736, "tokens/train_per_sec_per_gpu": 171.0, "tokens/trainable": 5863426 }, { "epoch": 0.2871576708322255, "grad_norm": 0.061156366020441055, "learning_rate": 3.593269112914472e-05, "loss": 0.7586344480514526, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.13536, "step": 270, "tokens/total": 70778880, "tokens/train_per_sec_per_gpu": 165.2, "tokens/trainable": 5884974 }, { "epoch": 0.2882212177612337, "grad_norm": 0.061479438096284866, "learning_rate": 3.588774170824225e-05, "loss": 0.8126254677772522, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.25382, "step": 271, "tokens/total": 71041024, "tokens/train_per_sec_per_gpu": 213.13, "tokens/trainable": 5906918 }, { "epoch": 0.28928476469024195, "grad_norm": 0.05486295372247696, "learning_rate": 3.584257371530386e-05, "loss": 0.7505637407302856, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.11819, "step": 272, "tokens/total": 71303168, "tokens/train_per_sec_per_gpu": 202.73, "tokens/trainable": 5929901 }, { "epoch": 0.2903483116192502, "grad_norm": 0.05428226664662361, "learning_rate": 3.579718777171806e-05, "loss": 0.7298543453216553, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.07478, "step": 273, "tokens/total": 71565312, "tokens/train_per_sec_per_gpu": 187.93, "tokens/trainable": 5949798 }, { "epoch": 0.29141185854825846, "grad_norm": 0.06736662238836288, "learning_rate": 3.5751584501871766e-05, "loss": 0.7698936462402344, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.15954, "step": 274, "tokens/total": 71827456, "tokens/train_per_sec_per_gpu": 179.77, "tokens/trainable": 5971772 }, { "epoch": 0.2924754054772667, "grad_norm": 0.05799878388643265, "learning_rate": 3.570576453314172e-05, "loss": 0.7739330530166626, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.16828, "step": 275, "tokens/total": 72089600, "tokens/train_per_sec_per_gpu": 179.84, "tokens/trainable": 5994035 }, { "epoch": 0.2935389524062749, "grad_norm": 0.051827434450387955, "learning_rate": 3.565972849588584e-05, "loss": 0.7918999791145325, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.20759, "step": 276, "tokens/total": 72351744, "tokens/train_per_sec_per_gpu": 219.16, "tokens/trainable": 6017969 }, { "epoch": 0.29460249933528315, "grad_norm": 0.05260028690099716, "learning_rate": 3.561347702343456e-05, "loss": 0.7098696827888489, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.03373, "step": 277, "tokens/total": 72613888, "tokens/train_per_sec_per_gpu": 208.13, "tokens/trainable": 6042126 }, { "epoch": 0.29566604626429144, "grad_norm": 0.05379115045070648, "learning_rate": 3.556701075208213e-05, "loss": 0.7800576090812683, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.1816, "step": 278, "tokens/total": 72876032, "tokens/train_per_sec_per_gpu": 207.75, "tokens/trainable": 6065523 }, { "epoch": 0.29672959319329967, "grad_norm": 0.05740763247013092, "learning_rate": 3.5520330321077815e-05, "loss": 0.8003265857696533, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.22627, "step": 279, "tokens/total": 73138176, "tokens/train_per_sec_per_gpu": 204.22, "tokens/trainable": 6088540 }, { "epoch": 0.2977931401223079, "grad_norm": 0.05483856424689293, "learning_rate": 3.547343637261717e-05, "loss": 0.7692792415618896, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.15821, "step": 280, "tokens/total": 73400320, "tokens/train_per_sec_per_gpu": 227.55, "tokens/trainable": 6111049 }, { "epoch": 0.29885668705131613, "grad_norm": 0.06599462777376175, "learning_rate": 3.5426329551833145e-05, "loss": 0.7712551355361938, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.16248, "step": 281, "tokens/total": 73662464, "tokens/train_per_sec_per_gpu": 194.72, "tokens/trainable": 6131721 }, { "epoch": 0.29992023398032436, "grad_norm": 0.055346276611089706, "learning_rate": 3.537901050678724e-05, "loss": 0.7576462030410767, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.13325, "step": 282, "tokens/total": 73924608, "tokens/train_per_sec_per_gpu": 200.29, "tokens/trainable": 6154147 }, { "epoch": 0.30098378090933264, "grad_norm": 0.06298188120126724, "learning_rate": 3.533147988846059e-05, "loss": 0.7945112586021423, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.21336, "step": 283, "tokens/total": 74186752, "tokens/train_per_sec_per_gpu": 228.94, "tokens/trainable": 6178704 }, { "epoch": 0.3020473278383409, "grad_norm": 0.05894119665026665, "learning_rate": 3.5283738350744986e-05, "loss": 0.7516214847564697, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.12044, "step": 284, "tokens/total": 74448896, "tokens/train_per_sec_per_gpu": 212.98, "tokens/trainable": 6200161 }, { "epoch": 0.3031108747673491, "grad_norm": 0.05261611193418503, "learning_rate": 3.5235786550433906e-05, "loss": 0.7629417777061462, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.14458, "step": 285, "tokens/total": 74711040, "tokens/train_per_sec_per_gpu": 219.51, "tokens/trainable": 6222600 }, { "epoch": 0.30417442169635733, "grad_norm": 0.059760384261608124, "learning_rate": 3.518762514721346e-05, "loss": 0.7162789106369019, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.0468, "step": 286, "tokens/total": 74973184, "tokens/train_per_sec_per_gpu": 235.97, "tokens/trainable": 6246346 }, { "epoch": 0.3052379686253656, "grad_norm": 0.06057070195674896, "learning_rate": 3.5139254803653346e-05, "loss": 0.7634356021881104, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.14564, "step": 287, "tokens/total": 75235328, "tokens/train_per_sec_per_gpu": 152.93, "tokens/trainable": 6267246 }, { "epoch": 0.30630151555437385, "grad_norm": 0.05959314480423927, "learning_rate": 3.509067618519768e-05, "loss": 0.8070247173309326, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.24123, "step": 288, "tokens/total": 75497472, "tokens/train_per_sec_per_gpu": 186.48, "tokens/trainable": 6289062 }, { "epoch": 0.3073650624833821, "grad_norm": 0.06659425795078278, "learning_rate": 3.5041889960155895e-05, "loss": 0.6915267705917358, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 1.99676, "step": 289, "tokens/total": 75759616, "tokens/train_per_sec_per_gpu": 194.38, "tokens/trainable": 6310916 }, { "epoch": 0.3084286094123903, "grad_norm": 0.05240656062960625, "learning_rate": 3.499289679969351e-05, "loss": 0.8021942377090454, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.23043, "step": 290, "tokens/total": 76021760, "tokens/train_per_sec_per_gpu": 207.96, "tokens/trainable": 6334881 }, { "epoch": 0.30949215634139854, "grad_norm": 0.06096180900931358, "learning_rate": 3.494369737782293e-05, "loss": 0.7638937830924988, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.14662, "step": 291, "tokens/total": 76283904, "tokens/train_per_sec_per_gpu": 205.29, "tokens/trainable": 6357654 }, { "epoch": 0.3105557032704068, "grad_norm": 0.062094077467918396, "learning_rate": 3.489429237139414e-05, "loss": 0.7336180210113525, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.0826, "step": 292, "tokens/total": 76546048, "tokens/train_per_sec_per_gpu": 178.46, "tokens/trainable": 6377841 }, { "epoch": 0.31161925019941505, "grad_norm": 0.055953506380319595, "learning_rate": 3.48446824600854e-05, "loss": 0.7609624862670898, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.14034, "step": 293, "tokens/total": 76808192, "tokens/train_per_sec_per_gpu": 168.65, "tokens/trainable": 6400076 }, { "epoch": 0.3126827971284233, "grad_norm": 0.05196288600564003, "learning_rate": 3.4794868326393935e-05, "loss": 0.7413825988769531, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.09884, "step": 294, "tokens/total": 77070336, "tokens/train_per_sec_per_gpu": 172.78, "tokens/trainable": 6422449 }, { "epoch": 0.3137463440574315, "grad_norm": 0.05353325977921486, "learning_rate": 3.474485065562648e-05, "loss": 0.7769887447357178, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.17491, "step": 295, "tokens/total": 77332480, "tokens/train_per_sec_per_gpu": 221.08, "tokens/trainable": 6445481 }, { "epoch": 0.3148098909864398, "grad_norm": 0.054653119295835495, "learning_rate": 3.469463013588991e-05, "loss": 0.7785749435424805, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.17837, "step": 296, "tokens/total": 77594624, "tokens/train_per_sec_per_gpu": 210.56, "tokens/trainable": 6467586 }, { "epoch": 0.31587343791544803, "grad_norm": 0.06027977168560028, "learning_rate": 3.4644207458081735e-05, "loss": 0.7743946313858032, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.16928, "step": 297, "tokens/total": 77856768, "tokens/train_per_sec_per_gpu": 204.77, "tokens/trainable": 6489041 }, { "epoch": 0.31693698484445626, "grad_norm": 0.05732357129454613, "learning_rate": 3.45935833158806e-05, "loss": 0.8058252334594727, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.23854, "step": 298, "tokens/total": 78118912, "tokens/train_per_sec_per_gpu": 171.13, "tokens/trainable": 6510717 }, { "epoch": 0.3180005317734645, "grad_norm": 0.05821244791150093, "learning_rate": 3.454275840573679e-05, "loss": 0.7749941945075989, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.17058, "step": 299, "tokens/total": 78381056, "tokens/train_per_sec_per_gpu": 223.81, "tokens/trainable": 6533545 }, { "epoch": 0.3190640787024727, "grad_norm": 0.05597531050443649, "learning_rate": 3.4491733426862556e-05, "loss": 0.7812941670417786, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.1843, "step": 300, "tokens/total": 78643200, "tokens/train_per_sec_per_gpu": 185.6, "tokens/trainable": 6556460 }, { "epoch": 0.3190640787024727, "eval_loss": 0.7725370526313782, "eval_ppl": 2.16525, "eval_runtime": 237.2784, "eval_samples_per_second": 28.178, "eval_steps_per_second": 1.762, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 38.19, "memory/max_allocated (GiB)": 38.19, "step": 300 }, { "epoch": 0.320127625631481, "grad_norm": 0.054633188992738724, "learning_rate": 3.44405090812226e-05, "loss": 0.7383944988250732, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.09257, "step": 301, "tokens/total": 78905344, "tokens/train_per_sec_per_gpu": 192.32, "tokens/trainable": 6579346 }, { "epoch": 0.32119117256048924, "grad_norm": 0.05523587390780449, "learning_rate": 3.438908607352433e-05, "loss": 0.6943072080612183, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.00232, "step": 302, "tokens/total": 79167488, "tokens/train_per_sec_per_gpu": 187.81, "tokens/trainable": 6599223 }, { "epoch": 0.32225471948949747, "grad_norm": 0.058179691433906555, "learning_rate": 3.433746511120823e-05, "loss": 0.7615541815757751, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.1416, "step": 303, "tokens/total": 79429632, "tokens/train_per_sec_per_gpu": 207.76, "tokens/trainable": 6622095 }, { "epoch": 0.3233182664185057, "grad_norm": 0.06251095235347748, "learning_rate": 3.428564690443807e-05, "loss": 0.7761749029159546, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.17314, "step": 304, "tokens/total": 79691776, "tokens/train_per_sec_per_gpu": 212.28, "tokens/trainable": 6643820 }, { "epoch": 0.324381813347514, "grad_norm": 0.06721773743629456, "learning_rate": 3.4233632166091205e-05, "loss": 0.7868590354919434, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.19649, "step": 305, "tokens/total": 79953920, "tokens/train_per_sec_per_gpu": 171.72, "tokens/trainable": 6664523 }, { "epoch": 0.3254453602765222, "grad_norm": 0.05866523087024689, "learning_rate": 3.41814216117487e-05, "loss": 0.7207814455032349, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.05604, "step": 306, "tokens/total": 80216064, "tokens/train_per_sec_per_gpu": 128.88, "tokens/trainable": 6685386 }, { "epoch": 0.32650890720553044, "grad_norm": 0.05545097589492798, "learning_rate": 3.412901595968551e-05, "loss": 0.7210918068885803, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.05668, "step": 307, "tokens/total": 80478208, "tokens/train_per_sec_per_gpu": 193.0, "tokens/trainable": 6706345 }, { "epoch": 0.32757245413453867, "grad_norm": 0.063844695687294, "learning_rate": 3.407641593086063e-05, "loss": 0.7825930118560791, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.18714, "step": 308, "tokens/total": 80740352, "tokens/train_per_sec_per_gpu": 201.52, "tokens/trainable": 6728441 }, { "epoch": 0.32863600106354696, "grad_norm": 0.05788377299904823, "learning_rate": 3.4023622248907134e-05, "loss": 0.7766852974891663, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.17425, "step": 309, "tokens/total": 81002496, "tokens/train_per_sec_per_gpu": 240.97, "tokens/trainable": 6754793 }, { "epoch": 0.3296995479925552, "grad_norm": 0.060298677533864975, "learning_rate": 3.397063564012223e-05, "loss": 0.7567377090454102, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.13131, "step": 310, "tokens/total": 81264640, "tokens/train_per_sec_per_gpu": 192.69, "tokens/trainable": 6776213 }, { "epoch": 0.3307630949215634, "grad_norm": 0.05781788378953934, "learning_rate": 3.391745683345729e-05, "loss": 0.8195350766181946, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.26944, "step": 311, "tokens/total": 81526784, "tokens/train_per_sec_per_gpu": 182.74, "tokens/trainable": 6798309 }, { "epoch": 0.33182664185057165, "grad_norm": 0.05640830472111702, "learning_rate": 3.3864086560507785e-05, "loss": 0.8057565689086914, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.23839, "step": 312, "tokens/total": 81788928, "tokens/train_per_sec_per_gpu": 297.01, "tokens/trainable": 6823595 }, { "epoch": 0.3328901887795799, "grad_norm": 0.0735592171549797, "learning_rate": 3.3810525555503254e-05, "loss": 0.8060101270675659, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.23896, "step": 313, "tokens/total": 82051072, "tokens/train_per_sec_per_gpu": 196.76, "tokens/trainable": 6845473 }, { "epoch": 0.33395373570858816, "grad_norm": 0.06115385517477989, "learning_rate": 3.3756774555297186e-05, "loss": 0.782099723815918, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.18606, "step": 314, "tokens/total": 82313216, "tokens/train_per_sec_per_gpu": 159.99, "tokens/trainable": 6866224 }, { "epoch": 0.3350172826375964, "grad_norm": 0.0663752555847168, "learning_rate": 3.3702834299356885e-05, "loss": 0.7522889375686646, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.12185, "step": 315, "tokens/total": 82575360, "tokens/train_per_sec_per_gpu": 157.51, "tokens/trainable": 6885515 }, { "epoch": 0.3360808295666046, "grad_norm": 0.05871765688061714, "learning_rate": 3.3648705529753306e-05, "loss": 0.7267637252807617, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.06838, "step": 316, "tokens/total": 82837504, "tokens/train_per_sec_per_gpu": 176.94, "tokens/trainable": 6906219 }, { "epoch": 0.33714437649561285, "grad_norm": 0.05956491827964783, "learning_rate": 3.3594388991150825e-05, "loss": 0.821346640586853, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.27356, "step": 317, "tokens/total": 83099648, "tokens/train_per_sec_per_gpu": 222.29, "tokens/trainable": 6929025 }, { "epoch": 0.33820792342462114, "grad_norm": 0.059474050998687744, "learning_rate": 3.353988543079702e-05, "loss": 0.8273679614067078, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.28729, "step": 318, "tokens/total": 83361792, "tokens/train_per_sec_per_gpu": 228.13, "tokens/trainable": 6951873 }, { "epoch": 0.33927147035362937, "grad_norm": 0.06504053622484207, "learning_rate": 3.3485195598512365e-05, "loss": 0.7731481790542603, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.16658, "step": 319, "tokens/total": 83623936, "tokens/train_per_sec_per_gpu": 183.37, "tokens/trainable": 6973435 }, { "epoch": 0.3403350172826376, "grad_norm": 0.0615144707262516, "learning_rate": 3.343032024667994e-05, "loss": 0.7920888662338257, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.208, "step": 320, "tokens/total": 83886080, "tokens/train_per_sec_per_gpu": 168.97, "tokens/trainable": 6993689 }, { "epoch": 0.3413985642116458, "grad_norm": 0.058340758085250854, "learning_rate": 3.337526013023507e-05, "loss": 0.7120217084884644, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.03811, "step": 321, "tokens/total": 84148224, "tokens/train_per_sec_per_gpu": 174.54, "tokens/trainable": 7015311 }, { "epoch": 0.34246211114065406, "grad_norm": 0.06086525321006775, "learning_rate": 3.332001600665494e-05, "loss": 0.8457379341125488, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.3297, "step": 322, "tokens/total": 84410368, "tokens/train_per_sec_per_gpu": 205.39, "tokens/trainable": 7038147 }, { "epoch": 0.34352565806966234, "grad_norm": 0.06164320930838585, "learning_rate": 3.326458863594814e-05, "loss": 0.8334550261497498, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.30126, "step": 323, "tokens/total": 84672512, "tokens/train_per_sec_per_gpu": 198.45, "tokens/trainable": 7059704 }, { "epoch": 0.3445892049986706, "grad_norm": 0.057081304490566254, "learning_rate": 3.320897878064428e-05, "loss": 0.838066041469574, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.31189, "step": 324, "tokens/total": 84934656, "tokens/train_per_sec_per_gpu": 209.4, "tokens/trainable": 7083870 }, { "epoch": 0.3456527519276788, "grad_norm": 0.05457906052470207, "learning_rate": 3.3153187205783454e-05, "loss": 0.7599472403526306, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.13816, "step": 325, "tokens/total": 85196800, "tokens/train_per_sec_per_gpu": 186.44, "tokens/trainable": 7108954 }, { "epoch": 0.34671629885668703, "grad_norm": 0.061806946992874146, "learning_rate": 3.309721467890571e-05, "loss": 0.7315384149551392, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.07828, "step": 326, "tokens/total": 85458944, "tokens/train_per_sec_per_gpu": 169.83, "tokens/trainable": 7130972 }, { "epoch": 0.3477798457856953, "grad_norm": 0.05989091843366623, "learning_rate": 3.3041061970040486e-05, "loss": 0.7060387134552002, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.02595, "step": 327, "tokens/total": 85721088, "tokens/train_per_sec_per_gpu": 191.48, "tokens/trainable": 7152519 }, { "epoch": 0.34884339271470355, "grad_norm": 0.06088387221097946, "learning_rate": 3.298472985169609e-05, "loss": 0.7296034097671509, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.07426, "step": 328, "tokens/total": 85983232, "tokens/train_per_sec_per_gpu": 237.52, "tokens/trainable": 7173660 }, { "epoch": 0.3499069396437118, "grad_norm": 0.059276383370161057, "learning_rate": 3.2928219098848955e-05, "loss": 0.7553350329399109, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.12832, "step": 329, "tokens/total": 86245376, "tokens/train_per_sec_per_gpu": 148.74, "tokens/trainable": 7196866 }, { "epoch": 0.35097048657272, "grad_norm": 0.06169452145695686, "learning_rate": 3.287153048893307e-05, "loss": 0.8173863887786865, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.26457, "step": 330, "tokens/total": 86507520, "tokens/train_per_sec_per_gpu": 198.89, "tokens/trainable": 7218044 }, { "epoch": 0.35203403350172824, "grad_norm": 0.06478651612997055, "learning_rate": 3.281466480182925e-05, "loss": 0.7393308877944946, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.09453, "step": 331, "tokens/total": 86769664, "tokens/train_per_sec_per_gpu": 189.28, "tokens/trainable": 7240085 }, { "epoch": 0.3530975804307365, "grad_norm": 0.0714036375284195, "learning_rate": 3.27576228198544e-05, "loss": 0.7848330140113831, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.19204, "step": 332, "tokens/total": 87031808, "tokens/train_per_sec_per_gpu": 180.05, "tokens/trainable": 7263289 }, { "epoch": 0.35416112735974475, "grad_norm": 0.059255450963974, "learning_rate": 3.270040532775077e-05, "loss": 0.7574397921562195, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.13281, "step": 333, "tokens/total": 87293952, "tokens/train_per_sec_per_gpu": 169.3, "tokens/trainable": 7283911 }, { "epoch": 0.355224674288753, "grad_norm": 0.0516078807413578, "learning_rate": 3.264301311267515e-05, "loss": 0.7163474559783936, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.04694, "step": 334, "tokens/total": 87556096, "tokens/train_per_sec_per_gpu": 223.05, "tokens/trainable": 7307672 }, { "epoch": 0.3562882212177612, "grad_norm": 0.06229817494750023, "learning_rate": 3.2585446964188026e-05, "loss": 0.715316116809845, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.04483, "step": 335, "tokens/total": 87818240, "tokens/train_per_sec_per_gpu": 195.29, "tokens/trainable": 7330420 }, { "epoch": 0.3573517681467695, "grad_norm": 0.06110849231481552, "learning_rate": 3.252770767424277e-05, "loss": 0.6982554197311401, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.01024, "step": 336, "tokens/total": 88080384, "tokens/train_per_sec_per_gpu": 207.65, "tokens/trainable": 7351877 }, { "epoch": 0.35841531507577773, "grad_norm": 0.05932268872857094, "learning_rate": 3.246979603717467e-05, "loss": 0.7535157203674316, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.12446, "step": 337, "tokens/total": 88342528, "tokens/train_per_sec_per_gpu": 239.51, "tokens/trainable": 7377554 }, { "epoch": 0.35947886200478596, "grad_norm": 0.06964396685361862, "learning_rate": 3.2411712849690076e-05, "loss": 0.802024781703949, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.23005, "step": 338, "tokens/total": 88604672, "tokens/train_per_sec_per_gpu": 223.69, "tokens/trainable": 7399496 }, { "epoch": 0.3605424089337942, "grad_norm": 0.06441716849803925, "learning_rate": 3.235345891085536e-05, "loss": 0.7387241125106812, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.09326, "step": 339, "tokens/total": 88866816, "tokens/train_per_sec_per_gpu": 195.97, "tokens/trainable": 7421514 }, { "epoch": 0.3616059558628024, "grad_norm": 0.05845661088824272, "learning_rate": 3.229503502208602e-05, "loss": 0.7207450866699219, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.05596, "step": 340, "tokens/total": 89128960, "tokens/train_per_sec_per_gpu": 171.19, "tokens/trainable": 7442572 }, { "epoch": 0.3626695027918107, "grad_norm": 0.06204557791352272, "learning_rate": 3.2236441987135565e-05, "loss": 0.757001519203186, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.13187, "step": 341, "tokens/total": 89391104, "tokens/train_per_sec_per_gpu": 177.89, "tokens/trainable": 7463164 }, { "epoch": 0.36373304972081894, "grad_norm": 0.05783051997423172, "learning_rate": 3.2177680612084494e-05, "loss": 0.7507032752037048, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.11849, "step": 342, "tokens/total": 89653248, "tokens/train_per_sec_per_gpu": 187.61, "tokens/trainable": 7485037 }, { "epoch": 0.36479659664982717, "grad_norm": 0.06585251539945602, "learning_rate": 3.211875170532924e-05, "loss": 0.7124658823013306, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.03901, "step": 343, "tokens/total": 89915392, "tokens/train_per_sec_per_gpu": 169.85, "tokens/trainable": 7505178 }, { "epoch": 0.3658601435788354, "grad_norm": 0.06433013081550598, "learning_rate": 3.205965607757097e-05, "loss": 0.755608081817627, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.12891, "step": 344, "tokens/total": 90177536, "tokens/train_per_sec_per_gpu": 186.29, "tokens/trainable": 7526734 }, { "epoch": 0.3669236905078437, "grad_norm": 0.06355367600917816, "learning_rate": 3.200039454180452e-05, "loss": 0.8150879740715027, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.25937, "step": 345, "tokens/total": 90439680, "tokens/train_per_sec_per_gpu": 179.78, "tokens/trainable": 7548681 }, { "epoch": 0.3679872374368519, "grad_norm": 0.058385591953992844, "learning_rate": 3.1940967913307144e-05, "loss": 0.6909693479537964, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 1.99565, "step": 346, "tokens/total": 90701824, "tokens/train_per_sec_per_gpu": 181.15, "tokens/trainable": 7569898 }, { "epoch": 0.36905078436586014, "grad_norm": 0.055889613926410675, "learning_rate": 3.188137700962733e-05, "loss": 0.7137737274169922, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.04168, "step": 347, "tokens/total": 90963968, "tokens/train_per_sec_per_gpu": 225.36, "tokens/trainable": 7592560 }, { "epoch": 0.37011433129486837, "grad_norm": 0.07224101573228836, "learning_rate": 3.1821622650573536e-05, "loss": 0.7986935973167419, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.22264, "step": 348, "tokens/total": 91226112, "tokens/train_per_sec_per_gpu": 204.66, "tokens/trainable": 7614191 }, { "epoch": 0.37117787822387666, "grad_norm": 0.06277446448802948, "learning_rate": 3.176170565820293e-05, "loss": 0.7567167282104492, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.13127, "step": 349, "tokens/total": 91488256, "tokens/train_per_sec_per_gpu": 165.46, "tokens/trainable": 7634778 }, { "epoch": 0.3722414251528849, "grad_norm": 0.06595855206251144, "learning_rate": 3.170162685681007e-05, "loss": 0.8263660073280334, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.285, "step": 350, "tokens/total": 91750400, "tokens/train_per_sec_per_gpu": 180.47, "tokens/trainable": 7655328 }, { "epoch": 0.3733049720818931, "grad_norm": 0.053407274186611176, "learning_rate": 3.1641387072915574e-05, "loss": 0.7117317318916321, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.03752, "step": 351, "tokens/total": 92012544, "tokens/train_per_sec_per_gpu": 209.35, "tokens/trainable": 7678377 }, { "epoch": 0.37436851901090135, "grad_norm": 0.0668676570057869, "learning_rate": 3.1580987135254715e-05, "loss": 0.7980103492736816, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.22112, "step": 352, "tokens/total": 92274688, "tokens/train_per_sec_per_gpu": 227.01, "tokens/trainable": 7700711 }, { "epoch": 0.3754320659399096, "grad_norm": 0.06859572231769562, "learning_rate": 3.1520427874766064e-05, "loss": 0.7606133818626404, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.13959, "step": 353, "tokens/total": 92536832, "tokens/train_per_sec_per_gpu": 178.09, "tokens/trainable": 7720570 }, { "epoch": 0.37649561286891786, "grad_norm": 0.06196228042244911, "learning_rate": 3.145971012458005e-05, "loss": 0.7603709101676941, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.13907, "step": 354, "tokens/total": 92798976, "tokens/train_per_sec_per_gpu": 174.2, "tokens/trainable": 7743422 }, { "epoch": 0.3775591597979261, "grad_norm": 0.05902267247438431, "learning_rate": 3.139883472000745e-05, "loss": 0.7486381530761719, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.11412, "step": 355, "tokens/total": 93061120, "tokens/train_per_sec_per_gpu": 257.11, "tokens/trainable": 7768328 }, { "epoch": 0.3786227067269343, "grad_norm": 0.05966390669345856, "learning_rate": 3.133780249852799e-05, "loss": 0.7499316930770874, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.11686, "step": 356, "tokens/total": 93323264, "tokens/train_per_sec_per_gpu": 218.24, "tokens/trainable": 7791928 }, { "epoch": 0.37968625365594255, "grad_norm": 0.06466083228588104, "learning_rate": 3.127661429977872e-05, "loss": 0.754686713218689, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.12695, "step": 357, "tokens/total": 93585408, "tokens/train_per_sec_per_gpu": 196.44, "tokens/trainable": 7812051 }, { "epoch": 0.38074980058495084, "grad_norm": 0.058210499584674835, "learning_rate": 3.1215270965542544e-05, "loss": 0.7765249013900757, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.1739, "step": 358, "tokens/total": 93847552, "tokens/train_per_sec_per_gpu": 207.04, "tokens/trainable": 7833084 }, { "epoch": 0.38181334751395907, "grad_norm": 0.057726673781871796, "learning_rate": 3.115377333973659e-05, "loss": 0.7482200264930725, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.11324, "step": 359, "tokens/total": 94109696, "tokens/train_per_sec_per_gpu": 228.28, "tokens/trainable": 7857441 }, { "epoch": 0.3828768944429673, "grad_norm": 0.06571833789348602, "learning_rate": 3.109212226840063e-05, "loss": 0.8272304534912109, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.28698, "step": 360, "tokens/total": 94371840, "tokens/train_per_sec_per_gpu": 168.79, "tokens/trainable": 7879005 }, { "epoch": 0.3839404413719755, "grad_norm": 0.06610347330570221, "learning_rate": 3.103031859968542e-05, "loss": 0.7621959447860718, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.14298, "step": 361, "tokens/total": 94633984, "tokens/train_per_sec_per_gpu": 174.29, "tokens/trainable": 7899540 }, { "epoch": 0.38500398830098376, "grad_norm": 0.05712476745247841, "learning_rate": 3.096836318384103e-05, "loss": 0.7930388450622559, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.2101, "step": 362, "tokens/total": 94896128, "tokens/train_per_sec_per_gpu": 220.27, "tokens/trainable": 7922542 }, { "epoch": 0.38606753522999204, "grad_norm": 0.06603259593248367, "learning_rate": 3.0906256873205193e-05, "loss": 0.7841721773147583, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.19059, "step": 363, "tokens/total": 95158272, "tokens/train_per_sec_per_gpu": 176.27, "tokens/trainable": 7945013 }, { "epoch": 0.3871310821590003, "grad_norm": 0.056947011500597, "learning_rate": 3.08440005221915e-05, "loss": 0.7464731335639954, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.10955, "step": 364, "tokens/total": 95420416, "tokens/train_per_sec_per_gpu": 169.25, "tokens/trainable": 7965994 }, { "epoch": 0.3881946290880085, "grad_norm": 0.06810449063777924, "learning_rate": 3.0781594987277724e-05, "loss": 0.8506579399108887, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.34119, "step": 365, "tokens/total": 95682560, "tokens/train_per_sec_per_gpu": 215.35, "tokens/trainable": 7987278 }, { "epoch": 0.38925817601701673, "grad_norm": 0.05948546528816223, "learning_rate": 3.071904112699397e-05, "loss": 0.8312458395957947, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.29618, "step": 366, "tokens/total": 95944704, "tokens/train_per_sec_per_gpu": 215.65, "tokens/trainable": 8011500 }, { "epoch": 0.390321722946025, "grad_norm": 0.06788976490497589, "learning_rate": 3.0656339801910926e-05, "loss": 0.7707929611206055, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.16148, "step": 367, "tokens/total": 96206848, "tokens/train_per_sec_per_gpu": 205.07, "tokens/trainable": 8032365 }, { "epoch": 0.39138526987503325, "grad_norm": 0.058084722608327866, "learning_rate": 3.059349187462798e-05, "loss": 0.6599326729774475, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 1.93466, "step": 368, "tokens/total": 96468992, "tokens/train_per_sec_per_gpu": 228.45, "tokens/trainable": 8055039 }, { "epoch": 0.3924488168040415, "grad_norm": 0.06115228310227394, "learning_rate": 3.053049820976135e-05, "loss": 0.760746955871582, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.13987, "step": 369, "tokens/total": 96731136, "tokens/train_per_sec_per_gpu": 189.31, "tokens/trainable": 8078259 }, { "epoch": 0.3935123637330497, "grad_norm": 0.07438641041517258, "learning_rate": 3.0467359673932244e-05, "loss": 0.7815507650375366, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.18486, "step": 370, "tokens/total": 96993280, "tokens/train_per_sec_per_gpu": 151.93, "tokens/trainable": 8099819 }, { "epoch": 0.39457591066205794, "grad_norm": 0.06393261253833771, "learning_rate": 3.040407713575487e-05, "loss": 0.7451884746551514, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.10684, "step": 371, "tokens/total": 97255424, "tokens/train_per_sec_per_gpu": 177.57, "tokens/trainable": 8121758 }, { "epoch": 0.3956394575910662, "grad_norm": 0.058520760387182236, "learning_rate": 3.034065146582452e-05, "loss": 0.8240416049957275, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.27969, "step": 372, "tokens/total": 97517568, "tokens/train_per_sec_per_gpu": 214.86, "tokens/trainable": 8144183 }, { "epoch": 0.39670300452007445, "grad_norm": 0.06523202359676361, "learning_rate": 3.0277083536705604e-05, "loss": 0.7511664032936096, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.11947, "step": 373, "tokens/total": 97779712, "tokens/train_per_sec_per_gpu": 172.88, "tokens/trainable": 8164676 }, { "epoch": 0.3977665514490827, "grad_norm": 0.06993680447340012, "learning_rate": 3.0213374222919617e-05, "loss": 0.7678710222244263, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.15517, "step": 374, "tokens/total": 98041856, "tokens/train_per_sec_per_gpu": 161.85, "tokens/trainable": 8184550 }, { "epoch": 0.3988300983780909, "grad_norm": 0.05952773615717888, "learning_rate": 3.0149524400933114e-05, "loss": 0.7702289819717407, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.16026, "step": 375, "tokens/total": 98304000, "tokens/train_per_sec_per_gpu": 174.86, "tokens/trainable": 8206571 }, { "epoch": 0.3998936453070992, "grad_norm": 0.06135401502251625, "learning_rate": 3.008553494914569e-05, "loss": 0.6841228008270264, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 1.98203, "step": 376, "tokens/total": 98566144, "tokens/train_per_sec_per_gpu": 194.82, "tokens/trainable": 8229068 }, { "epoch": 0.40095719223610743, "grad_norm": 0.0706343874335289, "learning_rate": 3.002140674787783e-05, "loss": 0.7617697715759277, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.14206, "step": 377, "tokens/total": 98828288, "tokens/train_per_sec_per_gpu": 145.29, "tokens/trainable": 8251266 }, { "epoch": 0.40202073916511566, "grad_norm": 0.0608975812792778, "learning_rate": 2.995714067935887e-05, "loss": 0.7577897310256958, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.13356, "step": 378, "tokens/total": 99090432, "tokens/train_per_sec_per_gpu": 148.13, "tokens/trainable": 8273480 }, { "epoch": 0.4030842860941239, "grad_norm": 0.06372503191232681, "learning_rate": 2.9892737627714786e-05, "loss": 0.7569035291671753, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.13167, "step": 379, "tokens/total": 99352576, "tokens/train_per_sec_per_gpu": 164.36, "tokens/trainable": 8293250 }, { "epoch": 0.4041478330231321, "grad_norm": 0.07930820435285568, "learning_rate": 2.9828198478956093e-05, "loss": 0.7412185668945312, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.09849, "step": 380, "tokens/total": 99614720, "tokens/train_per_sec_per_gpu": 165.37, "tokens/trainable": 8313207 }, { "epoch": 0.4052113799521404, "grad_norm": 0.0618307963013649, "learning_rate": 2.976352412096563e-05, "loss": 0.718746542930603, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.05186, "step": 381, "tokens/total": 99876864, "tokens/train_per_sec_per_gpu": 195.0, "tokens/trainable": 8334762 }, { "epoch": 0.40627492688114863, "grad_norm": 0.06570852547883987, "learning_rate": 2.9698715443486338e-05, "loss": 0.7648171186447144, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.1486, "step": 382, "tokens/total": 100139008, "tokens/train_per_sec_per_gpu": 174.53, "tokens/trainable": 8354787 }, { "epoch": 0.40733847381015686, "grad_norm": 0.06311152130365372, "learning_rate": 2.9633773338109027e-05, "loss": 0.7888460159301758, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.20086, "step": 383, "tokens/total": 100401152, "tokens/train_per_sec_per_gpu": 164.71, "tokens/trainable": 8376673 }, { "epoch": 0.4084020207391651, "grad_norm": 0.06162691116333008, "learning_rate": 2.9568698698260126e-05, "loss": 0.7124409675598145, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.03896, "step": 384, "tokens/total": 100663296, "tokens/train_per_sec_per_gpu": 224.89, "tokens/trainable": 8399066 }, { "epoch": 0.4094655676681734, "grad_norm": 0.059788983315229416, "learning_rate": 2.9503492419189366e-05, "loss": 0.7600404024124146, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.13836, "step": 385, "tokens/total": 100925440, "tokens/train_per_sec_per_gpu": 200.39, "tokens/trainable": 8422560 }, { "epoch": 0.4105291145971816, "grad_norm": 0.06216095760464668, "learning_rate": 2.9438155397957474e-05, "loss": 0.8070495128631592, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.24129, "step": 386, "tokens/total": 101187584, "tokens/train_per_sec_per_gpu": 149.86, "tokens/trainable": 8443830 }, { "epoch": 0.41159266152618984, "grad_norm": 0.05838804319500923, "learning_rate": 2.937268853342383e-05, "loss": 0.759508490562439, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.13723, "step": 387, "tokens/total": 101449728, "tokens/train_per_sec_per_gpu": 136.92, "tokens/trainable": 8465737 }, { "epoch": 0.41265620845519807, "grad_norm": 0.05906308814883232, "learning_rate": 2.9307092726234127e-05, "loss": 0.7870290279388428, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.19686, "step": 388, "tokens/total": 101711872, "tokens/train_per_sec_per_gpu": 176.32, "tokens/trainable": 8488568 }, { "epoch": 0.41371975538420636, "grad_norm": 0.06456853449344635, "learning_rate": 2.9241368878807925e-05, "loss": 0.7161662578582764, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.04657, "step": 389, "tokens/total": 101974016, "tokens/train_per_sec_per_gpu": 181.97, "tokens/trainable": 8510960 }, { "epoch": 0.4147833023132146, "grad_norm": 0.0667351484298706, "learning_rate": 2.9175517895326292e-05, "loss": 0.8066511154174805, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.24039, "step": 390, "tokens/total": 102236160, "tokens/train_per_sec_per_gpu": 208.78, "tokens/trainable": 8532514 }, { "epoch": 0.4158468492422228, "grad_norm": 0.06246356666088104, "learning_rate": 2.9109540681719322e-05, "loss": 0.7846518158912659, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.19164, "step": 391, "tokens/total": 102498304, "tokens/train_per_sec_per_gpu": 224.1, "tokens/trainable": 8555545 }, { "epoch": 0.41691039617123105, "grad_norm": 0.06540708988904953, "learning_rate": 2.9043438145653717e-05, "loss": 0.7380800843238831, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.09192, "step": 392, "tokens/total": 102760448, "tokens/train_per_sec_per_gpu": 190.76, "tokens/trainable": 8577885 }, { "epoch": 0.4179739431002393, "grad_norm": 0.062018416821956635, "learning_rate": 2.8977211196520257e-05, "loss": 0.774357795715332, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.1692, "step": 393, "tokens/total": 103022592, "tokens/train_per_sec_per_gpu": 212.39, "tokens/trainable": 8600480 }, { "epoch": 0.41903749002924756, "grad_norm": 0.062351830303668976, "learning_rate": 2.8910860745421305e-05, "loss": 0.7095509767532349, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.03308, "step": 394, "tokens/total": 103284736, "tokens/train_per_sec_per_gpu": 166.14, "tokens/trainable": 8620831 }, { "epoch": 0.4201010369582558, "grad_norm": 0.06218433752655983, "learning_rate": 2.884438770515829e-05, "loss": 0.7856686115264893, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.19387, "step": 395, "tokens/total": 103546880, "tokens/train_per_sec_per_gpu": 235.06, "tokens/trainable": 8646017 }, { "epoch": 0.421164583887264, "grad_norm": 0.07032019644975662, "learning_rate": 2.877779299021912e-05, "loss": 0.7358700037002563, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.0873, "step": 396, "tokens/total": 103809024, "tokens/train_per_sec_per_gpu": 132.47, "tokens/trainable": 8665893 }, { "epoch": 0.42222813081627225, "grad_norm": 0.057751379907131195, "learning_rate": 2.871107751676561e-05, "loss": 0.7476328015327454, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.11199, "step": 397, "tokens/total": 104071168, "tokens/train_per_sec_per_gpu": 216.14, "tokens/trainable": 8689230 }, { "epoch": 0.42329167774528054, "grad_norm": 0.060179274529218674, "learning_rate": 2.8644242202620907e-05, "loss": 0.7824307084083557, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.18678, "step": 398, "tokens/total": 104333312, "tokens/train_per_sec_per_gpu": 207.33, "tokens/trainable": 8710606 }, { "epoch": 0.42435522467428877, "grad_norm": 0.07202541828155518, "learning_rate": 2.857728796725682e-05, "loss": 0.7747771739959717, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.17011, "step": 399, "tokens/total": 104595456, "tokens/train_per_sec_per_gpu": 152.9, "tokens/trainable": 8732079 }, { "epoch": 0.425418771603297, "grad_norm": 0.06443008780479431, "learning_rate": 2.8510215731781194e-05, "loss": 0.7466020584106445, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.10982, "step": 400, "tokens/total": 104857600, "tokens/train_per_sec_per_gpu": 166.03, "tokens/trainable": 8754126 }, { "epoch": 0.4264823185323052, "grad_norm": 0.05945134907960892, "learning_rate": 2.844302641892523e-05, "loss": 0.7561734914779663, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.13011, "step": 401, "tokens/total": 105119744, "tokens/train_per_sec_per_gpu": 184.08, "tokens/trainable": 8775310 }, { "epoch": 0.42754586546131346, "grad_norm": 0.06570764631032944, "learning_rate": 2.83757209530308e-05, "loss": 0.728512167930603, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.072, "step": 402, "tokens/total": 105381888, "tokens/train_per_sec_per_gpu": 160.39, "tokens/trainable": 8795593 }, { "epoch": 0.42860941239032174, "grad_norm": 0.06125401705503464, "learning_rate": 2.8308300260037734e-05, "loss": 0.754820704460144, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.12723, "step": 403, "tokens/total": 105644032, "tokens/train_per_sec_per_gpu": 172.16, "tokens/trainable": 8817946 }, { "epoch": 0.42967295931933, "grad_norm": 0.06924828141927719, "learning_rate": 2.8240765267471056e-05, "loss": 0.7697207927703857, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.15916, "step": 404, "tokens/total": 105906176, "tokens/train_per_sec_per_gpu": 208.57, "tokens/trainable": 8840440 }, { "epoch": 0.4307365062483382, "grad_norm": 0.06233648210763931, "learning_rate": 2.8173116904428242e-05, "loss": 0.7522628307342529, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.1218, "step": 405, "tokens/total": 106168320, "tokens/train_per_sec_per_gpu": 192.61, "tokens/trainable": 8862223 }, { "epoch": 0.43180005317734643, "grad_norm": 0.05627689138054848, "learning_rate": 2.810535610156646e-05, "loss": 0.8060315847396851, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.23901, "step": 406, "tokens/total": 106430464, "tokens/train_per_sec_per_gpu": 240.07, "tokens/trainable": 8888361 }, { "epoch": 0.4328636001063547, "grad_norm": 0.060442451387643814, "learning_rate": 2.803748379108972e-05, "loss": 0.7366658449172974, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.08896, "step": 407, "tokens/total": 106692608, "tokens/train_per_sec_per_gpu": 194.32, "tokens/trainable": 8911616 }, { "epoch": 0.43392714703536295, "grad_norm": 0.06363833695650101, "learning_rate": 2.7969500906736065e-05, "loss": 0.777472734451294, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.17597, "step": 408, "tokens/total": 106954752, "tokens/train_per_sec_per_gpu": 205.98, "tokens/trainable": 8935498 }, { "epoch": 0.4349906939643712, "grad_norm": 0.060264162719249725, "learning_rate": 2.7901408383764776e-05, "loss": 0.8076545000076294, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.24264, "step": 409, "tokens/total": 107216896, "tokens/train_per_sec_per_gpu": 191.96, "tokens/trainable": 8959641 }, { "epoch": 0.4360542408933794, "grad_norm": 0.0615147240459919, "learning_rate": 2.783320715894341e-05, "loss": 0.7697659730911255, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.15926, "step": 410, "tokens/total": 107479040, "tokens/train_per_sec_per_gpu": 229.4, "tokens/trainable": 8981683 }, { "epoch": 0.43711778782238764, "grad_norm": 0.05955628678202629, "learning_rate": 2.7764898170534993e-05, "loss": 0.7136242389678955, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.04138, "step": 411, "tokens/total": 107741184, "tokens/train_per_sec_per_gpu": 197.85, "tokens/trainable": 9002018 }, { "epoch": 0.4381813347513959, "grad_norm": 0.05883246660232544, "learning_rate": 2.76964823582851e-05, "loss": 0.7270078659057617, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.06888, "step": 412, "tokens/total": 108003328, "tokens/train_per_sec_per_gpu": 209.83, "tokens/trainable": 9024288 }, { "epoch": 0.43924488168040415, "grad_norm": 0.06609495729207993, "learning_rate": 2.7627960663408874e-05, "loss": 0.7433674335479736, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.10301, "step": 413, "tokens/total": 108265472, "tokens/train_per_sec_per_gpu": 205.82, "tokens/trainable": 9046881 }, { "epoch": 0.4403084286094124, "grad_norm": 0.07620103657245636, "learning_rate": 2.7559334028578135e-05, "loss": 0.7862449884414673, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.19514, "step": 414, "tokens/total": 108527616, "tokens/train_per_sec_per_gpu": 181.75, "tokens/trainable": 9067519 }, { "epoch": 0.4413719755384206, "grad_norm": 0.06601685285568237, "learning_rate": 2.7490603397908393e-05, "loss": 0.7648828029632568, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.14874, "step": 415, "tokens/total": 108789760, "tokens/train_per_sec_per_gpu": 174.3, "tokens/trainable": 9089133 }, { "epoch": 0.4424355224674289, "grad_norm": 0.06513883173465729, "learning_rate": 2.742176971694585e-05, "loss": 0.7682895660400391, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.15608, "step": 416, "tokens/total": 109051904, "tokens/train_per_sec_per_gpu": 177.11, "tokens/trainable": 9110997 }, { "epoch": 0.44349906939643713, "grad_norm": 0.06401567906141281, "learning_rate": 2.7352833932654402e-05, "loss": 0.7881615161895752, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.19935, "step": 417, "tokens/total": 109314048, "tokens/train_per_sec_per_gpu": 176.32, "tokens/trainable": 9134139 }, { "epoch": 0.44456261632544536, "grad_norm": 0.07034140825271606, "learning_rate": 2.7283796993402613e-05, "loss": 0.8482910990715027, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.33565, "step": 418, "tokens/total": 109576192, "tokens/train_per_sec_per_gpu": 202.73, "tokens/trainable": 9155746 }, { "epoch": 0.4456261632544536, "grad_norm": 0.06932256370782852, "learning_rate": 2.721465984895066e-05, "loss": 0.7511000633239746, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.11933, "step": 419, "tokens/total": 109838336, "tokens/train_per_sec_per_gpu": 151.07, "tokens/trainable": 9175997 }, { "epoch": 0.4466897101834618, "grad_norm": 0.05801774561405182, "learning_rate": 2.714542345043726e-05, "loss": 0.7160236835479736, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.04628, "step": 420, "tokens/total": 110100480, "tokens/train_per_sec_per_gpu": 206.36, "tokens/trainable": 9201154 }, { "epoch": 0.4477532571124701, "grad_norm": 0.0719035267829895, "learning_rate": 2.7076088750366617e-05, "loss": 0.7472874522209167, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.11127, "step": 421, "tokens/total": 110362624, "tokens/train_per_sec_per_gpu": 221.89, "tokens/trainable": 9223146 }, { "epoch": 0.44881680404147833, "grad_norm": 0.06794784218072891, "learning_rate": 2.700665670259527e-05, "loss": 0.789624035358429, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.20257, "step": 422, "tokens/total": 110624768, "tokens/train_per_sec_per_gpu": 182.17, "tokens/trainable": 9245502 }, { "epoch": 0.44988035097048656, "grad_norm": 0.06335590779781342, "learning_rate": 2.693712826231903e-05, "loss": 0.720179557800293, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.0548, "step": 423, "tokens/total": 110886912, "tokens/train_per_sec_per_gpu": 184.76, "tokens/trainable": 9267573 }, { "epoch": 0.4509438978994948, "grad_norm": 0.06363626569509506, "learning_rate": 2.6867504386059776e-05, "loss": 0.771357536315918, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.1627, "step": 424, "tokens/total": 111149056, "tokens/train_per_sec_per_gpu": 181.22, "tokens/trainable": 9290024 }, { "epoch": 0.4520074448285031, "grad_norm": 0.06436197459697723, "learning_rate": 2.679778603165233e-05, "loss": 0.8299375772476196, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.29318, "step": 425, "tokens/total": 111411200, "tokens/train_per_sec_per_gpu": 181.49, "tokens/trainable": 9312089 }, { "epoch": 0.4530709917575113, "grad_norm": 0.07020284235477448, "learning_rate": 2.6727974158231312e-05, "loss": 0.7461998462677002, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.10897, "step": 426, "tokens/total": 111673344, "tokens/train_per_sec_per_gpu": 179.65, "tokens/trainable": 9332072 }, { "epoch": 0.45413453868651954, "grad_norm": 0.07214612513780594, "learning_rate": 2.6658069726217863e-05, "loss": 0.7696465253829956, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.159, "step": 427, "tokens/total": 111935488, "tokens/train_per_sec_per_gpu": 248.27, "tokens/trainable": 9354855 }, { "epoch": 0.45519808561552777, "grad_norm": 0.07788221538066864, "learning_rate": 2.6588073697306494e-05, "loss": 0.7938302755355835, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.21185, "step": 428, "tokens/total": 112197632, "tokens/train_per_sec_per_gpu": 193.12, "tokens/trainable": 9376477 }, { "epoch": 0.456261632544536, "grad_norm": 0.06307144463062286, "learning_rate": 2.6517987034451846e-05, "loss": 0.735474705696106, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.08647, "step": 429, "tokens/total": 112459776, "tokens/train_per_sec_per_gpu": 150.92, "tokens/trainable": 9396690 }, { "epoch": 0.4573251794735443, "grad_norm": 0.06643529236316681, "learning_rate": 2.6447810701855436e-05, "loss": 0.7078378200531006, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.0296, "step": 430, "tokens/total": 112721920, "tokens/train_per_sec_per_gpu": 209.14, "tokens/trainable": 9418386 }, { "epoch": 0.4583887264025525, "grad_norm": 0.06323806941509247, "learning_rate": 2.637754566495238e-05, "loss": 0.7661035060882568, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.15137, "step": 431, "tokens/total": 112984064, "tokens/train_per_sec_per_gpu": 174.47, "tokens/trainable": 9440604 }, { "epoch": 0.45945227333156075, "grad_norm": 0.06076245754957199, "learning_rate": 2.6307192890398126e-05, "loss": 0.7844895720481873, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.19129, "step": 432, "tokens/total": 113246208, "tokens/train_per_sec_per_gpu": 186.77, "tokens/trainable": 9464422 }, { "epoch": 0.460515820260569, "grad_norm": 0.0693245679140091, "learning_rate": 2.6236753346055176e-05, "loss": 0.6995319724082947, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.01281, "step": 433, "tokens/total": 113508352, "tokens/train_per_sec_per_gpu": 182.76, "tokens/trainable": 9485760 }, { "epoch": 0.46157936718957726, "grad_norm": 0.060889989137649536, "learning_rate": 2.6166228000979726e-05, "loss": 0.6955425143241882, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.0048, "step": 434, "tokens/total": 113770496, "tokens/train_per_sec_per_gpu": 201.76, "tokens/trainable": 9508485 }, { "epoch": 0.4626429141185855, "grad_norm": 0.06848379224538803, "learning_rate": 2.6095617825408357e-05, "loss": 0.7852069735527039, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.19286, "step": 435, "tokens/total": 114032640, "tokens/train_per_sec_per_gpu": 189.68, "tokens/trainable": 9530965 }, { "epoch": 0.4637064610475937, "grad_norm": 0.07383300364017487, "learning_rate": 2.6024923790744686e-05, "loss": 0.7191091775894165, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.0526, "step": 436, "tokens/total": 114294784, "tokens/train_per_sec_per_gpu": 196.81, "tokens/trainable": 9553360 }, { "epoch": 0.46477000797660195, "grad_norm": 0.06411275267601013, "learning_rate": 2.5954146869546018e-05, "loss": 0.7149425148963928, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.04407, "step": 437, "tokens/total": 114556928, "tokens/train_per_sec_per_gpu": 222.64, "tokens/trainable": 9574783 }, { "epoch": 0.46583355490561024, "grad_norm": 0.06931298971176147, "learning_rate": 2.588328803550993e-05, "loss": 0.8728055953979492, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.39362, "step": 438, "tokens/total": 114819072, "tokens/train_per_sec_per_gpu": 154.42, "tokens/trainable": 9595109 }, { "epoch": 0.46689710183461847, "grad_norm": 0.07122842967510223, "learning_rate": 2.5812348263460916e-05, "loss": 0.8005967140197754, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.22687, "step": 439, "tokens/total": 115081216, "tokens/train_per_sec_per_gpu": 195.46, "tokens/trainable": 9615835 }, { "epoch": 0.4679606487636267, "grad_norm": 0.07328256964683533, "learning_rate": 2.5741328529336934e-05, "loss": 0.8034292459487915, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.23319, "step": 440, "tokens/total": 115343360, "tokens/train_per_sec_per_gpu": 232.42, "tokens/trainable": 9638791 }, { "epoch": 0.4690241956926349, "grad_norm": 0.07195594161748886, "learning_rate": 2.5670229810176026e-05, "loss": 0.8185476064682007, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.2672, "step": 441, "tokens/total": 115605504, "tokens/train_per_sec_per_gpu": 154.16, "tokens/trainable": 9659990 }, { "epoch": 0.47008774262164316, "grad_norm": 0.07275015860795975, "learning_rate": 2.5599053084102838e-05, "loss": 0.8078755736351013, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.24314, "step": 442, "tokens/total": 115867648, "tokens/train_per_sec_per_gpu": 167.17, "tokens/trainable": 9680516 }, { "epoch": 0.47115128955065144, "grad_norm": 0.07171031087636948, "learning_rate": 2.5527799330315182e-05, "loss": 0.7240858674049377, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.06284, "step": 443, "tokens/total": 116129792, "tokens/train_per_sec_per_gpu": 189.32, "tokens/trainable": 9700527 }, { "epoch": 0.47221483647965967, "grad_norm": 0.05753646418452263, "learning_rate": 2.5456469529070566e-05, "loss": 0.7394333481788635, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.09475, "step": 444, "tokens/total": 116391936, "tokens/train_per_sec_per_gpu": 215.9, "tokens/trainable": 9725734 }, { "epoch": 0.4732783834086679, "grad_norm": 0.0627320408821106, "learning_rate": 2.5385064661672692e-05, "loss": 0.7002488970756531, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.01425, "step": 445, "tokens/total": 116654080, "tokens/train_per_sec_per_gpu": 166.84, "tokens/trainable": 9748076 }, { "epoch": 0.47434193033767613, "grad_norm": 0.06529032438993454, "learning_rate": 2.5313585710457985e-05, "loss": 0.7247512936592102, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.06422, "step": 446, "tokens/total": 116916224, "tokens/train_per_sec_per_gpu": 182.03, "tokens/trainable": 9770158 }, { "epoch": 0.4754054772666844, "grad_norm": 0.07498825341463089, "learning_rate": 2.5242033658782043e-05, "loss": 0.7564839124679565, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.13077, "step": 447, "tokens/total": 117178368, "tokens/train_per_sec_per_gpu": 177.15, "tokens/trainable": 9791564 }, { "epoch": 0.47646902419569265, "grad_norm": 0.07310041040182114, "learning_rate": 2.5170409491006145e-05, "loss": 0.8869813680648804, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.42779, "step": 448, "tokens/total": 117440512, "tokens/train_per_sec_per_gpu": 153.04, "tokens/trainable": 9812412 }, { "epoch": 0.4775325711247009, "grad_norm": 0.06502556055784225, "learning_rate": 2.5098714192483683e-05, "loss": 0.733482837677002, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.08232, "step": 449, "tokens/total": 117702656, "tokens/train_per_sec_per_gpu": 234.84, "tokens/trainable": 9836100 }, { "epoch": 0.4785961180537091, "grad_norm": 0.0711643397808075, "learning_rate": 2.50269487495466e-05, "loss": 0.7789556384086609, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.1792, "step": 450, "tokens/total": 117964800, "tokens/train_per_sec_per_gpu": 198.82, "tokens/trainable": 9857145 }, { "epoch": 0.4785961180537091, "eval_loss": 0.7710337042808533, "eval_ppl": 2.162, "eval_runtime": 237.2453, "eval_samples_per_second": 28.182, "eval_steps_per_second": 1.762, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 38.19, "memory/max_allocated (GiB)": 38.19, "step": 450 }, { "epoch": 0.47965966498271734, "grad_norm": 0.06686612218618393, "learning_rate": 2.4955114149491865e-05, "loss": 0.7786468267440796, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.17852, "step": 451, "tokens/total": 118226944, "tokens/train_per_sec_per_gpu": 164.67, "tokens/trainable": 9878218 }, { "epoch": 0.4807232119117256, "grad_norm": 0.0646248385310173, "learning_rate": 2.488321138056783e-05, "loss": 0.7580331563949585, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.13407, "step": 452, "tokens/total": 118489088, "tokens/train_per_sec_per_gpu": 244.84, "tokens/trainable": 9900786 }, { "epoch": 0.48178675884073385, "grad_norm": 0.06668704003095627, "learning_rate": 2.481124143196069e-05, "loss": 0.7126317620277405, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.03935, "step": 453, "tokens/total": 118751232, "tokens/train_per_sec_per_gpu": 175.52, "tokens/trainable": 9921265 }, { "epoch": 0.4828503057697421, "grad_norm": 0.06527574360370636, "learning_rate": 2.473920529378083e-05, "loss": 0.8261401653289795, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.28448, "step": 454, "tokens/total": 119013376, "tokens/train_per_sec_per_gpu": 182.36, "tokens/trainable": 9942755 }, { "epoch": 0.4839138526987503, "grad_norm": 0.07106975466012955, "learning_rate": 2.4667103957049237e-05, "loss": 0.7335352897644043, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.08243, "step": 455, "tokens/total": 119275520, "tokens/train_per_sec_per_gpu": 161.14, "tokens/trainable": 9963450 }, { "epoch": 0.4849773996277586, "grad_norm": 0.06658606976270676, "learning_rate": 2.4594938413683842e-05, "loss": 0.8705431222915649, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.38821, "step": 456, "tokens/total": 119537664, "tokens/train_per_sec_per_gpu": 221.13, "tokens/trainable": 9986403 }, { "epoch": 0.48604094655676683, "grad_norm": 0.06788789480924606, "learning_rate": 2.4522709656485896e-05, "loss": 0.8447569608688354, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.32741, "step": 457, "tokens/total": 119799808, "tokens/train_per_sec_per_gpu": 181.47, "tokens/trainable": 10007854 }, { "epoch": 0.48710449348577506, "grad_norm": 0.06112990900874138, "learning_rate": 2.445041867912629e-05, "loss": 0.7580868601799011, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.13419, "step": 458, "tokens/total": 120061952, "tokens/train_per_sec_per_gpu": 243.31, "tokens/trainable": 10033216 }, { "epoch": 0.4881680404147833, "grad_norm": 0.06946436315774918, "learning_rate": 2.43780664761319e-05, "loss": 0.7739719152450562, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.16836, "step": 459, "tokens/total": 120324096, "tokens/train_per_sec_per_gpu": 186.88, "tokens/trainable": 10054546 }, { "epoch": 0.4892315873437915, "grad_norm": 0.06711182743310928, "learning_rate": 2.4305654042871893e-05, "loss": 0.7676090002059937, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.15461, "step": 460, "tokens/total": 120586240, "tokens/train_per_sec_per_gpu": 203.9, "tokens/trainable": 10077655 }, { "epoch": 0.4902951342727998, "grad_norm": 0.07618647068738937, "learning_rate": 2.4233182375544052e-05, "loss": 0.8108090758323669, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.24973, "step": 461, "tokens/total": 120848384, "tokens/train_per_sec_per_gpu": 203.76, "tokens/trainable": 10099531 }, { "epoch": 0.49135868120180803, "grad_norm": 0.062073446810245514, "learning_rate": 2.4160652471161043e-05, "loss": 0.7443853616714478, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.10515, "step": 462, "tokens/total": 121110528, "tokens/train_per_sec_per_gpu": 172.63, "tokens/trainable": 10121476 }, { "epoch": 0.49242222813081626, "grad_norm": 0.06608369201421738, "learning_rate": 2.408806532753674e-05, "loss": 0.7803705930709839, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.18228, "step": 463, "tokens/total": 121372672, "tokens/train_per_sec_per_gpu": 193.2, "tokens/trainable": 10142900 }, { "epoch": 0.4934857750598245, "grad_norm": 0.06678762286901474, "learning_rate": 2.4015421943272442e-05, "loss": 0.7602465152740479, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.1388, "step": 464, "tokens/total": 121634816, "tokens/train_per_sec_per_gpu": 144.23, "tokens/trainable": 10162519 }, { "epoch": 0.4945493219888328, "grad_norm": 0.07111706584692001, "learning_rate": 2.3942723317743194e-05, "loss": 0.8450040817260742, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.32799, "step": 465, "tokens/total": 121896960, "tokens/train_per_sec_per_gpu": 172.92, "tokens/trainable": 10184843 }, { "epoch": 0.495612868917841, "grad_norm": 0.06318546831607819, "learning_rate": 2.3869970451083996e-05, "loss": 0.7744694948196411, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.16944, "step": 466, "tokens/total": 122159104, "tokens/train_per_sec_per_gpu": 211.14, "tokens/trainable": 10208041 }, { "epoch": 0.49667641584684924, "grad_norm": 0.06685180962085724, "learning_rate": 2.3797164344176054e-05, "loss": 0.7870070934295654, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.19681, "step": 467, "tokens/total": 122421248, "tokens/train_per_sec_per_gpu": 188.13, "tokens/trainable": 10229287 }, { "epoch": 0.49773996277585747, "grad_norm": 0.058472346514463425, "learning_rate": 2.3724305998633033e-05, "loss": 0.7486791610717773, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.11421, "step": 468, "tokens/total": 122683392, "tokens/train_per_sec_per_gpu": 193.34, "tokens/trainable": 10253293 }, { "epoch": 0.4988035097048657, "grad_norm": 0.06563540548086166, "learning_rate": 2.365139641678724e-05, "loss": 0.793043851852417, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.21011, "step": 469, "tokens/total": 122945536, "tokens/train_per_sec_per_gpu": 208.04, "tokens/trainable": 10275964 }, { "epoch": 0.499867056633874, "grad_norm": 0.06955686956644058, "learning_rate": 2.3578436601675857e-05, "loss": 0.767907977104187, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.15525, "step": 470, "tokens/total": 123207680, "tokens/train_per_sec_per_gpu": 162.65, "tokens/trainable": 10294893 }, { "epoch": 0.5009306035628822, "grad_norm": 0.06527870893478394, "learning_rate": 2.3505427557027153e-05, "loss": 0.7281315922737122, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.07121, "step": 471, "tokens/total": 123469824, "tokens/train_per_sec_per_gpu": 200.9, "tokens/trainable": 10318126 }, { "epoch": 0.5019941504918904, "grad_norm": 0.05844523012638092, "learning_rate": 2.3432370287246644e-05, "loss": 0.8092571496963501, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.24624, "step": 472, "tokens/total": 123731968, "tokens/train_per_sec_per_gpu": 191.12, "tokens/trainable": 10340892 }, { "epoch": 0.5030576974208987, "grad_norm": 0.0627407431602478, "learning_rate": 2.3359265797403297e-05, "loss": 0.8085892200469971, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.24474, "step": 473, "tokens/total": 123994112, "tokens/train_per_sec_per_gpu": 171.13, "tokens/trainable": 10364090 }, { "epoch": 0.5041212443499069, "grad_norm": 0.06694270670413971, "learning_rate": 2.3286115093215717e-05, "loss": 0.7958250045776367, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.21627, "step": 474, "tokens/total": 124256256, "tokens/train_per_sec_per_gpu": 199.51, "tokens/trainable": 10385759 }, { "epoch": 0.5051847912789151, "grad_norm": 0.06240719184279442, "learning_rate": 2.3212919181038264e-05, "loss": 0.7695267200469971, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.15874, "step": 475, "tokens/total": 124518400, "tokens/train_per_sec_per_gpu": 229.77, "tokens/trainable": 10411362 }, { "epoch": 0.5062483382079235, "grad_norm": 0.06532912701368332, "learning_rate": 2.313967906784725e-05, "loss": 0.7777595520019531, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.17659, "step": 476, "tokens/total": 124780544, "tokens/train_per_sec_per_gpu": 235.82, "tokens/trainable": 10433694 }, { "epoch": 0.5073118851369317, "grad_norm": 0.07455461472272873, "learning_rate": 2.306639576122708e-05, "loss": 0.8379300832748413, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.31158, "step": 477, "tokens/total": 125042688, "tokens/train_per_sec_per_gpu": 163.21, "tokens/trainable": 10452642 }, { "epoch": 0.5083754320659399, "grad_norm": 0.06591842323541641, "learning_rate": 2.2993070269356372e-05, "loss": 0.8277432322502136, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.28815, "step": 478, "tokens/total": 125304832, "tokens/train_per_sec_per_gpu": 150.23, "tokens/trainable": 10473380 }, { "epoch": 0.5094389789949482, "grad_norm": 0.06278102844953537, "learning_rate": 2.2919703600994096e-05, "loss": 0.7098827362060547, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.03375, "step": 479, "tokens/total": 125566976, "tokens/train_per_sec_per_gpu": 225.4, "tokens/trainable": 10496551 }, { "epoch": 0.5105025259239564, "grad_norm": 0.0822196677327156, "learning_rate": 2.2846296765465708e-05, "loss": 0.7754343152046204, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.17154, "step": 480, "tokens/total": 125829120, "tokens/train_per_sec_per_gpu": 189.13, "tokens/trainable": 10518279 }, { "epoch": 0.5115660728529646, "grad_norm": 0.06427222490310669, "learning_rate": 2.2772850772649245e-05, "loss": 0.7595022916793823, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.13721, "step": 481, "tokens/total": 126091264, "tokens/train_per_sec_per_gpu": 199.34, "tokens/trainable": 10541893 }, { "epoch": 0.5126296197819729, "grad_norm": 0.07555945217609406, "learning_rate": 2.269936663296146e-05, "loss": 0.7030783295631409, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.01996, "step": 482, "tokens/total": 126353408, "tokens/train_per_sec_per_gpu": 131.21, "tokens/trainable": 10560989 }, { "epoch": 0.5136931667109811, "grad_norm": 0.06928715854883194, "learning_rate": 2.262584535734387e-05, "loss": 0.7435761094093323, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.10344, "step": 483, "tokens/total": 126615552, "tokens/train_per_sec_per_gpu": 198.19, "tokens/trainable": 10582215 }, { "epoch": 0.5147567136399893, "grad_norm": 0.07140571624040604, "learning_rate": 2.2552287957248914e-05, "loss": 0.7427330017089844, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.10167, "step": 484, "tokens/total": 126877696, "tokens/train_per_sec_per_gpu": 177.97, "tokens/trainable": 10603392 }, { "epoch": 0.5158202605689977, "grad_norm": 0.05892965570092201, "learning_rate": 2.2478695444625993e-05, "loss": 0.7203789949417114, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.05521, "step": 485, "tokens/total": 127139840, "tokens/train_per_sec_per_gpu": 178.16, "tokens/trainable": 10625660 }, { "epoch": 0.5168838074980059, "grad_norm": 0.06429523229598999, "learning_rate": 2.240506883190756e-05, "loss": 0.7731969952583313, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.16668, "step": 486, "tokens/total": 127401984, "tokens/train_per_sec_per_gpu": 219.13, "tokens/trainable": 10649385 }, { "epoch": 0.5179473544270141, "grad_norm": 0.07350295782089233, "learning_rate": 2.2331409131995186e-05, "loss": 0.7495805621147156, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.11611, "step": 487, "tokens/total": 127664128, "tokens/train_per_sec_per_gpu": 234.6, "tokens/trainable": 10670554 }, { "epoch": 0.5190109013560223, "grad_norm": 0.06763040274381638, "learning_rate": 2.2257717358245645e-05, "loss": 0.7657451629638672, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.1506, "step": 488, "tokens/total": 127926272, "tokens/train_per_sec_per_gpu": 211.17, "tokens/trainable": 10694102 }, { "epoch": 0.5200744482850306, "grad_norm": 0.059085726737976074, "learning_rate": 2.2183994524456946e-05, "loss": 0.7091976404190063, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.03236, "step": 489, "tokens/total": 128188416, "tokens/train_per_sec_per_gpu": 185.08, "tokens/trainable": 10716407 }, { "epoch": 0.5211379952140388, "grad_norm": 0.06827449053525925, "learning_rate": 2.2110241644854415e-05, "loss": 0.7978835105895996, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.22084, "step": 490, "tokens/total": 128450560, "tokens/train_per_sec_per_gpu": 241.5, "tokens/trainable": 10739309 }, { "epoch": 0.522201542143047, "grad_norm": 0.06967286020517349, "learning_rate": 2.2036459734076715e-05, "loss": 0.7917447090148926, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.20724, "step": 491, "tokens/total": 128712704, "tokens/train_per_sec_per_gpu": 183.24, "tokens/trainable": 10759856 }, { "epoch": 0.5232650890720553, "grad_norm": 0.06789640337228775, "learning_rate": 2.196264980716189e-05, "loss": 0.785992443561554, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.19458, "step": 492, "tokens/total": 128974848, "tokens/train_per_sec_per_gpu": 208.01, "tokens/trainable": 10783453 }, { "epoch": 0.5243286360010635, "grad_norm": 0.07713824510574341, "learning_rate": 2.1888812879533438e-05, "loss": 0.740135908126831, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.09622, "step": 493, "tokens/total": 129236992, "tokens/train_per_sec_per_gpu": 193.38, "tokens/trainable": 10803022 }, { "epoch": 0.5253921829300718, "grad_norm": 0.06190735474228859, "learning_rate": 2.1814949966986288e-05, "loss": 0.7506577968597412, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.11839, "step": 494, "tokens/total": 129499136, "tokens/train_per_sec_per_gpu": 186.68, "tokens/trainable": 10825782 }, { "epoch": 0.5264557298590801, "grad_norm": 0.07524209469556808, "learning_rate": 2.174106208567286e-05, "loss": 0.8435920476913452, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.3247, "step": 495, "tokens/total": 129761280, "tokens/train_per_sec_per_gpu": 209.07, "tokens/trainable": 10848136 }, { "epoch": 0.5275192767880883, "grad_norm": 0.0792510136961937, "learning_rate": 2.166715025208908e-05, "loss": 0.795114278793335, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.21469, "step": 496, "tokens/total": 130023424, "tokens/train_per_sec_per_gpu": 176.49, "tokens/trainable": 10869582 }, { "epoch": 0.5285828237170965, "grad_norm": 0.06474092602729797, "learning_rate": 2.1593215483060382e-05, "loss": 0.7897614240646362, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.20287, "step": 497, "tokens/total": 130285568, "tokens/train_per_sec_per_gpu": 209.6, "tokens/trainable": 10893683 }, { "epoch": 0.5296463706461048, "grad_norm": 0.06536766141653061, "learning_rate": 2.151925879572774e-05, "loss": 0.7543013095855713, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.12613, "step": 498, "tokens/total": 130547712, "tokens/train_per_sec_per_gpu": 224.22, "tokens/trainable": 10916399 }, { "epoch": 0.530709917575113, "grad_norm": 0.06765280663967133, "learning_rate": 2.144528120753365e-05, "loss": 0.7390546202659607, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.09395, "step": 499, "tokens/total": 130809856, "tokens/train_per_sec_per_gpu": 169.0, "tokens/trainable": 10938004 }, { "epoch": 0.5317734645041212, "grad_norm": 0.07433243840932846, "learning_rate": 2.137128373620817e-05, "loss": 0.7480766773223877, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.11293, "step": 500, "tokens/total": 131072000, "tokens/train_per_sec_per_gpu": 188.24, "tokens/trainable": 10960082 }, { "epoch": 0.5328370114331294, "grad_norm": 0.058342620730400085, "learning_rate": 2.129726739975486e-05, "loss": 0.73946613073349, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.09482, "step": 501, "tokens/total": 131334144, "tokens/train_per_sec_per_gpu": 230.56, "tokens/trainable": 10983591 }, { "epoch": 0.5339005583621377, "grad_norm": 0.06705200672149658, "learning_rate": 2.1223233216436858e-05, "loss": 0.7899049520492554, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.20319, "step": 502, "tokens/total": 131596288, "tokens/train_per_sec_per_gpu": 177.49, "tokens/trainable": 11007036 }, { "epoch": 0.534964105291146, "grad_norm": 0.07227369397878647, "learning_rate": 2.114918220476279e-05, "loss": 0.7793487310409546, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.18005, "step": 503, "tokens/total": 131858432, "tokens/train_per_sec_per_gpu": 173.77, "tokens/trainable": 11030251 }, { "epoch": 0.5360276522201542, "grad_norm": 0.0691395178437233, "learning_rate": 2.1075115383472803e-05, "loss": 0.7794291973114014, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.18023, "step": 504, "tokens/total": 132120576, "tokens/train_per_sec_per_gpu": 161.28, "tokens/trainable": 11049765 }, { "epoch": 0.5370911991491625, "grad_norm": 0.06914931535720825, "learning_rate": 2.1001033771524556e-05, "loss": 0.7482678890228271, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.11334, "step": 505, "tokens/total": 132382720, "tokens/train_per_sec_per_gpu": 204.45, "tokens/trainable": 11072121 }, { "epoch": 0.5381547460781707, "grad_norm": 0.06037479639053345, "learning_rate": 2.0926938388079168e-05, "loss": 0.7241630554199219, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.063, "step": 506, "tokens/total": 132644864, "tokens/train_per_sec_per_gpu": 211.36, "tokens/trainable": 11094232 }, { "epoch": 0.5392182930071789, "grad_norm": 0.06841259449720383, "learning_rate": 2.085283025248723e-05, "loss": 0.7322399616241455, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.07973, "step": 507, "tokens/total": 132907008, "tokens/train_per_sec_per_gpu": 200.68, "tokens/trainable": 11118027 }, { "epoch": 0.5402818399361872, "grad_norm": 0.0649460032582283, "learning_rate": 2.0778710384274757e-05, "loss": 0.7410999536514282, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.09824, "step": 508, "tokens/total": 133169152, "tokens/train_per_sec_per_gpu": 187.65, "tokens/trainable": 11142472 }, { "epoch": 0.5413453868651954, "grad_norm": 0.06589141488075256, "learning_rate": 2.0704579803129184e-05, "loss": 0.7477791905403137, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.1123, "step": 509, "tokens/total": 133431296, "tokens/train_per_sec_per_gpu": 186.9, "tokens/trainable": 11164967 }, { "epoch": 0.5424089337942036, "grad_norm": 0.07234744727611542, "learning_rate": 2.0630439528885314e-05, "loss": 0.82126784324646, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.27338, "step": 510, "tokens/total": 133693440, "tokens/train_per_sec_per_gpu": 183.66, "tokens/trainable": 11187755 }, { "epoch": 0.5434724807232119, "grad_norm": 0.06735506653785706, "learning_rate": 2.0556290581511314e-05, "loss": 0.7757540941238403, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.17223, "step": 511, "tokens/total": 133955584, "tokens/train_per_sec_per_gpu": 200.97, "tokens/trainable": 11210450 }, { "epoch": 0.5445360276522202, "grad_norm": 0.06329286843538284, "learning_rate": 2.0482133981094656e-05, "loss": 0.7571574449539185, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.13221, "step": 512, "tokens/total": 134217728, "tokens/train_per_sec_per_gpu": 193.23, "tokens/trainable": 11232157 }, { "epoch": 0.5455995745812284, "grad_norm": 0.06758707016706467, "learning_rate": 2.0407970747828113e-05, "loss": 0.7560121417045593, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.12977, "step": 513, "tokens/total": 134479872, "tokens/train_per_sec_per_gpu": 171.6, "tokens/trainable": 11253596 }, { "epoch": 0.5466631215102367, "grad_norm": 0.067698173224926, "learning_rate": 2.033380190199569e-05, "loss": 0.7989984750747681, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.22331, "step": 514, "tokens/total": 134742016, "tokens/train_per_sec_per_gpu": 206.04, "tokens/trainable": 11276800 }, { "epoch": 0.5477266684392449, "grad_norm": 0.06352519989013672, "learning_rate": 2.025962846395862e-05, "loss": 0.7677520513534546, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.15492, "step": 515, "tokens/total": 135004160, "tokens/train_per_sec_per_gpu": 189.51, "tokens/trainable": 11300134 }, { "epoch": 0.5487902153682531, "grad_norm": 0.0675693228840828, "learning_rate": 2.0185451454141307e-05, "loss": 0.8176583647727966, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.26519, "step": 516, "tokens/total": 135266304, "tokens/train_per_sec_per_gpu": 170.04, "tokens/trainable": 11322260 }, { "epoch": 0.5498537622972614, "grad_norm": 0.07476314902305603, "learning_rate": 2.0111271893017298e-05, "loss": 0.7549651861190796, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.12754, "step": 517, "tokens/total": 135528448, "tokens/train_per_sec_per_gpu": 178.37, "tokens/trainable": 11342564 }, { "epoch": 0.5509173092262696, "grad_norm": 0.08098773658275604, "learning_rate": 2.0037090801095217e-05, "loss": 0.8205512762069702, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.27175, "step": 518, "tokens/total": 135790592, "tokens/train_per_sec_per_gpu": 195.1, "tokens/trainable": 11364054 }, { "epoch": 0.5519808561552778, "grad_norm": 0.07975499331951141, "learning_rate": 1.9962909198904782e-05, "loss": 0.822569727897644, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.27634, "step": 519, "tokens/total": 136052736, "tokens/train_per_sec_per_gpu": 185.35, "tokens/trainable": 11385094 }, { "epoch": 0.553044403084286, "grad_norm": 0.0654132217168808, "learning_rate": 1.9888728106982712e-05, "loss": 0.7287492156028748, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.07249, "step": 520, "tokens/total": 136314880, "tokens/train_per_sec_per_gpu": 188.86, "tokens/trainable": 11409016 }, { "epoch": 0.5541079500132944, "grad_norm": 0.06833093613386154, "learning_rate": 1.98145485458587e-05, "loss": 0.7531672716140747, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.12372, "step": 521, "tokens/total": 136577024, "tokens/train_per_sec_per_gpu": 246.44, "tokens/trainable": 11431115 }, { "epoch": 0.5551714969423026, "grad_norm": 0.0704493448138237, "learning_rate": 1.9740371536041388e-05, "loss": 0.8314918875694275, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.29674, "step": 522, "tokens/total": 136839168, "tokens/train_per_sec_per_gpu": 211.44, "tokens/trainable": 11454368 }, { "epoch": 0.5562350438713108, "grad_norm": 0.07962756603956223, "learning_rate": 1.966619809800432e-05, "loss": 0.7510333061218262, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.11919, "step": 523, "tokens/total": 137101312, "tokens/train_per_sec_per_gpu": 174.57, "tokens/trainable": 11474895 }, { "epoch": 0.5572985908003191, "grad_norm": 0.07758195698261261, "learning_rate": 1.95920292521719e-05, "loss": 0.7872558236122131, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.19736, "step": 524, "tokens/total": 137363456, "tokens/train_per_sec_per_gpu": 204.94, "tokens/trainable": 11494814 }, { "epoch": 0.5583621377293273, "grad_norm": 0.06898010522127151, "learning_rate": 1.9517866018905347e-05, "loss": 0.7131535410881042, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.04042, "step": 525, "tokens/total": 137625600, "tokens/train_per_sec_per_gpu": 186.39, "tokens/trainable": 11516315 }, { "epoch": 0.5594256846583355, "grad_norm": 0.0707436203956604, "learning_rate": 1.9443709418488692e-05, "loss": 0.7908194065093994, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.2052, "step": 526, "tokens/total": 137887744, "tokens/train_per_sec_per_gpu": 178.33, "tokens/trainable": 11536975 }, { "epoch": 0.5604892315873438, "grad_norm": 0.06255731731653214, "learning_rate": 1.9369560471114693e-05, "loss": 0.7359522581100464, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.08747, "step": 527, "tokens/total": 138149888, "tokens/train_per_sec_per_gpu": 191.7, "tokens/trainable": 11558976 }, { "epoch": 0.561552778516352, "grad_norm": 0.06818301230669022, "learning_rate": 1.9295420196870826e-05, "loss": 0.759716808795929, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.13767, "step": 528, "tokens/total": 138412032, "tokens/train_per_sec_per_gpu": 228.46, "tokens/trainable": 11580954 }, { "epoch": 0.5626163254453602, "grad_norm": 0.07047592103481293, "learning_rate": 1.922128961572525e-05, "loss": 0.8164031505584717, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.26235, "step": 529, "tokens/total": 138674176, "tokens/train_per_sec_per_gpu": 185.46, "tokens/trainable": 11603196 }, { "epoch": 0.5636798723743686, "grad_norm": 0.06535010784864426, "learning_rate": 1.9147169747512773e-05, "loss": 0.7326769828796387, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.08064, "step": 530, "tokens/total": 138936320, "tokens/train_per_sec_per_gpu": 192.3, "tokens/trainable": 11625136 }, { "epoch": 0.5647434193033768, "grad_norm": 0.20296123623847961, "learning_rate": 1.9073061611920835e-05, "loss": 0.8644706606864929, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.37375, "step": 531, "tokens/total": 139198464, "tokens/train_per_sec_per_gpu": 168.12, "tokens/trainable": 11649882 }, { "epoch": 0.565806966232385, "grad_norm": 0.07702672481536865, "learning_rate": 1.899896622847545e-05, "loss": 0.6990363001823425, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.01181, "step": 532, "tokens/total": 139460608, "tokens/train_per_sec_per_gpu": 188.85, "tokens/trainable": 11671814 }, { "epoch": 0.5668705131613933, "grad_norm": 0.06929226219654083, "learning_rate": 1.89248846165272e-05, "loss": 0.7941587567329407, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.21258, "step": 533, "tokens/total": 139722752, "tokens/train_per_sec_per_gpu": 194.34, "tokens/trainable": 11695013 }, { "epoch": 0.5679340600904015, "grad_norm": 0.07153689116239548, "learning_rate": 1.885081779523722e-05, "loss": 0.7691222429275513, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.15787, "step": 534, "tokens/total": 139984896, "tokens/train_per_sec_per_gpu": 240.08, "tokens/trainable": 11717160 }, { "epoch": 0.5689976070194097, "grad_norm": 0.06492677330970764, "learning_rate": 1.8776766783563152e-05, "loss": 0.7174063920974731, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.04911, "step": 535, "tokens/total": 140247040, "tokens/train_per_sec_per_gpu": 162.24, "tokens/trainable": 11738101 }, { "epoch": 0.5700611539484179, "grad_norm": 0.06563756614923477, "learning_rate": 1.8702732600245138e-05, "loss": 0.7257460355758667, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.06627, "step": 536, "tokens/total": 140509184, "tokens/train_per_sec_per_gpu": 173.62, "tokens/trainable": 11759125 }, { "epoch": 0.5711247008774262, "grad_norm": 0.06024221330881119, "learning_rate": 1.8628716263791837e-05, "loss": 0.747328519821167, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.11135, "step": 537, "tokens/total": 140771328, "tokens/train_per_sec_per_gpu": 207.16, "tokens/trainable": 11782122 }, { "epoch": 0.5721882478064345, "grad_norm": 0.06131380796432495, "learning_rate": 1.8554718792466353e-05, "loss": 0.7804300785064697, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.18241, "step": 538, "tokens/total": 141033472, "tokens/train_per_sec_per_gpu": 198.93, "tokens/trainable": 11805081 }, { "epoch": 0.5732517947354427, "grad_norm": 0.0673721581697464, "learning_rate": 1.8480741204272268e-05, "loss": 0.8499374389648438, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.3395, "step": 539, "tokens/total": 141295616, "tokens/train_per_sec_per_gpu": 205.27, "tokens/trainable": 11827941 }, { "epoch": 0.574315341664451, "grad_norm": 0.07307814806699753, "learning_rate": 1.8406784516939628e-05, "loss": 0.740190863609314, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.09634, "step": 540, "tokens/total": 141557760, "tokens/train_per_sec_per_gpu": 200.34, "tokens/trainable": 11850776 }, { "epoch": 0.5753788885934592, "grad_norm": 0.06450652331113815, "learning_rate": 1.8332849747910925e-05, "loss": 0.7271907329559326, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.06926, "step": 541, "tokens/total": 141819904, "tokens/train_per_sec_per_gpu": 199.39, "tokens/trainable": 11872489 }, { "epoch": 0.5764424355224674, "grad_norm": 0.06675565242767334, "learning_rate": 1.825893791432714e-05, "loss": 0.7834812998771667, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.18908, "step": 542, "tokens/total": 142082048, "tokens/train_per_sec_per_gpu": 220.61, "tokens/trainable": 11894441 }, { "epoch": 0.5775059824514757, "grad_norm": 0.07322388142347336, "learning_rate": 1.8185050033013715e-05, "loss": 0.8086245656013489, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.24482, "step": 543, "tokens/total": 142344192, "tokens/train_per_sec_per_gpu": 179.2, "tokens/trainable": 11915451 }, { "epoch": 0.5785695293804839, "grad_norm": 0.07201948016881943, "learning_rate": 1.811118712046657e-05, "loss": 0.8121210336685181, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.25268, "step": 544, "tokens/total": 142606336, "tokens/train_per_sec_per_gpu": 175.39, "tokens/trainable": 11937488 }, { "epoch": 0.5796330763094921, "grad_norm": 0.06886903196573257, "learning_rate": 1.8037350192838117e-05, "loss": 0.7545644044876099, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.12668, "step": 545, "tokens/total": 142868480, "tokens/train_per_sec_per_gpu": 194.01, "tokens/trainable": 11959978 }, { "epoch": 0.5806966232385004, "grad_norm": 0.07075604796409607, "learning_rate": 1.7963540265923298e-05, "loss": 0.7854512333869934, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.1934, "step": 546, "tokens/total": 143130624, "tokens/train_per_sec_per_gpu": 187.1, "tokens/trainable": 11980805 }, { "epoch": 0.5817601701675087, "grad_norm": 0.06993231177330017, "learning_rate": 1.788975835514559e-05, "loss": 0.7716733813285828, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.16338, "step": 547, "tokens/total": 143392768, "tokens/train_per_sec_per_gpu": 176.49, "tokens/trainable": 12002477 }, { "epoch": 0.5828237170965169, "grad_norm": 0.06735540181398392, "learning_rate": 1.7816005475543057e-05, "loss": 0.7268585562705994, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.06857, "step": 548, "tokens/total": 143654912, "tokens/train_per_sec_per_gpu": 168.14, "tokens/trainable": 12024394 }, { "epoch": 0.5838872640255252, "grad_norm": 0.06800015270709991, "learning_rate": 1.7742282641754362e-05, "loss": 0.7047498226165771, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.02334, "step": 549, "tokens/total": 143917056, "tokens/train_per_sec_per_gpu": 173.12, "tokens/trainable": 12045702 }, { "epoch": 0.5849508109545334, "grad_norm": 0.07099801301956177, "learning_rate": 1.766859086800482e-05, "loss": 0.7587050199508667, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.13551, "step": 550, "tokens/total": 144179200, "tokens/train_per_sec_per_gpu": 142.06, "tokens/trainable": 12067322 }, { "epoch": 0.5860143578835416, "grad_norm": 0.06389941275119781, "learning_rate": 1.759493116809245e-05, "loss": 0.7534220814704895, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.12426, "step": 551, "tokens/total": 144441344, "tokens/train_per_sec_per_gpu": 208.95, "tokens/trainable": 12090670 }, { "epoch": 0.5870779048125498, "grad_norm": 0.08602443337440491, "learning_rate": 1.7521304555374013e-05, "loss": 0.7858190536499023, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.1942, "step": 552, "tokens/total": 144703488, "tokens/train_per_sec_per_gpu": 183.64, "tokens/trainable": 12112909 }, { "epoch": 0.5881414517415581, "grad_norm": 0.06364187598228455, "learning_rate": 1.7447712042751086e-05, "loss": 0.8089983463287354, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.24566, "step": 553, "tokens/total": 144965632, "tokens/train_per_sec_per_gpu": 221.48, "tokens/trainable": 12136873 }, { "epoch": 0.5892049986705663, "grad_norm": 0.06705693900585175, "learning_rate": 1.7374154642656133e-05, "loss": 0.7520922422409058, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.12143, "step": 554, "tokens/total": 145227776, "tokens/train_per_sec_per_gpu": 211.51, "tokens/trainable": 12158329 }, { "epoch": 0.5902685455995745, "grad_norm": 0.06293050199747086, "learning_rate": 1.730063336703855e-05, "loss": 0.833366870880127, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.30105, "step": 555, "tokens/total": 145489920, "tokens/train_per_sec_per_gpu": 209.36, "tokens/trainable": 12180630 }, { "epoch": 0.5913320925285829, "grad_norm": 0.07079198956489563, "learning_rate": 1.722714922735076e-05, "loss": 0.6932408213615417, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.00019, "step": 556, "tokens/total": 145752064, "tokens/train_per_sec_per_gpu": 194.23, "tokens/trainable": 12202488 }, { "epoch": 0.5923956394575911, "grad_norm": 0.07012391835451126, "learning_rate": 1.7153703234534302e-05, "loss": 0.7621327042579651, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.14284, "step": 557, "tokens/total": 146014208, "tokens/train_per_sec_per_gpu": 216.52, "tokens/trainable": 12226431 }, { "epoch": 0.5934591863865993, "grad_norm": 0.061280712485313416, "learning_rate": 1.708029639900591e-05, "loss": 0.7005034685134888, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.01477, "step": 558, "tokens/total": 146276352, "tokens/train_per_sec_per_gpu": 192.83, "tokens/trainable": 12247296 }, { "epoch": 0.5945227333156076, "grad_norm": 0.06636322289705276, "learning_rate": 1.7006929730643635e-05, "loss": 0.6887087821960449, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 1.99114, "step": 559, "tokens/total": 146538496, "tokens/train_per_sec_per_gpu": 207.99, "tokens/trainable": 12268797 }, { "epoch": 0.5955862802446158, "grad_norm": 0.07723846286535263, "learning_rate": 1.6933604238772924e-05, "loss": 0.6889795064926147, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 1.99168, "step": 560, "tokens/total": 146800640, "tokens/train_per_sec_per_gpu": 200.15, "tokens/trainable": 12289775 }, { "epoch": 0.596649827173624, "grad_norm": 0.06970304995775223, "learning_rate": 1.6860320932152755e-05, "loss": 0.7143691182136536, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.0429, "step": 561, "tokens/total": 147062784, "tokens/train_per_sec_per_gpu": 167.54, "tokens/trainable": 12309110 }, { "epoch": 0.5977133741026323, "grad_norm": 0.07083282619714737, "learning_rate": 1.6787080818961746e-05, "loss": 0.7616149187088013, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.14173, "step": 562, "tokens/total": 147324928, "tokens/train_per_sec_per_gpu": 170.61, "tokens/trainable": 12329610 }, { "epoch": 0.5987769210316405, "grad_norm": 0.06794283539056778, "learning_rate": 1.6713884906784293e-05, "loss": 0.7875048518180847, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.19791, "step": 563, "tokens/total": 147587072, "tokens/train_per_sec_per_gpu": 160.01, "tokens/trainable": 12350916 }, { "epoch": 0.5998404679606487, "grad_norm": 0.07991766929626465, "learning_rate": 1.6640734202596702e-05, "loss": 0.7792809009552002, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.1799, "step": 564, "tokens/total": 147849216, "tokens/train_per_sec_per_gpu": 155.73, "tokens/trainable": 12370256 }, { "epoch": 0.6009040148896571, "grad_norm": 0.07368163019418716, "learning_rate": 1.6567629712753363e-05, "loss": 0.7557001113891602, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.1291, "step": 565, "tokens/total": 148111360, "tokens/train_per_sec_per_gpu": 184.87, "tokens/trainable": 12391117 }, { "epoch": 0.6019675618186653, "grad_norm": 0.07445425540208817, "learning_rate": 1.6494572442972857e-05, "loss": 0.7838120460510254, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.1898, "step": 566, "tokens/total": 148373504, "tokens/train_per_sec_per_gpu": 188.91, "tokens/trainable": 12412378 }, { "epoch": 0.6030311087476735, "grad_norm": 0.0670490711927414, "learning_rate": 1.642156339832415e-05, "loss": 0.7417568564414978, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.09962, "step": 567, "tokens/total": 148635648, "tokens/train_per_sec_per_gpu": 170.03, "tokens/trainable": 12433097 }, { "epoch": 0.6040946556766817, "grad_norm": 0.06880825757980347, "learning_rate": 1.634860358321277e-05, "loss": 0.7300165891647339, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.07512, "step": 568, "tokens/total": 148897792, "tokens/train_per_sec_per_gpu": 195.18, "tokens/trainable": 12455206 }, { "epoch": 0.60515820260569, "grad_norm": 0.07096575200557709, "learning_rate": 1.627569400136697e-05, "loss": 0.8089407682418823, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.24553, "step": 569, "tokens/total": 149159936, "tokens/train_per_sec_per_gpu": 191.16, "tokens/trainable": 12478007 }, { "epoch": 0.6062217495346982, "grad_norm": 0.0826595202088356, "learning_rate": 1.620283565582395e-05, "loss": 0.7439614534378052, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.10425, "step": 570, "tokens/total": 149422080, "tokens/train_per_sec_per_gpu": 186.55, "tokens/trainable": 12498799 }, { "epoch": 0.6072852964637064, "grad_norm": 0.08035894483327866, "learning_rate": 1.6130029548916007e-05, "loss": 0.7608100175857544, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.14001, "step": 571, "tokens/total": 149684224, "tokens/train_per_sec_per_gpu": 177.96, "tokens/trainable": 12518407 }, { "epoch": 0.6083488433927147, "grad_norm": 0.07164735347032547, "learning_rate": 1.605727668225681e-05, "loss": 0.8451493978500366, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.32833, "step": 572, "tokens/total": 149946368, "tokens/train_per_sec_per_gpu": 219.11, "tokens/trainable": 12541561 }, { "epoch": 0.6094123903217229, "grad_norm": 0.07182083278894424, "learning_rate": 1.5984578056727564e-05, "loss": 0.8233068585395813, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.27802, "step": 573, "tokens/total": 150208512, "tokens/train_per_sec_per_gpu": 208.4, "tokens/trainable": 12562762 }, { "epoch": 0.6104759372507312, "grad_norm": 0.06765095144510269, "learning_rate": 1.591193467246327e-05, "loss": 0.7087922096252441, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.03154, "step": 574, "tokens/total": 150470656, "tokens/train_per_sec_per_gpu": 149.32, "tokens/trainable": 12583197 }, { "epoch": 0.6115394841797395, "grad_norm": 0.07710757106542587, "learning_rate": 1.5839347528838957e-05, "loss": 0.7562744617462158, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.13032, "step": 575, "tokens/total": 150732800, "tokens/train_per_sec_per_gpu": 121.82, "tokens/trainable": 12601739 }, { "epoch": 0.6126030311087477, "grad_norm": 0.07722500711679459, "learning_rate": 1.5766817624455954e-05, "loss": 0.8186255693435669, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.26738, "step": 576, "tokens/total": 150994944, "tokens/train_per_sec_per_gpu": 177.23, "tokens/trainable": 12623589 }, { "epoch": 0.6136665780377559, "grad_norm": 0.06252503395080566, "learning_rate": 1.569434595712811e-05, "loss": 0.7135177850723267, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.04116, "step": 577, "tokens/total": 151257088, "tokens/train_per_sec_per_gpu": 216.51, "tokens/trainable": 12646370 }, { "epoch": 0.6147301249667642, "grad_norm": 0.07124295085668564, "learning_rate": 1.5621933523868106e-05, "loss": 0.7426387667655945, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.10147, "step": 578, "tokens/total": 151519232, "tokens/train_per_sec_per_gpu": 164.29, "tokens/trainable": 12666523 }, { "epoch": 0.6157936718957724, "grad_norm": 0.06870193779468536, "learning_rate": 1.5549581320873715e-05, "loss": 0.7106361389160156, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.03529, "step": 579, "tokens/total": 151781376, "tokens/train_per_sec_per_gpu": 203.55, "tokens/trainable": 12689727 }, { "epoch": 0.6168572188247806, "grad_norm": 0.06538794189691544, "learning_rate": 1.5477290343514108e-05, "loss": 0.7434192299842834, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.10311, "step": 580, "tokens/total": 152043520, "tokens/train_per_sec_per_gpu": 220.1, "tokens/trainable": 12713675 }, { "epoch": 0.6179207657537888, "grad_norm": 0.08144285529851913, "learning_rate": 1.5405061586316158e-05, "loss": 0.7463376522064209, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.10926, "step": 581, "tokens/total": 152305664, "tokens/train_per_sec_per_gpu": 199.52, "tokens/trainable": 12734720 }, { "epoch": 0.6189843126827971, "grad_norm": 0.0673174113035202, "learning_rate": 1.533289604295077e-05, "loss": 0.6784200668334961, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 1.97076, "step": 582, "tokens/total": 152567808, "tokens/train_per_sec_per_gpu": 185.45, "tokens/trainable": 12757403 }, { "epoch": 0.6200478596118054, "grad_norm": 0.07269315421581268, "learning_rate": 1.5260794706219176e-05, "loss": 0.769637942314148, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.15898, "step": 583, "tokens/total": 152829952, "tokens/train_per_sec_per_gpu": 198.93, "tokens/trainable": 12778686 }, { "epoch": 0.6211114065408136, "grad_norm": 0.07265086472034454, "learning_rate": 1.5188758568039318e-05, "loss": 0.783679723739624, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.18951, "step": 584, "tokens/total": 153092096, "tokens/train_per_sec_per_gpu": 172.98, "tokens/trainable": 12799072 }, { "epoch": 0.6221749534698219, "grad_norm": 0.06607359647750854, "learning_rate": 1.5116788619432177e-05, "loss": 0.7359644770622253, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.08749, "step": 585, "tokens/total": 153354240, "tokens/train_per_sec_per_gpu": 217.63, "tokens/trainable": 12823018 }, { "epoch": 0.6232385003988301, "grad_norm": 0.07835246622562408, "learning_rate": 1.5044885850508137e-05, "loss": 0.7655049562454224, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.15008, "step": 586, "tokens/total": 153616384, "tokens/train_per_sec_per_gpu": 172.13, "tokens/trainable": 12843433 }, { "epoch": 0.6243020473278383, "grad_norm": 0.06266583502292633, "learning_rate": 1.4973051250453399e-05, "loss": 0.784702718257904, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.19176, "step": 587, "tokens/total": 153878528, "tokens/train_per_sec_per_gpu": 181.28, "tokens/trainable": 12866501 }, { "epoch": 0.6253655942568466, "grad_norm": 0.07751967012882233, "learning_rate": 1.4901285807516326e-05, "loss": 0.7583497762680054, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.13475, "step": 588, "tokens/total": 154140672, "tokens/train_per_sec_per_gpu": 186.01, "tokens/trainable": 12887753 }, { "epoch": 0.6264291411858548, "grad_norm": 0.06350097805261612, "learning_rate": 1.4829590508993859e-05, "loss": 0.7606030702590942, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.13957, "step": 589, "tokens/total": 154402816, "tokens/train_per_sec_per_gpu": 173.15, "tokens/trainable": 12910246 }, { "epoch": 0.627492688114863, "grad_norm": 0.0697028860449791, "learning_rate": 1.4757966341217963e-05, "loss": 0.7563662528991699, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.13052, "step": 590, "tokens/total": 154664960, "tokens/train_per_sec_per_gpu": 205.69, "tokens/trainable": 12933119 }, { "epoch": 0.6285562350438713, "grad_norm": 0.07811742275953293, "learning_rate": 1.4686414289542023e-05, "loss": 0.8478161096572876, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.33454, "step": 591, "tokens/total": 154927104, "tokens/train_per_sec_per_gpu": 200.8, "tokens/trainable": 12953870 }, { "epoch": 0.6296197819728796, "grad_norm": 0.06654322147369385, "learning_rate": 1.461493533832731e-05, "loss": 0.7657591104507446, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.15063, "step": 592, "tokens/total": 155189248, "tokens/train_per_sec_per_gpu": 193.4, "tokens/trainable": 12976578 }, { "epoch": 0.6306833289018878, "grad_norm": 0.07657228410243988, "learning_rate": 1.454353047092944e-05, "loss": 0.839828372001648, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.31597, "step": 593, "tokens/total": 155451392, "tokens/train_per_sec_per_gpu": 228.66, "tokens/trainable": 13000674 }, { "epoch": 0.6317468758308961, "grad_norm": 0.07358460128307343, "learning_rate": 1.4472200669684821e-05, "loss": 0.6963605284690857, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.00644, "step": 594, "tokens/total": 155713536, "tokens/train_per_sec_per_gpu": 199.12, "tokens/trainable": 13021140 }, { "epoch": 0.6328104227599043, "grad_norm": 0.07676289230585098, "learning_rate": 1.4400946915897168e-05, "loss": 0.8120332956314087, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.25248, "step": 595, "tokens/total": 155975680, "tokens/train_per_sec_per_gpu": 172.52, "tokens/trainable": 13041462 }, { "epoch": 0.6338739696889125, "grad_norm": 0.07129766792058945, "learning_rate": 1.4329770189823982e-05, "loss": 0.7258169054985046, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.06642, "step": 596, "tokens/total": 156237824, "tokens/train_per_sec_per_gpu": 199.58, "tokens/trainable": 13062706 }, { "epoch": 0.6349375166179207, "grad_norm": 0.0697442814707756, "learning_rate": 1.4258671470663075e-05, "loss": 0.7802278995513916, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.18197, "step": 597, "tokens/total": 156499968, "tokens/train_per_sec_per_gpu": 205.89, "tokens/trainable": 13087819 }, { "epoch": 0.636001063546929, "grad_norm": 0.07785540819168091, "learning_rate": 1.4187651736539092e-05, "loss": 0.756123423576355, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.13, "step": 598, "tokens/total": 156762112, "tokens/train_per_sec_per_gpu": 196.25, "tokens/trainable": 13109448 }, { "epoch": 0.6370646104759372, "grad_norm": 0.07135733217000961, "learning_rate": 1.4116711964490076e-05, "loss": 0.760696530342102, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.13977, "step": 599, "tokens/total": 157024256, "tokens/train_per_sec_per_gpu": 169.38, "tokens/trainable": 13130856 }, { "epoch": 0.6381281574049454, "grad_norm": 0.07068773359060287, "learning_rate": 1.404585313045399e-05, "loss": 0.7768102884292603, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.17453, "step": 600, "tokens/total": 157286400, "tokens/train_per_sec_per_gpu": 200.62, "tokens/trainable": 13154412 }, { "epoch": 0.6381281574049454, "eval_loss": 0.7701326012611389, "eval_ppl": 2.16005, "eval_runtime": 237.2245, "eval_samples_per_second": 28.184, "eval_steps_per_second": 1.762, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 38.19, "memory/max_allocated (GiB)": 38.19, "step": 600 }, { "epoch": 0.6391917043339538, "grad_norm": 0.06947837024927139, "learning_rate": 1.3975076209255321e-05, "loss": 0.7434956431388855, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.10327, "step": 601, "tokens/total": 157548544, "tokens/train_per_sec_per_gpu": 185.08, "tokens/trainable": 13176877 }, { "epoch": 0.640255251262962, "grad_norm": 0.08173263818025589, "learning_rate": 1.3904382174591654e-05, "loss": 0.7505627870559692, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.11819, "step": 602, "tokens/total": 157810688, "tokens/train_per_sec_per_gpu": 157.36, "tokens/trainable": 13197398 }, { "epoch": 0.6413187981919702, "grad_norm": 0.07958182692527771, "learning_rate": 1.3833771999020274e-05, "loss": 0.7926914095878601, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.20933, "step": 603, "tokens/total": 158072832, "tokens/train_per_sec_per_gpu": 143.13, "tokens/trainable": 13219479 }, { "epoch": 0.6423823451209785, "grad_norm": 0.08310385793447495, "learning_rate": 1.3763246653944824e-05, "loss": 0.7650701999664307, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.14915, "step": 604, "tokens/total": 158334976, "tokens/train_per_sec_per_gpu": 156.87, "tokens/trainable": 13239934 }, { "epoch": 0.6434458920499867, "grad_norm": 0.07890634983778, "learning_rate": 1.3692807109601875e-05, "loss": 0.7053734064102173, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.0246, "step": 605, "tokens/total": 158597120, "tokens/train_per_sec_per_gpu": 183.26, "tokens/trainable": 13259732 }, { "epoch": 0.6445094389789949, "grad_norm": 0.06404001265764236, "learning_rate": 1.3622454335047631e-05, "loss": 0.737473726272583, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.09065, "step": 606, "tokens/total": 158859264, "tokens/train_per_sec_per_gpu": 235.57, "tokens/trainable": 13284062 }, { "epoch": 0.6455729859080032, "grad_norm": 0.07135722041130066, "learning_rate": 1.3552189298144573e-05, "loss": 0.7033129334449768, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.02044, "step": 607, "tokens/total": 159121408, "tokens/train_per_sec_per_gpu": 176.01, "tokens/trainable": 13306868 }, { "epoch": 0.6466365328370114, "grad_norm": 0.0779723972082138, "learning_rate": 1.3482012965548161e-05, "loss": 0.8055770993232727, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.23799, "step": 608, "tokens/total": 159383552, "tokens/train_per_sec_per_gpu": 189.42, "tokens/trainable": 13327922 }, { "epoch": 0.6477000797660196, "grad_norm": 0.08110717684030533, "learning_rate": 1.341192630269351e-05, "loss": 0.817200779914856, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.26415, "step": 609, "tokens/total": 159645696, "tokens/train_per_sec_per_gpu": 170.71, "tokens/trainable": 13347786 }, { "epoch": 0.648763626695028, "grad_norm": 0.06666215509176254, "learning_rate": 1.3341930273782144e-05, "loss": 0.704579770565033, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.023, "step": 610, "tokens/total": 159907840, "tokens/train_per_sec_per_gpu": 200.63, "tokens/trainable": 13371502 }, { "epoch": 0.6498271736240362, "grad_norm": 0.06857079267501831, "learning_rate": 1.3272025841768693e-05, "loss": 0.8532009720802307, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.34715, "step": 611, "tokens/total": 160169984, "tokens/train_per_sec_per_gpu": 191.0, "tokens/trainable": 13393637 }, { "epoch": 0.6508907205530444, "grad_norm": 0.0732354000210762, "learning_rate": 1.320221396834767e-05, "loss": 0.7995498180389404, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.22454, "step": 612, "tokens/total": 160432128, "tokens/train_per_sec_per_gpu": 189.51, "tokens/trainable": 13415692 }, { "epoch": 0.6519542674820527, "grad_norm": 0.06430143862962723, "learning_rate": 1.3132495613940237e-05, "loss": 0.76482093334198, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.14861, "step": 613, "tokens/total": 160694272, "tokens/train_per_sec_per_gpu": 207.14, "tokens/trainable": 13438366 }, { "epoch": 0.6530178144110609, "grad_norm": 0.06737808883190155, "learning_rate": 1.3062871737680976e-05, "loss": 0.7430492043495178, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.10234, "step": 614, "tokens/total": 160956416, "tokens/train_per_sec_per_gpu": 188.74, "tokens/trainable": 13458353 }, { "epoch": 0.6540813613400691, "grad_norm": 0.0812523365020752, "learning_rate": 1.2993343297404732e-05, "loss": 0.8373923897743225, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.31033, "step": 615, "tokens/total": 161218560, "tokens/train_per_sec_per_gpu": 198.52, "tokens/trainable": 13479819 }, { "epoch": 0.6551449082690773, "grad_norm": 0.07273576408624649, "learning_rate": 1.2923911249633391e-05, "loss": 0.753447413444519, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.12431, "step": 616, "tokens/total": 161480704, "tokens/train_per_sec_per_gpu": 187.46, "tokens/trainable": 13500295 }, { "epoch": 0.6562084551980856, "grad_norm": 0.07487501949071884, "learning_rate": 1.2854576549562743e-05, "loss": 0.7411309480667114, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.09831, "step": 617, "tokens/total": 161742848, "tokens/train_per_sec_per_gpu": 158.41, "tokens/trainable": 13518908 }, { "epoch": 0.6572720021270939, "grad_norm": 0.060535646975040436, "learning_rate": 1.2785340151049348e-05, "loss": 0.6992688179016113, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.01228, "step": 618, "tokens/total": 162004992, "tokens/train_per_sec_per_gpu": 195.11, "tokens/trainable": 13542123 }, { "epoch": 0.6583355490561021, "grad_norm": 0.07925450801849365, "learning_rate": 1.2716203006597389e-05, "loss": 0.7158269882202148, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.04588, "step": 619, "tokens/total": 162267136, "tokens/train_per_sec_per_gpu": 154.73, "tokens/trainable": 13560742 }, { "epoch": 0.6593990959851104, "grad_norm": 0.07715223729610443, "learning_rate": 1.2647166067345598e-05, "loss": 0.7587981224060059, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.13571, "step": 620, "tokens/total": 162529280, "tokens/train_per_sec_per_gpu": 190.53, "tokens/trainable": 13581062 }, { "epoch": 0.6604626429141186, "grad_norm": 0.0761185884475708, "learning_rate": 1.2578230283054153e-05, "loss": 0.7682688236236572, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.15603, "step": 621, "tokens/total": 162791424, "tokens/train_per_sec_per_gpu": 193.3, "tokens/trainable": 13602325 }, { "epoch": 0.6615261898431268, "grad_norm": 0.0685892179608345, "learning_rate": 1.2509396602091612e-05, "loss": 0.7407702207565308, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.09755, "step": 622, "tokens/total": 163053568, "tokens/train_per_sec_per_gpu": 178.64, "tokens/trainable": 13624055 }, { "epoch": 0.6625897367721351, "grad_norm": 0.07496423274278641, "learning_rate": 1.2440665971421872e-05, "loss": 0.7656638622283936, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.15042, "step": 623, "tokens/total": 163315712, "tokens/train_per_sec_per_gpu": 182.21, "tokens/trainable": 13643250 }, { "epoch": 0.6636532837011433, "grad_norm": 0.07262540608644485, "learning_rate": 1.2372039336591137e-05, "loss": 0.801539421081543, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.22897, "step": 624, "tokens/total": 163577856, "tokens/train_per_sec_per_gpu": 196.93, "tokens/trainable": 13664253 }, { "epoch": 0.6647168306301515, "grad_norm": 0.07125604152679443, "learning_rate": 1.230351764171491e-05, "loss": 0.7554821968078613, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.12864, "step": 625, "tokens/total": 163840000, "tokens/train_per_sec_per_gpu": 174.72, "tokens/trainable": 13685345 }, { "epoch": 0.6657803775591598, "grad_norm": 0.071174755692482, "learning_rate": 1.2235101829465003e-05, "loss": 0.8233163952827454, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.27804, "step": 626, "tokens/total": 164102144, "tokens/train_per_sec_per_gpu": 235.38, "tokens/trainable": 13708541 }, { "epoch": 0.6668439244881681, "grad_norm": 0.07615090906620026, "learning_rate": 1.2166792841056596e-05, "loss": 0.7587050199508667, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.13551, "step": 627, "tokens/total": 164364288, "tokens/train_per_sec_per_gpu": 220.58, "tokens/trainable": 13730305 }, { "epoch": 0.6679074714171763, "grad_norm": 0.08138352632522583, "learning_rate": 1.2098591616235231e-05, "loss": 0.7529855370521545, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.12333, "step": 628, "tokens/total": 164626432, "tokens/train_per_sec_per_gpu": 193.64, "tokens/trainable": 13752153 }, { "epoch": 0.6689710183461846, "grad_norm": 0.07478975504636765, "learning_rate": 1.2030499093263938e-05, "loss": 0.7286227941513062, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.07222, "step": 629, "tokens/total": 164888576, "tokens/train_per_sec_per_gpu": 193.85, "tokens/trainable": 13774483 }, { "epoch": 0.6700345652751928, "grad_norm": 0.07285770773887634, "learning_rate": 1.1962516208910295e-05, "loss": 0.733648955821991, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.08267, "step": 630, "tokens/total": 165150720, "tokens/train_per_sec_per_gpu": 184.41, "tokens/trainable": 13795589 }, { "epoch": 0.671098112204201, "grad_norm": 0.0695388987660408, "learning_rate": 1.1894643898433541e-05, "loss": 0.7216265201568604, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.05778, "step": 631, "tokens/total": 165412864, "tokens/train_per_sec_per_gpu": 204.39, "tokens/trainable": 13817860 }, { "epoch": 0.6721616591332092, "grad_norm": 0.0669473186135292, "learning_rate": 1.1826883095571758e-05, "loss": 0.8506951332092285, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.34127, "step": 632, "tokens/total": 165675008, "tokens/train_per_sec_per_gpu": 251.79, "tokens/trainable": 13842305 }, { "epoch": 0.6732252060622175, "grad_norm": 0.07681821286678314, "learning_rate": 1.1759234732528952e-05, "loss": 0.7686042785644531, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.15675, "step": 633, "tokens/total": 165937152, "tokens/train_per_sec_per_gpu": 177.33, "tokens/trainable": 13861902 }, { "epoch": 0.6742887529912257, "grad_norm": 0.06518401950597763, "learning_rate": 1.1691699739962275e-05, "loss": 0.8041465878486633, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.23479, "step": 634, "tokens/total": 166199296, "tokens/train_per_sec_per_gpu": 207.88, "tokens/trainable": 13884726 }, { "epoch": 0.6753522999202339, "grad_norm": 0.06772468984127045, "learning_rate": 1.1624279046969208e-05, "loss": 0.7493684887886047, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.11566, "step": 635, "tokens/total": 166461440, "tokens/train_per_sec_per_gpu": 221.3, "tokens/trainable": 13908358 }, { "epoch": 0.6764158468492423, "grad_norm": 0.07777251303195953, "learning_rate": 1.1556973581074784e-05, "loss": 0.7796363830566406, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.18068, "step": 636, "tokens/total": 166723584, "tokens/train_per_sec_per_gpu": 158.72, "tokens/trainable": 13930851 }, { "epoch": 0.6774793937782505, "grad_norm": 0.07164981216192245, "learning_rate": 1.1489784268218811e-05, "loss": 0.7849699258804321, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.19234, "step": 637, "tokens/total": 166985728, "tokens/train_per_sec_per_gpu": 226.45, "tokens/trainable": 13952042 }, { "epoch": 0.6785429407072587, "grad_norm": 0.07446952164173126, "learning_rate": 1.1422712032743186e-05, "loss": 0.7363911867141724, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.08839, "step": 638, "tokens/total": 167247872, "tokens/train_per_sec_per_gpu": 176.2, "tokens/trainable": 13972004 }, { "epoch": 0.679606487636267, "grad_norm": 0.07111480087041855, "learning_rate": 1.1355757797379093e-05, "loss": 0.7901080846786499, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.20363, "step": 639, "tokens/total": 167510016, "tokens/train_per_sec_per_gpu": 204.49, "tokens/trainable": 13994787 }, { "epoch": 0.6806700345652752, "grad_norm": 0.07409324496984482, "learning_rate": 1.1288922483234395e-05, "loss": 0.7688755393028259, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.15734, "step": 640, "tokens/total": 167772160, "tokens/train_per_sec_per_gpu": 183.2, "tokens/trainable": 14016072 }, { "epoch": 0.6817335814942834, "grad_norm": 0.06080978736281395, "learning_rate": 1.1222207009780888e-05, "loss": 0.731904923915863, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.07904, "step": 641, "tokens/total": 168034304, "tokens/train_per_sec_per_gpu": 193.71, "tokens/trainable": 14039858 }, { "epoch": 0.6827971284232917, "grad_norm": 0.07461950182914734, "learning_rate": 1.1155612294841713e-05, "loss": 0.7744308710098267, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.16936, "step": 642, "tokens/total": 168296448, "tokens/train_per_sec_per_gpu": 190.26, "tokens/trainable": 14062445 }, { "epoch": 0.6838606753522999, "grad_norm": 0.06637416779994965, "learning_rate": 1.10891392545787e-05, "loss": 0.6965582370758057, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.00683, "step": 643, "tokens/total": 168558592, "tokens/train_per_sec_per_gpu": 202.21, "tokens/trainable": 14085386 }, { "epoch": 0.6849242222813081, "grad_norm": 0.08194036036729813, "learning_rate": 1.1022788803479747e-05, "loss": 0.7541095018386841, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.12572, "step": 644, "tokens/total": 168820736, "tokens/train_per_sec_per_gpu": 200.39, "tokens/trainable": 14107399 }, { "epoch": 0.6859877692103165, "grad_norm": 0.07168550044298172, "learning_rate": 1.095656185434629e-05, "loss": 0.7705248594284058, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.1609, "step": 645, "tokens/total": 169082880, "tokens/train_per_sec_per_gpu": 179.15, "tokens/trainable": 14129284 }, { "epoch": 0.6870513161393247, "grad_norm": 0.0729442834854126, "learning_rate": 1.0890459318280681e-05, "loss": 0.7964296340942383, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.21761, "step": 646, "tokens/total": 169345024, "tokens/train_per_sec_per_gpu": 210.01, "tokens/trainable": 14151413 }, { "epoch": 0.6881148630683329, "grad_norm": 0.07156070321798325, "learning_rate": 1.0824482104673723e-05, "loss": 0.7291166186332703, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.07325, "step": 647, "tokens/total": 169607168, "tokens/train_per_sec_per_gpu": 187.04, "tokens/trainable": 14172388 }, { "epoch": 0.6891784099973411, "grad_norm": 0.07664386928081512, "learning_rate": 1.0758631121192075e-05, "loss": 0.847728967666626, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.33434, "step": 648, "tokens/total": 169869312, "tokens/train_per_sec_per_gpu": 183.41, "tokens/trainable": 14194504 }, { "epoch": 0.6902419569263494, "grad_norm": 0.06855987012386322, "learning_rate": 1.0692907273765878e-05, "loss": 0.809911847114563, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.24771, "step": 649, "tokens/total": 170131456, "tokens/train_per_sec_per_gpu": 175.96, "tokens/trainable": 14217186 }, { "epoch": 0.6913055038553576, "grad_norm": 0.069715715944767, "learning_rate": 1.0627311466576167e-05, "loss": 0.7290323972702026, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.07307, "step": 650, "tokens/total": 170393600, "tokens/train_per_sec_per_gpu": 226.27, "tokens/trainable": 14240939 }, { "epoch": 0.6923690507843658, "grad_norm": 0.06665818393230438, "learning_rate": 1.0561844602042535e-05, "loss": 0.696631908416748, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.00698, "step": 651, "tokens/total": 170655744, "tokens/train_per_sec_per_gpu": 165.96, "tokens/trainable": 14262378 }, { "epoch": 0.6934325977133741, "grad_norm": 0.06637638807296753, "learning_rate": 1.0496507580810637e-05, "loss": 0.7380032539367676, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.09175, "step": 652, "tokens/total": 170917888, "tokens/train_per_sec_per_gpu": 210.8, "tokens/trainable": 14284893 }, { "epoch": 0.6944961446423823, "grad_norm": 0.07084432989358902, "learning_rate": 1.0431301301739882e-05, "loss": 0.8238184452056885, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.27919, "step": 653, "tokens/total": 171180032, "tokens/train_per_sec_per_gpu": 161.41, "tokens/trainable": 14305618 }, { "epoch": 0.6955596915713906, "grad_norm": 0.07500351220369339, "learning_rate": 1.0366226661890976e-05, "loss": 0.8070303201675415, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.24124, "step": 654, "tokens/total": 171442176, "tokens/train_per_sec_per_gpu": 198.07, "tokens/trainable": 14327779 }, { "epoch": 0.6966232385003989, "grad_norm": 0.06748203933238983, "learning_rate": 1.0301284556513669e-05, "loss": 0.7480711340904236, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.11292, "step": 655, "tokens/total": 171704320, "tokens/train_per_sec_per_gpu": 198.9, "tokens/trainable": 14350224 }, { "epoch": 0.6976867854294071, "grad_norm": 0.07086233794689178, "learning_rate": 1.023647587903438e-05, "loss": 0.7530485391616821, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.12346, "step": 656, "tokens/total": 171966464, "tokens/train_per_sec_per_gpu": 186.03, "tokens/trainable": 14370865 }, { "epoch": 0.6987503323584153, "grad_norm": 0.07139495760202408, "learning_rate": 1.017180152104391e-05, "loss": 0.7517107725143433, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.12062, "step": 657, "tokens/total": 172228608, "tokens/train_per_sec_per_gpu": 208.93, "tokens/trainable": 14393067 }, { "epoch": 0.6998138792874236, "grad_norm": 0.07312899082899094, "learning_rate": 1.0107262372285224e-05, "loss": 0.737115740776062, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.0899, "step": 658, "tokens/total": 172490752, "tokens/train_per_sec_per_gpu": 211.06, "tokens/trainable": 14414022 }, { "epoch": 0.7008774262164318, "grad_norm": 0.06322194635868073, "learning_rate": 1.004285932064113e-05, "loss": 0.7515996098518372, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.12039, "step": 659, "tokens/total": 172752896, "tokens/train_per_sec_per_gpu": 264.01, "tokens/trainable": 14441310 }, { "epoch": 0.70194097314544, "grad_norm": 0.06637128442525864, "learning_rate": 9.978593252122168e-06, "loss": 0.7236040830612183, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.06185, "step": 660, "tokens/total": 173015040, "tokens/train_per_sec_per_gpu": 205.79, "tokens/trainable": 14462654 }, { "epoch": 0.7030045200744482, "grad_norm": 0.07252184301614761, "learning_rate": 9.914465050854312e-06, "loss": 0.7753311395645142, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.17131, "step": 661, "tokens/total": 173277184, "tokens/train_per_sec_per_gpu": 186.41, "tokens/trainable": 14484643 }, { "epoch": 0.7040680670034565, "grad_norm": 0.06677009165287018, "learning_rate": 9.85047559906689e-06, "loss": 0.7773078680038452, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.17561, "step": 662, "tokens/total": 173539328, "tokens/train_per_sec_per_gpu": 200.95, "tokens/trainable": 14507742 }, { "epoch": 0.7051316139324648, "grad_norm": 0.06489443778991699, "learning_rate": 9.78662577708039e-06, "loss": 0.8098626136779785, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.2476, "step": 663, "tokens/total": 173801472, "tokens/train_per_sec_per_gpu": 209.29, "tokens/trainable": 14531656 }, { "epoch": 0.706195160861473, "grad_norm": 0.0692375972867012, "learning_rate": 9.722916463294405e-06, "loss": 0.7761327624320984, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.17305, "step": 664, "tokens/total": 174063616, "tokens/train_per_sec_per_gpu": 180.0, "tokens/trainable": 14553153 }, { "epoch": 0.7072587077904813, "grad_norm": 0.07724090665578842, "learning_rate": 9.659348534175484e-06, "loss": 0.7831587195396423, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.18837, "step": 665, "tokens/total": 174325760, "tokens/train_per_sec_per_gpu": 183.64, "tokens/trainable": 14575227 }, { "epoch": 0.7083222547194895, "grad_norm": 0.07470756769180298, "learning_rate": 9.595922864245135e-06, "loss": 0.799434244632721, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.22428, "step": 666, "tokens/total": 174587904, "tokens/train_per_sec_per_gpu": 224.55, "tokens/trainable": 14598217 }, { "epoch": 0.7093858016484977, "grad_norm": 0.07464490085840225, "learning_rate": 9.532640326067764e-06, "loss": 0.7516946792602539, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.12059, "step": 667, "tokens/total": 174850048, "tokens/train_per_sec_per_gpu": 200.11, "tokens/trainable": 14620170 }, { "epoch": 0.710449348577506, "grad_norm": 0.06981492787599564, "learning_rate": 9.469501790238654e-06, "loss": 0.7612972259521484, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.14105, "step": 668, "tokens/total": 175112192, "tokens/train_per_sec_per_gpu": 176.73, "tokens/trainable": 14641941 }, { "epoch": 0.7115128955065142, "grad_norm": 0.06430601328611374, "learning_rate": 9.406508125372034e-06, "loss": 0.7262794971466064, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.06737, "step": 669, "tokens/total": 175374336, "tokens/train_per_sec_per_gpu": 186.94, "tokens/trainable": 14664035 }, { "epoch": 0.7125764424355224, "grad_norm": 0.06467308104038239, "learning_rate": 9.343660198089072e-06, "loss": 0.751503586769104, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.12019, "step": 670, "tokens/total": 175636480, "tokens/train_per_sec_per_gpu": 171.75, "tokens/trainable": 14684884 }, { "epoch": 0.7136399893645307, "grad_norm": 0.07673154026269913, "learning_rate": 9.280958873006032e-06, "loss": 0.771912693977356, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.1639, "step": 671, "tokens/total": 175898624, "tokens/train_per_sec_per_gpu": 185.49, "tokens/trainable": 14705843 }, { "epoch": 0.714703536293539, "grad_norm": 0.06743910163640976, "learning_rate": 9.21840501272228e-06, "loss": 0.7066195011138916, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.02713, "step": 672, "tokens/total": 176160768, "tokens/train_per_sec_per_gpu": 202.95, "tokens/trainable": 14728558 }, { "epoch": 0.7157670832225472, "grad_norm": 0.07574694603681564, "learning_rate": 9.155999477808503e-06, "loss": 0.7232800126075745, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.06118, "step": 673, "tokens/total": 176422912, "tokens/train_per_sec_per_gpu": 203.45, "tokens/trainable": 14748880 }, { "epoch": 0.7168306301515555, "grad_norm": 0.06880473345518112, "learning_rate": 9.093743126794818e-06, "loss": 0.7625008225440979, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.14363, "step": 674, "tokens/total": 176685056, "tokens/train_per_sec_per_gpu": 202.57, "tokens/trainable": 14770901 }, { "epoch": 0.7178941770805637, "grad_norm": 0.07242193818092346, "learning_rate": 9.031636816158974e-06, "loss": 0.7549704909324646, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.12755, "step": 675, "tokens/total": 176947200, "tokens/train_per_sec_per_gpu": 170.41, "tokens/trainable": 14791194 }, { "epoch": 0.7189577240095719, "grad_norm": 0.07365458458662033, "learning_rate": 8.969681400314589e-06, "loss": 0.8114491701126099, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.25117, "step": 676, "tokens/total": 177209344, "tokens/train_per_sec_per_gpu": 170.56, "tokens/trainable": 14811680 }, { "epoch": 0.7200212709385801, "grad_norm": 0.08069625496864319, "learning_rate": 8.907877731599372e-06, "loss": 0.7868974208831787, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.19657, "step": 677, "tokens/total": 177471488, "tokens/train_per_sec_per_gpu": 214.41, "tokens/trainable": 14831836 }, { "epoch": 0.7210848178675884, "grad_norm": 0.07136721163988113, "learning_rate": 8.846226660263415e-06, "loss": 0.7520813345909119, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.12141, "step": 678, "tokens/total": 177733632, "tokens/train_per_sec_per_gpu": 165.9, "tokens/trainable": 14852202 }, { "epoch": 0.7221483647965966, "grad_norm": 0.06927739828824997, "learning_rate": 8.78472903445746e-06, "loss": 0.7342413067817688, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.0839, "step": 679, "tokens/total": 177995776, "tokens/train_per_sec_per_gpu": 199.05, "tokens/trainable": 14873216 }, { "epoch": 0.7232119117256048, "grad_norm": 0.06386187672615051, "learning_rate": 8.723385700221288e-06, "loss": 0.7768787145614624, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.17467, "step": 680, "tokens/total": 178257920, "tokens/train_per_sec_per_gpu": 263.1, "tokens/trainable": 14895853 }, { "epoch": 0.7242754586546132, "grad_norm": 0.08089049160480499, "learning_rate": 8.662197501472016e-06, "loss": 0.7622289061546326, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.14305, "step": 681, "tokens/total": 178520064, "tokens/train_per_sec_per_gpu": 179.05, "tokens/trainable": 14917187 }, { "epoch": 0.7253390055836214, "grad_norm": 0.07812239229679108, "learning_rate": 8.601165279992549e-06, "loss": 0.7714396715164185, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.16288, "step": 682, "tokens/total": 178782208, "tokens/train_per_sec_per_gpu": 157.1, "tokens/trainable": 14936575 }, { "epoch": 0.7264025525126296, "grad_norm": 0.07455753535032272, "learning_rate": 8.540289875419962e-06, "loss": 0.763314962387085, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.14538, "step": 683, "tokens/total": 179044352, "tokens/train_per_sec_per_gpu": 199.99, "tokens/trainable": 14959566 }, { "epoch": 0.7274660994416379, "grad_norm": 0.06646628677845001, "learning_rate": 8.47957212523394e-06, "loss": 0.7563636302947998, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.13051, "step": 684, "tokens/total": 179306496, "tokens/train_per_sec_per_gpu": 202.47, "tokens/trainable": 14981954 }, { "epoch": 0.7285296463706461, "grad_norm": 0.08660077303647995, "learning_rate": 8.419012864745297e-06, "loss": 0.7826834917068481, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.18733, "step": 685, "tokens/total": 179568640, "tokens/train_per_sec_per_gpu": 169.37, "tokens/trainable": 15001703 }, { "epoch": 0.7295931932996543, "grad_norm": 0.07473118603229523, "learning_rate": 8.358612927084435e-06, "loss": 0.8132219910621643, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.25516, "step": 686, "tokens/total": 179830784, "tokens/train_per_sec_per_gpu": 176.67, "tokens/trainable": 15024105 }, { "epoch": 0.7306567402286626, "grad_norm": 0.06741170585155487, "learning_rate": 8.29837314318993e-06, "loss": 0.6950328946113586, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.00377, "step": 687, "tokens/total": 180092928, "tokens/train_per_sec_per_gpu": 190.47, "tokens/trainable": 15045856 }, { "epoch": 0.7317202871576708, "grad_norm": 0.07606185227632523, "learning_rate": 8.23829434179707e-06, "loss": 0.8332937955856323, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.30088, "step": 688, "tokens/total": 180355072, "tokens/train_per_sec_per_gpu": 194.57, "tokens/trainable": 15067803 }, { "epoch": 0.732783834086679, "grad_norm": 0.07419081032276154, "learning_rate": 8.178377349426471e-06, "loss": 0.81829434633255, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.26663, "step": 689, "tokens/total": 180617216, "tokens/train_per_sec_per_gpu": 179.35, "tokens/trainable": 15090573 }, { "epoch": 0.7338473810156874, "grad_norm": 0.0726039931178093, "learning_rate": 8.118622990372676e-06, "loss": 0.8616635203361511, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.3671, "step": 690, "tokens/total": 180879360, "tokens/train_per_sec_per_gpu": 200.44, "tokens/trainable": 15112666 }, { "epoch": 0.7349109279446956, "grad_norm": 0.07402276247739792, "learning_rate": 8.059032086692864e-06, "loss": 0.7540690302848816, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.12563, "step": 691, "tokens/total": 181141504, "tokens/train_per_sec_per_gpu": 168.0, "tokens/trainable": 15133098 }, { "epoch": 0.7359744748737038, "grad_norm": 0.07309621572494507, "learning_rate": 7.999605458195486e-06, "loss": 0.7695388793945312, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.15877, "step": 692, "tokens/total": 181403648, "tokens/train_per_sec_per_gpu": 181.09, "tokens/trainable": 15153899 }, { "epoch": 0.737038021802712, "grad_norm": 0.06912367045879364, "learning_rate": 7.94034392242903e-06, "loss": 0.7535648345947266, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.12456, "step": 693, "tokens/total": 181665792, "tokens/train_per_sec_per_gpu": 183.77, "tokens/trainable": 15176330 }, { "epoch": 0.7381015687317203, "grad_norm": 0.07674102485179901, "learning_rate": 7.881248294670771e-06, "loss": 0.7670722007751465, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.15345, "step": 694, "tokens/total": 181927936, "tokens/train_per_sec_per_gpu": 172.46, "tokens/trainable": 15198450 }, { "epoch": 0.7391651156607285, "grad_norm": 0.07187400758266449, "learning_rate": 7.82231938791551e-06, "loss": 0.7742888331413269, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.16905, "step": 695, "tokens/total": 182190080, "tokens/train_per_sec_per_gpu": 209.81, "tokens/trainable": 15222309 }, { "epoch": 0.7402286625897367, "grad_norm": 0.07647348195314407, "learning_rate": 7.763558012864446e-06, "loss": 0.8115613460540771, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.25142, "step": 696, "tokens/total": 182452224, "tokens/train_per_sec_per_gpu": 160.85, "tokens/trainable": 15245790 }, { "epoch": 0.741292209518745, "grad_norm": 0.07111165672540665, "learning_rate": 7.704964977913984e-06, "loss": 0.8241356611251831, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.27991, "step": 697, "tokens/total": 182714368, "tokens/train_per_sec_per_gpu": 222.33, "tokens/trainable": 15268790 }, { "epoch": 0.7423557564477533, "grad_norm": 0.07145415991544724, "learning_rate": 7.646541089144638e-06, "loss": 0.7397055625915527, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.09532, "step": 698, "tokens/total": 182976512, "tokens/train_per_sec_per_gpu": 197.27, "tokens/trainable": 15289822 }, { "epoch": 0.7434193033767615, "grad_norm": 0.08257947117090225, "learning_rate": 7.588287150309928e-06, "loss": 0.7899596691131592, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.20331, "step": 699, "tokens/total": 183238656, "tokens/train_per_sec_per_gpu": 172.65, "tokens/trainable": 15310952 }, { "epoch": 0.7444828503057698, "grad_norm": 0.07342710345983505, "learning_rate": 7.530203962825331e-06, "loss": 0.7988142371177673, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.2229, "step": 700, "tokens/total": 183500800, "tokens/train_per_sec_per_gpu": 202.61, "tokens/trainable": 15335255 }, { "epoch": 0.745546397234778, "grad_norm": 0.07142513245344162, "learning_rate": 7.4722923257572335e-06, "loss": 0.8055736422538757, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.23798, "step": 701, "tokens/total": 183762944, "tokens/train_per_sec_per_gpu": 234.19, "tokens/trainable": 15359283 }, { "epoch": 0.7466099441637862, "grad_norm": 0.0752025619149208, "learning_rate": 7.414553035811978e-06, "loss": 0.7718970775604248, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.16387, "step": 702, "tokens/total": 184025088, "tokens/train_per_sec_per_gpu": 222.69, "tokens/trainable": 15380449 }, { "epoch": 0.7476734910927945, "grad_norm": 0.07547293603420258, "learning_rate": 7.35698688732486e-06, "loss": 0.7342941761016846, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.08401, "step": 703, "tokens/total": 184287232, "tokens/train_per_sec_per_gpu": 152.07, "tokens/trainable": 15400504 }, { "epoch": 0.7487370380218027, "grad_norm": 0.06701923161745071, "learning_rate": 7.299594672249231e-06, "loss": 0.8081640005111694, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.24378, "step": 704, "tokens/total": 184549376, "tokens/train_per_sec_per_gpu": 185.98, "tokens/trainable": 15423145 }, { "epoch": 0.7498005849508109, "grad_norm": 0.06758899241685867, "learning_rate": 7.242377180145603e-06, "loss": 0.7538725733757019, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.12521, "step": 705, "tokens/total": 184811520, "tokens/train_per_sec_per_gpu": 169.11, "tokens/trainable": 15446585 }, { "epoch": 0.7508641318798192, "grad_norm": 0.07745972275733948, "learning_rate": 7.1853351981707504e-06, "loss": 0.8155503273010254, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.26042, "step": 706, "tokens/total": 185073664, "tokens/train_per_sec_per_gpu": 202.79, "tokens/trainable": 15467738 }, { "epoch": 0.7519276788088275, "grad_norm": 0.06426619738340378, "learning_rate": 7.128469511066933e-06, "loss": 0.7379392385482788, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.09162, "step": 707, "tokens/total": 185335808, "tokens/train_per_sec_per_gpu": 199.47, "tokens/trainable": 15490043 }, { "epoch": 0.7529912257378357, "grad_norm": 0.07671428471803665, "learning_rate": 7.071780901151049e-06, "loss": 0.8222740292549133, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.27567, "step": 708, "tokens/total": 185597952, "tokens/train_per_sec_per_gpu": 187.24, "tokens/trainable": 15512912 }, { "epoch": 0.754054772666844, "grad_norm": 0.07356390357017517, "learning_rate": 7.015270148303919e-06, "loss": 0.7136031985282898, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.04133, "step": 709, "tokens/total": 185860096, "tokens/train_per_sec_per_gpu": 188.47, "tokens/trainable": 15532378 }, { "epoch": 0.7551183195958522, "grad_norm": 0.07700134813785553, "learning_rate": 6.958938029959508e-06, "loss": 0.7238497734069824, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.06236, "step": 710, "tokens/total": 186122240, "tokens/train_per_sec_per_gpu": 203.44, "tokens/trainable": 15553126 }, { "epoch": 0.7561818665248604, "grad_norm": 0.06828487664461136, "learning_rate": 6.902785321094301e-06, "loss": 0.7130833864212036, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.04027, "step": 711, "tokens/total": 186384384, "tokens/train_per_sec_per_gpu": 226.96, "tokens/trainable": 15575744 }, { "epoch": 0.7572454134538686, "grad_norm": 0.07853987067937851, "learning_rate": 6.846812794216546e-06, "loss": 0.7818912267684937, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.1856, "step": 712, "tokens/total": 186646528, "tokens/train_per_sec_per_gpu": 165.77, "tokens/trainable": 15595629 }, { "epoch": 0.7583089603828769, "grad_norm": 0.08049149066209793, "learning_rate": 6.791021219355722e-06, "loss": 0.860575795173645, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.36452, "step": 713, "tokens/total": 186908672, "tokens/train_per_sec_per_gpu": 197.57, "tokens/trainable": 15617377 }, { "epoch": 0.7593725073118851, "grad_norm": 0.07803945243358612, "learning_rate": 6.735411364051865e-06, "loss": 0.8134850263595581, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.25576, "step": 714, "tokens/total": 187170816, "tokens/train_per_sec_per_gpu": 170.03, "tokens/trainable": 15638182 }, { "epoch": 0.7604360542408933, "grad_norm": 0.07364539057016373, "learning_rate": 6.679983993345067e-06, "loss": 0.7291663885116577, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.07335, "step": 715, "tokens/total": 187432960, "tokens/train_per_sec_per_gpu": 165.64, "tokens/trainable": 15658499 }, { "epoch": 0.7614996011699017, "grad_norm": 0.06459354609251022, "learning_rate": 6.624739869764931e-06, "loss": 0.6955982446670532, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.00491, "step": 716, "tokens/total": 187695104, "tokens/train_per_sec_per_gpu": 218.83, "tokens/trainable": 15681617 }, { "epoch": 0.7625631480989099, "grad_norm": 0.07153891026973724, "learning_rate": 6.5696797533200596e-06, "loss": 0.7669087648391724, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.1531, "step": 717, "tokens/total": 187957248, "tokens/train_per_sec_per_gpu": 215.72, "tokens/trainable": 15703269 }, { "epoch": 0.7636266950279181, "grad_norm": 0.0816182941198349, "learning_rate": 6.514804401487642e-06, "loss": 0.7865086197853088, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.19572, "step": 718, "tokens/total": 188219392, "tokens/train_per_sec_per_gpu": 197.11, "tokens/trainable": 15723336 }, { "epoch": 0.7646902419569264, "grad_norm": 0.07151731103658676, "learning_rate": 6.460114569202989e-06, "loss": 0.7884716987609863, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.20003, "step": 719, "tokens/total": 188481536, "tokens/train_per_sec_per_gpu": 194.18, "tokens/trainable": 15744945 }, { "epoch": 0.7657537888859346, "grad_norm": 0.08032439649105072, "learning_rate": 6.405611008849184e-06, "loss": 0.7517272233963013, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.12066, "step": 720, "tokens/total": 188743680, "tokens/train_per_sec_per_gpu": 182.56, "tokens/trainable": 15766209 }, { "epoch": 0.7668173358149428, "grad_norm": 0.0685308575630188, "learning_rate": 6.351294470246694e-06, "loss": 0.710903525352478, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.03583, "step": 721, "tokens/total": 189005824, "tokens/train_per_sec_per_gpu": 177.14, "tokens/trainable": 15790422 }, { "epoch": 0.767880882743951, "grad_norm": 0.06389962136745453, "learning_rate": 6.2971657006431175e-06, "loss": 0.6497384905815125, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 1.91504, "step": 722, "tokens/total": 189267968, "tokens/train_per_sec_per_gpu": 211.94, "tokens/trainable": 15812341 }, { "epoch": 0.7689444296729593, "grad_norm": 0.08121001720428467, "learning_rate": 6.243225444702823e-06, "loss": 0.7910786867141724, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.20577, "step": 723, "tokens/total": 189530112, "tokens/train_per_sec_per_gpu": 167.38, "tokens/trainable": 15832072 }, { "epoch": 0.7700079766019675, "grad_norm": 0.07116419076919556, "learning_rate": 6.1894744444967525e-06, "loss": 0.820841372013092, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.27241, "step": 724, "tokens/total": 189792256, "tokens/train_per_sec_per_gpu": 226.98, "tokens/trainable": 15855273 }, { "epoch": 0.7710715235309759, "grad_norm": 0.07092837989330292, "learning_rate": 6.135913439492227e-06, "loss": 0.7487311363220215, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.11432, "step": 725, "tokens/total": 190054400, "tokens/train_per_sec_per_gpu": 171.4, "tokens/trainable": 15877353 }, { "epoch": 0.7721350704599841, "grad_norm": 0.07904180139303207, "learning_rate": 6.0825431665427185e-06, "loss": 0.779388427734375, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.18014, "step": 726, "tokens/total": 190316544, "tokens/train_per_sec_per_gpu": 166.96, "tokens/trainable": 15899580 }, { "epoch": 0.7731986173889923, "grad_norm": 0.08124149590730667, "learning_rate": 6.029364359877772e-06, "loss": 0.7941169738769531, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.21249, "step": 727, "tokens/total": 190578688, "tokens/train_per_sec_per_gpu": 164.05, "tokens/trainable": 15919312 }, { "epoch": 0.7742621643180005, "grad_norm": 0.07375580817461014, "learning_rate": 5.976377751092867e-06, "loss": 0.7667893171310425, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.15284, "step": 728, "tokens/total": 190840832, "tokens/train_per_sec_per_gpu": 183.39, "tokens/trainable": 15940854 }, { "epoch": 0.7753257112470088, "grad_norm": 0.07292015105485916, "learning_rate": 5.923584069139372e-06, "loss": 0.7329660654067993, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.08124, "step": 729, "tokens/total": 191102976, "tokens/train_per_sec_per_gpu": 179.98, "tokens/trainable": 15960364 }, { "epoch": 0.776389258176017, "grad_norm": 0.07190407067537308, "learning_rate": 5.870984040314491e-06, "loss": 0.7781997919082642, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.17755, "step": 730, "tokens/total": 191365120, "tokens/train_per_sec_per_gpu": 181.25, "tokens/trainable": 15981713 }, { "epoch": 0.7774528051050252, "grad_norm": 0.08158424496650696, "learning_rate": 5.81857838825131e-06, "loss": 0.7884747385978699, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.20004, "step": 731, "tokens/total": 191627264, "tokens/train_per_sec_per_gpu": 192.3, "tokens/trainable": 16002933 }, { "epoch": 0.7785163520340335, "grad_norm": 0.06751953810453415, "learning_rate": 5.7663678339087995e-06, "loss": 0.7473230361938477, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.11134, "step": 732, "tokens/total": 191889408, "tokens/train_per_sec_per_gpu": 204.72, "tokens/trainable": 16023922 }, { "epoch": 0.7795798989630417, "grad_norm": 0.07681619375944138, "learning_rate": 5.714353095561929e-06, "loss": 0.669176459312439, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 1.95263, "step": 733, "tokens/total": 192151552, "tokens/train_per_sec_per_gpu": 214.29, "tokens/trainable": 16047794 }, { "epoch": 0.78064344589205, "grad_norm": 0.07466083765029907, "learning_rate": 5.66253488879178e-06, "loss": 0.7656868696212769, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.15047, "step": 734, "tokens/total": 192413696, "tokens/train_per_sec_per_gpu": 172.48, "tokens/trainable": 16067607 }, { "epoch": 0.7817069928210583, "grad_norm": 0.07689571380615234, "learning_rate": 5.6109139264756715e-06, "loss": 0.782907247543335, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.18782, "step": 735, "tokens/total": 192675840, "tokens/train_per_sec_per_gpu": 141.32, "tokens/trainable": 16086585 }, { "epoch": 0.7827705397500665, "grad_norm": 0.06553677469491959, "learning_rate": 5.55949091877741e-06, "loss": 0.805263876914978, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.23729, "step": 736, "tokens/total": 192937984, "tokens/train_per_sec_per_gpu": 213.73, "tokens/trainable": 16110209 }, { "epoch": 0.7838340866790747, "grad_norm": 0.07252180576324463, "learning_rate": 5.508266573137449e-06, "loss": 0.7305166125297546, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.07615, "step": 737, "tokens/total": 193200128, "tokens/train_per_sec_per_gpu": 189.19, "tokens/trainable": 16132001 }, { "epoch": 0.784897633608083, "grad_norm": 0.08187612891197205, "learning_rate": 5.457241594263219e-06, "loss": 0.8013345003128052, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.22851, "step": 738, "tokens/total": 193462272, "tokens/train_per_sec_per_gpu": 196.91, "tokens/trainable": 16152126 }, { "epoch": 0.7859611805370912, "grad_norm": 0.06783902645111084, "learning_rate": 5.4064166841194e-06, "loss": 0.7070966958999634, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.02809, "step": 739, "tokens/total": 193724416, "tokens/train_per_sec_per_gpu": 187.99, "tokens/trainable": 16173209 }, { "epoch": 0.7870247274660994, "grad_norm": 0.07314567267894745, "learning_rate": 5.3557925419182764e-06, "loss": 0.7426178455352783, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.10143, "step": 740, "tokens/total": 193986560, "tokens/train_per_sec_per_gpu": 192.09, "tokens/trainable": 16193859 }, { "epoch": 0.7880882743951076, "grad_norm": 0.07094215601682663, "learning_rate": 5.305369864110095e-06, "loss": 0.7928224802017212, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.20962, "step": 741, "tokens/total": 194248704, "tokens/train_per_sec_per_gpu": 206.94, "tokens/trainable": 16217465 }, { "epoch": 0.7891518213241159, "grad_norm": 0.07578529417514801, "learning_rate": 5.255149344373525e-06, "loss": 0.7676488161087036, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.15469, "step": 742, "tokens/total": 194510848, "tokens/train_per_sec_per_gpu": 209.86, "tokens/trainable": 16237798 }, { "epoch": 0.7902153682531242, "grad_norm": 0.07046926766633987, "learning_rate": 5.205131673606072e-06, "loss": 0.7564293742179871, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.13065, "step": 743, "tokens/total": 194772992, "tokens/train_per_sec_per_gpu": 188.6, "tokens/trainable": 16259923 }, { "epoch": 0.7912789151821324, "grad_norm": 0.07736487686634064, "learning_rate": 5.155317539914601e-06, "loss": 0.8347206115722656, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.30417, "step": 744, "tokens/total": 195035136, "tokens/train_per_sec_per_gpu": 185.07, "tokens/trainable": 16281319 }, { "epoch": 0.7923424621111407, "grad_norm": 0.07465264946222305, "learning_rate": 5.105707628605872e-06, "loss": 0.7569116950035095, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.13168, "step": 745, "tokens/total": 195297280, "tokens/train_per_sec_per_gpu": 208.28, "tokens/trainable": 16304177 }, { "epoch": 0.7934060090401489, "grad_norm": 0.07319594919681549, "learning_rate": 5.056302622177074e-06, "loss": 0.8522895574569702, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.34501, "step": 746, "tokens/total": 195559424, "tokens/train_per_sec_per_gpu": 183.83, "tokens/trainable": 16325635 }, { "epoch": 0.7944695559691571, "grad_norm": 0.06852512806653976, "learning_rate": 5.007103200306493e-06, "loss": 0.6919762492179871, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 1.99766, "step": 747, "tokens/total": 195821568, "tokens/train_per_sec_per_gpu": 169.51, "tokens/trainable": 16347890 }, { "epoch": 0.7955331028981654, "grad_norm": 0.06914964318275452, "learning_rate": 4.958110039844109e-06, "loss": 0.7257647514343262, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.06631, "step": 748, "tokens/total": 196083712, "tokens/train_per_sec_per_gpu": 180.18, "tokens/trainable": 16370035 }, { "epoch": 0.7965966498271736, "grad_norm": 0.08356507867574692, "learning_rate": 4.9093238148023206e-06, "loss": 0.8562688827514648, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.35436, "step": 749, "tokens/total": 196345856, "tokens/train_per_sec_per_gpu": 163.34, "tokens/trainable": 16389246 }, { "epoch": 0.7976601967561818, "grad_norm": 0.06851000338792801, "learning_rate": 4.860745196346652e-06, "loss": 0.732913076877594, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 46.14, "memory/max_allocated (GiB)": 46.14, "ppl": 2.08113, "step": 750, "tokens/total": 196608000, "tokens/train_per_sec_per_gpu": 191.77, "tokens/trainable": 16410595 }, { "epoch": 0.7976601967561818, "eval_loss": 0.7697240710258484, "eval_ppl": 2.15917, "eval_runtime": 237.415, "eval_samples_per_second": 28.162, "eval_steps_per_second": 1.761, "memory/device_reserved (GiB)": 52.46, "memory/max_active (GiB)": 38.19, "memory/max_allocated (GiB)": 38.19, "step": 750 } ], "logging_steps": 1, "max_steps": 941, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 150, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 1000, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.388464118595584e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }