felixwangg's picture
Upload folder using huggingface_hub
2360457 verified
Raw
History Blame Contribute Delete
341 kB
{
"best_global_step": 750,
"best_metric": 0.7697240710258484,
"best_model_checkpoint": "/home/tkwang/scratch/SecSteer-v2/axolotl-outputs/lora/Qwen2.5-Coder-7B-evol-stage1/checkpoint-750",
"epoch": 0.7976601967561818,
"eval_steps": 150,
"global_step": 750,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0,
"eval_loss": 0.804951012134552,
"eval_ppl": 2.23659,
"eval_runtime": 237.4236,
"eval_samples_per_second": 28.161,
"eval_steps_per_second": 1.761,
"memory/device_reserved (GiB)": 41.82,
"memory/max_active (GiB)": 37.85,
"memory/max_allocated (GiB)": 37.85,
"step": 0
},
{
"epoch": 0.0010635469290082426,
"grad_norm": 0.0461450070142746,
"learning_rate": 0.0,
"loss": 0.7683508396148682,
"memory/device_reserved (GiB)": 51.3,
"memory/max_active (GiB)": 45.83,
"memory/max_allocated (GiB)": 45.83,
"ppl": 2.15621,
"step": 1,
"tokens/total": 262144,
"tokens/train_per_sec_per_gpu": 143.71,
"tokens/trainable": 21968
},
{
"epoch": 0.002127093858016485,
"grad_norm": 0.053678449243307114,
"learning_rate": 4.2553191489361704e-07,
"loss": 0.8233645558357239,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.27815,
"step": 2,
"tokens/total": 524288,
"tokens/train_per_sec_per_gpu": 206.75,
"tokens/trainable": 44841
},
{
"epoch": 0.0031906407870247273,
"grad_norm": 0.05005470663309097,
"learning_rate": 8.510638297872341e-07,
"loss": 0.8364737033843994,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.30821,
"step": 3,
"tokens/total": 786432,
"tokens/train_per_sec_per_gpu": 190.31,
"tokens/trainable": 67815
},
{
"epoch": 0.00425418771603297,
"grad_norm": 0.04871873930096626,
"learning_rate": 1.276595744680851e-06,
"loss": 0.7923524975776672,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.20859,
"step": 4,
"tokens/total": 1048576,
"tokens/train_per_sec_per_gpu": 172.64,
"tokens/trainable": 89239
},
{
"epoch": 0.0053177346450412125,
"grad_norm": 0.052344731986522675,
"learning_rate": 1.7021276595744682e-06,
"loss": 0.7642413973808289,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.14736,
"step": 5,
"tokens/total": 1310720,
"tokens/train_per_sec_per_gpu": 185.1,
"tokens/trainable": 109336
},
{
"epoch": 0.006381281574049455,
"grad_norm": 0.04826882481575012,
"learning_rate": 2.1276595744680853e-06,
"loss": 0.8659416437149048,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.37724,
"step": 6,
"tokens/total": 1572864,
"tokens/train_per_sec_per_gpu": 183.2,
"tokens/trainable": 130625
},
{
"epoch": 0.007444828503057698,
"grad_norm": 0.05040327087044716,
"learning_rate": 2.553191489361702e-06,
"loss": 0.8249338865280151,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.28173,
"step": 7,
"tokens/total": 1835008,
"tokens/train_per_sec_per_gpu": 209.68,
"tokens/trainable": 153061
},
{
"epoch": 0.00850837543206594,
"grad_norm": 0.05111980810761452,
"learning_rate": 2.978723404255319e-06,
"loss": 0.7967497706413269,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.21832,
"step": 8,
"tokens/total": 2097152,
"tokens/train_per_sec_per_gpu": 185.83,
"tokens/trainable": 173852
},
{
"epoch": 0.009571922361074183,
"grad_norm": 0.04268274083733559,
"learning_rate": 3.4042553191489363e-06,
"loss": 0.7484039068222046,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.11362,
"step": 9,
"tokens/total": 2359296,
"tokens/train_per_sec_per_gpu": 214.69,
"tokens/trainable": 196255
},
{
"epoch": 0.010635469290082425,
"grad_norm": 0.04418900981545448,
"learning_rate": 3.8297872340425535e-06,
"loss": 0.8072069883346558,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.24164,
"step": 10,
"tokens/total": 2621440,
"tokens/train_per_sec_per_gpu": 168.89,
"tokens/trainable": 217227
},
{
"epoch": 0.011699016219090667,
"grad_norm": 0.04787033051252365,
"learning_rate": 4.255319148936171e-06,
"loss": 0.7684656381607056,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.15645,
"step": 11,
"tokens/total": 2883584,
"tokens/train_per_sec_per_gpu": 187.36,
"tokens/trainable": 240990
},
{
"epoch": 0.01276256314809891,
"grad_norm": 0.04942560940980911,
"learning_rate": 4.680851063829788e-06,
"loss": 0.8147498369216919,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.25861,
"step": 12,
"tokens/total": 3145728,
"tokens/train_per_sec_per_gpu": 146.78,
"tokens/trainable": 263455
},
{
"epoch": 0.013826110077107153,
"grad_norm": 0.05288751795887947,
"learning_rate": 5.106382978723404e-06,
"loss": 0.8472910523414612,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.33332,
"step": 13,
"tokens/total": 3407872,
"tokens/train_per_sec_per_gpu": 179.19,
"tokens/trainable": 286052
},
{
"epoch": 0.014889657006115395,
"grad_norm": 0.05669346824288368,
"learning_rate": 5.531914893617022e-06,
"loss": 0.7845062017440796,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.19132,
"step": 14,
"tokens/total": 3670016,
"tokens/train_per_sec_per_gpu": 147.62,
"tokens/trainable": 305257
},
{
"epoch": 0.015953203935123637,
"grad_norm": 0.04507856070995331,
"learning_rate": 5.957446808510638e-06,
"loss": 0.7491350769996643,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.11517,
"step": 15,
"tokens/total": 3932160,
"tokens/train_per_sec_per_gpu": 254.85,
"tokens/trainable": 328628
},
{
"epoch": 0.01701675086413188,
"grad_norm": 0.04711790010333061,
"learning_rate": 6.382978723404256e-06,
"loss": 0.7844012975692749,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.19109,
"step": 16,
"tokens/total": 4194304,
"tokens/train_per_sec_per_gpu": 210.21,
"tokens/trainable": 351162
},
{
"epoch": 0.01808029779314012,
"grad_norm": 0.04914192110300064,
"learning_rate": 6.808510638297873e-06,
"loss": 0.8149253129959106,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.25901,
"step": 17,
"tokens/total": 4456448,
"tokens/train_per_sec_per_gpu": 172.85,
"tokens/trainable": 373818
},
{
"epoch": 0.019143844722148366,
"grad_norm": 0.057912107557058334,
"learning_rate": 7.234042553191491e-06,
"loss": 0.7840430736541748,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.19031,
"step": 18,
"tokens/total": 4718592,
"tokens/train_per_sec_per_gpu": 154.54,
"tokens/trainable": 393528
},
{
"epoch": 0.020207391651156606,
"grad_norm": 0.05160650238394737,
"learning_rate": 7.659574468085107e-06,
"loss": 0.8152034282684326,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.25964,
"step": 19,
"tokens/total": 4980736,
"tokens/train_per_sec_per_gpu": 188.44,
"tokens/trainable": 415305
},
{
"epoch": 0.02127093858016485,
"grad_norm": 0.052523426711559296,
"learning_rate": 8.085106382978723e-06,
"loss": 0.816782534122467,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.26321,
"step": 20,
"tokens/total": 5242880,
"tokens/train_per_sec_per_gpu": 223.4,
"tokens/trainable": 438205
},
{
"epoch": 0.022334485509173094,
"grad_norm": 0.06776182353496552,
"learning_rate": 8.510638297872341e-06,
"loss": 0.8410882949829102,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.31889,
"step": 21,
"tokens/total": 5505024,
"tokens/train_per_sec_per_gpu": 177.12,
"tokens/trainable": 457521
},
{
"epoch": 0.023398032438181334,
"grad_norm": 0.05495529994368553,
"learning_rate": 8.936170212765958e-06,
"loss": 0.7947180271148682,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.21382,
"step": 22,
"tokens/total": 5767168,
"tokens/train_per_sec_per_gpu": 175.57,
"tokens/trainable": 478475
},
{
"epoch": 0.024461579367189578,
"grad_norm": 0.061899591237306595,
"learning_rate": 9.361702127659576e-06,
"loss": 0.8852798342704773,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.42366,
"step": 23,
"tokens/total": 6029312,
"tokens/train_per_sec_per_gpu": 142.65,
"tokens/trainable": 498659
},
{
"epoch": 0.02552512629619782,
"grad_norm": 0.0498763844370842,
"learning_rate": 9.787234042553192e-06,
"loss": 0.672831654548645,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 1.95978,
"step": 24,
"tokens/total": 6291456,
"tokens/train_per_sec_per_gpu": 214.6,
"tokens/trainable": 521341
},
{
"epoch": 0.026588673225206062,
"grad_norm": 0.056778695434331894,
"learning_rate": 1.0212765957446808e-05,
"loss": 0.7445163726806641,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.10542,
"step": 25,
"tokens/total": 6553600,
"tokens/train_per_sec_per_gpu": 191.7,
"tokens/trainable": 542902
},
{
"epoch": 0.027652220154214306,
"grad_norm": 0.05652826279401779,
"learning_rate": 1.0638297872340426e-05,
"loss": 0.7965201139450073,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.21781,
"step": 26,
"tokens/total": 6815744,
"tokens/train_per_sec_per_gpu": 173.66,
"tokens/trainable": 565280
},
{
"epoch": 0.028715767083222547,
"grad_norm": 0.05963267385959625,
"learning_rate": 1.1063829787234044e-05,
"loss": 0.8641867637634277,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.37308,
"step": 27,
"tokens/total": 7077888,
"tokens/train_per_sec_per_gpu": 197.23,
"tokens/trainable": 587539
},
{
"epoch": 0.02977931401223079,
"grad_norm": 0.06136506423354149,
"learning_rate": 1.1489361702127662e-05,
"loss": 0.8187180161476135,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.26759,
"step": 28,
"tokens/total": 7340032,
"tokens/train_per_sec_per_gpu": 222.81,
"tokens/trainable": 609839
},
{
"epoch": 0.03084286094123903,
"grad_norm": 0.0565866194665432,
"learning_rate": 1.1914893617021277e-05,
"loss": 0.8024689555168152,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.23104,
"step": 29,
"tokens/total": 7602176,
"tokens/train_per_sec_per_gpu": 204.38,
"tokens/trainable": 631280
},
{
"epoch": 0.031906407870247275,
"grad_norm": 0.06343540549278259,
"learning_rate": 1.2340425531914895e-05,
"loss": 0.7662019729614258,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.15158,
"step": 30,
"tokens/total": 7864320,
"tokens/train_per_sec_per_gpu": 228.13,
"tokens/trainable": 652766
},
{
"epoch": 0.03296995479925552,
"grad_norm": 0.056531310081481934,
"learning_rate": 1.2765957446808513e-05,
"loss": 0.8442375659942627,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.3262,
"step": 31,
"tokens/total": 8126464,
"tokens/train_per_sec_per_gpu": 190.64,
"tokens/trainable": 674612
},
{
"epoch": 0.03403350172826376,
"grad_norm": 0.06304491311311722,
"learning_rate": 1.3191489361702127e-05,
"loss": 0.8129785060882568,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.25461,
"step": 32,
"tokens/total": 8388608,
"tokens/train_per_sec_per_gpu": 214.77,
"tokens/trainable": 695779
},
{
"epoch": 0.035097048657272,
"grad_norm": 0.05908438190817833,
"learning_rate": 1.3617021276595745e-05,
"loss": 0.7475928068161011,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.11191,
"step": 33,
"tokens/total": 8650752,
"tokens/train_per_sec_per_gpu": 177.7,
"tokens/trainable": 716063
},
{
"epoch": 0.03616059558628024,
"grad_norm": 0.049326092004776,
"learning_rate": 1.4042553191489363e-05,
"loss": 0.7267792224884033,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.06841,
"step": 34,
"tokens/total": 8912896,
"tokens/train_per_sec_per_gpu": 201.94,
"tokens/trainable": 737753
},
{
"epoch": 0.03722414251528849,
"grad_norm": 0.04339035972952843,
"learning_rate": 1.4468085106382981e-05,
"loss": 0.8321285247802734,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.29821,
"step": 35,
"tokens/total": 9175040,
"tokens/train_per_sec_per_gpu": 238.51,
"tokens/trainable": 761270
},
{
"epoch": 0.03828768944429673,
"grad_norm": 0.040435630828142166,
"learning_rate": 1.4893617021276596e-05,
"loss": 0.7482062578201294,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.11321,
"step": 36,
"tokens/total": 9437184,
"tokens/train_per_sec_per_gpu": 235.13,
"tokens/trainable": 783478
},
{
"epoch": 0.039351236373304975,
"grad_norm": 0.04054463654756546,
"learning_rate": 1.5319148936170214e-05,
"loss": 0.7387034296989441,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.09322,
"step": 37,
"tokens/total": 9699328,
"tokens/train_per_sec_per_gpu": 173.41,
"tokens/trainable": 805224
},
{
"epoch": 0.04041478330231321,
"grad_norm": 0.03981109336018562,
"learning_rate": 1.5744680851063832e-05,
"loss": 0.7435116767883301,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.10331,
"step": 38,
"tokens/total": 9961472,
"tokens/train_per_sec_per_gpu": 185.32,
"tokens/trainable": 828977
},
{
"epoch": 0.041478330231321456,
"grad_norm": 0.04639929160475731,
"learning_rate": 1.6170212765957446e-05,
"loss": 0.8203743696212769,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.27135,
"step": 39,
"tokens/total": 10223616,
"tokens/train_per_sec_per_gpu": 183.52,
"tokens/trainable": 850008
},
{
"epoch": 0.0425418771603297,
"grad_norm": 0.04261818155646324,
"learning_rate": 1.6595744680851064e-05,
"loss": 0.8682478070259094,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.38273,
"step": 40,
"tokens/total": 10485760,
"tokens/train_per_sec_per_gpu": 219.05,
"tokens/trainable": 872900
},
{
"epoch": 0.043605424089337944,
"grad_norm": 0.04111519455909729,
"learning_rate": 1.7021276595744682e-05,
"loss": 0.7811744213104248,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.18404,
"step": 41,
"tokens/total": 10747904,
"tokens/train_per_sec_per_gpu": 204.48,
"tokens/trainable": 895505
},
{
"epoch": 0.04466897101834619,
"grad_norm": 0.03414495289325714,
"learning_rate": 1.74468085106383e-05,
"loss": 0.8004995584487915,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.22665,
"step": 42,
"tokens/total": 11010048,
"tokens/train_per_sec_per_gpu": 198.04,
"tokens/trainable": 917814
},
{
"epoch": 0.045732517947354424,
"grad_norm": 0.030925795435905457,
"learning_rate": 1.7872340425531915e-05,
"loss": 0.7756137251853943,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.17192,
"step": 43,
"tokens/total": 11272192,
"tokens/train_per_sec_per_gpu": 195.24,
"tokens/trainable": 939811
},
{
"epoch": 0.04679606487636267,
"grad_norm": 0.026804521679878235,
"learning_rate": 1.8297872340425533e-05,
"loss": 0.6872485876083374,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 1.98824,
"step": 44,
"tokens/total": 11534336,
"tokens/train_per_sec_per_gpu": 228.04,
"tokens/trainable": 963796
},
{
"epoch": 0.04785961180537091,
"grad_norm": 0.02924325503408909,
"learning_rate": 1.872340425531915e-05,
"loss": 0.7919371128082275,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.20767,
"step": 45,
"tokens/total": 11796480,
"tokens/train_per_sec_per_gpu": 193.49,
"tokens/trainable": 984430
},
{
"epoch": 0.048923158734379156,
"grad_norm": 0.030018026009202003,
"learning_rate": 1.914893617021277e-05,
"loss": 0.7972186803817749,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.21936,
"step": 46,
"tokens/total": 12058624,
"tokens/train_per_sec_per_gpu": 180.38,
"tokens/trainable": 1004277
},
{
"epoch": 0.0499867056633874,
"grad_norm": 0.030266476795077324,
"learning_rate": 1.9574468085106384e-05,
"loss": 0.7901904582977295,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.20382,
"step": 47,
"tokens/total": 12320768,
"tokens/train_per_sec_per_gpu": 185.52,
"tokens/trainable": 1026347
},
{
"epoch": 0.05105025259239564,
"grad_norm": 0.028692839667201042,
"learning_rate": 2e-05,
"loss": 0.7853357791900635,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.19314,
"step": 48,
"tokens/total": 12582912,
"tokens/train_per_sec_per_gpu": 167.93,
"tokens/trainable": 1048721
},
{
"epoch": 0.05211379952140388,
"grad_norm": 0.026348290964961052,
"learning_rate": 2.0425531914893616e-05,
"loss": 0.8077329397201538,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.24282,
"step": 49,
"tokens/total": 12845056,
"tokens/train_per_sec_per_gpu": 185.28,
"tokens/trainable": 1071780
},
{
"epoch": 0.053177346450412125,
"grad_norm": 0.029625559225678444,
"learning_rate": 2.0851063829787238e-05,
"loss": 0.7733415365219116,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.167,
"step": 50,
"tokens/total": 13107200,
"tokens/train_per_sec_per_gpu": 189.79,
"tokens/trainable": 1091760
},
{
"epoch": 0.05424089337942037,
"grad_norm": 0.027405593544244766,
"learning_rate": 2.1276595744680852e-05,
"loss": 0.7173340916633606,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.04896,
"step": 51,
"tokens/total": 13369344,
"tokens/train_per_sec_per_gpu": 188.87,
"tokens/trainable": 1113687
},
{
"epoch": 0.05530444030842861,
"grad_norm": 0.02946804091334343,
"learning_rate": 2.1702127659574467e-05,
"loss": 0.7727050185203552,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.16562,
"step": 52,
"tokens/total": 13631488,
"tokens/train_per_sec_per_gpu": 209.22,
"tokens/trainable": 1135391
},
{
"epoch": 0.05636798723743685,
"grad_norm": 0.02892529033124447,
"learning_rate": 2.2127659574468088e-05,
"loss": 0.8007056713104248,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.22711,
"step": 53,
"tokens/total": 13893632,
"tokens/train_per_sec_per_gpu": 202.13,
"tokens/trainable": 1159518
},
{
"epoch": 0.05743153416644509,
"grad_norm": 0.031362369656562805,
"learning_rate": 2.2553191489361703e-05,
"loss": 0.7692879438400269,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.15823,
"step": 54,
"tokens/total": 14155776,
"tokens/train_per_sec_per_gpu": 151.27,
"tokens/trainable": 1178964
},
{
"epoch": 0.05849508109545334,
"grad_norm": 0.027873003855347633,
"learning_rate": 2.2978723404255324e-05,
"loss": 0.7864505052566528,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.19559,
"step": 55,
"tokens/total": 14417920,
"tokens/train_per_sec_per_gpu": 213.4,
"tokens/trainable": 1201830
},
{
"epoch": 0.05955862802446158,
"grad_norm": 0.030442189425230026,
"learning_rate": 2.340425531914894e-05,
"loss": 0.7171883583068848,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.04866,
"step": 56,
"tokens/total": 14680064,
"tokens/train_per_sec_per_gpu": 163.06,
"tokens/trainable": 1221040
},
{
"epoch": 0.060622174953469825,
"grad_norm": 0.030432693660259247,
"learning_rate": 2.3829787234042553e-05,
"loss": 0.8123354911804199,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.25316,
"step": 57,
"tokens/total": 14942208,
"tokens/train_per_sec_per_gpu": 191.04,
"tokens/trainable": 1245037
},
{
"epoch": 0.06168572188247806,
"grad_norm": 0.030456526204943657,
"learning_rate": 2.4255319148936175e-05,
"loss": 0.7400133609771729,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.09596,
"step": 58,
"tokens/total": 15204352,
"tokens/train_per_sec_per_gpu": 173.09,
"tokens/trainable": 1265410
},
{
"epoch": 0.0627492688114863,
"grad_norm": 0.028698932379484177,
"learning_rate": 2.468085106382979e-05,
"loss": 0.7431353330612183,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.10252,
"step": 59,
"tokens/total": 15466496,
"tokens/train_per_sec_per_gpu": 204.25,
"tokens/trainable": 1287471
},
{
"epoch": 0.06381281574049455,
"grad_norm": 0.029805311933159828,
"learning_rate": 2.5106382978723404e-05,
"loss": 0.785997748374939,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.1946,
"step": 60,
"tokens/total": 15728640,
"tokens/train_per_sec_per_gpu": 224.54,
"tokens/trainable": 1309972
},
{
"epoch": 0.0648763626695028,
"grad_norm": 0.03344248980283737,
"learning_rate": 2.5531914893617025e-05,
"loss": 0.7236359119415283,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.06192,
"step": 61,
"tokens/total": 15990784,
"tokens/train_per_sec_per_gpu": 162.36,
"tokens/trainable": 1329223
},
{
"epoch": 0.06593990959851104,
"grad_norm": 0.03220194950699806,
"learning_rate": 2.595744680851064e-05,
"loss": 0.7307531833648682,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.07664,
"step": 62,
"tokens/total": 16252928,
"tokens/train_per_sec_per_gpu": 161.66,
"tokens/trainable": 1350124
},
{
"epoch": 0.06700345652751928,
"grad_norm": 0.032156504690647125,
"learning_rate": 2.6382978723404255e-05,
"loss": 0.8302059173583984,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.29379,
"step": 63,
"tokens/total": 16515072,
"tokens/train_per_sec_per_gpu": 174.14,
"tokens/trainable": 1372459
},
{
"epoch": 0.06806700345652753,
"grad_norm": 0.031544484198093414,
"learning_rate": 2.6808510638297876e-05,
"loss": 0.8317389488220215,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.29731,
"step": 64,
"tokens/total": 16777216,
"tokens/train_per_sec_per_gpu": 203.61,
"tokens/trainable": 1394135
},
{
"epoch": 0.06913055038553576,
"grad_norm": 0.028723040595650673,
"learning_rate": 2.723404255319149e-05,
"loss": 0.7596557140350342,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.13754,
"step": 65,
"tokens/total": 17039360,
"tokens/train_per_sec_per_gpu": 175.65,
"tokens/trainable": 1416653
},
{
"epoch": 0.070194097314544,
"grad_norm": 0.03393164649605751,
"learning_rate": 2.7659574468085112e-05,
"loss": 0.8652482032775879,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.3756,
"step": 66,
"tokens/total": 17301504,
"tokens/train_per_sec_per_gpu": 195.94,
"tokens/trainable": 1437390
},
{
"epoch": 0.07125764424355224,
"grad_norm": 0.030688602477312088,
"learning_rate": 2.8085106382978727e-05,
"loss": 0.7935420274734497,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.21121,
"step": 67,
"tokens/total": 17563648,
"tokens/train_per_sec_per_gpu": 220.73,
"tokens/trainable": 1461784
},
{
"epoch": 0.07232119117256049,
"grad_norm": 0.03269756957888603,
"learning_rate": 2.851063829787234e-05,
"loss": 0.7466378211975098,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.10989,
"step": 68,
"tokens/total": 17825792,
"tokens/train_per_sec_per_gpu": 172.64,
"tokens/trainable": 1482296
},
{
"epoch": 0.07338473810156873,
"grad_norm": 0.03261660039424896,
"learning_rate": 2.8936170212765963e-05,
"loss": 0.7684181928634644,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.15635,
"step": 69,
"tokens/total": 18087936,
"tokens/train_per_sec_per_gpu": 153.82,
"tokens/trainable": 1503447
},
{
"epoch": 0.07444828503057697,
"grad_norm": 0.029810158535838127,
"learning_rate": 2.9361702127659577e-05,
"loss": 0.745851993560791,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.10824,
"step": 70,
"tokens/total": 18350080,
"tokens/train_per_sec_per_gpu": 176.59,
"tokens/trainable": 1526617
},
{
"epoch": 0.07551183195958522,
"grad_norm": 0.03327067568898201,
"learning_rate": 2.9787234042553192e-05,
"loss": 0.8320407867431641,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.298,
"step": 71,
"tokens/total": 18612224,
"tokens/train_per_sec_per_gpu": 172.54,
"tokens/trainable": 1549503
},
{
"epoch": 0.07657537888859346,
"grad_norm": 0.030294055119156837,
"learning_rate": 3.0212765957446813e-05,
"loss": 0.7923359870910645,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.20855,
"step": 72,
"tokens/total": 18874368,
"tokens/train_per_sec_per_gpu": 186.06,
"tokens/trainable": 1572991
},
{
"epoch": 0.0776389258176017,
"grad_norm": 0.03210108354687691,
"learning_rate": 3.063829787234043e-05,
"loss": 0.746250569820404,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.10908,
"step": 73,
"tokens/total": 19136512,
"tokens/train_per_sec_per_gpu": 140.36,
"tokens/trainable": 1594140
},
{
"epoch": 0.07870247274660995,
"grad_norm": 0.030283037573099136,
"learning_rate": 3.1063829787234046e-05,
"loss": 0.7411618232727051,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.09837,
"step": 74,
"tokens/total": 19398656,
"tokens/train_per_sec_per_gpu": 208.45,
"tokens/trainable": 1616672
},
{
"epoch": 0.07976601967561818,
"grad_norm": 0.03764800727367401,
"learning_rate": 3.1489361702127664e-05,
"loss": 0.7435256242752075,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.10334,
"step": 75,
"tokens/total": 19660800,
"tokens/train_per_sec_per_gpu": 167.25,
"tokens/trainable": 1635193
},
{
"epoch": 0.08082956660462642,
"grad_norm": 0.03491177409887314,
"learning_rate": 3.191489361702128e-05,
"loss": 0.7323366403579712,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.07993,
"step": 76,
"tokens/total": 19922944,
"tokens/train_per_sec_per_gpu": 214.58,
"tokens/trainable": 1656375
},
{
"epoch": 0.08189311353363467,
"grad_norm": 0.03278028592467308,
"learning_rate": 3.234042553191489e-05,
"loss": 0.7500340938568115,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.11707,
"step": 77,
"tokens/total": 20185088,
"tokens/train_per_sec_per_gpu": 187.04,
"tokens/trainable": 1678155
},
{
"epoch": 0.08295666046264291,
"grad_norm": 0.032096248120069504,
"learning_rate": 3.276595744680851e-05,
"loss": 0.7423413991928101,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.10085,
"step": 78,
"tokens/total": 20447232,
"tokens/train_per_sec_per_gpu": 209.49,
"tokens/trainable": 1701373
},
{
"epoch": 0.08402020739165116,
"grad_norm": 0.03354285657405853,
"learning_rate": 3.319148936170213e-05,
"loss": 0.8002707958221436,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.22614,
"step": 79,
"tokens/total": 20709376,
"tokens/train_per_sec_per_gpu": 178.24,
"tokens/trainable": 1721916
},
{
"epoch": 0.0850837543206594,
"grad_norm": 0.03523889556527138,
"learning_rate": 3.361702127659575e-05,
"loss": 0.7944124937057495,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.21314,
"step": 80,
"tokens/total": 20971520,
"tokens/train_per_sec_per_gpu": 221.61,
"tokens/trainable": 1744950
},
{
"epoch": 0.08614730124966764,
"grad_norm": 0.03261874243617058,
"learning_rate": 3.4042553191489365e-05,
"loss": 0.7720386981964111,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.16417,
"step": 81,
"tokens/total": 21233664,
"tokens/train_per_sec_per_gpu": 167.21,
"tokens/trainable": 1765422
},
{
"epoch": 0.08721084817867589,
"grad_norm": 0.033395156264305115,
"learning_rate": 3.446808510638298e-05,
"loss": 0.779296875,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.17994,
"step": 82,
"tokens/total": 21495808,
"tokens/train_per_sec_per_gpu": 217.82,
"tokens/trainable": 1787402
},
{
"epoch": 0.08827439510768413,
"grad_norm": 0.032813675701618195,
"learning_rate": 3.48936170212766e-05,
"loss": 0.7013646364212036,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.0165,
"step": 83,
"tokens/total": 21757952,
"tokens/train_per_sec_per_gpu": 165.9,
"tokens/trainable": 1807444
},
{
"epoch": 0.08933794203669237,
"grad_norm": 0.035501375794410706,
"learning_rate": 3.531914893617022e-05,
"loss": 0.7295072674751282,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.07406,
"step": 84,
"tokens/total": 22020096,
"tokens/train_per_sec_per_gpu": 182.67,
"tokens/trainable": 1827505
},
{
"epoch": 0.0904014889657006,
"grad_norm": 0.0353703536093235,
"learning_rate": 3.574468085106383e-05,
"loss": 0.7775543928146362,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.17614,
"step": 85,
"tokens/total": 22282240,
"tokens/train_per_sec_per_gpu": 188.9,
"tokens/trainable": 1847467
},
{
"epoch": 0.09146503589470885,
"grad_norm": 0.03491484373807907,
"learning_rate": 3.617021276595745e-05,
"loss": 0.8319449424743652,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.29778,
"step": 86,
"tokens/total": 22544384,
"tokens/train_per_sec_per_gpu": 214.28,
"tokens/trainable": 1868200
},
{
"epoch": 0.09252858282371709,
"grad_norm": 0.032434333115816116,
"learning_rate": 3.6595744680851066e-05,
"loss": 0.8519909977912903,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.34431,
"step": 87,
"tokens/total": 22806528,
"tokens/train_per_sec_per_gpu": 259.84,
"tokens/trainable": 1893531
},
{
"epoch": 0.09359212975272534,
"grad_norm": 0.04186626523733139,
"learning_rate": 3.7021276595744684e-05,
"loss": 0.8195874094963074,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.26956,
"step": 88,
"tokens/total": 23068672,
"tokens/train_per_sec_per_gpu": 178.33,
"tokens/trainable": 1914870
},
{
"epoch": 0.09465567668173358,
"grad_norm": 0.03298460692167282,
"learning_rate": 3.74468085106383e-05,
"loss": 0.7469631433486938,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.11058,
"step": 89,
"tokens/total": 23330816,
"tokens/train_per_sec_per_gpu": 200.02,
"tokens/trainable": 1937963
},
{
"epoch": 0.09571922361074182,
"grad_norm": 0.03386974707245827,
"learning_rate": 3.787234042553192e-05,
"loss": 0.7484230399131775,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.11366,
"step": 90,
"tokens/total": 23592960,
"tokens/train_per_sec_per_gpu": 193.44,
"tokens/trainable": 1961198
},
{
"epoch": 0.09678277053975007,
"grad_norm": 0.039303258061409,
"learning_rate": 3.829787234042554e-05,
"loss": 0.75224769115448,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.12176,
"step": 91,
"tokens/total": 23855104,
"tokens/train_per_sec_per_gpu": 176.98,
"tokens/trainable": 1982479
},
{
"epoch": 0.09784631746875831,
"grad_norm": 0.03529525175690651,
"learning_rate": 3.872340425531915e-05,
"loss": 0.7571150064468384,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.13212,
"step": 92,
"tokens/total": 24117248,
"tokens/train_per_sec_per_gpu": 227.43,
"tokens/trainable": 2004994
},
{
"epoch": 0.09890986439776656,
"grad_norm": 0.034970078617334366,
"learning_rate": 3.914893617021277e-05,
"loss": 0.7881733179092407,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.19938,
"step": 93,
"tokens/total": 24379392,
"tokens/train_per_sec_per_gpu": 180.97,
"tokens/trainable": 2028105
},
{
"epoch": 0.0999734113267748,
"grad_norm": 0.036846473813056946,
"learning_rate": 3.9574468085106385e-05,
"loss": 0.7719080448150635,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.16389,
"step": 94,
"tokens/total": 24641536,
"tokens/train_per_sec_per_gpu": 186.35,
"tokens/trainable": 2051020
},
{
"epoch": 0.10103695825578303,
"grad_norm": 0.03919777274131775,
"learning_rate": 4e-05,
"loss": 0.7920703887939453,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.20796,
"step": 95,
"tokens/total": 24903680,
"tokens/train_per_sec_per_gpu": 147.2,
"tokens/trainable": 2070277
},
{
"epoch": 0.10210050518479127,
"grad_norm": 0.03782414644956589,
"learning_rate": 3.9999862427247416e-05,
"loss": 0.8352775573730469,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.30545,
"step": 96,
"tokens/total": 25165824,
"tokens/train_per_sec_per_gpu": 181.92,
"tokens/trainable": 2090839
},
{
"epoch": 0.10316405211379952,
"grad_norm": 0.03704574331641197,
"learning_rate": 3.999944971088228e-05,
"loss": 0.761550784111023,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.14159,
"step": 97,
"tokens/total": 25427968,
"tokens/train_per_sec_per_gpu": 206.15,
"tokens/trainable": 2112256
},
{
"epoch": 0.10422759904280776,
"grad_norm": 0.03385859355330467,
"learning_rate": 3.999876185658244e-05,
"loss": 0.7179139852523804,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.05015,
"step": 98,
"tokens/total": 25690112,
"tokens/train_per_sec_per_gpu": 187.02,
"tokens/trainable": 2136667
},
{
"epoch": 0.105291145971816,
"grad_norm": 0.03678734600543976,
"learning_rate": 3.99977988738109e-05,
"loss": 0.7438210248947144,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.10396,
"step": 99,
"tokens/total": 25952256,
"tokens/train_per_sec_per_gpu": 178.09,
"tokens/trainable": 2157962
},
{
"epoch": 0.10635469290082425,
"grad_norm": 0.03856838122010231,
"learning_rate": 3.999656077581569e-05,
"loss": 0.7466105222702026,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.10984,
"step": 100,
"tokens/total": 26214400,
"tokens/train_per_sec_per_gpu": 184.35,
"tokens/trainable": 2178396
},
{
"epoch": 0.1074182398298325,
"grad_norm": 0.03785452991724014,
"learning_rate": 3.9995047579629654e-05,
"loss": 0.7475836873054504,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.11189,
"step": 101,
"tokens/total": 26476544,
"tokens/train_per_sec_per_gpu": 201.31,
"tokens/trainable": 2201039
},
{
"epoch": 0.10848178675884074,
"grad_norm": 0.03467912971973419,
"learning_rate": 3.9993259306070256e-05,
"loss": 0.7737405300140381,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.16786,
"step": 102,
"tokens/total": 26738688,
"tokens/train_per_sec_per_gpu": 185.99,
"tokens/trainable": 2223019
},
{
"epoch": 0.10954533368784898,
"grad_norm": 0.04308745265007019,
"learning_rate": 3.999119597973925e-05,
"loss": 0.8207772970199585,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.27227,
"step": 103,
"tokens/total": 27000832,
"tokens/train_per_sec_per_gpu": 147.51,
"tokens/trainable": 2242785
},
{
"epoch": 0.11060888061685722,
"grad_norm": 0.036378778517246246,
"learning_rate": 3.998885762902241e-05,
"loss": 0.7338327169418335,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.08305,
"step": 104,
"tokens/total": 27262976,
"tokens/train_per_sec_per_gpu": 179.24,
"tokens/trainable": 2264578
},
{
"epoch": 0.11167242754586545,
"grad_norm": 0.04125402122735977,
"learning_rate": 3.998624428608906e-05,
"loss": 0.8683584332466125,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.383,
"step": 105,
"tokens/total": 27525120,
"tokens/train_per_sec_per_gpu": 195.93,
"tokens/trainable": 2285624
},
{
"epoch": 0.1127359744748737,
"grad_norm": 0.03740216791629791,
"learning_rate": 3.9983355986891664e-05,
"loss": 0.7756333947181702,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.17197,
"step": 106,
"tokens/total": 27787264,
"tokens/train_per_sec_per_gpu": 246.08,
"tokens/trainable": 2308345
},
{
"epoch": 0.11379952140388194,
"grad_norm": 0.04036470502614975,
"learning_rate": 3.9980192771165364e-05,
"loss": 0.7976692914962769,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.22036,
"step": 107,
"tokens/total": 28049408,
"tokens/train_per_sec_per_gpu": 215.98,
"tokens/trainable": 2328867
},
{
"epoch": 0.11486306833289019,
"grad_norm": 0.03836773335933685,
"learning_rate": 3.997675468242738e-05,
"loss": 0.7081190943717957,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.03017,
"step": 108,
"tokens/total": 28311552,
"tokens/train_per_sec_per_gpu": 191.07,
"tokens/trainable": 2349916
},
{
"epoch": 0.11592661526189843,
"grad_norm": 0.035974569618701935,
"learning_rate": 3.9973041767976466e-05,
"loss": 0.7658606767654419,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.15084,
"step": 109,
"tokens/total": 28573696,
"tokens/train_per_sec_per_gpu": 200.69,
"tokens/trainable": 2374197
},
{
"epoch": 0.11699016219090667,
"grad_norm": 0.0417025052011013,
"learning_rate": 3.9969054078892185e-05,
"loss": 0.8230124711990356,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.27735,
"step": 110,
"tokens/total": 28835840,
"tokens/train_per_sec_per_gpu": 195.73,
"tokens/trainable": 2395345
},
{
"epoch": 0.11805370911991492,
"grad_norm": 0.03637392073869705,
"learning_rate": 3.996479167003428e-05,
"loss": 0.7655156254768372,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.1501,
"step": 111,
"tokens/total": 29097984,
"tokens/train_per_sec_per_gpu": 214.49,
"tokens/trainable": 2418145
},
{
"epoch": 0.11911725604892316,
"grad_norm": 0.04198000580072403,
"learning_rate": 3.996025460004189e-05,
"loss": 0.8185654878616333,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.26725,
"step": 112,
"tokens/total": 29360128,
"tokens/train_per_sec_per_gpu": 236.86,
"tokens/trainable": 2439031
},
{
"epoch": 0.1201808029779314,
"grad_norm": 0.041592370718717575,
"learning_rate": 3.995544293133273e-05,
"loss": 0.8015573024749756,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.22901,
"step": 113,
"tokens/total": 29622272,
"tokens/train_per_sec_per_gpu": 186.73,
"tokens/trainable": 2458819
},
{
"epoch": 0.12124434990693965,
"grad_norm": 0.04079896956682205,
"learning_rate": 3.995035673010225e-05,
"loss": 0.7219120264053345,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.05837,
"step": 114,
"tokens/total": 29884416,
"tokens/train_per_sec_per_gpu": 184.15,
"tokens/trainable": 2480736
},
{
"epoch": 0.12230789683594788,
"grad_norm": 0.041768353432416916,
"learning_rate": 3.994499606632272e-05,
"loss": 0.8270866870880127,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.28665,
"step": 115,
"tokens/total": 30146560,
"tokens/train_per_sec_per_gpu": 193.64,
"tokens/trainable": 2504003
},
{
"epoch": 0.12337144376495612,
"grad_norm": 0.04556523263454437,
"learning_rate": 3.9939361013742275e-05,
"loss": 0.7384425401687622,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.09267,
"step": 116,
"tokens/total": 30408704,
"tokens/train_per_sec_per_gpu": 213.94,
"tokens/trainable": 2524722
},
{
"epoch": 0.12443499069396437,
"grad_norm": 0.03868886083364487,
"learning_rate": 3.9933451649883866e-05,
"loss": 0.709857165813446,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.0337,
"step": 117,
"tokens/total": 30670848,
"tokens/train_per_sec_per_gpu": 231.95,
"tokens/trainable": 2546913
},
{
"epoch": 0.1254985376229726,
"grad_norm": 0.04056168347597122,
"learning_rate": 3.9927268056044266e-05,
"loss": 0.7398765087127686,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.09568,
"step": 118,
"tokens/total": 30932992,
"tokens/train_per_sec_per_gpu": 183.97,
"tokens/trainable": 2568114
},
{
"epoch": 0.12656208455198087,
"grad_norm": 0.04197125881910324,
"learning_rate": 3.992081031729285e-05,
"loss": 0.7923115491867065,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.2085,
"step": 119,
"tokens/total": 31195136,
"tokens/train_per_sec_per_gpu": 199.13,
"tokens/trainable": 2588738
},
{
"epoch": 0.1276256314809891,
"grad_norm": 0.037061259150505066,
"learning_rate": 3.9914078522470526e-05,
"loss": 0.8101846575737,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.24832,
"step": 120,
"tokens/total": 31457280,
"tokens/train_per_sec_per_gpu": 193.31,
"tokens/trainable": 2613736
},
{
"epoch": 0.12868917840999733,
"grad_norm": 0.04570484906435013,
"learning_rate": 3.9907072764188435e-05,
"loss": 0.7499140501022339,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.11682,
"step": 121,
"tokens/total": 31719424,
"tokens/train_per_sec_per_gpu": 180.47,
"tokens/trainable": 2634223
},
{
"epoch": 0.1297527253390056,
"grad_norm": 0.04544052109122276,
"learning_rate": 3.9899793138826736e-05,
"loss": 0.8020647764205933,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.23014,
"step": 122,
"tokens/total": 31981568,
"tokens/train_per_sec_per_gpu": 198.92,
"tokens/trainable": 2655869
},
{
"epoch": 0.13081627226801382,
"grad_norm": 0.04469464346766472,
"learning_rate": 3.989223974653323e-05,
"loss": 0.7518518567085266,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.12092,
"step": 123,
"tokens/total": 32243712,
"tokens/train_per_sec_per_gpu": 208.31,
"tokens/trainable": 2679088
},
{
"epoch": 0.13187981919702207,
"grad_norm": 0.0395895391702652,
"learning_rate": 3.9884412691222016e-05,
"loss": 0.7855230569839478,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.19355,
"step": 124,
"tokens/total": 32505856,
"tokens/train_per_sec_per_gpu": 195.72,
"tokens/trainable": 2702201
},
{
"epoch": 0.1329433661260303,
"grad_norm": 0.04236849397420883,
"learning_rate": 3.987631208057205e-05,
"loss": 0.775454044342041,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.17158,
"step": 125,
"tokens/total": 32768000,
"tokens/train_per_sec_per_gpu": 169.99,
"tokens/trainable": 2724593
},
{
"epoch": 0.13400691305503856,
"grad_norm": 0.04403228312730789,
"learning_rate": 3.986793802602566e-05,
"loss": 0.7912722229957581,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.2062,
"step": 126,
"tokens/total": 33030144,
"tokens/train_per_sec_per_gpu": 178.51,
"tokens/trainable": 2747546
},
{
"epoch": 0.1350704599840468,
"grad_norm": 0.0415693037211895,
"learning_rate": 3.985929064278701e-05,
"loss": 0.803294837474823,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.23289,
"step": 127,
"tokens/total": 33292288,
"tokens/train_per_sec_per_gpu": 199.75,
"tokens/trainable": 2769876
},
{
"epoch": 0.13613400691305505,
"grad_norm": 0.043098073452711105,
"learning_rate": 3.985037004982056e-05,
"loss": 0.8380795121192932,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.31192,
"step": 128,
"tokens/total": 33554432,
"tokens/train_per_sec_per_gpu": 197.32,
"tokens/trainable": 2791990
},
{
"epoch": 0.13719755384206328,
"grad_norm": 0.04367615282535553,
"learning_rate": 3.984117636984933e-05,
"loss": 0.7381528615951538,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.09207,
"step": 129,
"tokens/total": 33816576,
"tokens/train_per_sec_per_gpu": 157.27,
"tokens/trainable": 2812157
},
{
"epoch": 0.1382611007710715,
"grad_norm": 0.03902239724993706,
"learning_rate": 3.983170972935333e-05,
"loss": 0.6622740030288696,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 1.9392,
"step": 130,
"tokens/total": 34078720,
"tokens/train_per_sec_per_gpu": 199.7,
"tokens/trainable": 2834450
},
{
"epoch": 0.13932464770007977,
"grad_norm": 0.04401889070868492,
"learning_rate": 3.982197025856772e-05,
"loss": 0.8131764531135559,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.25506,
"step": 131,
"tokens/total": 34340864,
"tokens/train_per_sec_per_gpu": 210.08,
"tokens/trainable": 2857868
},
{
"epoch": 0.140388194629088,
"grad_norm": 0.055351078510284424,
"learning_rate": 3.98119580914811e-05,
"loss": 0.8648597002029419,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.37467,
"step": 132,
"tokens/total": 34603008,
"tokens/train_per_sec_per_gpu": 164.9,
"tokens/trainable": 2877451
},
{
"epoch": 0.14145174155809626,
"grad_norm": 0.04292495548725128,
"learning_rate": 3.980167336583359e-05,
"loss": 0.8202415108680725,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.27105,
"step": 133,
"tokens/total": 34865152,
"tokens/train_per_sec_per_gpu": 179.62,
"tokens/trainable": 2898969
},
{
"epoch": 0.14251528848710449,
"grad_norm": 0.039902154356241226,
"learning_rate": 3.979111622311501e-05,
"loss": 0.7659401893615723,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.15102,
"step": 134,
"tokens/total": 35127296,
"tokens/train_per_sec_per_gpu": 204.35,
"tokens/trainable": 2922244
},
{
"epoch": 0.14357883541611274,
"grad_norm": 0.04879293963313103,
"learning_rate": 3.978028680856286e-05,
"loss": 0.7666274309158325,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.15249,
"step": 135,
"tokens/total": 35389440,
"tokens/train_per_sec_per_gpu": 203.21,
"tokens/trainable": 2941098
},
{
"epoch": 0.14464238234512097,
"grad_norm": 0.04211945831775665,
"learning_rate": 3.97691852711604e-05,
"loss": 0.7596578001976013,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.13754,
"step": 136,
"tokens/total": 35651584,
"tokens/train_per_sec_per_gpu": 182.99,
"tokens/trainable": 2962728
},
{
"epoch": 0.14570592927412923,
"grad_norm": 0.05592913180589676,
"learning_rate": 3.975781176363451e-05,
"loss": 0.8827542066574097,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.41755,
"step": 137,
"tokens/total": 35913728,
"tokens/train_per_sec_per_gpu": 167.07,
"tokens/trainable": 2983012
},
{
"epoch": 0.14676947620313746,
"grad_norm": 0.042312368750572205,
"learning_rate": 3.9746166442453667e-05,
"loss": 0.7679699659347534,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.15539,
"step": 138,
"tokens/total": 36175872,
"tokens/train_per_sec_per_gpu": 173.16,
"tokens/trainable": 3005801
},
{
"epoch": 0.14783302313214572,
"grad_norm": 0.043086566030979156,
"learning_rate": 3.973424946782578e-05,
"loss": 0.7419267892837524,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.09998,
"step": 139,
"tokens/total": 36438016,
"tokens/train_per_sec_per_gpu": 179.27,
"tokens/trainable": 3027970
},
{
"epoch": 0.14889657006115395,
"grad_norm": 0.044038690626621246,
"learning_rate": 3.972206100369594e-05,
"loss": 0.8171659111976624,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.26407,
"step": 140,
"tokens/total": 36700160,
"tokens/train_per_sec_per_gpu": 156.51,
"tokens/trainable": 3051271
},
{
"epoch": 0.14996011699016218,
"grad_norm": 0.04058285430073738,
"learning_rate": 3.970960121774419e-05,
"loss": 0.8079518675804138,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.24331,
"step": 141,
"tokens/total": 36962304,
"tokens/train_per_sec_per_gpu": 167.22,
"tokens/trainable": 3074209
},
{
"epoch": 0.15102366391917044,
"grad_norm": 0.04522034898400307,
"learning_rate": 3.9696870281383255e-05,
"loss": 0.8723236322402954,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.39246,
"step": 142,
"tokens/total": 37224448,
"tokens/train_per_sec_per_gpu": 158.71,
"tokens/trainable": 3095651
},
{
"epoch": 0.15208721084817867,
"grad_norm": 0.04753715172410011,
"learning_rate": 3.968386836975611e-05,
"loss": 0.7392692565917969,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.0944,
"step": 143,
"tokens/total": 37486592,
"tokens/train_per_sec_per_gpu": 189.77,
"tokens/trainable": 3118220
},
{
"epoch": 0.15315075777718692,
"grad_norm": 0.045635782182216644,
"learning_rate": 3.9670595661733654e-05,
"loss": 0.8149927854537964,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.25916,
"step": 144,
"tokens/total": 37748736,
"tokens/train_per_sec_per_gpu": 209.61,
"tokens/trainable": 3143149
},
{
"epoch": 0.15421430470619515,
"grad_norm": 0.049446720629930496,
"learning_rate": 3.9657052339912166e-05,
"loss": 0.8048349022865295,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.23633,
"step": 145,
"tokens/total": 38010880,
"tokens/train_per_sec_per_gpu": 200.16,
"tokens/trainable": 3164817
},
{
"epoch": 0.1552778516352034,
"grad_norm": 0.04811964929103851,
"learning_rate": 3.9643238590610864e-05,
"loss": 0.7713128328323364,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.1626,
"step": 146,
"tokens/total": 38273024,
"tokens/train_per_sec_per_gpu": 210.98,
"tokens/trainable": 3185683
},
{
"epoch": 0.15634139856421164,
"grad_norm": 0.04874229058623314,
"learning_rate": 3.9629154603869294e-05,
"loss": 0.7867254614830017,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.19619,
"step": 147,
"tokens/total": 38535168,
"tokens/train_per_sec_per_gpu": 161.76,
"tokens/trainable": 3207493
},
{
"epoch": 0.1574049454932199,
"grad_norm": 0.04509029909968376,
"learning_rate": 3.961480057344474e-05,
"loss": 0.8230168223381042,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.27736,
"step": 148,
"tokens/total": 38797312,
"tokens/train_per_sec_per_gpu": 165.98,
"tokens/trainable": 3227269
},
{
"epoch": 0.15846849242222813,
"grad_norm": 0.048180241137742996,
"learning_rate": 3.9600176696809555e-05,
"loss": 0.7925543785095215,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.20903,
"step": 149,
"tokens/total": 39059456,
"tokens/train_per_sec_per_gpu": 178.87,
"tokens/trainable": 3247693
},
{
"epoch": 0.15953203935123636,
"grad_norm": 0.044965874403715134,
"learning_rate": 3.9585283175148425e-05,
"loss": 0.7305552959442139,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.07623,
"step": 150,
"tokens/total": 39321600,
"tokens/train_per_sec_per_gpu": 185.73,
"tokens/trainable": 3271323
},
{
"epoch": 0.15953203935123636,
"eval_loss": 0.7756121754646301,
"eval_ppl": 2.17192,
"eval_runtime": 237.3162,
"eval_samples_per_second": 28.173,
"eval_steps_per_second": 1.761,
"memory/device_reserved (GiB)": 51.31,
"memory/max_active (GiB)": 38.19,
"memory/max_allocated (GiB)": 38.19,
"step": 150
},
{
"epoch": 0.16059558628024462,
"grad_norm": 0.05009883642196655,
"learning_rate": 3.9570120213355636e-05,
"loss": 0.7295466661453247,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.07414,
"step": 151,
"tokens/total": 39583744,
"tokens/train_per_sec_per_gpu": 212.53,
"tokens/trainable": 3293376
},
{
"epoch": 0.16165913320925285,
"grad_norm": 0.05454389378428459,
"learning_rate": 3.955468802003222e-05,
"loss": 0.8171148300170898,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.26396,
"step": 152,
"tokens/total": 39845888,
"tokens/train_per_sec_per_gpu": 160.49,
"tokens/trainable": 3314003
},
{
"epoch": 0.1627226801382611,
"grad_norm": 0.05227701738476753,
"learning_rate": 3.953898680748311e-05,
"loss": 0.8315908908843994,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.29697,
"step": 153,
"tokens/total": 40108032,
"tokens/train_per_sec_per_gpu": 178.92,
"tokens/trainable": 3335783
},
{
"epoch": 0.16378622706726934,
"grad_norm": 0.04624287411570549,
"learning_rate": 3.952301679171421e-05,
"loss": 0.7561501860618591,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.13006,
"step": 154,
"tokens/total": 40370176,
"tokens/train_per_sec_per_gpu": 164.87,
"tokens/trainable": 3355886
},
{
"epoch": 0.1648497739962776,
"grad_norm": 0.04997319355607033,
"learning_rate": 3.950677819242943e-05,
"loss": 0.788512110710144,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.20012,
"step": 155,
"tokens/total": 40632320,
"tokens/train_per_sec_per_gpu": 188.8,
"tokens/trainable": 3378024
},
{
"epoch": 0.16591332092528582,
"grad_norm": 0.05008501932024956,
"learning_rate": 3.949027123302764e-05,
"loss": 0.8327994346618652,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.29975,
"step": 156,
"tokens/total": 40894464,
"tokens/train_per_sec_per_gpu": 163.0,
"tokens/trainable": 3399925
},
{
"epoch": 0.16697686785429408,
"grad_norm": 0.05233265459537506,
"learning_rate": 3.9473496140599626e-05,
"loss": 0.8238826394081116,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.27933,
"step": 157,
"tokens/total": 41156608,
"tokens/train_per_sec_per_gpu": 182.71,
"tokens/trainable": 3420640
},
{
"epoch": 0.1680404147833023,
"grad_norm": 0.05217234417796135,
"learning_rate": 3.945645314592495e-05,
"loss": 0.7473776340484619,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.11146,
"step": 158,
"tokens/total": 41418752,
"tokens/train_per_sec_per_gpu": 223.76,
"tokens/trainable": 3442988
},
{
"epoch": 0.16910396171231057,
"grad_norm": 0.04567525163292885,
"learning_rate": 3.943914248346879e-05,
"loss": 0.7227488160133362,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.06009,
"step": 159,
"tokens/total": 41680896,
"tokens/train_per_sec_per_gpu": 177.03,
"tokens/trainable": 3464101
},
{
"epoch": 0.1701675086413188,
"grad_norm": 0.05548242852091789,
"learning_rate": 3.9421564391378685e-05,
"loss": 0.7955631017684937,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.21569,
"step": 160,
"tokens/total": 41943040,
"tokens/train_per_sec_per_gpu": 152.75,
"tokens/trainable": 3483419
},
{
"epoch": 0.17123105557032703,
"grad_norm": 0.04398762434720993,
"learning_rate": 3.9403719111481295e-05,
"loss": 0.7600826025009155,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.13845,
"step": 161,
"tokens/total": 42205184,
"tokens/train_per_sec_per_gpu": 205.76,
"tokens/trainable": 3504602
},
{
"epoch": 0.1722946024993353,
"grad_norm": 0.04506729915738106,
"learning_rate": 3.9385606889279035e-05,
"loss": 0.7480685114860535,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.11292,
"step": 162,
"tokens/total": 42467328,
"tokens/train_per_sec_per_gpu": 227.39,
"tokens/trainable": 3526406
},
{
"epoch": 0.17335814942834352,
"grad_norm": 0.044185835868120193,
"learning_rate": 3.9367227973946745e-05,
"loss": 0.7433359026908875,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.10294,
"step": 163,
"tokens/total": 42729472,
"tokens/train_per_sec_per_gpu": 182.07,
"tokens/trainable": 3548313
},
{
"epoch": 0.17442169635735177,
"grad_norm": 0.05059230327606201,
"learning_rate": 3.934858261832822e-05,
"loss": 0.6835325956344604,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 1.98086,
"step": 164,
"tokens/total": 42991616,
"tokens/train_per_sec_per_gpu": 169.05,
"tokens/trainable": 3568190
},
{
"epoch": 0.17548524328636,
"grad_norm": 0.050207290798425674,
"learning_rate": 3.932967107893274e-05,
"loss": 0.7899980545043945,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.20339,
"step": 165,
"tokens/total": 43253760,
"tokens/train_per_sec_per_gpu": 205.5,
"tokens/trainable": 3588948
},
{
"epoch": 0.17654879021536826,
"grad_norm": 0.04827320948243141,
"learning_rate": 3.931049361593157e-05,
"loss": 0.7980469465255737,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.2212,
"step": 166,
"tokens/total": 43515904,
"tokens/train_per_sec_per_gpu": 173.52,
"tokens/trainable": 3609829
},
{
"epoch": 0.1776123371443765,
"grad_norm": 0.048085663467645645,
"learning_rate": 3.9291050493154336e-05,
"loss": 0.7630643844604492,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.14484,
"step": 167,
"tokens/total": 43778048,
"tokens/train_per_sec_per_gpu": 200.45,
"tokens/trainable": 3631205
},
{
"epoch": 0.17867588407338475,
"grad_norm": 0.05210770294070244,
"learning_rate": 3.927134197808544e-05,
"loss": 0.7417425513267517,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.09959,
"step": 168,
"tokens/total": 44040192,
"tokens/train_per_sec_per_gpu": 138.92,
"tokens/trainable": 3651025
},
{
"epoch": 0.17973943100239298,
"grad_norm": 0.04291163384914398,
"learning_rate": 3.9251368341860343e-05,
"loss": 0.7509276866912842,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.11896,
"step": 169,
"tokens/total": 44302336,
"tokens/train_per_sec_per_gpu": 189.85,
"tokens/trainable": 3674343
},
{
"epoch": 0.1808029779314012,
"grad_norm": 0.05076931044459343,
"learning_rate": 3.923112985926185e-05,
"loss": 0.7864252328872681,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.19553,
"step": 170,
"tokens/total": 44564480,
"tokens/train_per_sec_per_gpu": 173.58,
"tokens/trainable": 3694079
},
{
"epoch": 0.18186652486040947,
"grad_norm": 0.046801142394542694,
"learning_rate": 3.921062680871635e-05,
"loss": 0.6877319812774658,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 1.9892,
"step": 171,
"tokens/total": 44826624,
"tokens/train_per_sec_per_gpu": 176.69,
"tokens/trainable": 3715203
},
{
"epoch": 0.1829300717894177,
"grad_norm": 0.05101482570171356,
"learning_rate": 3.9189859472289956e-05,
"loss": 0.800000786781311,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.22554,
"step": 172,
"tokens/total": 45088768,
"tokens/train_per_sec_per_gpu": 150.34,
"tokens/trainable": 3737632
},
{
"epoch": 0.18399361871842596,
"grad_norm": 0.04480605199933052,
"learning_rate": 3.916882813568461e-05,
"loss": 0.7518149614334106,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.12085,
"step": 173,
"tokens/total": 45350912,
"tokens/train_per_sec_per_gpu": 216.08,
"tokens/trainable": 3761754
},
{
"epoch": 0.18505716564743419,
"grad_norm": 0.05060945823788643,
"learning_rate": 3.914753308823422e-05,
"loss": 0.8080068230628967,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.24343,
"step": 174,
"tokens/total": 45613056,
"tokens/train_per_sec_per_gpu": 147.42,
"tokens/trainable": 3784061
},
{
"epoch": 0.18612071257644244,
"grad_norm": 0.04640955105423927,
"learning_rate": 3.9125974622900596e-05,
"loss": 0.8282898664474487,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.2894,
"step": 175,
"tokens/total": 45875200,
"tokens/train_per_sec_per_gpu": 208.37,
"tokens/trainable": 3808216
},
{
"epoch": 0.18718425950545067,
"grad_norm": 0.04194442555308342,
"learning_rate": 3.91041530362695e-05,
"loss": 0.7835493087768555,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.18923,
"step": 176,
"tokens/total": 46137344,
"tokens/train_per_sec_per_gpu": 198.59,
"tokens/trainable": 3832481
},
{
"epoch": 0.18824780643445893,
"grad_norm": 0.049947503954172134,
"learning_rate": 3.90820686285465e-05,
"loss": 0.8008949756622314,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.22753,
"step": 177,
"tokens/total": 46399488,
"tokens/train_per_sec_per_gpu": 151.75,
"tokens/trainable": 3852693
},
{
"epoch": 0.18931135336346716,
"grad_norm": 0.04894804581999779,
"learning_rate": 3.905972170355286e-05,
"loss": 0.711793065071106,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.03764,
"step": 178,
"tokens/total": 46661632,
"tokens/train_per_sec_per_gpu": 178.97,
"tokens/trainable": 3873846
},
{
"epoch": 0.19037490029247542,
"grad_norm": 0.04731186851859093,
"learning_rate": 3.903711256872139e-05,
"loss": 0.7765140533447266,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.17388,
"step": 179,
"tokens/total": 46923776,
"tokens/train_per_sec_per_gpu": 206.26,
"tokens/trainable": 3898641
},
{
"epoch": 0.19143844722148365,
"grad_norm": 0.055336493998765945,
"learning_rate": 3.901424153509218e-05,
"loss": 0.8252753019332886,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.28251,
"step": 180,
"tokens/total": 47185920,
"tokens/train_per_sec_per_gpu": 170.2,
"tokens/trainable": 3919863
},
{
"epoch": 0.19250199415049188,
"grad_norm": 0.04609975218772888,
"learning_rate": 3.899110891730834e-05,
"loss": 0.7351381778717041,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.08577,
"step": 181,
"tokens/total": 47448064,
"tokens/train_per_sec_per_gpu": 177.94,
"tokens/trainable": 3942514
},
{
"epoch": 0.19356554107950014,
"grad_norm": 0.05186214670538902,
"learning_rate": 3.896771503361165e-05,
"loss": 0.7917764186859131,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.20731,
"step": 182,
"tokens/total": 47710208,
"tokens/train_per_sec_per_gpu": 200.82,
"tokens/trainable": 3964229
},
{
"epoch": 0.19462908800850837,
"grad_norm": 0.049349937587976456,
"learning_rate": 3.8944060205838204e-05,
"loss": 0.7680513858795166,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.15556,
"step": 183,
"tokens/total": 47972352,
"tokens/train_per_sec_per_gpu": 190.57,
"tokens/trainable": 3983779
},
{
"epoch": 0.19569263493751662,
"grad_norm": 0.053863126784563065,
"learning_rate": 3.892014475941399e-05,
"loss": 0.7624801397323608,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.14359,
"step": 184,
"tokens/total": 48234496,
"tokens/train_per_sec_per_gpu": 165.8,
"tokens/trainable": 4004641
},
{
"epoch": 0.19675618186652485,
"grad_norm": 0.04809142276644707,
"learning_rate": 3.8895969023350384e-05,
"loss": 0.8235425353050232,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.27856,
"step": 185,
"tokens/total": 48496640,
"tokens/train_per_sec_per_gpu": 198.93,
"tokens/trainable": 4025071
},
{
"epoch": 0.1978197287955331,
"grad_norm": 0.04834391921758652,
"learning_rate": 3.8871533330239646e-05,
"loss": 0.7338411211967468,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.08307,
"step": 186,
"tokens/total": 48758784,
"tokens/train_per_sec_per_gpu": 185.51,
"tokens/trainable": 4048197
},
{
"epoch": 0.19888327572454134,
"grad_norm": 0.052105050534009933,
"learning_rate": 3.884683801625032e-05,
"loss": 0.7286123633384705,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.0722,
"step": 187,
"tokens/total": 49020928,
"tokens/train_per_sec_per_gpu": 175.75,
"tokens/trainable": 4069621
},
{
"epoch": 0.1999468226535496,
"grad_norm": 0.04482823610305786,
"learning_rate": 3.8821883421122645e-05,
"loss": 0.7568373680114746,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.13152,
"step": 188,
"tokens/total": 49283072,
"tokens/train_per_sec_per_gpu": 203.94,
"tokens/trainable": 4093663
},
{
"epoch": 0.20101036958255783,
"grad_norm": 0.0523945651948452,
"learning_rate": 3.879666988816386e-05,
"loss": 0.755517840385437,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.12871,
"step": 189,
"tokens/total": 49545216,
"tokens/train_per_sec_per_gpu": 160.35,
"tokens/trainable": 4113650
},
{
"epoch": 0.20207391651156606,
"grad_norm": 0.04911473020911217,
"learning_rate": 3.877119776424347e-05,
"loss": 0.7191125154495239,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.05261,
"step": 190,
"tokens/total": 49807360,
"tokens/train_per_sec_per_gpu": 221.01,
"tokens/trainable": 4136190
},
{
"epoch": 0.20313746344057432,
"grad_norm": 0.053953029215335846,
"learning_rate": 3.8745467399788506e-05,
"loss": 0.7884220480918884,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.19992,
"step": 191,
"tokens/total": 50069504,
"tokens/train_per_sec_per_gpu": 154.41,
"tokens/trainable": 4158226
},
{
"epoch": 0.20420101036958255,
"grad_norm": 0.05675153061747551,
"learning_rate": 3.871947914877866e-05,
"loss": 0.7461360692977905,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.10884,
"step": 192,
"tokens/total": 50331648,
"tokens/train_per_sec_per_gpu": 179.47,
"tokens/trainable": 4179916
},
{
"epoch": 0.2052645572985908,
"grad_norm": 0.05521610751748085,
"learning_rate": 3.869323336874146e-05,
"loss": 0.7196043729782104,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.05362,
"step": 193,
"tokens/total": 50593792,
"tokens/train_per_sec_per_gpu": 201.18,
"tokens/trainable": 4200779
},
{
"epoch": 0.20632810422759904,
"grad_norm": 0.056510064750909805,
"learning_rate": 3.8666730420747336e-05,
"loss": 0.8237625360488892,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.27906,
"step": 194,
"tokens/total": 50855936,
"tokens/train_per_sec_per_gpu": 171.77,
"tokens/trainable": 4222217
},
{
"epoch": 0.2073916511566073,
"grad_norm": 0.0467703752219677,
"learning_rate": 3.863997066940463e-05,
"loss": 0.7923108339309692,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.20849,
"step": 195,
"tokens/total": 51118080,
"tokens/train_per_sec_per_gpu": 193.25,
"tokens/trainable": 4244857
},
{
"epoch": 0.20845519808561552,
"grad_norm": 0.04992164671421051,
"learning_rate": 3.8612954482854606e-05,
"loss": 0.7622380256652832,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.14307,
"step": 196,
"tokens/total": 51380224,
"tokens/train_per_sec_per_gpu": 216.77,
"tokens/trainable": 4267572
},
{
"epoch": 0.20951874501462378,
"grad_norm": 0.049939945340156555,
"learning_rate": 3.8585682232766385e-05,
"loss": 0.7296082973480225,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.07427,
"step": 197,
"tokens/total": 51642368,
"tokens/train_per_sec_per_gpu": 214.99,
"tokens/trainable": 4290348
},
{
"epoch": 0.210582291943632,
"grad_norm": 0.05264829471707344,
"learning_rate": 3.8558154294331807e-05,
"loss": 0.7297487258911133,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.07456,
"step": 198,
"tokens/total": 51904512,
"tokens/train_per_sec_per_gpu": 178.25,
"tokens/trainable": 4310984
},
{
"epoch": 0.21164583887264027,
"grad_norm": 0.05678649619221687,
"learning_rate": 3.853037104626031e-05,
"loss": 0.8022236227989197,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.2305,
"step": 199,
"tokens/total": 52166656,
"tokens/train_per_sec_per_gpu": 212.29,
"tokens/trainable": 4332640
},
{
"epoch": 0.2127093858016485,
"grad_norm": 0.04627032205462456,
"learning_rate": 3.8502332870773675e-05,
"loss": 0.8206828832626343,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.27205,
"step": 200,
"tokens/total": 52428800,
"tokens/train_per_sec_per_gpu": 173.49,
"tokens/trainable": 4356101
},
{
"epoch": 0.21377293273065673,
"grad_norm": 0.04949687048792839,
"learning_rate": 3.847404015360081e-05,
"loss": 0.7801845073699951,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.18187,
"step": 201,
"tokens/total": 52690944,
"tokens/train_per_sec_per_gpu": 202.37,
"tokens/trainable": 4378733
},
{
"epoch": 0.214836479659665,
"grad_norm": 0.05199093371629715,
"learning_rate": 3.8445493283972414e-05,
"loss": 0.7552693486213684,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.12818,
"step": 202,
"tokens/total": 52953088,
"tokens/train_per_sec_per_gpu": 165.43,
"tokens/trainable": 4401167
},
{
"epoch": 0.21590002658867322,
"grad_norm": 0.060018111020326614,
"learning_rate": 3.841669265461562e-05,
"loss": 0.8178205490112305,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.26556,
"step": 203,
"tokens/total": 53215232,
"tokens/train_per_sec_per_gpu": 192.79,
"tokens/trainable": 4421155
},
{
"epoch": 0.21696357351768147,
"grad_norm": 0.0454607792198658,
"learning_rate": 3.838763866174862e-05,
"loss": 0.798674464225769,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.22259,
"step": 204,
"tokens/total": 53477376,
"tokens/train_per_sec_per_gpu": 181.78,
"tokens/trainable": 4443402
},
{
"epoch": 0.2180271204466897,
"grad_norm": 0.05475523695349693,
"learning_rate": 3.835833170507519e-05,
"loss": 0.7501173615455627,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.11725,
"step": 205,
"tokens/total": 53739520,
"tokens/train_per_sec_per_gpu": 174.06,
"tokens/trainable": 4465823
},
{
"epoch": 0.21909066737569796,
"grad_norm": 0.04717683792114258,
"learning_rate": 3.8328772187779196e-05,
"loss": 0.7843440771102905,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.19097,
"step": 206,
"tokens/total": 54001664,
"tokens/train_per_sec_per_gpu": 172.19,
"tokens/trainable": 4490092
},
{
"epoch": 0.2201542143047062,
"grad_norm": 0.04792032018303871,
"learning_rate": 3.829896051651907e-05,
"loss": 0.7995268702507019,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.22449,
"step": 207,
"tokens/total": 54263808,
"tokens/train_per_sec_per_gpu": 198.34,
"tokens/trainable": 4513112
},
{
"epoch": 0.22121776123371445,
"grad_norm": 0.05625506862998009,
"learning_rate": 3.8268897101422154e-05,
"loss": 0.7814656496047974,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.18467,
"step": 208,
"tokens/total": 54525952,
"tokens/train_per_sec_per_gpu": 178.5,
"tokens/trainable": 4533887
},
{
"epoch": 0.22228130816272268,
"grad_norm": 0.05093343183398247,
"learning_rate": 3.823858235607915e-05,
"loss": 0.8371798396110535,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.30984,
"step": 209,
"tokens/total": 54788096,
"tokens/train_per_sec_per_gpu": 179.89,
"tokens/trainable": 4557864
},
{
"epoch": 0.2233448550917309,
"grad_norm": 0.051436666399240494,
"learning_rate": 3.820801669753833e-05,
"loss": 0.8444880247116089,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.32679,
"step": 210,
"tokens/total": 55050240,
"tokens/train_per_sec_per_gpu": 208.06,
"tokens/trainable": 4579529
},
{
"epoch": 0.22440840202073917,
"grad_norm": 0.046146344393491745,
"learning_rate": 3.8177200546299894e-05,
"loss": 0.7421606779098511,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.10047,
"step": 211,
"tokens/total": 55312384,
"tokens/train_per_sec_per_gpu": 198.81,
"tokens/trainable": 4602512
},
{
"epoch": 0.2254719489497474,
"grad_norm": 0.06141940504312515,
"learning_rate": 3.81461343263101e-05,
"loss": 0.7741906642913818,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.16884,
"step": 212,
"tokens/total": 55574528,
"tokens/train_per_sec_per_gpu": 196.43,
"tokens/trainable": 4623604
},
{
"epoch": 0.22653549587875565,
"grad_norm": 0.061457011848688126,
"learning_rate": 3.81148184649555e-05,
"loss": 0.7711528539657593,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.16226,
"step": 213,
"tokens/total": 55836672,
"tokens/train_per_sec_per_gpu": 148.28,
"tokens/trainable": 4645354
},
{
"epoch": 0.22759904280776388,
"grad_norm": 0.05289280042052269,
"learning_rate": 3.8083253393057006e-05,
"loss": 0.7671029567718506,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.15352,
"step": 214,
"tokens/total": 56098816,
"tokens/train_per_sec_per_gpu": 198.49,
"tokens/trainable": 4666565
},
{
"epoch": 0.22866258973677214,
"grad_norm": 0.05258488655090332,
"learning_rate": 3.805143954486401e-05,
"loss": 0.7617560029029846,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.14203,
"step": 215,
"tokens/total": 56360960,
"tokens/train_per_sec_per_gpu": 177.48,
"tokens/trainable": 4688241
},
{
"epoch": 0.22972613666578037,
"grad_norm": 0.05477464199066162,
"learning_rate": 3.801937735804838e-05,
"loss": 0.736034095287323,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.08764,
"step": 216,
"tokens/total": 56623104,
"tokens/train_per_sec_per_gpu": 164.79,
"tokens/trainable": 4710328
},
{
"epoch": 0.23078968359478863,
"grad_norm": 0.058859411627054214,
"learning_rate": 3.798706727369845e-05,
"loss": 0.7572994232177734,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.13251,
"step": 217,
"tokens/total": 56885248,
"tokens/train_per_sec_per_gpu": 155.41,
"tokens/trainable": 4730784
},
{
"epoch": 0.23185323052379686,
"grad_norm": 0.055905554443597794,
"learning_rate": 3.795450973631293e-05,
"loss": 0.7484654188156128,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.11375,
"step": 218,
"tokens/total": 57147392,
"tokens/train_per_sec_per_gpu": 195.23,
"tokens/trainable": 4752708
},
{
"epoch": 0.23291677745280512,
"grad_norm": 0.05177122727036476,
"learning_rate": 3.792170519379482e-05,
"loss": 0.6985906958580017,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.01092,
"step": 219,
"tokens/total": 57409536,
"tokens/train_per_sec_per_gpu": 151.36,
"tokens/trainable": 4776585
},
{
"epoch": 0.23398032438181335,
"grad_norm": 0.05102040618658066,
"learning_rate": 3.788865409744527e-05,
"loss": 0.7429978847503662,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.10223,
"step": 220,
"tokens/total": 57671680,
"tokens/train_per_sec_per_gpu": 161.92,
"tokens/trainable": 4798099
},
{
"epoch": 0.23504387131082158,
"grad_norm": 0.04876242205500603,
"learning_rate": 3.785535690195728e-05,
"loss": 0.8290475010871887,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.29114,
"step": 221,
"tokens/total": 57933824,
"tokens/train_per_sec_per_gpu": 215.34,
"tokens/trainable": 4821746
},
{
"epoch": 0.23610741823982984,
"grad_norm": 0.04731612280011177,
"learning_rate": 3.782181406540954e-05,
"loss": 0.7661755084991455,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.15152,
"step": 222,
"tokens/total": 58195968,
"tokens/train_per_sec_per_gpu": 213.65,
"tokens/trainable": 4845142
},
{
"epoch": 0.23717096516883807,
"grad_norm": 0.058613162487745285,
"learning_rate": 3.77880260492601e-05,
"loss": 0.7483052015304565,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.11342,
"step": 223,
"tokens/total": 58458112,
"tokens/train_per_sec_per_gpu": 219.8,
"tokens/trainable": 4866360
},
{
"epoch": 0.23823451209784632,
"grad_norm": 0.05162626504898071,
"learning_rate": 3.775399331833998e-05,
"loss": 0.798062264919281,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.22123,
"step": 224,
"tokens/total": 58720256,
"tokens/train_per_sec_per_gpu": 144.23,
"tokens/trainable": 4886502
},
{
"epoch": 0.23929805902685455,
"grad_norm": 0.04998771846294403,
"learning_rate": 3.7719716340846845e-05,
"loss": 0.8206250667572021,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.27192,
"step": 225,
"tokens/total": 58982400,
"tokens/train_per_sec_per_gpu": 195.94,
"tokens/trainable": 4910020
},
{
"epoch": 0.2403616059558628,
"grad_norm": 0.056194525212049484,
"learning_rate": 3.768519558833849e-05,
"loss": 0.878259539604187,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.40671,
"step": 226,
"tokens/total": 59244544,
"tokens/train_per_sec_per_gpu": 157.32,
"tokens/trainable": 4931232
},
{
"epoch": 0.24142515288487104,
"grad_norm": 0.0521056093275547,
"learning_rate": 3.765043153572643e-05,
"loss": 0.759453535079956,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.13711,
"step": 227,
"tokens/total": 59506688,
"tokens/train_per_sec_per_gpu": 193.4,
"tokens/trainable": 4954744
},
{
"epoch": 0.2424886998138793,
"grad_norm": 0.04911046847701073,
"learning_rate": 3.761542466126929e-05,
"loss": 0.7336410284042358,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.08265,
"step": 228,
"tokens/total": 59768832,
"tokens/train_per_sec_per_gpu": 207.37,
"tokens/trainable": 4975992
},
{
"epoch": 0.24355224674288753,
"grad_norm": 0.06128966435790062,
"learning_rate": 3.758017544656628e-05,
"loss": 0.7942535877227783,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.21279,
"step": 229,
"tokens/total": 60030976,
"tokens/train_per_sec_per_gpu": 185.07,
"tokens/trainable": 4996744
},
{
"epoch": 0.24461579367189576,
"grad_norm": 0.057662662118673325,
"learning_rate": 3.754468437655056e-05,
"loss": 0.7774747014045715,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.17597,
"step": 230,
"tokens/total": 60293120,
"tokens/train_per_sec_per_gpu": 144.76,
"tokens/trainable": 5015784
},
{
"epoch": 0.24567934060090402,
"grad_norm": 0.0500815324485302,
"learning_rate": 3.7508951939482543e-05,
"loss": 0.701805591583252,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.01739,
"step": 231,
"tokens/total": 60555264,
"tokens/train_per_sec_per_gpu": 198.76,
"tokens/trainable": 5037125
},
{
"epoch": 0.24674288752991225,
"grad_norm": 0.05346173420548439,
"learning_rate": 3.74729786269432e-05,
"loss": 0.7507196068763733,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.11852,
"step": 232,
"tokens/total": 60817408,
"tokens/train_per_sec_per_gpu": 197.05,
"tokens/trainable": 5059497
},
{
"epoch": 0.2478064344589205,
"grad_norm": 0.05386090278625488,
"learning_rate": 3.7436764933827284e-05,
"loss": 0.7338147163391113,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.08301,
"step": 233,
"tokens/total": 61079552,
"tokens/train_per_sec_per_gpu": 157.02,
"tokens/trainable": 5080212
},
{
"epoch": 0.24886998138792873,
"grad_norm": 0.05847088247537613,
"learning_rate": 3.7400311358336555e-05,
"loss": 0.7185083627700806,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.05137,
"step": 234,
"tokens/total": 61341696,
"tokens/train_per_sec_per_gpu": 178.9,
"tokens/trainable": 5101917
},
{
"epoch": 0.249933528316937,
"grad_norm": 0.05917196720838547,
"learning_rate": 3.736361840197288e-05,
"loss": 0.7877013087272644,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.19834,
"step": 235,
"tokens/total": 61603840,
"tokens/train_per_sec_per_gpu": 172.85,
"tokens/trainable": 5122988
},
{
"epoch": 0.2509970752459452,
"grad_norm": 0.05370993912220001,
"learning_rate": 3.732668656953136e-05,
"loss": 0.7264862656593323,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.0678,
"step": 236,
"tokens/total": 61865984,
"tokens/train_per_sec_per_gpu": 215.97,
"tokens/trainable": 5145194
},
{
"epoch": 0.25206062217495345,
"grad_norm": 0.05220884829759598,
"learning_rate": 3.728951636909338e-05,
"loss": 0.7533116340637207,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.12402,
"step": 237,
"tokens/total": 62128128,
"tokens/train_per_sec_per_gpu": 211.82,
"tokens/trainable": 5167987
},
{
"epoch": 0.25312416910396174,
"grad_norm": 0.054722413420677185,
"learning_rate": 3.725210831201961e-05,
"loss": 0.7575439810752869,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.13303,
"step": 238,
"tokens/total": 62390272,
"tokens/train_per_sec_per_gpu": 154.07,
"tokens/trainable": 5189725
},
{
"epoch": 0.25418771603296997,
"grad_norm": 0.05407283455133438,
"learning_rate": 3.721446291294301e-05,
"loss": 0.7419638633728027,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.10006,
"step": 239,
"tokens/total": 62652416,
"tokens/train_per_sec_per_gpu": 241.74,
"tokens/trainable": 5213041
},
{
"epoch": 0.2552512629619782,
"grad_norm": 0.05353840813040733,
"learning_rate": 3.717658068976168e-05,
"loss": 0.7568483352661133,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.13155,
"step": 240,
"tokens/total": 62914560,
"tokens/train_per_sec_per_gpu": 170.27,
"tokens/trainable": 5232929
},
{
"epoch": 0.25631480989098643,
"grad_norm": 0.05174530670046806,
"learning_rate": 3.713846216363179e-05,
"loss": 0.6827611923217773,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 1.97934,
"step": 241,
"tokens/total": 63176704,
"tokens/train_per_sec_per_gpu": 212.44,
"tokens/trainable": 5253894
},
{
"epoch": 0.25737835681999466,
"grad_norm": 0.050874270498752594,
"learning_rate": 3.7100107858960404e-05,
"loss": 0.7131105661392212,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.04033,
"step": 242,
"tokens/total": 63438848,
"tokens/train_per_sec_per_gpu": 167.64,
"tokens/trainable": 5275682
},
{
"epoch": 0.25844190374900294,
"grad_norm": 0.06289295852184296,
"learning_rate": 3.7061518303398244e-05,
"loss": 0.814382791519165,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.25778,
"step": 243,
"tokens/total": 63700992,
"tokens/train_per_sec_per_gpu": 181.58,
"tokens/trainable": 5296823
},
{
"epoch": 0.2595054506780112,
"grad_norm": 0.05226941406726837,
"learning_rate": 3.7022694027832456e-05,
"loss": 0.716779887676239,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.04783,
"step": 244,
"tokens/total": 63963136,
"tokens/train_per_sec_per_gpu": 202.14,
"tokens/trainable": 5318716
},
{
"epoch": 0.2605689976070194,
"grad_norm": 0.04937303066253662,
"learning_rate": 3.698363556637927e-05,
"loss": 0.7856250405311584,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.19378,
"step": 245,
"tokens/total": 64225280,
"tokens/train_per_sec_per_gpu": 229.48,
"tokens/trainable": 5344987
},
{
"epoch": 0.26163254453602763,
"grad_norm": 0.049020808190107346,
"learning_rate": 3.694434345637671e-05,
"loss": 0.7429791688919067,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.10219,
"step": 246,
"tokens/total": 64487424,
"tokens/train_per_sec_per_gpu": 226.82,
"tokens/trainable": 5369834
},
{
"epoch": 0.2626960914650359,
"grad_norm": 0.052014704793691635,
"learning_rate": 3.690481823837714e-05,
"loss": 0.7686535120010376,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.15686,
"step": 247,
"tokens/total": 64749568,
"tokens/train_per_sec_per_gpu": 199.34,
"tokens/trainable": 5392410
},
{
"epoch": 0.26375963839404415,
"grad_norm": 0.060707978904247284,
"learning_rate": 3.686506045613986e-05,
"loss": 0.7760209441184998,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.17281,
"step": 248,
"tokens/total": 65011712,
"tokens/train_per_sec_per_gpu": 157.58,
"tokens/trainable": 5414349
},
{
"epoch": 0.2648231853230524,
"grad_norm": 0.06462915241718292,
"learning_rate": 3.6825070656623626e-05,
"loss": 0.7831145524978638,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.18828,
"step": 249,
"tokens/total": 65273856,
"tokens/train_per_sec_per_gpu": 159.08,
"tokens/trainable": 5433875
},
{
"epoch": 0.2658867322520606,
"grad_norm": 0.057429276406764984,
"learning_rate": 3.678484938997912e-05,
"loss": 0.6994718909263611,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.01269,
"step": 250,
"tokens/total": 65536000,
"tokens/train_per_sec_per_gpu": 166.54,
"tokens/trainable": 5454358
},
{
"epoch": 0.26695027918106884,
"grad_norm": 0.05674952268600464,
"learning_rate": 3.674439720954138e-05,
"loss": 0.7693842649459839,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.15844,
"step": 251,
"tokens/total": 65798144,
"tokens/train_per_sec_per_gpu": 199.24,
"tokens/trainable": 5476733
},
{
"epoch": 0.2680138261100771,
"grad_norm": 0.06037837266921997,
"learning_rate": 3.670371467182219e-05,
"loss": 0.7689650058746338,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.15753,
"step": 252,
"tokens/total": 66060288,
"tokens/train_per_sec_per_gpu": 184.65,
"tokens/trainable": 5498250
},
{
"epoch": 0.26907737303908535,
"grad_norm": 0.055501531809568405,
"learning_rate": 3.66628023365024e-05,
"loss": 0.7238588333129883,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.06238,
"step": 253,
"tokens/total": 66322432,
"tokens/train_per_sec_per_gpu": 227.0,
"tokens/trainable": 5520172
},
{
"epoch": 0.2701409199680936,
"grad_norm": 0.05900080129504204,
"learning_rate": 3.66216607664243e-05,
"loss": 0.7000092267990112,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.01377,
"step": 254,
"tokens/total": 66584576,
"tokens/train_per_sec_per_gpu": 153.45,
"tokens/trainable": 5539604
},
{
"epoch": 0.2712044668971018,
"grad_norm": 0.054649997502565384,
"learning_rate": 3.658029052758377e-05,
"loss": 0.7665979266166687,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.15243,
"step": 255,
"tokens/total": 66846720,
"tokens/train_per_sec_per_gpu": 173.48,
"tokens/trainable": 5561266
},
{
"epoch": 0.2722680138261101,
"grad_norm": 0.05519971251487732,
"learning_rate": 3.653869218912258e-05,
"loss": 0.7569284439086914,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.13172,
"step": 256,
"tokens/total": 67108864,
"tokens/train_per_sec_per_gpu": 171.2,
"tokens/trainable": 5583034
},
{
"epoch": 0.27333156075511833,
"grad_norm": 0.055965524166822433,
"learning_rate": 3.649686632332052e-05,
"loss": 0.7072951793670654,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.0285,
"step": 257,
"tokens/total": 67371008,
"tokens/train_per_sec_per_gpu": 175.76,
"tokens/trainable": 5604839
},
{
"epoch": 0.27439510768412656,
"grad_norm": 0.05398353934288025,
"learning_rate": 3.645481350558754e-05,
"loss": 0.7658364176750183,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.15079,
"step": 258,
"tokens/total": 67633152,
"tokens/train_per_sec_per_gpu": 153.96,
"tokens/trainable": 5626766
},
{
"epoch": 0.2754586546131348,
"grad_norm": 0.04775004833936691,
"learning_rate": 3.6412534314455836e-05,
"loss": 0.6940434575080872,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.00179,
"step": 259,
"tokens/total": 67895296,
"tokens/train_per_sec_per_gpu": 142.3,
"tokens/trainable": 5647569
},
{
"epoch": 0.276522201542143,
"grad_norm": 0.05857420340180397,
"learning_rate": 3.637002933157187e-05,
"loss": 0.8444321155548096,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.32666,
"step": 260,
"tokens/total": 68157440,
"tokens/train_per_sec_per_gpu": 169.5,
"tokens/trainable": 5667784
},
{
"epoch": 0.2775857484711513,
"grad_norm": 0.05263550207018852,
"learning_rate": 3.6327299141688396e-05,
"loss": 0.8108729720115662,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.24987,
"step": 261,
"tokens/total": 68419584,
"tokens/train_per_sec_per_gpu": 143.48,
"tokens/trainable": 5689804
},
{
"epoch": 0.27864929540015954,
"grad_norm": 0.05478999391198158,
"learning_rate": 3.6284344332656396e-05,
"loss": 0.8421679735183716,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.32139,
"step": 262,
"tokens/total": 68681728,
"tokens/train_per_sec_per_gpu": 246.12,
"tokens/trainable": 5714024
},
{
"epoch": 0.27971284232916777,
"grad_norm": 0.06489334255456924,
"learning_rate": 3.6241165495417006e-05,
"loss": 0.8724418878555298,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.39275,
"step": 263,
"tokens/total": 68943872,
"tokens/train_per_sec_per_gpu": 166.84,
"tokens/trainable": 5734370
},
{
"epoch": 0.280776389258176,
"grad_norm": 0.06463494151830673,
"learning_rate": 3.619776322399336e-05,
"loss": 0.7543226480484009,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.12617,
"step": 264,
"tokens/total": 69206016,
"tokens/train_per_sec_per_gpu": 214.95,
"tokens/trainable": 5757030
},
{
"epoch": 0.2818399361871843,
"grad_norm": 0.05572579428553581,
"learning_rate": 3.615413811548247e-05,
"loss": 0.7700116634368896,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.15979,
"step": 265,
"tokens/total": 69468160,
"tokens/train_per_sec_per_gpu": 187.77,
"tokens/trainable": 5777899
},
{
"epoch": 0.2829034831161925,
"grad_norm": 0.05551740154623985,
"learning_rate": 3.6110290770046954e-05,
"loss": 0.7438211441040039,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.10396,
"step": 266,
"tokens/total": 69730304,
"tokens/train_per_sec_per_gpu": 154.69,
"tokens/trainable": 5798783
},
{
"epoch": 0.28396703004520074,
"grad_norm": 0.06058730185031891,
"learning_rate": 3.606622179090682e-05,
"loss": 0.7585455179214478,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.13517,
"step": 267,
"tokens/total": 69992448,
"tokens/train_per_sec_per_gpu": 180.57,
"tokens/trainable": 5819730
},
{
"epoch": 0.28503057697420897,
"grad_norm": 0.056012120097875595,
"learning_rate": 3.6021931784331136e-05,
"loss": 0.7486584186553955,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.11416,
"step": 268,
"tokens/total": 70254592,
"tokens/train_per_sec_per_gpu": 156.75,
"tokens/trainable": 5842124
},
{
"epoch": 0.28609412390321726,
"grad_norm": 0.06160522252321243,
"learning_rate": 3.5977421359629715e-05,
"loss": 0.6910536289215088,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 1.99582,
"step": 269,
"tokens/total": 70516736,
"tokens/train_per_sec_per_gpu": 171.0,
"tokens/trainable": 5863426
},
{
"epoch": 0.2871576708322255,
"grad_norm": 0.061156366020441055,
"learning_rate": 3.593269112914472e-05,
"loss": 0.7586344480514526,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.13536,
"step": 270,
"tokens/total": 70778880,
"tokens/train_per_sec_per_gpu": 165.2,
"tokens/trainable": 5884974
},
{
"epoch": 0.2882212177612337,
"grad_norm": 0.061479438096284866,
"learning_rate": 3.588774170824225e-05,
"loss": 0.8126254677772522,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.25382,
"step": 271,
"tokens/total": 71041024,
"tokens/train_per_sec_per_gpu": 213.13,
"tokens/trainable": 5906918
},
{
"epoch": 0.28928476469024195,
"grad_norm": 0.05486295372247696,
"learning_rate": 3.584257371530386e-05,
"loss": 0.7505637407302856,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.11819,
"step": 272,
"tokens/total": 71303168,
"tokens/train_per_sec_per_gpu": 202.73,
"tokens/trainable": 5929901
},
{
"epoch": 0.2903483116192502,
"grad_norm": 0.05428226664662361,
"learning_rate": 3.579718777171806e-05,
"loss": 0.7298543453216553,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.07478,
"step": 273,
"tokens/total": 71565312,
"tokens/train_per_sec_per_gpu": 187.93,
"tokens/trainable": 5949798
},
{
"epoch": 0.29141185854825846,
"grad_norm": 0.06736662238836288,
"learning_rate": 3.5751584501871766e-05,
"loss": 0.7698936462402344,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.15954,
"step": 274,
"tokens/total": 71827456,
"tokens/train_per_sec_per_gpu": 179.77,
"tokens/trainable": 5971772
},
{
"epoch": 0.2924754054772667,
"grad_norm": 0.05799878388643265,
"learning_rate": 3.570576453314172e-05,
"loss": 0.7739330530166626,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.16828,
"step": 275,
"tokens/total": 72089600,
"tokens/train_per_sec_per_gpu": 179.84,
"tokens/trainable": 5994035
},
{
"epoch": 0.2935389524062749,
"grad_norm": 0.051827434450387955,
"learning_rate": 3.565972849588584e-05,
"loss": 0.7918999791145325,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.20759,
"step": 276,
"tokens/total": 72351744,
"tokens/train_per_sec_per_gpu": 219.16,
"tokens/trainable": 6017969
},
{
"epoch": 0.29460249933528315,
"grad_norm": 0.05260028690099716,
"learning_rate": 3.561347702343456e-05,
"loss": 0.7098696827888489,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.03373,
"step": 277,
"tokens/total": 72613888,
"tokens/train_per_sec_per_gpu": 208.13,
"tokens/trainable": 6042126
},
{
"epoch": 0.29566604626429144,
"grad_norm": 0.05379115045070648,
"learning_rate": 3.556701075208213e-05,
"loss": 0.7800576090812683,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.1816,
"step": 278,
"tokens/total": 72876032,
"tokens/train_per_sec_per_gpu": 207.75,
"tokens/trainable": 6065523
},
{
"epoch": 0.29672959319329967,
"grad_norm": 0.05740763247013092,
"learning_rate": 3.5520330321077815e-05,
"loss": 0.8003265857696533,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.22627,
"step": 279,
"tokens/total": 73138176,
"tokens/train_per_sec_per_gpu": 204.22,
"tokens/trainable": 6088540
},
{
"epoch": 0.2977931401223079,
"grad_norm": 0.05483856424689293,
"learning_rate": 3.547343637261717e-05,
"loss": 0.7692792415618896,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.15821,
"step": 280,
"tokens/total": 73400320,
"tokens/train_per_sec_per_gpu": 227.55,
"tokens/trainable": 6111049
},
{
"epoch": 0.29885668705131613,
"grad_norm": 0.06599462777376175,
"learning_rate": 3.5426329551833145e-05,
"loss": 0.7712551355361938,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.16248,
"step": 281,
"tokens/total": 73662464,
"tokens/train_per_sec_per_gpu": 194.72,
"tokens/trainable": 6131721
},
{
"epoch": 0.29992023398032436,
"grad_norm": 0.055346276611089706,
"learning_rate": 3.537901050678724e-05,
"loss": 0.7576462030410767,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.13325,
"step": 282,
"tokens/total": 73924608,
"tokens/train_per_sec_per_gpu": 200.29,
"tokens/trainable": 6154147
},
{
"epoch": 0.30098378090933264,
"grad_norm": 0.06298188120126724,
"learning_rate": 3.533147988846059e-05,
"loss": 0.7945112586021423,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.21336,
"step": 283,
"tokens/total": 74186752,
"tokens/train_per_sec_per_gpu": 228.94,
"tokens/trainable": 6178704
},
{
"epoch": 0.3020473278383409,
"grad_norm": 0.05894119665026665,
"learning_rate": 3.5283738350744986e-05,
"loss": 0.7516214847564697,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.12044,
"step": 284,
"tokens/total": 74448896,
"tokens/train_per_sec_per_gpu": 212.98,
"tokens/trainable": 6200161
},
{
"epoch": 0.3031108747673491,
"grad_norm": 0.05261611193418503,
"learning_rate": 3.5235786550433906e-05,
"loss": 0.7629417777061462,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.14458,
"step": 285,
"tokens/total": 74711040,
"tokens/train_per_sec_per_gpu": 219.51,
"tokens/trainable": 6222600
},
{
"epoch": 0.30417442169635733,
"grad_norm": 0.059760384261608124,
"learning_rate": 3.518762514721346e-05,
"loss": 0.7162789106369019,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.0468,
"step": 286,
"tokens/total": 74973184,
"tokens/train_per_sec_per_gpu": 235.97,
"tokens/trainable": 6246346
},
{
"epoch": 0.3052379686253656,
"grad_norm": 0.06057070195674896,
"learning_rate": 3.5139254803653346e-05,
"loss": 0.7634356021881104,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.14564,
"step": 287,
"tokens/total": 75235328,
"tokens/train_per_sec_per_gpu": 152.93,
"tokens/trainable": 6267246
},
{
"epoch": 0.30630151555437385,
"grad_norm": 0.05959314480423927,
"learning_rate": 3.509067618519768e-05,
"loss": 0.8070247173309326,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.24123,
"step": 288,
"tokens/total": 75497472,
"tokens/train_per_sec_per_gpu": 186.48,
"tokens/trainable": 6289062
},
{
"epoch": 0.3073650624833821,
"grad_norm": 0.06659425795078278,
"learning_rate": 3.5041889960155895e-05,
"loss": 0.6915267705917358,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 1.99676,
"step": 289,
"tokens/total": 75759616,
"tokens/train_per_sec_per_gpu": 194.38,
"tokens/trainable": 6310916
},
{
"epoch": 0.3084286094123903,
"grad_norm": 0.05240656062960625,
"learning_rate": 3.499289679969351e-05,
"loss": 0.8021942377090454,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.23043,
"step": 290,
"tokens/total": 76021760,
"tokens/train_per_sec_per_gpu": 207.96,
"tokens/trainable": 6334881
},
{
"epoch": 0.30949215634139854,
"grad_norm": 0.06096180900931358,
"learning_rate": 3.494369737782293e-05,
"loss": 0.7638937830924988,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.14662,
"step": 291,
"tokens/total": 76283904,
"tokens/train_per_sec_per_gpu": 205.29,
"tokens/trainable": 6357654
},
{
"epoch": 0.3105557032704068,
"grad_norm": 0.062094077467918396,
"learning_rate": 3.489429237139414e-05,
"loss": 0.7336180210113525,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.0826,
"step": 292,
"tokens/total": 76546048,
"tokens/train_per_sec_per_gpu": 178.46,
"tokens/trainable": 6377841
},
{
"epoch": 0.31161925019941505,
"grad_norm": 0.055953506380319595,
"learning_rate": 3.48446824600854e-05,
"loss": 0.7609624862670898,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.14034,
"step": 293,
"tokens/total": 76808192,
"tokens/train_per_sec_per_gpu": 168.65,
"tokens/trainable": 6400076
},
{
"epoch": 0.3126827971284233,
"grad_norm": 0.05196288600564003,
"learning_rate": 3.4794868326393935e-05,
"loss": 0.7413825988769531,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.09884,
"step": 294,
"tokens/total": 77070336,
"tokens/train_per_sec_per_gpu": 172.78,
"tokens/trainable": 6422449
},
{
"epoch": 0.3137463440574315,
"grad_norm": 0.05353325977921486,
"learning_rate": 3.474485065562648e-05,
"loss": 0.7769887447357178,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.17491,
"step": 295,
"tokens/total": 77332480,
"tokens/train_per_sec_per_gpu": 221.08,
"tokens/trainable": 6445481
},
{
"epoch": 0.3148098909864398,
"grad_norm": 0.054653119295835495,
"learning_rate": 3.469463013588991e-05,
"loss": 0.7785749435424805,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.17837,
"step": 296,
"tokens/total": 77594624,
"tokens/train_per_sec_per_gpu": 210.56,
"tokens/trainable": 6467586
},
{
"epoch": 0.31587343791544803,
"grad_norm": 0.06027977168560028,
"learning_rate": 3.4644207458081735e-05,
"loss": 0.7743946313858032,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.16928,
"step": 297,
"tokens/total": 77856768,
"tokens/train_per_sec_per_gpu": 204.77,
"tokens/trainable": 6489041
},
{
"epoch": 0.31693698484445626,
"grad_norm": 0.05732357129454613,
"learning_rate": 3.45935833158806e-05,
"loss": 0.8058252334594727,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.23854,
"step": 298,
"tokens/total": 78118912,
"tokens/train_per_sec_per_gpu": 171.13,
"tokens/trainable": 6510717
},
{
"epoch": 0.3180005317734645,
"grad_norm": 0.05821244791150093,
"learning_rate": 3.454275840573679e-05,
"loss": 0.7749941945075989,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.17058,
"step": 299,
"tokens/total": 78381056,
"tokens/train_per_sec_per_gpu": 223.81,
"tokens/trainable": 6533545
},
{
"epoch": 0.3190640787024727,
"grad_norm": 0.05597531050443649,
"learning_rate": 3.4491733426862556e-05,
"loss": 0.7812941670417786,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.1843,
"step": 300,
"tokens/total": 78643200,
"tokens/train_per_sec_per_gpu": 185.6,
"tokens/trainable": 6556460
},
{
"epoch": 0.3190640787024727,
"eval_loss": 0.7725370526313782,
"eval_ppl": 2.16525,
"eval_runtime": 237.2784,
"eval_samples_per_second": 28.178,
"eval_steps_per_second": 1.762,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 38.19,
"memory/max_allocated (GiB)": 38.19,
"step": 300
},
{
"epoch": 0.320127625631481,
"grad_norm": 0.054633188992738724,
"learning_rate": 3.44405090812226e-05,
"loss": 0.7383944988250732,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.09257,
"step": 301,
"tokens/total": 78905344,
"tokens/train_per_sec_per_gpu": 192.32,
"tokens/trainable": 6579346
},
{
"epoch": 0.32119117256048924,
"grad_norm": 0.05523587390780449,
"learning_rate": 3.438908607352433e-05,
"loss": 0.6943072080612183,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.00232,
"step": 302,
"tokens/total": 79167488,
"tokens/train_per_sec_per_gpu": 187.81,
"tokens/trainable": 6599223
},
{
"epoch": 0.32225471948949747,
"grad_norm": 0.058179691433906555,
"learning_rate": 3.433746511120823e-05,
"loss": 0.7615541815757751,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.1416,
"step": 303,
"tokens/total": 79429632,
"tokens/train_per_sec_per_gpu": 207.76,
"tokens/trainable": 6622095
},
{
"epoch": 0.3233182664185057,
"grad_norm": 0.06251095235347748,
"learning_rate": 3.428564690443807e-05,
"loss": 0.7761749029159546,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.17314,
"step": 304,
"tokens/total": 79691776,
"tokens/train_per_sec_per_gpu": 212.28,
"tokens/trainable": 6643820
},
{
"epoch": 0.324381813347514,
"grad_norm": 0.06721773743629456,
"learning_rate": 3.4233632166091205e-05,
"loss": 0.7868590354919434,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.19649,
"step": 305,
"tokens/total": 79953920,
"tokens/train_per_sec_per_gpu": 171.72,
"tokens/trainable": 6664523
},
{
"epoch": 0.3254453602765222,
"grad_norm": 0.05866523087024689,
"learning_rate": 3.41814216117487e-05,
"loss": 0.7207814455032349,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.05604,
"step": 306,
"tokens/total": 80216064,
"tokens/train_per_sec_per_gpu": 128.88,
"tokens/trainable": 6685386
},
{
"epoch": 0.32650890720553044,
"grad_norm": 0.05545097589492798,
"learning_rate": 3.412901595968551e-05,
"loss": 0.7210918068885803,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.05668,
"step": 307,
"tokens/total": 80478208,
"tokens/train_per_sec_per_gpu": 193.0,
"tokens/trainable": 6706345
},
{
"epoch": 0.32757245413453867,
"grad_norm": 0.063844695687294,
"learning_rate": 3.407641593086063e-05,
"loss": 0.7825930118560791,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.18714,
"step": 308,
"tokens/total": 80740352,
"tokens/train_per_sec_per_gpu": 201.52,
"tokens/trainable": 6728441
},
{
"epoch": 0.32863600106354696,
"grad_norm": 0.05788377299904823,
"learning_rate": 3.4023622248907134e-05,
"loss": 0.7766852974891663,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.17425,
"step": 309,
"tokens/total": 81002496,
"tokens/train_per_sec_per_gpu": 240.97,
"tokens/trainable": 6754793
},
{
"epoch": 0.3296995479925552,
"grad_norm": 0.060298677533864975,
"learning_rate": 3.397063564012223e-05,
"loss": 0.7567377090454102,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.13131,
"step": 310,
"tokens/total": 81264640,
"tokens/train_per_sec_per_gpu": 192.69,
"tokens/trainable": 6776213
},
{
"epoch": 0.3307630949215634,
"grad_norm": 0.05781788378953934,
"learning_rate": 3.391745683345729e-05,
"loss": 0.8195350766181946,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.26944,
"step": 311,
"tokens/total": 81526784,
"tokens/train_per_sec_per_gpu": 182.74,
"tokens/trainable": 6798309
},
{
"epoch": 0.33182664185057165,
"grad_norm": 0.05640830472111702,
"learning_rate": 3.3864086560507785e-05,
"loss": 0.8057565689086914,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.23839,
"step": 312,
"tokens/total": 81788928,
"tokens/train_per_sec_per_gpu": 297.01,
"tokens/trainable": 6823595
},
{
"epoch": 0.3328901887795799,
"grad_norm": 0.0735592171549797,
"learning_rate": 3.3810525555503254e-05,
"loss": 0.8060101270675659,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.23896,
"step": 313,
"tokens/total": 82051072,
"tokens/train_per_sec_per_gpu": 196.76,
"tokens/trainable": 6845473
},
{
"epoch": 0.33395373570858816,
"grad_norm": 0.06115385517477989,
"learning_rate": 3.3756774555297186e-05,
"loss": 0.782099723815918,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.18606,
"step": 314,
"tokens/total": 82313216,
"tokens/train_per_sec_per_gpu": 159.99,
"tokens/trainable": 6866224
},
{
"epoch": 0.3350172826375964,
"grad_norm": 0.0663752555847168,
"learning_rate": 3.3702834299356885e-05,
"loss": 0.7522889375686646,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.12185,
"step": 315,
"tokens/total": 82575360,
"tokens/train_per_sec_per_gpu": 157.51,
"tokens/trainable": 6885515
},
{
"epoch": 0.3360808295666046,
"grad_norm": 0.05871765688061714,
"learning_rate": 3.3648705529753306e-05,
"loss": 0.7267637252807617,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.06838,
"step": 316,
"tokens/total": 82837504,
"tokens/train_per_sec_per_gpu": 176.94,
"tokens/trainable": 6906219
},
{
"epoch": 0.33714437649561285,
"grad_norm": 0.05956491827964783,
"learning_rate": 3.3594388991150825e-05,
"loss": 0.821346640586853,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.27356,
"step": 317,
"tokens/total": 83099648,
"tokens/train_per_sec_per_gpu": 222.29,
"tokens/trainable": 6929025
},
{
"epoch": 0.33820792342462114,
"grad_norm": 0.059474050998687744,
"learning_rate": 3.353988543079702e-05,
"loss": 0.8273679614067078,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.28729,
"step": 318,
"tokens/total": 83361792,
"tokens/train_per_sec_per_gpu": 228.13,
"tokens/trainable": 6951873
},
{
"epoch": 0.33927147035362937,
"grad_norm": 0.06504053622484207,
"learning_rate": 3.3485195598512365e-05,
"loss": 0.7731481790542603,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.16658,
"step": 319,
"tokens/total": 83623936,
"tokens/train_per_sec_per_gpu": 183.37,
"tokens/trainable": 6973435
},
{
"epoch": 0.3403350172826376,
"grad_norm": 0.0615144707262516,
"learning_rate": 3.343032024667994e-05,
"loss": 0.7920888662338257,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.208,
"step": 320,
"tokens/total": 83886080,
"tokens/train_per_sec_per_gpu": 168.97,
"tokens/trainable": 6993689
},
{
"epoch": 0.3413985642116458,
"grad_norm": 0.058340758085250854,
"learning_rate": 3.337526013023507e-05,
"loss": 0.7120217084884644,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.03811,
"step": 321,
"tokens/total": 84148224,
"tokens/train_per_sec_per_gpu": 174.54,
"tokens/trainable": 7015311
},
{
"epoch": 0.34246211114065406,
"grad_norm": 0.06086525321006775,
"learning_rate": 3.332001600665494e-05,
"loss": 0.8457379341125488,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.3297,
"step": 322,
"tokens/total": 84410368,
"tokens/train_per_sec_per_gpu": 205.39,
"tokens/trainable": 7038147
},
{
"epoch": 0.34352565806966234,
"grad_norm": 0.06164320930838585,
"learning_rate": 3.326458863594814e-05,
"loss": 0.8334550261497498,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.30126,
"step": 323,
"tokens/total": 84672512,
"tokens/train_per_sec_per_gpu": 198.45,
"tokens/trainable": 7059704
},
{
"epoch": 0.3445892049986706,
"grad_norm": 0.057081304490566254,
"learning_rate": 3.320897878064428e-05,
"loss": 0.838066041469574,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.31189,
"step": 324,
"tokens/total": 84934656,
"tokens/train_per_sec_per_gpu": 209.4,
"tokens/trainable": 7083870
},
{
"epoch": 0.3456527519276788,
"grad_norm": 0.05457906052470207,
"learning_rate": 3.3153187205783454e-05,
"loss": 0.7599472403526306,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.13816,
"step": 325,
"tokens/total": 85196800,
"tokens/train_per_sec_per_gpu": 186.44,
"tokens/trainable": 7108954
},
{
"epoch": 0.34671629885668703,
"grad_norm": 0.061806946992874146,
"learning_rate": 3.309721467890571e-05,
"loss": 0.7315384149551392,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.07828,
"step": 326,
"tokens/total": 85458944,
"tokens/train_per_sec_per_gpu": 169.83,
"tokens/trainable": 7130972
},
{
"epoch": 0.3477798457856953,
"grad_norm": 0.05989091843366623,
"learning_rate": 3.3041061970040486e-05,
"loss": 0.7060387134552002,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.02595,
"step": 327,
"tokens/total": 85721088,
"tokens/train_per_sec_per_gpu": 191.48,
"tokens/trainable": 7152519
},
{
"epoch": 0.34884339271470355,
"grad_norm": 0.06088387221097946,
"learning_rate": 3.298472985169609e-05,
"loss": 0.7296034097671509,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.07426,
"step": 328,
"tokens/total": 85983232,
"tokens/train_per_sec_per_gpu": 237.52,
"tokens/trainable": 7173660
},
{
"epoch": 0.3499069396437118,
"grad_norm": 0.059276383370161057,
"learning_rate": 3.2928219098848955e-05,
"loss": 0.7553350329399109,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.12832,
"step": 329,
"tokens/total": 86245376,
"tokens/train_per_sec_per_gpu": 148.74,
"tokens/trainable": 7196866
},
{
"epoch": 0.35097048657272,
"grad_norm": 0.06169452145695686,
"learning_rate": 3.287153048893307e-05,
"loss": 0.8173863887786865,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.26457,
"step": 330,
"tokens/total": 86507520,
"tokens/train_per_sec_per_gpu": 198.89,
"tokens/trainable": 7218044
},
{
"epoch": 0.35203403350172824,
"grad_norm": 0.06478651612997055,
"learning_rate": 3.281466480182925e-05,
"loss": 0.7393308877944946,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.09453,
"step": 331,
"tokens/total": 86769664,
"tokens/train_per_sec_per_gpu": 189.28,
"tokens/trainable": 7240085
},
{
"epoch": 0.3530975804307365,
"grad_norm": 0.0714036375284195,
"learning_rate": 3.27576228198544e-05,
"loss": 0.7848330140113831,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.19204,
"step": 332,
"tokens/total": 87031808,
"tokens/train_per_sec_per_gpu": 180.05,
"tokens/trainable": 7263289
},
{
"epoch": 0.35416112735974475,
"grad_norm": 0.059255450963974,
"learning_rate": 3.270040532775077e-05,
"loss": 0.7574397921562195,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.13281,
"step": 333,
"tokens/total": 87293952,
"tokens/train_per_sec_per_gpu": 169.3,
"tokens/trainable": 7283911
},
{
"epoch": 0.355224674288753,
"grad_norm": 0.0516078807413578,
"learning_rate": 3.264301311267515e-05,
"loss": 0.7163474559783936,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.04694,
"step": 334,
"tokens/total": 87556096,
"tokens/train_per_sec_per_gpu": 223.05,
"tokens/trainable": 7307672
},
{
"epoch": 0.3562882212177612,
"grad_norm": 0.06229817494750023,
"learning_rate": 3.2585446964188026e-05,
"loss": 0.715316116809845,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.04483,
"step": 335,
"tokens/total": 87818240,
"tokens/train_per_sec_per_gpu": 195.29,
"tokens/trainable": 7330420
},
{
"epoch": 0.3573517681467695,
"grad_norm": 0.06110849231481552,
"learning_rate": 3.252770767424277e-05,
"loss": 0.6982554197311401,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.01024,
"step": 336,
"tokens/total": 88080384,
"tokens/train_per_sec_per_gpu": 207.65,
"tokens/trainable": 7351877
},
{
"epoch": 0.35841531507577773,
"grad_norm": 0.05932268872857094,
"learning_rate": 3.246979603717467e-05,
"loss": 0.7535157203674316,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.12446,
"step": 337,
"tokens/total": 88342528,
"tokens/train_per_sec_per_gpu": 239.51,
"tokens/trainable": 7377554
},
{
"epoch": 0.35947886200478596,
"grad_norm": 0.06964396685361862,
"learning_rate": 3.2411712849690076e-05,
"loss": 0.802024781703949,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.23005,
"step": 338,
"tokens/total": 88604672,
"tokens/train_per_sec_per_gpu": 223.69,
"tokens/trainable": 7399496
},
{
"epoch": 0.3605424089337942,
"grad_norm": 0.06441716849803925,
"learning_rate": 3.235345891085536e-05,
"loss": 0.7387241125106812,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.09326,
"step": 339,
"tokens/total": 88866816,
"tokens/train_per_sec_per_gpu": 195.97,
"tokens/trainable": 7421514
},
{
"epoch": 0.3616059558628024,
"grad_norm": 0.05845661088824272,
"learning_rate": 3.229503502208602e-05,
"loss": 0.7207450866699219,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.05596,
"step": 340,
"tokens/total": 89128960,
"tokens/train_per_sec_per_gpu": 171.19,
"tokens/trainable": 7442572
},
{
"epoch": 0.3626695027918107,
"grad_norm": 0.06204557791352272,
"learning_rate": 3.2236441987135565e-05,
"loss": 0.757001519203186,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.13187,
"step": 341,
"tokens/total": 89391104,
"tokens/train_per_sec_per_gpu": 177.89,
"tokens/trainable": 7463164
},
{
"epoch": 0.36373304972081894,
"grad_norm": 0.05783051997423172,
"learning_rate": 3.2177680612084494e-05,
"loss": 0.7507032752037048,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.11849,
"step": 342,
"tokens/total": 89653248,
"tokens/train_per_sec_per_gpu": 187.61,
"tokens/trainable": 7485037
},
{
"epoch": 0.36479659664982717,
"grad_norm": 0.06585251539945602,
"learning_rate": 3.211875170532924e-05,
"loss": 0.7124658823013306,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.03901,
"step": 343,
"tokens/total": 89915392,
"tokens/train_per_sec_per_gpu": 169.85,
"tokens/trainable": 7505178
},
{
"epoch": 0.3658601435788354,
"grad_norm": 0.06433013081550598,
"learning_rate": 3.205965607757097e-05,
"loss": 0.755608081817627,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.12891,
"step": 344,
"tokens/total": 90177536,
"tokens/train_per_sec_per_gpu": 186.29,
"tokens/trainable": 7526734
},
{
"epoch": 0.3669236905078437,
"grad_norm": 0.06355367600917816,
"learning_rate": 3.200039454180452e-05,
"loss": 0.8150879740715027,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.25937,
"step": 345,
"tokens/total": 90439680,
"tokens/train_per_sec_per_gpu": 179.78,
"tokens/trainable": 7548681
},
{
"epoch": 0.3679872374368519,
"grad_norm": 0.058385591953992844,
"learning_rate": 3.1940967913307144e-05,
"loss": 0.6909693479537964,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 1.99565,
"step": 346,
"tokens/total": 90701824,
"tokens/train_per_sec_per_gpu": 181.15,
"tokens/trainable": 7569898
},
{
"epoch": 0.36905078436586014,
"grad_norm": 0.055889613926410675,
"learning_rate": 3.188137700962733e-05,
"loss": 0.7137737274169922,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.04168,
"step": 347,
"tokens/total": 90963968,
"tokens/train_per_sec_per_gpu": 225.36,
"tokens/trainable": 7592560
},
{
"epoch": 0.37011433129486837,
"grad_norm": 0.07224101573228836,
"learning_rate": 3.1821622650573536e-05,
"loss": 0.7986935973167419,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.22264,
"step": 348,
"tokens/total": 91226112,
"tokens/train_per_sec_per_gpu": 204.66,
"tokens/trainable": 7614191
},
{
"epoch": 0.37117787822387666,
"grad_norm": 0.06277446448802948,
"learning_rate": 3.176170565820293e-05,
"loss": 0.7567167282104492,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.13127,
"step": 349,
"tokens/total": 91488256,
"tokens/train_per_sec_per_gpu": 165.46,
"tokens/trainable": 7634778
},
{
"epoch": 0.3722414251528849,
"grad_norm": 0.06595855206251144,
"learning_rate": 3.170162685681007e-05,
"loss": 0.8263660073280334,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.285,
"step": 350,
"tokens/total": 91750400,
"tokens/train_per_sec_per_gpu": 180.47,
"tokens/trainable": 7655328
},
{
"epoch": 0.3733049720818931,
"grad_norm": 0.053407274186611176,
"learning_rate": 3.1641387072915574e-05,
"loss": 0.7117317318916321,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.03752,
"step": 351,
"tokens/total": 92012544,
"tokens/train_per_sec_per_gpu": 209.35,
"tokens/trainable": 7678377
},
{
"epoch": 0.37436851901090135,
"grad_norm": 0.0668676570057869,
"learning_rate": 3.1580987135254715e-05,
"loss": 0.7980103492736816,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.22112,
"step": 352,
"tokens/total": 92274688,
"tokens/train_per_sec_per_gpu": 227.01,
"tokens/trainable": 7700711
},
{
"epoch": 0.3754320659399096,
"grad_norm": 0.06859572231769562,
"learning_rate": 3.1520427874766064e-05,
"loss": 0.7606133818626404,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.13959,
"step": 353,
"tokens/total": 92536832,
"tokens/train_per_sec_per_gpu": 178.09,
"tokens/trainable": 7720570
},
{
"epoch": 0.37649561286891786,
"grad_norm": 0.06196228042244911,
"learning_rate": 3.145971012458005e-05,
"loss": 0.7603709101676941,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.13907,
"step": 354,
"tokens/total": 92798976,
"tokens/train_per_sec_per_gpu": 174.2,
"tokens/trainable": 7743422
},
{
"epoch": 0.3775591597979261,
"grad_norm": 0.05902267247438431,
"learning_rate": 3.139883472000745e-05,
"loss": 0.7486381530761719,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.11412,
"step": 355,
"tokens/total": 93061120,
"tokens/train_per_sec_per_gpu": 257.11,
"tokens/trainable": 7768328
},
{
"epoch": 0.3786227067269343,
"grad_norm": 0.05966390669345856,
"learning_rate": 3.133780249852799e-05,
"loss": 0.7499316930770874,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.11686,
"step": 356,
"tokens/total": 93323264,
"tokens/train_per_sec_per_gpu": 218.24,
"tokens/trainable": 7791928
},
{
"epoch": 0.37968625365594255,
"grad_norm": 0.06466083228588104,
"learning_rate": 3.127661429977872e-05,
"loss": 0.754686713218689,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.12695,
"step": 357,
"tokens/total": 93585408,
"tokens/train_per_sec_per_gpu": 196.44,
"tokens/trainable": 7812051
},
{
"epoch": 0.38074980058495084,
"grad_norm": 0.058210499584674835,
"learning_rate": 3.1215270965542544e-05,
"loss": 0.7765249013900757,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.1739,
"step": 358,
"tokens/total": 93847552,
"tokens/train_per_sec_per_gpu": 207.04,
"tokens/trainable": 7833084
},
{
"epoch": 0.38181334751395907,
"grad_norm": 0.057726673781871796,
"learning_rate": 3.115377333973659e-05,
"loss": 0.7482200264930725,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.11324,
"step": 359,
"tokens/total": 94109696,
"tokens/train_per_sec_per_gpu": 228.28,
"tokens/trainable": 7857441
},
{
"epoch": 0.3828768944429673,
"grad_norm": 0.06571833789348602,
"learning_rate": 3.109212226840063e-05,
"loss": 0.8272304534912109,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.28698,
"step": 360,
"tokens/total": 94371840,
"tokens/train_per_sec_per_gpu": 168.79,
"tokens/trainable": 7879005
},
{
"epoch": 0.3839404413719755,
"grad_norm": 0.06610347330570221,
"learning_rate": 3.103031859968542e-05,
"loss": 0.7621959447860718,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.14298,
"step": 361,
"tokens/total": 94633984,
"tokens/train_per_sec_per_gpu": 174.29,
"tokens/trainable": 7899540
},
{
"epoch": 0.38500398830098376,
"grad_norm": 0.05712476745247841,
"learning_rate": 3.096836318384103e-05,
"loss": 0.7930388450622559,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.2101,
"step": 362,
"tokens/total": 94896128,
"tokens/train_per_sec_per_gpu": 220.27,
"tokens/trainable": 7922542
},
{
"epoch": 0.38606753522999204,
"grad_norm": 0.06603259593248367,
"learning_rate": 3.0906256873205193e-05,
"loss": 0.7841721773147583,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.19059,
"step": 363,
"tokens/total": 95158272,
"tokens/train_per_sec_per_gpu": 176.27,
"tokens/trainable": 7945013
},
{
"epoch": 0.3871310821590003,
"grad_norm": 0.056947011500597,
"learning_rate": 3.08440005221915e-05,
"loss": 0.7464731335639954,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.10955,
"step": 364,
"tokens/total": 95420416,
"tokens/train_per_sec_per_gpu": 169.25,
"tokens/trainable": 7965994
},
{
"epoch": 0.3881946290880085,
"grad_norm": 0.06810449063777924,
"learning_rate": 3.0781594987277724e-05,
"loss": 0.8506579399108887,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.34119,
"step": 365,
"tokens/total": 95682560,
"tokens/train_per_sec_per_gpu": 215.35,
"tokens/trainable": 7987278
},
{
"epoch": 0.38925817601701673,
"grad_norm": 0.05948546528816223,
"learning_rate": 3.071904112699397e-05,
"loss": 0.8312458395957947,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.29618,
"step": 366,
"tokens/total": 95944704,
"tokens/train_per_sec_per_gpu": 215.65,
"tokens/trainable": 8011500
},
{
"epoch": 0.390321722946025,
"grad_norm": 0.06788976490497589,
"learning_rate": 3.0656339801910926e-05,
"loss": 0.7707929611206055,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.16148,
"step": 367,
"tokens/total": 96206848,
"tokens/train_per_sec_per_gpu": 205.07,
"tokens/trainable": 8032365
},
{
"epoch": 0.39138526987503325,
"grad_norm": 0.058084722608327866,
"learning_rate": 3.059349187462798e-05,
"loss": 0.6599326729774475,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 1.93466,
"step": 368,
"tokens/total": 96468992,
"tokens/train_per_sec_per_gpu": 228.45,
"tokens/trainable": 8055039
},
{
"epoch": 0.3924488168040415,
"grad_norm": 0.06115228310227394,
"learning_rate": 3.053049820976135e-05,
"loss": 0.760746955871582,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.13987,
"step": 369,
"tokens/total": 96731136,
"tokens/train_per_sec_per_gpu": 189.31,
"tokens/trainable": 8078259
},
{
"epoch": 0.3935123637330497,
"grad_norm": 0.07438641041517258,
"learning_rate": 3.0467359673932244e-05,
"loss": 0.7815507650375366,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.18486,
"step": 370,
"tokens/total": 96993280,
"tokens/train_per_sec_per_gpu": 151.93,
"tokens/trainable": 8099819
},
{
"epoch": 0.39457591066205794,
"grad_norm": 0.06393261253833771,
"learning_rate": 3.040407713575487e-05,
"loss": 0.7451884746551514,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.10684,
"step": 371,
"tokens/total": 97255424,
"tokens/train_per_sec_per_gpu": 177.57,
"tokens/trainable": 8121758
},
{
"epoch": 0.3956394575910662,
"grad_norm": 0.058520760387182236,
"learning_rate": 3.034065146582452e-05,
"loss": 0.8240416049957275,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.27969,
"step": 372,
"tokens/total": 97517568,
"tokens/train_per_sec_per_gpu": 214.86,
"tokens/trainable": 8144183
},
{
"epoch": 0.39670300452007445,
"grad_norm": 0.06523202359676361,
"learning_rate": 3.0277083536705604e-05,
"loss": 0.7511664032936096,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.11947,
"step": 373,
"tokens/total": 97779712,
"tokens/train_per_sec_per_gpu": 172.88,
"tokens/trainable": 8164676
},
{
"epoch": 0.3977665514490827,
"grad_norm": 0.06993680447340012,
"learning_rate": 3.0213374222919617e-05,
"loss": 0.7678710222244263,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.15517,
"step": 374,
"tokens/total": 98041856,
"tokens/train_per_sec_per_gpu": 161.85,
"tokens/trainable": 8184550
},
{
"epoch": 0.3988300983780909,
"grad_norm": 0.05952773615717888,
"learning_rate": 3.0149524400933114e-05,
"loss": 0.7702289819717407,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.16026,
"step": 375,
"tokens/total": 98304000,
"tokens/train_per_sec_per_gpu": 174.86,
"tokens/trainable": 8206571
},
{
"epoch": 0.3998936453070992,
"grad_norm": 0.06135401502251625,
"learning_rate": 3.008553494914569e-05,
"loss": 0.6841228008270264,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 1.98203,
"step": 376,
"tokens/total": 98566144,
"tokens/train_per_sec_per_gpu": 194.82,
"tokens/trainable": 8229068
},
{
"epoch": 0.40095719223610743,
"grad_norm": 0.0706343874335289,
"learning_rate": 3.002140674787783e-05,
"loss": 0.7617697715759277,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.14206,
"step": 377,
"tokens/total": 98828288,
"tokens/train_per_sec_per_gpu": 145.29,
"tokens/trainable": 8251266
},
{
"epoch": 0.40202073916511566,
"grad_norm": 0.0608975812792778,
"learning_rate": 2.995714067935887e-05,
"loss": 0.7577897310256958,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.13356,
"step": 378,
"tokens/total": 99090432,
"tokens/train_per_sec_per_gpu": 148.13,
"tokens/trainable": 8273480
},
{
"epoch": 0.4030842860941239,
"grad_norm": 0.06372503191232681,
"learning_rate": 2.9892737627714786e-05,
"loss": 0.7569035291671753,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.13167,
"step": 379,
"tokens/total": 99352576,
"tokens/train_per_sec_per_gpu": 164.36,
"tokens/trainable": 8293250
},
{
"epoch": 0.4041478330231321,
"grad_norm": 0.07930820435285568,
"learning_rate": 2.9828198478956093e-05,
"loss": 0.7412185668945312,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.09849,
"step": 380,
"tokens/total": 99614720,
"tokens/train_per_sec_per_gpu": 165.37,
"tokens/trainable": 8313207
},
{
"epoch": 0.4052113799521404,
"grad_norm": 0.0618307963013649,
"learning_rate": 2.976352412096563e-05,
"loss": 0.718746542930603,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.05186,
"step": 381,
"tokens/total": 99876864,
"tokens/train_per_sec_per_gpu": 195.0,
"tokens/trainable": 8334762
},
{
"epoch": 0.40627492688114863,
"grad_norm": 0.06570852547883987,
"learning_rate": 2.9698715443486338e-05,
"loss": 0.7648171186447144,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.1486,
"step": 382,
"tokens/total": 100139008,
"tokens/train_per_sec_per_gpu": 174.53,
"tokens/trainable": 8354787
},
{
"epoch": 0.40733847381015686,
"grad_norm": 0.06311152130365372,
"learning_rate": 2.9633773338109027e-05,
"loss": 0.7888460159301758,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.20086,
"step": 383,
"tokens/total": 100401152,
"tokens/train_per_sec_per_gpu": 164.71,
"tokens/trainable": 8376673
},
{
"epoch": 0.4084020207391651,
"grad_norm": 0.06162691116333008,
"learning_rate": 2.9568698698260126e-05,
"loss": 0.7124409675598145,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.03896,
"step": 384,
"tokens/total": 100663296,
"tokens/train_per_sec_per_gpu": 224.89,
"tokens/trainable": 8399066
},
{
"epoch": 0.4094655676681734,
"grad_norm": 0.059788983315229416,
"learning_rate": 2.9503492419189366e-05,
"loss": 0.7600404024124146,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.13836,
"step": 385,
"tokens/total": 100925440,
"tokens/train_per_sec_per_gpu": 200.39,
"tokens/trainable": 8422560
},
{
"epoch": 0.4105291145971816,
"grad_norm": 0.06216095760464668,
"learning_rate": 2.9438155397957474e-05,
"loss": 0.8070495128631592,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.24129,
"step": 386,
"tokens/total": 101187584,
"tokens/train_per_sec_per_gpu": 149.86,
"tokens/trainable": 8443830
},
{
"epoch": 0.41159266152618984,
"grad_norm": 0.05838804319500923,
"learning_rate": 2.937268853342383e-05,
"loss": 0.759508490562439,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.13723,
"step": 387,
"tokens/total": 101449728,
"tokens/train_per_sec_per_gpu": 136.92,
"tokens/trainable": 8465737
},
{
"epoch": 0.41265620845519807,
"grad_norm": 0.05906308814883232,
"learning_rate": 2.9307092726234127e-05,
"loss": 0.7870290279388428,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.19686,
"step": 388,
"tokens/total": 101711872,
"tokens/train_per_sec_per_gpu": 176.32,
"tokens/trainable": 8488568
},
{
"epoch": 0.41371975538420636,
"grad_norm": 0.06456853449344635,
"learning_rate": 2.9241368878807925e-05,
"loss": 0.7161662578582764,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.04657,
"step": 389,
"tokens/total": 101974016,
"tokens/train_per_sec_per_gpu": 181.97,
"tokens/trainable": 8510960
},
{
"epoch": 0.4147833023132146,
"grad_norm": 0.0667351484298706,
"learning_rate": 2.9175517895326292e-05,
"loss": 0.8066511154174805,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.24039,
"step": 390,
"tokens/total": 102236160,
"tokens/train_per_sec_per_gpu": 208.78,
"tokens/trainable": 8532514
},
{
"epoch": 0.4158468492422228,
"grad_norm": 0.06246356666088104,
"learning_rate": 2.9109540681719322e-05,
"loss": 0.7846518158912659,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.19164,
"step": 391,
"tokens/total": 102498304,
"tokens/train_per_sec_per_gpu": 224.1,
"tokens/trainable": 8555545
},
{
"epoch": 0.41691039617123105,
"grad_norm": 0.06540708988904953,
"learning_rate": 2.9043438145653717e-05,
"loss": 0.7380800843238831,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.09192,
"step": 392,
"tokens/total": 102760448,
"tokens/train_per_sec_per_gpu": 190.76,
"tokens/trainable": 8577885
},
{
"epoch": 0.4179739431002393,
"grad_norm": 0.062018416821956635,
"learning_rate": 2.8977211196520257e-05,
"loss": 0.774357795715332,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.1692,
"step": 393,
"tokens/total": 103022592,
"tokens/train_per_sec_per_gpu": 212.39,
"tokens/trainable": 8600480
},
{
"epoch": 0.41903749002924756,
"grad_norm": 0.062351830303668976,
"learning_rate": 2.8910860745421305e-05,
"loss": 0.7095509767532349,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.03308,
"step": 394,
"tokens/total": 103284736,
"tokens/train_per_sec_per_gpu": 166.14,
"tokens/trainable": 8620831
},
{
"epoch": 0.4201010369582558,
"grad_norm": 0.06218433752655983,
"learning_rate": 2.884438770515829e-05,
"loss": 0.7856686115264893,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.19387,
"step": 395,
"tokens/total": 103546880,
"tokens/train_per_sec_per_gpu": 235.06,
"tokens/trainable": 8646017
},
{
"epoch": 0.421164583887264,
"grad_norm": 0.07032019644975662,
"learning_rate": 2.877779299021912e-05,
"loss": 0.7358700037002563,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.0873,
"step": 396,
"tokens/total": 103809024,
"tokens/train_per_sec_per_gpu": 132.47,
"tokens/trainable": 8665893
},
{
"epoch": 0.42222813081627225,
"grad_norm": 0.057751379907131195,
"learning_rate": 2.871107751676561e-05,
"loss": 0.7476328015327454,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.11199,
"step": 397,
"tokens/total": 104071168,
"tokens/train_per_sec_per_gpu": 216.14,
"tokens/trainable": 8689230
},
{
"epoch": 0.42329167774528054,
"grad_norm": 0.060179274529218674,
"learning_rate": 2.8644242202620907e-05,
"loss": 0.7824307084083557,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.18678,
"step": 398,
"tokens/total": 104333312,
"tokens/train_per_sec_per_gpu": 207.33,
"tokens/trainable": 8710606
},
{
"epoch": 0.42435522467428877,
"grad_norm": 0.07202541828155518,
"learning_rate": 2.857728796725682e-05,
"loss": 0.7747771739959717,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.17011,
"step": 399,
"tokens/total": 104595456,
"tokens/train_per_sec_per_gpu": 152.9,
"tokens/trainable": 8732079
},
{
"epoch": 0.425418771603297,
"grad_norm": 0.06443008780479431,
"learning_rate": 2.8510215731781194e-05,
"loss": 0.7466020584106445,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.10982,
"step": 400,
"tokens/total": 104857600,
"tokens/train_per_sec_per_gpu": 166.03,
"tokens/trainable": 8754126
},
{
"epoch": 0.4264823185323052,
"grad_norm": 0.05945134907960892,
"learning_rate": 2.844302641892523e-05,
"loss": 0.7561734914779663,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.13011,
"step": 401,
"tokens/total": 105119744,
"tokens/train_per_sec_per_gpu": 184.08,
"tokens/trainable": 8775310
},
{
"epoch": 0.42754586546131346,
"grad_norm": 0.06570764631032944,
"learning_rate": 2.83757209530308e-05,
"loss": 0.728512167930603,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.072,
"step": 402,
"tokens/total": 105381888,
"tokens/train_per_sec_per_gpu": 160.39,
"tokens/trainable": 8795593
},
{
"epoch": 0.42860941239032174,
"grad_norm": 0.06125401705503464,
"learning_rate": 2.8308300260037734e-05,
"loss": 0.754820704460144,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.12723,
"step": 403,
"tokens/total": 105644032,
"tokens/train_per_sec_per_gpu": 172.16,
"tokens/trainable": 8817946
},
{
"epoch": 0.42967295931933,
"grad_norm": 0.06924828141927719,
"learning_rate": 2.8240765267471056e-05,
"loss": 0.7697207927703857,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.15916,
"step": 404,
"tokens/total": 105906176,
"tokens/train_per_sec_per_gpu": 208.57,
"tokens/trainable": 8840440
},
{
"epoch": 0.4307365062483382,
"grad_norm": 0.06233648210763931,
"learning_rate": 2.8173116904428242e-05,
"loss": 0.7522628307342529,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.1218,
"step": 405,
"tokens/total": 106168320,
"tokens/train_per_sec_per_gpu": 192.61,
"tokens/trainable": 8862223
},
{
"epoch": 0.43180005317734643,
"grad_norm": 0.05627689138054848,
"learning_rate": 2.810535610156646e-05,
"loss": 0.8060315847396851,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.23901,
"step": 406,
"tokens/total": 106430464,
"tokens/train_per_sec_per_gpu": 240.07,
"tokens/trainable": 8888361
},
{
"epoch": 0.4328636001063547,
"grad_norm": 0.060442451387643814,
"learning_rate": 2.803748379108972e-05,
"loss": 0.7366658449172974,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.08896,
"step": 407,
"tokens/total": 106692608,
"tokens/train_per_sec_per_gpu": 194.32,
"tokens/trainable": 8911616
},
{
"epoch": 0.43392714703536295,
"grad_norm": 0.06363833695650101,
"learning_rate": 2.7969500906736065e-05,
"loss": 0.777472734451294,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.17597,
"step": 408,
"tokens/total": 106954752,
"tokens/train_per_sec_per_gpu": 205.98,
"tokens/trainable": 8935498
},
{
"epoch": 0.4349906939643712,
"grad_norm": 0.060264162719249725,
"learning_rate": 2.7901408383764776e-05,
"loss": 0.8076545000076294,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.24264,
"step": 409,
"tokens/total": 107216896,
"tokens/train_per_sec_per_gpu": 191.96,
"tokens/trainable": 8959641
},
{
"epoch": 0.4360542408933794,
"grad_norm": 0.0615147240459919,
"learning_rate": 2.783320715894341e-05,
"loss": 0.7697659730911255,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.15926,
"step": 410,
"tokens/total": 107479040,
"tokens/train_per_sec_per_gpu": 229.4,
"tokens/trainable": 8981683
},
{
"epoch": 0.43711778782238764,
"grad_norm": 0.05955628678202629,
"learning_rate": 2.7764898170534993e-05,
"loss": 0.7136242389678955,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.04138,
"step": 411,
"tokens/total": 107741184,
"tokens/train_per_sec_per_gpu": 197.85,
"tokens/trainable": 9002018
},
{
"epoch": 0.4381813347513959,
"grad_norm": 0.05883246660232544,
"learning_rate": 2.76964823582851e-05,
"loss": 0.7270078659057617,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.06888,
"step": 412,
"tokens/total": 108003328,
"tokens/train_per_sec_per_gpu": 209.83,
"tokens/trainable": 9024288
},
{
"epoch": 0.43924488168040415,
"grad_norm": 0.06609495729207993,
"learning_rate": 2.7627960663408874e-05,
"loss": 0.7433674335479736,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.10301,
"step": 413,
"tokens/total": 108265472,
"tokens/train_per_sec_per_gpu": 205.82,
"tokens/trainable": 9046881
},
{
"epoch": 0.4403084286094124,
"grad_norm": 0.07620103657245636,
"learning_rate": 2.7559334028578135e-05,
"loss": 0.7862449884414673,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.19514,
"step": 414,
"tokens/total": 108527616,
"tokens/train_per_sec_per_gpu": 181.75,
"tokens/trainable": 9067519
},
{
"epoch": 0.4413719755384206,
"grad_norm": 0.06601685285568237,
"learning_rate": 2.7490603397908393e-05,
"loss": 0.7648828029632568,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.14874,
"step": 415,
"tokens/total": 108789760,
"tokens/train_per_sec_per_gpu": 174.3,
"tokens/trainable": 9089133
},
{
"epoch": 0.4424355224674289,
"grad_norm": 0.06513883173465729,
"learning_rate": 2.742176971694585e-05,
"loss": 0.7682895660400391,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.15608,
"step": 416,
"tokens/total": 109051904,
"tokens/train_per_sec_per_gpu": 177.11,
"tokens/trainable": 9110997
},
{
"epoch": 0.44349906939643713,
"grad_norm": 0.06401567906141281,
"learning_rate": 2.7352833932654402e-05,
"loss": 0.7881615161895752,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.19935,
"step": 417,
"tokens/total": 109314048,
"tokens/train_per_sec_per_gpu": 176.32,
"tokens/trainable": 9134139
},
{
"epoch": 0.44456261632544536,
"grad_norm": 0.07034140825271606,
"learning_rate": 2.7283796993402613e-05,
"loss": 0.8482910990715027,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.33565,
"step": 418,
"tokens/total": 109576192,
"tokens/train_per_sec_per_gpu": 202.73,
"tokens/trainable": 9155746
},
{
"epoch": 0.4456261632544536,
"grad_norm": 0.06932256370782852,
"learning_rate": 2.721465984895066e-05,
"loss": 0.7511000633239746,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.11933,
"step": 419,
"tokens/total": 109838336,
"tokens/train_per_sec_per_gpu": 151.07,
"tokens/trainable": 9175997
},
{
"epoch": 0.4466897101834618,
"grad_norm": 0.05801774561405182,
"learning_rate": 2.714542345043726e-05,
"loss": 0.7160236835479736,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.04628,
"step": 420,
"tokens/total": 110100480,
"tokens/train_per_sec_per_gpu": 206.36,
"tokens/trainable": 9201154
},
{
"epoch": 0.4477532571124701,
"grad_norm": 0.0719035267829895,
"learning_rate": 2.7076088750366617e-05,
"loss": 0.7472874522209167,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.11127,
"step": 421,
"tokens/total": 110362624,
"tokens/train_per_sec_per_gpu": 221.89,
"tokens/trainable": 9223146
},
{
"epoch": 0.44881680404147833,
"grad_norm": 0.06794784218072891,
"learning_rate": 2.700665670259527e-05,
"loss": 0.789624035358429,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.20257,
"step": 422,
"tokens/total": 110624768,
"tokens/train_per_sec_per_gpu": 182.17,
"tokens/trainable": 9245502
},
{
"epoch": 0.44988035097048656,
"grad_norm": 0.06335590779781342,
"learning_rate": 2.693712826231903e-05,
"loss": 0.720179557800293,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.0548,
"step": 423,
"tokens/total": 110886912,
"tokens/train_per_sec_per_gpu": 184.76,
"tokens/trainable": 9267573
},
{
"epoch": 0.4509438978994948,
"grad_norm": 0.06363626569509506,
"learning_rate": 2.6867504386059776e-05,
"loss": 0.771357536315918,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.1627,
"step": 424,
"tokens/total": 111149056,
"tokens/train_per_sec_per_gpu": 181.22,
"tokens/trainable": 9290024
},
{
"epoch": 0.4520074448285031,
"grad_norm": 0.06436197459697723,
"learning_rate": 2.679778603165233e-05,
"loss": 0.8299375772476196,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.29318,
"step": 425,
"tokens/total": 111411200,
"tokens/train_per_sec_per_gpu": 181.49,
"tokens/trainable": 9312089
},
{
"epoch": 0.4530709917575113,
"grad_norm": 0.07020284235477448,
"learning_rate": 2.6727974158231312e-05,
"loss": 0.7461998462677002,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.10897,
"step": 426,
"tokens/total": 111673344,
"tokens/train_per_sec_per_gpu": 179.65,
"tokens/trainable": 9332072
},
{
"epoch": 0.45413453868651954,
"grad_norm": 0.07214612513780594,
"learning_rate": 2.6658069726217863e-05,
"loss": 0.7696465253829956,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.159,
"step": 427,
"tokens/total": 111935488,
"tokens/train_per_sec_per_gpu": 248.27,
"tokens/trainable": 9354855
},
{
"epoch": 0.45519808561552777,
"grad_norm": 0.07788221538066864,
"learning_rate": 2.6588073697306494e-05,
"loss": 0.7938302755355835,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.21185,
"step": 428,
"tokens/total": 112197632,
"tokens/train_per_sec_per_gpu": 193.12,
"tokens/trainable": 9376477
},
{
"epoch": 0.456261632544536,
"grad_norm": 0.06307144463062286,
"learning_rate": 2.6517987034451846e-05,
"loss": 0.735474705696106,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.08647,
"step": 429,
"tokens/total": 112459776,
"tokens/train_per_sec_per_gpu": 150.92,
"tokens/trainable": 9396690
},
{
"epoch": 0.4573251794735443,
"grad_norm": 0.06643529236316681,
"learning_rate": 2.6447810701855436e-05,
"loss": 0.7078378200531006,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.0296,
"step": 430,
"tokens/total": 112721920,
"tokens/train_per_sec_per_gpu": 209.14,
"tokens/trainable": 9418386
},
{
"epoch": 0.4583887264025525,
"grad_norm": 0.06323806941509247,
"learning_rate": 2.637754566495238e-05,
"loss": 0.7661035060882568,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.15137,
"step": 431,
"tokens/total": 112984064,
"tokens/train_per_sec_per_gpu": 174.47,
"tokens/trainable": 9440604
},
{
"epoch": 0.45945227333156075,
"grad_norm": 0.06076245754957199,
"learning_rate": 2.6307192890398126e-05,
"loss": 0.7844895720481873,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.19129,
"step": 432,
"tokens/total": 113246208,
"tokens/train_per_sec_per_gpu": 186.77,
"tokens/trainable": 9464422
},
{
"epoch": 0.460515820260569,
"grad_norm": 0.0693245679140091,
"learning_rate": 2.6236753346055176e-05,
"loss": 0.6995319724082947,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.01281,
"step": 433,
"tokens/total": 113508352,
"tokens/train_per_sec_per_gpu": 182.76,
"tokens/trainable": 9485760
},
{
"epoch": 0.46157936718957726,
"grad_norm": 0.060889989137649536,
"learning_rate": 2.6166228000979726e-05,
"loss": 0.6955425143241882,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.0048,
"step": 434,
"tokens/total": 113770496,
"tokens/train_per_sec_per_gpu": 201.76,
"tokens/trainable": 9508485
},
{
"epoch": 0.4626429141185855,
"grad_norm": 0.06848379224538803,
"learning_rate": 2.6095617825408357e-05,
"loss": 0.7852069735527039,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.19286,
"step": 435,
"tokens/total": 114032640,
"tokens/train_per_sec_per_gpu": 189.68,
"tokens/trainable": 9530965
},
{
"epoch": 0.4637064610475937,
"grad_norm": 0.07383300364017487,
"learning_rate": 2.6024923790744686e-05,
"loss": 0.7191091775894165,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.0526,
"step": 436,
"tokens/total": 114294784,
"tokens/train_per_sec_per_gpu": 196.81,
"tokens/trainable": 9553360
},
{
"epoch": 0.46477000797660195,
"grad_norm": 0.06411275267601013,
"learning_rate": 2.5954146869546018e-05,
"loss": 0.7149425148963928,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.04407,
"step": 437,
"tokens/total": 114556928,
"tokens/train_per_sec_per_gpu": 222.64,
"tokens/trainable": 9574783
},
{
"epoch": 0.46583355490561024,
"grad_norm": 0.06931298971176147,
"learning_rate": 2.588328803550993e-05,
"loss": 0.8728055953979492,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.39362,
"step": 438,
"tokens/total": 114819072,
"tokens/train_per_sec_per_gpu": 154.42,
"tokens/trainable": 9595109
},
{
"epoch": 0.46689710183461847,
"grad_norm": 0.07122842967510223,
"learning_rate": 2.5812348263460916e-05,
"loss": 0.8005967140197754,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.22687,
"step": 439,
"tokens/total": 115081216,
"tokens/train_per_sec_per_gpu": 195.46,
"tokens/trainable": 9615835
},
{
"epoch": 0.4679606487636267,
"grad_norm": 0.07328256964683533,
"learning_rate": 2.5741328529336934e-05,
"loss": 0.8034292459487915,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.23319,
"step": 440,
"tokens/total": 115343360,
"tokens/train_per_sec_per_gpu": 232.42,
"tokens/trainable": 9638791
},
{
"epoch": 0.4690241956926349,
"grad_norm": 0.07195594161748886,
"learning_rate": 2.5670229810176026e-05,
"loss": 0.8185476064682007,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.2672,
"step": 441,
"tokens/total": 115605504,
"tokens/train_per_sec_per_gpu": 154.16,
"tokens/trainable": 9659990
},
{
"epoch": 0.47008774262164316,
"grad_norm": 0.07275015860795975,
"learning_rate": 2.5599053084102838e-05,
"loss": 0.8078755736351013,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.24314,
"step": 442,
"tokens/total": 115867648,
"tokens/train_per_sec_per_gpu": 167.17,
"tokens/trainable": 9680516
},
{
"epoch": 0.47115128955065144,
"grad_norm": 0.07171031087636948,
"learning_rate": 2.5527799330315182e-05,
"loss": 0.7240858674049377,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.06284,
"step": 443,
"tokens/total": 116129792,
"tokens/train_per_sec_per_gpu": 189.32,
"tokens/trainable": 9700527
},
{
"epoch": 0.47221483647965967,
"grad_norm": 0.05753646418452263,
"learning_rate": 2.5456469529070566e-05,
"loss": 0.7394333481788635,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.09475,
"step": 444,
"tokens/total": 116391936,
"tokens/train_per_sec_per_gpu": 215.9,
"tokens/trainable": 9725734
},
{
"epoch": 0.4732783834086679,
"grad_norm": 0.0627320408821106,
"learning_rate": 2.5385064661672692e-05,
"loss": 0.7002488970756531,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.01425,
"step": 445,
"tokens/total": 116654080,
"tokens/train_per_sec_per_gpu": 166.84,
"tokens/trainable": 9748076
},
{
"epoch": 0.47434193033767613,
"grad_norm": 0.06529032438993454,
"learning_rate": 2.5313585710457985e-05,
"loss": 0.7247512936592102,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.06422,
"step": 446,
"tokens/total": 116916224,
"tokens/train_per_sec_per_gpu": 182.03,
"tokens/trainable": 9770158
},
{
"epoch": 0.4754054772666844,
"grad_norm": 0.07498825341463089,
"learning_rate": 2.5242033658782043e-05,
"loss": 0.7564839124679565,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.13077,
"step": 447,
"tokens/total": 117178368,
"tokens/train_per_sec_per_gpu": 177.15,
"tokens/trainable": 9791564
},
{
"epoch": 0.47646902419569265,
"grad_norm": 0.07310041040182114,
"learning_rate": 2.5170409491006145e-05,
"loss": 0.8869813680648804,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.42779,
"step": 448,
"tokens/total": 117440512,
"tokens/train_per_sec_per_gpu": 153.04,
"tokens/trainable": 9812412
},
{
"epoch": 0.4775325711247009,
"grad_norm": 0.06502556055784225,
"learning_rate": 2.5098714192483683e-05,
"loss": 0.733482837677002,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.08232,
"step": 449,
"tokens/total": 117702656,
"tokens/train_per_sec_per_gpu": 234.84,
"tokens/trainable": 9836100
},
{
"epoch": 0.4785961180537091,
"grad_norm": 0.0711643397808075,
"learning_rate": 2.50269487495466e-05,
"loss": 0.7789556384086609,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.1792,
"step": 450,
"tokens/total": 117964800,
"tokens/train_per_sec_per_gpu": 198.82,
"tokens/trainable": 9857145
},
{
"epoch": 0.4785961180537091,
"eval_loss": 0.7710337042808533,
"eval_ppl": 2.162,
"eval_runtime": 237.2453,
"eval_samples_per_second": 28.182,
"eval_steps_per_second": 1.762,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 38.19,
"memory/max_allocated (GiB)": 38.19,
"step": 450
},
{
"epoch": 0.47965966498271734,
"grad_norm": 0.06686612218618393,
"learning_rate": 2.4955114149491865e-05,
"loss": 0.7786468267440796,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.17852,
"step": 451,
"tokens/total": 118226944,
"tokens/train_per_sec_per_gpu": 164.67,
"tokens/trainable": 9878218
},
{
"epoch": 0.4807232119117256,
"grad_norm": 0.0646248385310173,
"learning_rate": 2.488321138056783e-05,
"loss": 0.7580331563949585,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.13407,
"step": 452,
"tokens/total": 118489088,
"tokens/train_per_sec_per_gpu": 244.84,
"tokens/trainable": 9900786
},
{
"epoch": 0.48178675884073385,
"grad_norm": 0.06668704003095627,
"learning_rate": 2.481124143196069e-05,
"loss": 0.7126317620277405,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.03935,
"step": 453,
"tokens/total": 118751232,
"tokens/train_per_sec_per_gpu": 175.52,
"tokens/trainable": 9921265
},
{
"epoch": 0.4828503057697421,
"grad_norm": 0.06527574360370636,
"learning_rate": 2.473920529378083e-05,
"loss": 0.8261401653289795,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.28448,
"step": 454,
"tokens/total": 119013376,
"tokens/train_per_sec_per_gpu": 182.36,
"tokens/trainable": 9942755
},
{
"epoch": 0.4839138526987503,
"grad_norm": 0.07106975466012955,
"learning_rate": 2.4667103957049237e-05,
"loss": 0.7335352897644043,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.08243,
"step": 455,
"tokens/total": 119275520,
"tokens/train_per_sec_per_gpu": 161.14,
"tokens/trainable": 9963450
},
{
"epoch": 0.4849773996277586,
"grad_norm": 0.06658606976270676,
"learning_rate": 2.4594938413683842e-05,
"loss": 0.8705431222915649,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.38821,
"step": 456,
"tokens/total": 119537664,
"tokens/train_per_sec_per_gpu": 221.13,
"tokens/trainable": 9986403
},
{
"epoch": 0.48604094655676683,
"grad_norm": 0.06788789480924606,
"learning_rate": 2.4522709656485896e-05,
"loss": 0.8447569608688354,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.32741,
"step": 457,
"tokens/total": 119799808,
"tokens/train_per_sec_per_gpu": 181.47,
"tokens/trainable": 10007854
},
{
"epoch": 0.48710449348577506,
"grad_norm": 0.06112990900874138,
"learning_rate": 2.445041867912629e-05,
"loss": 0.7580868601799011,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.13419,
"step": 458,
"tokens/total": 120061952,
"tokens/train_per_sec_per_gpu": 243.31,
"tokens/trainable": 10033216
},
{
"epoch": 0.4881680404147833,
"grad_norm": 0.06946436315774918,
"learning_rate": 2.43780664761319e-05,
"loss": 0.7739719152450562,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.16836,
"step": 459,
"tokens/total": 120324096,
"tokens/train_per_sec_per_gpu": 186.88,
"tokens/trainable": 10054546
},
{
"epoch": 0.4892315873437915,
"grad_norm": 0.06711182743310928,
"learning_rate": 2.4305654042871893e-05,
"loss": 0.7676090002059937,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.15461,
"step": 460,
"tokens/total": 120586240,
"tokens/train_per_sec_per_gpu": 203.9,
"tokens/trainable": 10077655
},
{
"epoch": 0.4902951342727998,
"grad_norm": 0.07618647068738937,
"learning_rate": 2.4233182375544052e-05,
"loss": 0.8108090758323669,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.24973,
"step": 461,
"tokens/total": 120848384,
"tokens/train_per_sec_per_gpu": 203.76,
"tokens/trainable": 10099531
},
{
"epoch": 0.49135868120180803,
"grad_norm": 0.062073446810245514,
"learning_rate": 2.4160652471161043e-05,
"loss": 0.7443853616714478,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.10515,
"step": 462,
"tokens/total": 121110528,
"tokens/train_per_sec_per_gpu": 172.63,
"tokens/trainable": 10121476
},
{
"epoch": 0.49242222813081626,
"grad_norm": 0.06608369201421738,
"learning_rate": 2.408806532753674e-05,
"loss": 0.7803705930709839,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.18228,
"step": 463,
"tokens/total": 121372672,
"tokens/train_per_sec_per_gpu": 193.2,
"tokens/trainable": 10142900
},
{
"epoch": 0.4934857750598245,
"grad_norm": 0.06678762286901474,
"learning_rate": 2.4015421943272442e-05,
"loss": 0.7602465152740479,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.1388,
"step": 464,
"tokens/total": 121634816,
"tokens/train_per_sec_per_gpu": 144.23,
"tokens/trainable": 10162519
},
{
"epoch": 0.4945493219888328,
"grad_norm": 0.07111706584692001,
"learning_rate": 2.3942723317743194e-05,
"loss": 0.8450040817260742,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.32799,
"step": 465,
"tokens/total": 121896960,
"tokens/train_per_sec_per_gpu": 172.92,
"tokens/trainable": 10184843
},
{
"epoch": 0.495612868917841,
"grad_norm": 0.06318546831607819,
"learning_rate": 2.3869970451083996e-05,
"loss": 0.7744694948196411,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.16944,
"step": 466,
"tokens/total": 122159104,
"tokens/train_per_sec_per_gpu": 211.14,
"tokens/trainable": 10208041
},
{
"epoch": 0.49667641584684924,
"grad_norm": 0.06685180962085724,
"learning_rate": 2.3797164344176054e-05,
"loss": 0.7870070934295654,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.19681,
"step": 467,
"tokens/total": 122421248,
"tokens/train_per_sec_per_gpu": 188.13,
"tokens/trainable": 10229287
},
{
"epoch": 0.49773996277585747,
"grad_norm": 0.058472346514463425,
"learning_rate": 2.3724305998633033e-05,
"loss": 0.7486791610717773,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.11421,
"step": 468,
"tokens/total": 122683392,
"tokens/train_per_sec_per_gpu": 193.34,
"tokens/trainable": 10253293
},
{
"epoch": 0.4988035097048657,
"grad_norm": 0.06563540548086166,
"learning_rate": 2.365139641678724e-05,
"loss": 0.793043851852417,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.21011,
"step": 469,
"tokens/total": 122945536,
"tokens/train_per_sec_per_gpu": 208.04,
"tokens/trainable": 10275964
},
{
"epoch": 0.499867056633874,
"grad_norm": 0.06955686956644058,
"learning_rate": 2.3578436601675857e-05,
"loss": 0.767907977104187,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.15525,
"step": 470,
"tokens/total": 123207680,
"tokens/train_per_sec_per_gpu": 162.65,
"tokens/trainable": 10294893
},
{
"epoch": 0.5009306035628822,
"grad_norm": 0.06527870893478394,
"learning_rate": 2.3505427557027153e-05,
"loss": 0.7281315922737122,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.07121,
"step": 471,
"tokens/total": 123469824,
"tokens/train_per_sec_per_gpu": 200.9,
"tokens/trainable": 10318126
},
{
"epoch": 0.5019941504918904,
"grad_norm": 0.05844523012638092,
"learning_rate": 2.3432370287246644e-05,
"loss": 0.8092571496963501,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.24624,
"step": 472,
"tokens/total": 123731968,
"tokens/train_per_sec_per_gpu": 191.12,
"tokens/trainable": 10340892
},
{
"epoch": 0.5030576974208987,
"grad_norm": 0.0627407431602478,
"learning_rate": 2.3359265797403297e-05,
"loss": 0.8085892200469971,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.24474,
"step": 473,
"tokens/total": 123994112,
"tokens/train_per_sec_per_gpu": 171.13,
"tokens/trainable": 10364090
},
{
"epoch": 0.5041212443499069,
"grad_norm": 0.06694270670413971,
"learning_rate": 2.3286115093215717e-05,
"loss": 0.7958250045776367,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.21627,
"step": 474,
"tokens/total": 124256256,
"tokens/train_per_sec_per_gpu": 199.51,
"tokens/trainable": 10385759
},
{
"epoch": 0.5051847912789151,
"grad_norm": 0.06240719184279442,
"learning_rate": 2.3212919181038264e-05,
"loss": 0.7695267200469971,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.15874,
"step": 475,
"tokens/total": 124518400,
"tokens/train_per_sec_per_gpu": 229.77,
"tokens/trainable": 10411362
},
{
"epoch": 0.5062483382079235,
"grad_norm": 0.06532912701368332,
"learning_rate": 2.313967906784725e-05,
"loss": 0.7777595520019531,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.17659,
"step": 476,
"tokens/total": 124780544,
"tokens/train_per_sec_per_gpu": 235.82,
"tokens/trainable": 10433694
},
{
"epoch": 0.5073118851369317,
"grad_norm": 0.07455461472272873,
"learning_rate": 2.306639576122708e-05,
"loss": 0.8379300832748413,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.31158,
"step": 477,
"tokens/total": 125042688,
"tokens/train_per_sec_per_gpu": 163.21,
"tokens/trainable": 10452642
},
{
"epoch": 0.5083754320659399,
"grad_norm": 0.06591842323541641,
"learning_rate": 2.2993070269356372e-05,
"loss": 0.8277432322502136,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.28815,
"step": 478,
"tokens/total": 125304832,
"tokens/train_per_sec_per_gpu": 150.23,
"tokens/trainable": 10473380
},
{
"epoch": 0.5094389789949482,
"grad_norm": 0.06278102844953537,
"learning_rate": 2.2919703600994096e-05,
"loss": 0.7098827362060547,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.03375,
"step": 479,
"tokens/total": 125566976,
"tokens/train_per_sec_per_gpu": 225.4,
"tokens/trainable": 10496551
},
{
"epoch": 0.5105025259239564,
"grad_norm": 0.0822196677327156,
"learning_rate": 2.2846296765465708e-05,
"loss": 0.7754343152046204,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.17154,
"step": 480,
"tokens/total": 125829120,
"tokens/train_per_sec_per_gpu": 189.13,
"tokens/trainable": 10518279
},
{
"epoch": 0.5115660728529646,
"grad_norm": 0.06427222490310669,
"learning_rate": 2.2772850772649245e-05,
"loss": 0.7595022916793823,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.13721,
"step": 481,
"tokens/total": 126091264,
"tokens/train_per_sec_per_gpu": 199.34,
"tokens/trainable": 10541893
},
{
"epoch": 0.5126296197819729,
"grad_norm": 0.07555945217609406,
"learning_rate": 2.269936663296146e-05,
"loss": 0.7030783295631409,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.01996,
"step": 482,
"tokens/total": 126353408,
"tokens/train_per_sec_per_gpu": 131.21,
"tokens/trainable": 10560989
},
{
"epoch": 0.5136931667109811,
"grad_norm": 0.06928715854883194,
"learning_rate": 2.262584535734387e-05,
"loss": 0.7435761094093323,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.10344,
"step": 483,
"tokens/total": 126615552,
"tokens/train_per_sec_per_gpu": 198.19,
"tokens/trainable": 10582215
},
{
"epoch": 0.5147567136399893,
"grad_norm": 0.07140571624040604,
"learning_rate": 2.2552287957248914e-05,
"loss": 0.7427330017089844,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.10167,
"step": 484,
"tokens/total": 126877696,
"tokens/train_per_sec_per_gpu": 177.97,
"tokens/trainable": 10603392
},
{
"epoch": 0.5158202605689977,
"grad_norm": 0.05892965570092201,
"learning_rate": 2.2478695444625993e-05,
"loss": 0.7203789949417114,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.05521,
"step": 485,
"tokens/total": 127139840,
"tokens/train_per_sec_per_gpu": 178.16,
"tokens/trainable": 10625660
},
{
"epoch": 0.5168838074980059,
"grad_norm": 0.06429523229598999,
"learning_rate": 2.240506883190756e-05,
"loss": 0.7731969952583313,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.16668,
"step": 486,
"tokens/total": 127401984,
"tokens/train_per_sec_per_gpu": 219.13,
"tokens/trainable": 10649385
},
{
"epoch": 0.5179473544270141,
"grad_norm": 0.07350295782089233,
"learning_rate": 2.2331409131995186e-05,
"loss": 0.7495805621147156,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.11611,
"step": 487,
"tokens/total": 127664128,
"tokens/train_per_sec_per_gpu": 234.6,
"tokens/trainable": 10670554
},
{
"epoch": 0.5190109013560223,
"grad_norm": 0.06763040274381638,
"learning_rate": 2.2257717358245645e-05,
"loss": 0.7657451629638672,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.1506,
"step": 488,
"tokens/total": 127926272,
"tokens/train_per_sec_per_gpu": 211.17,
"tokens/trainable": 10694102
},
{
"epoch": 0.5200744482850306,
"grad_norm": 0.059085726737976074,
"learning_rate": 2.2183994524456946e-05,
"loss": 0.7091976404190063,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.03236,
"step": 489,
"tokens/total": 128188416,
"tokens/train_per_sec_per_gpu": 185.08,
"tokens/trainable": 10716407
},
{
"epoch": 0.5211379952140388,
"grad_norm": 0.06827449053525925,
"learning_rate": 2.2110241644854415e-05,
"loss": 0.7978835105895996,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.22084,
"step": 490,
"tokens/total": 128450560,
"tokens/train_per_sec_per_gpu": 241.5,
"tokens/trainable": 10739309
},
{
"epoch": 0.522201542143047,
"grad_norm": 0.06967286020517349,
"learning_rate": 2.2036459734076715e-05,
"loss": 0.7917447090148926,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.20724,
"step": 491,
"tokens/total": 128712704,
"tokens/train_per_sec_per_gpu": 183.24,
"tokens/trainable": 10759856
},
{
"epoch": 0.5232650890720553,
"grad_norm": 0.06789640337228775,
"learning_rate": 2.196264980716189e-05,
"loss": 0.785992443561554,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.19458,
"step": 492,
"tokens/total": 128974848,
"tokens/train_per_sec_per_gpu": 208.01,
"tokens/trainable": 10783453
},
{
"epoch": 0.5243286360010635,
"grad_norm": 0.07713824510574341,
"learning_rate": 2.1888812879533438e-05,
"loss": 0.740135908126831,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.09622,
"step": 493,
"tokens/total": 129236992,
"tokens/train_per_sec_per_gpu": 193.38,
"tokens/trainable": 10803022
},
{
"epoch": 0.5253921829300718,
"grad_norm": 0.06190735474228859,
"learning_rate": 2.1814949966986288e-05,
"loss": 0.7506577968597412,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.11839,
"step": 494,
"tokens/total": 129499136,
"tokens/train_per_sec_per_gpu": 186.68,
"tokens/trainable": 10825782
},
{
"epoch": 0.5264557298590801,
"grad_norm": 0.07524209469556808,
"learning_rate": 2.174106208567286e-05,
"loss": 0.8435920476913452,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.3247,
"step": 495,
"tokens/total": 129761280,
"tokens/train_per_sec_per_gpu": 209.07,
"tokens/trainable": 10848136
},
{
"epoch": 0.5275192767880883,
"grad_norm": 0.0792510136961937,
"learning_rate": 2.166715025208908e-05,
"loss": 0.795114278793335,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.21469,
"step": 496,
"tokens/total": 130023424,
"tokens/train_per_sec_per_gpu": 176.49,
"tokens/trainable": 10869582
},
{
"epoch": 0.5285828237170965,
"grad_norm": 0.06474092602729797,
"learning_rate": 2.1593215483060382e-05,
"loss": 0.7897614240646362,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.20287,
"step": 497,
"tokens/total": 130285568,
"tokens/train_per_sec_per_gpu": 209.6,
"tokens/trainable": 10893683
},
{
"epoch": 0.5296463706461048,
"grad_norm": 0.06536766141653061,
"learning_rate": 2.151925879572774e-05,
"loss": 0.7543013095855713,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.12613,
"step": 498,
"tokens/total": 130547712,
"tokens/train_per_sec_per_gpu": 224.22,
"tokens/trainable": 10916399
},
{
"epoch": 0.530709917575113,
"grad_norm": 0.06765280663967133,
"learning_rate": 2.144528120753365e-05,
"loss": 0.7390546202659607,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.09395,
"step": 499,
"tokens/total": 130809856,
"tokens/train_per_sec_per_gpu": 169.0,
"tokens/trainable": 10938004
},
{
"epoch": 0.5317734645041212,
"grad_norm": 0.07433243840932846,
"learning_rate": 2.137128373620817e-05,
"loss": 0.7480766773223877,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.11293,
"step": 500,
"tokens/total": 131072000,
"tokens/train_per_sec_per_gpu": 188.24,
"tokens/trainable": 10960082
},
{
"epoch": 0.5328370114331294,
"grad_norm": 0.058342620730400085,
"learning_rate": 2.129726739975486e-05,
"loss": 0.73946613073349,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.09482,
"step": 501,
"tokens/total": 131334144,
"tokens/train_per_sec_per_gpu": 230.56,
"tokens/trainable": 10983591
},
{
"epoch": 0.5339005583621377,
"grad_norm": 0.06705200672149658,
"learning_rate": 2.1223233216436858e-05,
"loss": 0.7899049520492554,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.20319,
"step": 502,
"tokens/total": 131596288,
"tokens/train_per_sec_per_gpu": 177.49,
"tokens/trainable": 11007036
},
{
"epoch": 0.534964105291146,
"grad_norm": 0.07227369397878647,
"learning_rate": 2.114918220476279e-05,
"loss": 0.7793487310409546,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.18005,
"step": 503,
"tokens/total": 131858432,
"tokens/train_per_sec_per_gpu": 173.77,
"tokens/trainable": 11030251
},
{
"epoch": 0.5360276522201542,
"grad_norm": 0.0691395178437233,
"learning_rate": 2.1075115383472803e-05,
"loss": 0.7794291973114014,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.18023,
"step": 504,
"tokens/total": 132120576,
"tokens/train_per_sec_per_gpu": 161.28,
"tokens/trainable": 11049765
},
{
"epoch": 0.5370911991491625,
"grad_norm": 0.06914931535720825,
"learning_rate": 2.1001033771524556e-05,
"loss": 0.7482678890228271,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.11334,
"step": 505,
"tokens/total": 132382720,
"tokens/train_per_sec_per_gpu": 204.45,
"tokens/trainable": 11072121
},
{
"epoch": 0.5381547460781707,
"grad_norm": 0.06037479639053345,
"learning_rate": 2.0926938388079168e-05,
"loss": 0.7241630554199219,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.063,
"step": 506,
"tokens/total": 132644864,
"tokens/train_per_sec_per_gpu": 211.36,
"tokens/trainable": 11094232
},
{
"epoch": 0.5392182930071789,
"grad_norm": 0.06841259449720383,
"learning_rate": 2.085283025248723e-05,
"loss": 0.7322399616241455,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.07973,
"step": 507,
"tokens/total": 132907008,
"tokens/train_per_sec_per_gpu": 200.68,
"tokens/trainable": 11118027
},
{
"epoch": 0.5402818399361872,
"grad_norm": 0.0649460032582283,
"learning_rate": 2.0778710384274757e-05,
"loss": 0.7410999536514282,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.09824,
"step": 508,
"tokens/total": 133169152,
"tokens/train_per_sec_per_gpu": 187.65,
"tokens/trainable": 11142472
},
{
"epoch": 0.5413453868651954,
"grad_norm": 0.06589141488075256,
"learning_rate": 2.0704579803129184e-05,
"loss": 0.7477791905403137,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.1123,
"step": 509,
"tokens/total": 133431296,
"tokens/train_per_sec_per_gpu": 186.9,
"tokens/trainable": 11164967
},
{
"epoch": 0.5424089337942036,
"grad_norm": 0.07234744727611542,
"learning_rate": 2.0630439528885314e-05,
"loss": 0.82126784324646,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.27338,
"step": 510,
"tokens/total": 133693440,
"tokens/train_per_sec_per_gpu": 183.66,
"tokens/trainable": 11187755
},
{
"epoch": 0.5434724807232119,
"grad_norm": 0.06735506653785706,
"learning_rate": 2.0556290581511314e-05,
"loss": 0.7757540941238403,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.17223,
"step": 511,
"tokens/total": 133955584,
"tokens/train_per_sec_per_gpu": 200.97,
"tokens/trainable": 11210450
},
{
"epoch": 0.5445360276522202,
"grad_norm": 0.06329286843538284,
"learning_rate": 2.0482133981094656e-05,
"loss": 0.7571574449539185,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.13221,
"step": 512,
"tokens/total": 134217728,
"tokens/train_per_sec_per_gpu": 193.23,
"tokens/trainable": 11232157
},
{
"epoch": 0.5455995745812284,
"grad_norm": 0.06758707016706467,
"learning_rate": 2.0407970747828113e-05,
"loss": 0.7560121417045593,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.12977,
"step": 513,
"tokens/total": 134479872,
"tokens/train_per_sec_per_gpu": 171.6,
"tokens/trainable": 11253596
},
{
"epoch": 0.5466631215102367,
"grad_norm": 0.067698173224926,
"learning_rate": 2.033380190199569e-05,
"loss": 0.7989984750747681,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.22331,
"step": 514,
"tokens/total": 134742016,
"tokens/train_per_sec_per_gpu": 206.04,
"tokens/trainable": 11276800
},
{
"epoch": 0.5477266684392449,
"grad_norm": 0.06352519989013672,
"learning_rate": 2.025962846395862e-05,
"loss": 0.7677520513534546,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.15492,
"step": 515,
"tokens/total": 135004160,
"tokens/train_per_sec_per_gpu": 189.51,
"tokens/trainable": 11300134
},
{
"epoch": 0.5487902153682531,
"grad_norm": 0.0675693228840828,
"learning_rate": 2.0185451454141307e-05,
"loss": 0.8176583647727966,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.26519,
"step": 516,
"tokens/total": 135266304,
"tokens/train_per_sec_per_gpu": 170.04,
"tokens/trainable": 11322260
},
{
"epoch": 0.5498537622972614,
"grad_norm": 0.07476314902305603,
"learning_rate": 2.0111271893017298e-05,
"loss": 0.7549651861190796,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.12754,
"step": 517,
"tokens/total": 135528448,
"tokens/train_per_sec_per_gpu": 178.37,
"tokens/trainable": 11342564
},
{
"epoch": 0.5509173092262696,
"grad_norm": 0.08098773658275604,
"learning_rate": 2.0037090801095217e-05,
"loss": 0.8205512762069702,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.27175,
"step": 518,
"tokens/total": 135790592,
"tokens/train_per_sec_per_gpu": 195.1,
"tokens/trainable": 11364054
},
{
"epoch": 0.5519808561552778,
"grad_norm": 0.07975499331951141,
"learning_rate": 1.9962909198904782e-05,
"loss": 0.822569727897644,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.27634,
"step": 519,
"tokens/total": 136052736,
"tokens/train_per_sec_per_gpu": 185.35,
"tokens/trainable": 11385094
},
{
"epoch": 0.553044403084286,
"grad_norm": 0.0654132217168808,
"learning_rate": 1.9888728106982712e-05,
"loss": 0.7287492156028748,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.07249,
"step": 520,
"tokens/total": 136314880,
"tokens/train_per_sec_per_gpu": 188.86,
"tokens/trainable": 11409016
},
{
"epoch": 0.5541079500132944,
"grad_norm": 0.06833093613386154,
"learning_rate": 1.98145485458587e-05,
"loss": 0.7531672716140747,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.12372,
"step": 521,
"tokens/total": 136577024,
"tokens/train_per_sec_per_gpu": 246.44,
"tokens/trainable": 11431115
},
{
"epoch": 0.5551714969423026,
"grad_norm": 0.0704493448138237,
"learning_rate": 1.9740371536041388e-05,
"loss": 0.8314918875694275,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.29674,
"step": 522,
"tokens/total": 136839168,
"tokens/train_per_sec_per_gpu": 211.44,
"tokens/trainable": 11454368
},
{
"epoch": 0.5562350438713108,
"grad_norm": 0.07962756603956223,
"learning_rate": 1.966619809800432e-05,
"loss": 0.7510333061218262,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.11919,
"step": 523,
"tokens/total": 137101312,
"tokens/train_per_sec_per_gpu": 174.57,
"tokens/trainable": 11474895
},
{
"epoch": 0.5572985908003191,
"grad_norm": 0.07758195698261261,
"learning_rate": 1.95920292521719e-05,
"loss": 0.7872558236122131,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.19736,
"step": 524,
"tokens/total": 137363456,
"tokens/train_per_sec_per_gpu": 204.94,
"tokens/trainable": 11494814
},
{
"epoch": 0.5583621377293273,
"grad_norm": 0.06898010522127151,
"learning_rate": 1.9517866018905347e-05,
"loss": 0.7131535410881042,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.04042,
"step": 525,
"tokens/total": 137625600,
"tokens/train_per_sec_per_gpu": 186.39,
"tokens/trainable": 11516315
},
{
"epoch": 0.5594256846583355,
"grad_norm": 0.0707436203956604,
"learning_rate": 1.9443709418488692e-05,
"loss": 0.7908194065093994,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.2052,
"step": 526,
"tokens/total": 137887744,
"tokens/train_per_sec_per_gpu": 178.33,
"tokens/trainable": 11536975
},
{
"epoch": 0.5604892315873438,
"grad_norm": 0.06255731731653214,
"learning_rate": 1.9369560471114693e-05,
"loss": 0.7359522581100464,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.08747,
"step": 527,
"tokens/total": 138149888,
"tokens/train_per_sec_per_gpu": 191.7,
"tokens/trainable": 11558976
},
{
"epoch": 0.561552778516352,
"grad_norm": 0.06818301230669022,
"learning_rate": 1.9295420196870826e-05,
"loss": 0.759716808795929,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.13767,
"step": 528,
"tokens/total": 138412032,
"tokens/train_per_sec_per_gpu": 228.46,
"tokens/trainable": 11580954
},
{
"epoch": 0.5626163254453602,
"grad_norm": 0.07047592103481293,
"learning_rate": 1.922128961572525e-05,
"loss": 0.8164031505584717,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.26235,
"step": 529,
"tokens/total": 138674176,
"tokens/train_per_sec_per_gpu": 185.46,
"tokens/trainable": 11603196
},
{
"epoch": 0.5636798723743686,
"grad_norm": 0.06535010784864426,
"learning_rate": 1.9147169747512773e-05,
"loss": 0.7326769828796387,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.08064,
"step": 530,
"tokens/total": 138936320,
"tokens/train_per_sec_per_gpu": 192.3,
"tokens/trainable": 11625136
},
{
"epoch": 0.5647434193033768,
"grad_norm": 0.20296123623847961,
"learning_rate": 1.9073061611920835e-05,
"loss": 0.8644706606864929,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.37375,
"step": 531,
"tokens/total": 139198464,
"tokens/train_per_sec_per_gpu": 168.12,
"tokens/trainable": 11649882
},
{
"epoch": 0.565806966232385,
"grad_norm": 0.07702672481536865,
"learning_rate": 1.899896622847545e-05,
"loss": 0.6990363001823425,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.01181,
"step": 532,
"tokens/total": 139460608,
"tokens/train_per_sec_per_gpu": 188.85,
"tokens/trainable": 11671814
},
{
"epoch": 0.5668705131613933,
"grad_norm": 0.06929226219654083,
"learning_rate": 1.89248846165272e-05,
"loss": 0.7941587567329407,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.21258,
"step": 533,
"tokens/total": 139722752,
"tokens/train_per_sec_per_gpu": 194.34,
"tokens/trainable": 11695013
},
{
"epoch": 0.5679340600904015,
"grad_norm": 0.07153689116239548,
"learning_rate": 1.885081779523722e-05,
"loss": 0.7691222429275513,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.15787,
"step": 534,
"tokens/total": 139984896,
"tokens/train_per_sec_per_gpu": 240.08,
"tokens/trainable": 11717160
},
{
"epoch": 0.5689976070194097,
"grad_norm": 0.06492677330970764,
"learning_rate": 1.8776766783563152e-05,
"loss": 0.7174063920974731,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.04911,
"step": 535,
"tokens/total": 140247040,
"tokens/train_per_sec_per_gpu": 162.24,
"tokens/trainable": 11738101
},
{
"epoch": 0.5700611539484179,
"grad_norm": 0.06563756614923477,
"learning_rate": 1.8702732600245138e-05,
"loss": 0.7257460355758667,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.06627,
"step": 536,
"tokens/total": 140509184,
"tokens/train_per_sec_per_gpu": 173.62,
"tokens/trainable": 11759125
},
{
"epoch": 0.5711247008774262,
"grad_norm": 0.06024221330881119,
"learning_rate": 1.8628716263791837e-05,
"loss": 0.747328519821167,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.11135,
"step": 537,
"tokens/total": 140771328,
"tokens/train_per_sec_per_gpu": 207.16,
"tokens/trainable": 11782122
},
{
"epoch": 0.5721882478064345,
"grad_norm": 0.06131380796432495,
"learning_rate": 1.8554718792466353e-05,
"loss": 0.7804300785064697,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.18241,
"step": 538,
"tokens/total": 141033472,
"tokens/train_per_sec_per_gpu": 198.93,
"tokens/trainable": 11805081
},
{
"epoch": 0.5732517947354427,
"grad_norm": 0.0673721581697464,
"learning_rate": 1.8480741204272268e-05,
"loss": 0.8499374389648438,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.3395,
"step": 539,
"tokens/total": 141295616,
"tokens/train_per_sec_per_gpu": 205.27,
"tokens/trainable": 11827941
},
{
"epoch": 0.574315341664451,
"grad_norm": 0.07307814806699753,
"learning_rate": 1.8406784516939628e-05,
"loss": 0.740190863609314,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.09634,
"step": 540,
"tokens/total": 141557760,
"tokens/train_per_sec_per_gpu": 200.34,
"tokens/trainable": 11850776
},
{
"epoch": 0.5753788885934592,
"grad_norm": 0.06450652331113815,
"learning_rate": 1.8332849747910925e-05,
"loss": 0.7271907329559326,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.06926,
"step": 541,
"tokens/total": 141819904,
"tokens/train_per_sec_per_gpu": 199.39,
"tokens/trainable": 11872489
},
{
"epoch": 0.5764424355224674,
"grad_norm": 0.06675565242767334,
"learning_rate": 1.825893791432714e-05,
"loss": 0.7834812998771667,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.18908,
"step": 542,
"tokens/total": 142082048,
"tokens/train_per_sec_per_gpu": 220.61,
"tokens/trainable": 11894441
},
{
"epoch": 0.5775059824514757,
"grad_norm": 0.07322388142347336,
"learning_rate": 1.8185050033013715e-05,
"loss": 0.8086245656013489,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.24482,
"step": 543,
"tokens/total": 142344192,
"tokens/train_per_sec_per_gpu": 179.2,
"tokens/trainable": 11915451
},
{
"epoch": 0.5785695293804839,
"grad_norm": 0.07201948016881943,
"learning_rate": 1.811118712046657e-05,
"loss": 0.8121210336685181,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.25268,
"step": 544,
"tokens/total": 142606336,
"tokens/train_per_sec_per_gpu": 175.39,
"tokens/trainable": 11937488
},
{
"epoch": 0.5796330763094921,
"grad_norm": 0.06886903196573257,
"learning_rate": 1.8037350192838117e-05,
"loss": 0.7545644044876099,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.12668,
"step": 545,
"tokens/total": 142868480,
"tokens/train_per_sec_per_gpu": 194.01,
"tokens/trainable": 11959978
},
{
"epoch": 0.5806966232385004,
"grad_norm": 0.07075604796409607,
"learning_rate": 1.7963540265923298e-05,
"loss": 0.7854512333869934,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.1934,
"step": 546,
"tokens/total": 143130624,
"tokens/train_per_sec_per_gpu": 187.1,
"tokens/trainable": 11980805
},
{
"epoch": 0.5817601701675087,
"grad_norm": 0.06993231177330017,
"learning_rate": 1.788975835514559e-05,
"loss": 0.7716733813285828,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.16338,
"step": 547,
"tokens/total": 143392768,
"tokens/train_per_sec_per_gpu": 176.49,
"tokens/trainable": 12002477
},
{
"epoch": 0.5828237170965169,
"grad_norm": 0.06735540181398392,
"learning_rate": 1.7816005475543057e-05,
"loss": 0.7268585562705994,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.06857,
"step": 548,
"tokens/total": 143654912,
"tokens/train_per_sec_per_gpu": 168.14,
"tokens/trainable": 12024394
},
{
"epoch": 0.5838872640255252,
"grad_norm": 0.06800015270709991,
"learning_rate": 1.7742282641754362e-05,
"loss": 0.7047498226165771,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.02334,
"step": 549,
"tokens/total": 143917056,
"tokens/train_per_sec_per_gpu": 173.12,
"tokens/trainable": 12045702
},
{
"epoch": 0.5849508109545334,
"grad_norm": 0.07099801301956177,
"learning_rate": 1.766859086800482e-05,
"loss": 0.7587050199508667,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.13551,
"step": 550,
"tokens/total": 144179200,
"tokens/train_per_sec_per_gpu": 142.06,
"tokens/trainable": 12067322
},
{
"epoch": 0.5860143578835416,
"grad_norm": 0.06389941275119781,
"learning_rate": 1.759493116809245e-05,
"loss": 0.7534220814704895,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.12426,
"step": 551,
"tokens/total": 144441344,
"tokens/train_per_sec_per_gpu": 208.95,
"tokens/trainable": 12090670
},
{
"epoch": 0.5870779048125498,
"grad_norm": 0.08602443337440491,
"learning_rate": 1.7521304555374013e-05,
"loss": 0.7858190536499023,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.1942,
"step": 552,
"tokens/total": 144703488,
"tokens/train_per_sec_per_gpu": 183.64,
"tokens/trainable": 12112909
},
{
"epoch": 0.5881414517415581,
"grad_norm": 0.06364187598228455,
"learning_rate": 1.7447712042751086e-05,
"loss": 0.8089983463287354,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.24566,
"step": 553,
"tokens/total": 144965632,
"tokens/train_per_sec_per_gpu": 221.48,
"tokens/trainable": 12136873
},
{
"epoch": 0.5892049986705663,
"grad_norm": 0.06705693900585175,
"learning_rate": 1.7374154642656133e-05,
"loss": 0.7520922422409058,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.12143,
"step": 554,
"tokens/total": 145227776,
"tokens/train_per_sec_per_gpu": 211.51,
"tokens/trainable": 12158329
},
{
"epoch": 0.5902685455995745,
"grad_norm": 0.06293050199747086,
"learning_rate": 1.730063336703855e-05,
"loss": 0.833366870880127,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.30105,
"step": 555,
"tokens/total": 145489920,
"tokens/train_per_sec_per_gpu": 209.36,
"tokens/trainable": 12180630
},
{
"epoch": 0.5913320925285829,
"grad_norm": 0.07079198956489563,
"learning_rate": 1.722714922735076e-05,
"loss": 0.6932408213615417,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.00019,
"step": 556,
"tokens/total": 145752064,
"tokens/train_per_sec_per_gpu": 194.23,
"tokens/trainable": 12202488
},
{
"epoch": 0.5923956394575911,
"grad_norm": 0.07012391835451126,
"learning_rate": 1.7153703234534302e-05,
"loss": 0.7621327042579651,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.14284,
"step": 557,
"tokens/total": 146014208,
"tokens/train_per_sec_per_gpu": 216.52,
"tokens/trainable": 12226431
},
{
"epoch": 0.5934591863865993,
"grad_norm": 0.061280712485313416,
"learning_rate": 1.708029639900591e-05,
"loss": 0.7005034685134888,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.01477,
"step": 558,
"tokens/total": 146276352,
"tokens/train_per_sec_per_gpu": 192.83,
"tokens/trainable": 12247296
},
{
"epoch": 0.5945227333156076,
"grad_norm": 0.06636322289705276,
"learning_rate": 1.7006929730643635e-05,
"loss": 0.6887087821960449,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 1.99114,
"step": 559,
"tokens/total": 146538496,
"tokens/train_per_sec_per_gpu": 207.99,
"tokens/trainable": 12268797
},
{
"epoch": 0.5955862802446158,
"grad_norm": 0.07723846286535263,
"learning_rate": 1.6933604238772924e-05,
"loss": 0.6889795064926147,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 1.99168,
"step": 560,
"tokens/total": 146800640,
"tokens/train_per_sec_per_gpu": 200.15,
"tokens/trainable": 12289775
},
{
"epoch": 0.596649827173624,
"grad_norm": 0.06970304995775223,
"learning_rate": 1.6860320932152755e-05,
"loss": 0.7143691182136536,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.0429,
"step": 561,
"tokens/total": 147062784,
"tokens/train_per_sec_per_gpu": 167.54,
"tokens/trainable": 12309110
},
{
"epoch": 0.5977133741026323,
"grad_norm": 0.07083282619714737,
"learning_rate": 1.6787080818961746e-05,
"loss": 0.7616149187088013,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.14173,
"step": 562,
"tokens/total": 147324928,
"tokens/train_per_sec_per_gpu": 170.61,
"tokens/trainable": 12329610
},
{
"epoch": 0.5987769210316405,
"grad_norm": 0.06794283539056778,
"learning_rate": 1.6713884906784293e-05,
"loss": 0.7875048518180847,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.19791,
"step": 563,
"tokens/total": 147587072,
"tokens/train_per_sec_per_gpu": 160.01,
"tokens/trainable": 12350916
},
{
"epoch": 0.5998404679606487,
"grad_norm": 0.07991766929626465,
"learning_rate": 1.6640734202596702e-05,
"loss": 0.7792809009552002,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.1799,
"step": 564,
"tokens/total": 147849216,
"tokens/train_per_sec_per_gpu": 155.73,
"tokens/trainable": 12370256
},
{
"epoch": 0.6009040148896571,
"grad_norm": 0.07368163019418716,
"learning_rate": 1.6567629712753363e-05,
"loss": 0.7557001113891602,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.1291,
"step": 565,
"tokens/total": 148111360,
"tokens/train_per_sec_per_gpu": 184.87,
"tokens/trainable": 12391117
},
{
"epoch": 0.6019675618186653,
"grad_norm": 0.07445425540208817,
"learning_rate": 1.6494572442972857e-05,
"loss": 0.7838120460510254,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.1898,
"step": 566,
"tokens/total": 148373504,
"tokens/train_per_sec_per_gpu": 188.91,
"tokens/trainable": 12412378
},
{
"epoch": 0.6030311087476735,
"grad_norm": 0.0670490711927414,
"learning_rate": 1.642156339832415e-05,
"loss": 0.7417568564414978,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.09962,
"step": 567,
"tokens/total": 148635648,
"tokens/train_per_sec_per_gpu": 170.03,
"tokens/trainable": 12433097
},
{
"epoch": 0.6040946556766817,
"grad_norm": 0.06880825757980347,
"learning_rate": 1.634860358321277e-05,
"loss": 0.7300165891647339,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.07512,
"step": 568,
"tokens/total": 148897792,
"tokens/train_per_sec_per_gpu": 195.18,
"tokens/trainable": 12455206
},
{
"epoch": 0.60515820260569,
"grad_norm": 0.07096575200557709,
"learning_rate": 1.627569400136697e-05,
"loss": 0.8089407682418823,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.24553,
"step": 569,
"tokens/total": 149159936,
"tokens/train_per_sec_per_gpu": 191.16,
"tokens/trainable": 12478007
},
{
"epoch": 0.6062217495346982,
"grad_norm": 0.0826595202088356,
"learning_rate": 1.620283565582395e-05,
"loss": 0.7439614534378052,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.10425,
"step": 570,
"tokens/total": 149422080,
"tokens/train_per_sec_per_gpu": 186.55,
"tokens/trainable": 12498799
},
{
"epoch": 0.6072852964637064,
"grad_norm": 0.08035894483327866,
"learning_rate": 1.6130029548916007e-05,
"loss": 0.7608100175857544,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.14001,
"step": 571,
"tokens/total": 149684224,
"tokens/train_per_sec_per_gpu": 177.96,
"tokens/trainable": 12518407
},
{
"epoch": 0.6083488433927147,
"grad_norm": 0.07164735347032547,
"learning_rate": 1.605727668225681e-05,
"loss": 0.8451493978500366,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.32833,
"step": 572,
"tokens/total": 149946368,
"tokens/train_per_sec_per_gpu": 219.11,
"tokens/trainable": 12541561
},
{
"epoch": 0.6094123903217229,
"grad_norm": 0.07182083278894424,
"learning_rate": 1.5984578056727564e-05,
"loss": 0.8233068585395813,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.27802,
"step": 573,
"tokens/total": 150208512,
"tokens/train_per_sec_per_gpu": 208.4,
"tokens/trainable": 12562762
},
{
"epoch": 0.6104759372507312,
"grad_norm": 0.06765095144510269,
"learning_rate": 1.591193467246327e-05,
"loss": 0.7087922096252441,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.03154,
"step": 574,
"tokens/total": 150470656,
"tokens/train_per_sec_per_gpu": 149.32,
"tokens/trainable": 12583197
},
{
"epoch": 0.6115394841797395,
"grad_norm": 0.07710757106542587,
"learning_rate": 1.5839347528838957e-05,
"loss": 0.7562744617462158,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.13032,
"step": 575,
"tokens/total": 150732800,
"tokens/train_per_sec_per_gpu": 121.82,
"tokens/trainable": 12601739
},
{
"epoch": 0.6126030311087477,
"grad_norm": 0.07722500711679459,
"learning_rate": 1.5766817624455954e-05,
"loss": 0.8186255693435669,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.26738,
"step": 576,
"tokens/total": 150994944,
"tokens/train_per_sec_per_gpu": 177.23,
"tokens/trainable": 12623589
},
{
"epoch": 0.6136665780377559,
"grad_norm": 0.06252503395080566,
"learning_rate": 1.569434595712811e-05,
"loss": 0.7135177850723267,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.04116,
"step": 577,
"tokens/total": 151257088,
"tokens/train_per_sec_per_gpu": 216.51,
"tokens/trainable": 12646370
},
{
"epoch": 0.6147301249667642,
"grad_norm": 0.07124295085668564,
"learning_rate": 1.5621933523868106e-05,
"loss": 0.7426387667655945,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.10147,
"step": 578,
"tokens/total": 151519232,
"tokens/train_per_sec_per_gpu": 164.29,
"tokens/trainable": 12666523
},
{
"epoch": 0.6157936718957724,
"grad_norm": 0.06870193779468536,
"learning_rate": 1.5549581320873715e-05,
"loss": 0.7106361389160156,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.03529,
"step": 579,
"tokens/total": 151781376,
"tokens/train_per_sec_per_gpu": 203.55,
"tokens/trainable": 12689727
},
{
"epoch": 0.6168572188247806,
"grad_norm": 0.06538794189691544,
"learning_rate": 1.5477290343514108e-05,
"loss": 0.7434192299842834,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.10311,
"step": 580,
"tokens/total": 152043520,
"tokens/train_per_sec_per_gpu": 220.1,
"tokens/trainable": 12713675
},
{
"epoch": 0.6179207657537888,
"grad_norm": 0.08144285529851913,
"learning_rate": 1.5405061586316158e-05,
"loss": 0.7463376522064209,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.10926,
"step": 581,
"tokens/total": 152305664,
"tokens/train_per_sec_per_gpu": 199.52,
"tokens/trainable": 12734720
},
{
"epoch": 0.6189843126827971,
"grad_norm": 0.0673174113035202,
"learning_rate": 1.533289604295077e-05,
"loss": 0.6784200668334961,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 1.97076,
"step": 582,
"tokens/total": 152567808,
"tokens/train_per_sec_per_gpu": 185.45,
"tokens/trainable": 12757403
},
{
"epoch": 0.6200478596118054,
"grad_norm": 0.07269315421581268,
"learning_rate": 1.5260794706219176e-05,
"loss": 0.769637942314148,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.15898,
"step": 583,
"tokens/total": 152829952,
"tokens/train_per_sec_per_gpu": 198.93,
"tokens/trainable": 12778686
},
{
"epoch": 0.6211114065408136,
"grad_norm": 0.07265086472034454,
"learning_rate": 1.5188758568039318e-05,
"loss": 0.783679723739624,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.18951,
"step": 584,
"tokens/total": 153092096,
"tokens/train_per_sec_per_gpu": 172.98,
"tokens/trainable": 12799072
},
{
"epoch": 0.6221749534698219,
"grad_norm": 0.06607359647750854,
"learning_rate": 1.5116788619432177e-05,
"loss": 0.7359644770622253,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.08749,
"step": 585,
"tokens/total": 153354240,
"tokens/train_per_sec_per_gpu": 217.63,
"tokens/trainable": 12823018
},
{
"epoch": 0.6232385003988301,
"grad_norm": 0.07835246622562408,
"learning_rate": 1.5044885850508137e-05,
"loss": 0.7655049562454224,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.15008,
"step": 586,
"tokens/total": 153616384,
"tokens/train_per_sec_per_gpu": 172.13,
"tokens/trainable": 12843433
},
{
"epoch": 0.6243020473278383,
"grad_norm": 0.06266583502292633,
"learning_rate": 1.4973051250453399e-05,
"loss": 0.784702718257904,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.19176,
"step": 587,
"tokens/total": 153878528,
"tokens/train_per_sec_per_gpu": 181.28,
"tokens/trainable": 12866501
},
{
"epoch": 0.6253655942568466,
"grad_norm": 0.07751967012882233,
"learning_rate": 1.4901285807516326e-05,
"loss": 0.7583497762680054,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.13475,
"step": 588,
"tokens/total": 154140672,
"tokens/train_per_sec_per_gpu": 186.01,
"tokens/trainable": 12887753
},
{
"epoch": 0.6264291411858548,
"grad_norm": 0.06350097805261612,
"learning_rate": 1.4829590508993859e-05,
"loss": 0.7606030702590942,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.13957,
"step": 589,
"tokens/total": 154402816,
"tokens/train_per_sec_per_gpu": 173.15,
"tokens/trainable": 12910246
},
{
"epoch": 0.627492688114863,
"grad_norm": 0.0697028860449791,
"learning_rate": 1.4757966341217963e-05,
"loss": 0.7563662528991699,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.13052,
"step": 590,
"tokens/total": 154664960,
"tokens/train_per_sec_per_gpu": 205.69,
"tokens/trainable": 12933119
},
{
"epoch": 0.6285562350438713,
"grad_norm": 0.07811742275953293,
"learning_rate": 1.4686414289542023e-05,
"loss": 0.8478161096572876,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.33454,
"step": 591,
"tokens/total": 154927104,
"tokens/train_per_sec_per_gpu": 200.8,
"tokens/trainable": 12953870
},
{
"epoch": 0.6296197819728796,
"grad_norm": 0.06654322147369385,
"learning_rate": 1.461493533832731e-05,
"loss": 0.7657591104507446,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.15063,
"step": 592,
"tokens/total": 155189248,
"tokens/train_per_sec_per_gpu": 193.4,
"tokens/trainable": 12976578
},
{
"epoch": 0.6306833289018878,
"grad_norm": 0.07657228410243988,
"learning_rate": 1.454353047092944e-05,
"loss": 0.839828372001648,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.31597,
"step": 593,
"tokens/total": 155451392,
"tokens/train_per_sec_per_gpu": 228.66,
"tokens/trainable": 13000674
},
{
"epoch": 0.6317468758308961,
"grad_norm": 0.07358460128307343,
"learning_rate": 1.4472200669684821e-05,
"loss": 0.6963605284690857,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.00644,
"step": 594,
"tokens/total": 155713536,
"tokens/train_per_sec_per_gpu": 199.12,
"tokens/trainable": 13021140
},
{
"epoch": 0.6328104227599043,
"grad_norm": 0.07676289230585098,
"learning_rate": 1.4400946915897168e-05,
"loss": 0.8120332956314087,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.25248,
"step": 595,
"tokens/total": 155975680,
"tokens/train_per_sec_per_gpu": 172.52,
"tokens/trainable": 13041462
},
{
"epoch": 0.6338739696889125,
"grad_norm": 0.07129766792058945,
"learning_rate": 1.4329770189823982e-05,
"loss": 0.7258169054985046,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.06642,
"step": 596,
"tokens/total": 156237824,
"tokens/train_per_sec_per_gpu": 199.58,
"tokens/trainable": 13062706
},
{
"epoch": 0.6349375166179207,
"grad_norm": 0.0697442814707756,
"learning_rate": 1.4258671470663075e-05,
"loss": 0.7802278995513916,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.18197,
"step": 597,
"tokens/total": 156499968,
"tokens/train_per_sec_per_gpu": 205.89,
"tokens/trainable": 13087819
},
{
"epoch": 0.636001063546929,
"grad_norm": 0.07785540819168091,
"learning_rate": 1.4187651736539092e-05,
"loss": 0.756123423576355,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.13,
"step": 598,
"tokens/total": 156762112,
"tokens/train_per_sec_per_gpu": 196.25,
"tokens/trainable": 13109448
},
{
"epoch": 0.6370646104759372,
"grad_norm": 0.07135733217000961,
"learning_rate": 1.4116711964490076e-05,
"loss": 0.760696530342102,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.13977,
"step": 599,
"tokens/total": 157024256,
"tokens/train_per_sec_per_gpu": 169.38,
"tokens/trainable": 13130856
},
{
"epoch": 0.6381281574049454,
"grad_norm": 0.07068773359060287,
"learning_rate": 1.404585313045399e-05,
"loss": 0.7768102884292603,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.17453,
"step": 600,
"tokens/total": 157286400,
"tokens/train_per_sec_per_gpu": 200.62,
"tokens/trainable": 13154412
},
{
"epoch": 0.6381281574049454,
"eval_loss": 0.7701326012611389,
"eval_ppl": 2.16005,
"eval_runtime": 237.2245,
"eval_samples_per_second": 28.184,
"eval_steps_per_second": 1.762,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 38.19,
"memory/max_allocated (GiB)": 38.19,
"step": 600
},
{
"epoch": 0.6391917043339538,
"grad_norm": 0.06947837024927139,
"learning_rate": 1.3975076209255321e-05,
"loss": 0.7434956431388855,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.10327,
"step": 601,
"tokens/total": 157548544,
"tokens/train_per_sec_per_gpu": 185.08,
"tokens/trainable": 13176877
},
{
"epoch": 0.640255251262962,
"grad_norm": 0.08173263818025589,
"learning_rate": 1.3904382174591654e-05,
"loss": 0.7505627870559692,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.11819,
"step": 602,
"tokens/total": 157810688,
"tokens/train_per_sec_per_gpu": 157.36,
"tokens/trainable": 13197398
},
{
"epoch": 0.6413187981919702,
"grad_norm": 0.07958182692527771,
"learning_rate": 1.3833771999020274e-05,
"loss": 0.7926914095878601,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.20933,
"step": 603,
"tokens/total": 158072832,
"tokens/train_per_sec_per_gpu": 143.13,
"tokens/trainable": 13219479
},
{
"epoch": 0.6423823451209785,
"grad_norm": 0.08310385793447495,
"learning_rate": 1.3763246653944824e-05,
"loss": 0.7650701999664307,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.14915,
"step": 604,
"tokens/total": 158334976,
"tokens/train_per_sec_per_gpu": 156.87,
"tokens/trainable": 13239934
},
{
"epoch": 0.6434458920499867,
"grad_norm": 0.07890634983778,
"learning_rate": 1.3692807109601875e-05,
"loss": 0.7053734064102173,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.0246,
"step": 605,
"tokens/total": 158597120,
"tokens/train_per_sec_per_gpu": 183.26,
"tokens/trainable": 13259732
},
{
"epoch": 0.6445094389789949,
"grad_norm": 0.06404001265764236,
"learning_rate": 1.3622454335047631e-05,
"loss": 0.737473726272583,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.09065,
"step": 606,
"tokens/total": 158859264,
"tokens/train_per_sec_per_gpu": 235.57,
"tokens/trainable": 13284062
},
{
"epoch": 0.6455729859080032,
"grad_norm": 0.07135722041130066,
"learning_rate": 1.3552189298144573e-05,
"loss": 0.7033129334449768,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.02044,
"step": 607,
"tokens/total": 159121408,
"tokens/train_per_sec_per_gpu": 176.01,
"tokens/trainable": 13306868
},
{
"epoch": 0.6466365328370114,
"grad_norm": 0.0779723972082138,
"learning_rate": 1.3482012965548161e-05,
"loss": 0.8055770993232727,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.23799,
"step": 608,
"tokens/total": 159383552,
"tokens/train_per_sec_per_gpu": 189.42,
"tokens/trainable": 13327922
},
{
"epoch": 0.6477000797660196,
"grad_norm": 0.08110717684030533,
"learning_rate": 1.341192630269351e-05,
"loss": 0.817200779914856,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.26415,
"step": 609,
"tokens/total": 159645696,
"tokens/train_per_sec_per_gpu": 170.71,
"tokens/trainable": 13347786
},
{
"epoch": 0.648763626695028,
"grad_norm": 0.06666215509176254,
"learning_rate": 1.3341930273782144e-05,
"loss": 0.704579770565033,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.023,
"step": 610,
"tokens/total": 159907840,
"tokens/train_per_sec_per_gpu": 200.63,
"tokens/trainable": 13371502
},
{
"epoch": 0.6498271736240362,
"grad_norm": 0.06857079267501831,
"learning_rate": 1.3272025841768693e-05,
"loss": 0.8532009720802307,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.34715,
"step": 611,
"tokens/total": 160169984,
"tokens/train_per_sec_per_gpu": 191.0,
"tokens/trainable": 13393637
},
{
"epoch": 0.6508907205530444,
"grad_norm": 0.0732354000210762,
"learning_rate": 1.320221396834767e-05,
"loss": 0.7995498180389404,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.22454,
"step": 612,
"tokens/total": 160432128,
"tokens/train_per_sec_per_gpu": 189.51,
"tokens/trainable": 13415692
},
{
"epoch": 0.6519542674820527,
"grad_norm": 0.06430143862962723,
"learning_rate": 1.3132495613940237e-05,
"loss": 0.76482093334198,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.14861,
"step": 613,
"tokens/total": 160694272,
"tokens/train_per_sec_per_gpu": 207.14,
"tokens/trainable": 13438366
},
{
"epoch": 0.6530178144110609,
"grad_norm": 0.06737808883190155,
"learning_rate": 1.3062871737680976e-05,
"loss": 0.7430492043495178,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.10234,
"step": 614,
"tokens/total": 160956416,
"tokens/train_per_sec_per_gpu": 188.74,
"tokens/trainable": 13458353
},
{
"epoch": 0.6540813613400691,
"grad_norm": 0.0812523365020752,
"learning_rate": 1.2993343297404732e-05,
"loss": 0.8373923897743225,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.31033,
"step": 615,
"tokens/total": 161218560,
"tokens/train_per_sec_per_gpu": 198.52,
"tokens/trainable": 13479819
},
{
"epoch": 0.6551449082690773,
"grad_norm": 0.07273576408624649,
"learning_rate": 1.2923911249633391e-05,
"loss": 0.753447413444519,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.12431,
"step": 616,
"tokens/total": 161480704,
"tokens/train_per_sec_per_gpu": 187.46,
"tokens/trainable": 13500295
},
{
"epoch": 0.6562084551980856,
"grad_norm": 0.07487501949071884,
"learning_rate": 1.2854576549562743e-05,
"loss": 0.7411309480667114,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.09831,
"step": 617,
"tokens/total": 161742848,
"tokens/train_per_sec_per_gpu": 158.41,
"tokens/trainable": 13518908
},
{
"epoch": 0.6572720021270939,
"grad_norm": 0.060535646975040436,
"learning_rate": 1.2785340151049348e-05,
"loss": 0.6992688179016113,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.01228,
"step": 618,
"tokens/total": 162004992,
"tokens/train_per_sec_per_gpu": 195.11,
"tokens/trainable": 13542123
},
{
"epoch": 0.6583355490561021,
"grad_norm": 0.07925450801849365,
"learning_rate": 1.2716203006597389e-05,
"loss": 0.7158269882202148,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.04588,
"step": 619,
"tokens/total": 162267136,
"tokens/train_per_sec_per_gpu": 154.73,
"tokens/trainable": 13560742
},
{
"epoch": 0.6593990959851104,
"grad_norm": 0.07715223729610443,
"learning_rate": 1.2647166067345598e-05,
"loss": 0.7587981224060059,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.13571,
"step": 620,
"tokens/total": 162529280,
"tokens/train_per_sec_per_gpu": 190.53,
"tokens/trainable": 13581062
},
{
"epoch": 0.6604626429141186,
"grad_norm": 0.0761185884475708,
"learning_rate": 1.2578230283054153e-05,
"loss": 0.7682688236236572,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.15603,
"step": 621,
"tokens/total": 162791424,
"tokens/train_per_sec_per_gpu": 193.3,
"tokens/trainable": 13602325
},
{
"epoch": 0.6615261898431268,
"grad_norm": 0.0685892179608345,
"learning_rate": 1.2509396602091612e-05,
"loss": 0.7407702207565308,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.09755,
"step": 622,
"tokens/total": 163053568,
"tokens/train_per_sec_per_gpu": 178.64,
"tokens/trainable": 13624055
},
{
"epoch": 0.6625897367721351,
"grad_norm": 0.07496423274278641,
"learning_rate": 1.2440665971421872e-05,
"loss": 0.7656638622283936,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.15042,
"step": 623,
"tokens/total": 163315712,
"tokens/train_per_sec_per_gpu": 182.21,
"tokens/trainable": 13643250
},
{
"epoch": 0.6636532837011433,
"grad_norm": 0.07262540608644485,
"learning_rate": 1.2372039336591137e-05,
"loss": 0.801539421081543,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.22897,
"step": 624,
"tokens/total": 163577856,
"tokens/train_per_sec_per_gpu": 196.93,
"tokens/trainable": 13664253
},
{
"epoch": 0.6647168306301515,
"grad_norm": 0.07125604152679443,
"learning_rate": 1.230351764171491e-05,
"loss": 0.7554821968078613,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.12864,
"step": 625,
"tokens/total": 163840000,
"tokens/train_per_sec_per_gpu": 174.72,
"tokens/trainable": 13685345
},
{
"epoch": 0.6657803775591598,
"grad_norm": 0.071174755692482,
"learning_rate": 1.2235101829465003e-05,
"loss": 0.8233163952827454,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.27804,
"step": 626,
"tokens/total": 164102144,
"tokens/train_per_sec_per_gpu": 235.38,
"tokens/trainable": 13708541
},
{
"epoch": 0.6668439244881681,
"grad_norm": 0.07615090906620026,
"learning_rate": 1.2166792841056596e-05,
"loss": 0.7587050199508667,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.13551,
"step": 627,
"tokens/total": 164364288,
"tokens/train_per_sec_per_gpu": 220.58,
"tokens/trainable": 13730305
},
{
"epoch": 0.6679074714171763,
"grad_norm": 0.08138352632522583,
"learning_rate": 1.2098591616235231e-05,
"loss": 0.7529855370521545,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.12333,
"step": 628,
"tokens/total": 164626432,
"tokens/train_per_sec_per_gpu": 193.64,
"tokens/trainable": 13752153
},
{
"epoch": 0.6689710183461846,
"grad_norm": 0.07478975504636765,
"learning_rate": 1.2030499093263938e-05,
"loss": 0.7286227941513062,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.07222,
"step": 629,
"tokens/total": 164888576,
"tokens/train_per_sec_per_gpu": 193.85,
"tokens/trainable": 13774483
},
{
"epoch": 0.6700345652751928,
"grad_norm": 0.07285770773887634,
"learning_rate": 1.1962516208910295e-05,
"loss": 0.733648955821991,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.08267,
"step": 630,
"tokens/total": 165150720,
"tokens/train_per_sec_per_gpu": 184.41,
"tokens/trainable": 13795589
},
{
"epoch": 0.671098112204201,
"grad_norm": 0.0695388987660408,
"learning_rate": 1.1894643898433541e-05,
"loss": 0.7216265201568604,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.05778,
"step": 631,
"tokens/total": 165412864,
"tokens/train_per_sec_per_gpu": 204.39,
"tokens/trainable": 13817860
},
{
"epoch": 0.6721616591332092,
"grad_norm": 0.0669473186135292,
"learning_rate": 1.1826883095571758e-05,
"loss": 0.8506951332092285,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.34127,
"step": 632,
"tokens/total": 165675008,
"tokens/train_per_sec_per_gpu": 251.79,
"tokens/trainable": 13842305
},
{
"epoch": 0.6732252060622175,
"grad_norm": 0.07681821286678314,
"learning_rate": 1.1759234732528952e-05,
"loss": 0.7686042785644531,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.15675,
"step": 633,
"tokens/total": 165937152,
"tokens/train_per_sec_per_gpu": 177.33,
"tokens/trainable": 13861902
},
{
"epoch": 0.6742887529912257,
"grad_norm": 0.06518401950597763,
"learning_rate": 1.1691699739962275e-05,
"loss": 0.8041465878486633,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.23479,
"step": 634,
"tokens/total": 166199296,
"tokens/train_per_sec_per_gpu": 207.88,
"tokens/trainable": 13884726
},
{
"epoch": 0.6753522999202339,
"grad_norm": 0.06772468984127045,
"learning_rate": 1.1624279046969208e-05,
"loss": 0.7493684887886047,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.11566,
"step": 635,
"tokens/total": 166461440,
"tokens/train_per_sec_per_gpu": 221.3,
"tokens/trainable": 13908358
},
{
"epoch": 0.6764158468492423,
"grad_norm": 0.07777251303195953,
"learning_rate": 1.1556973581074784e-05,
"loss": 0.7796363830566406,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.18068,
"step": 636,
"tokens/total": 166723584,
"tokens/train_per_sec_per_gpu": 158.72,
"tokens/trainable": 13930851
},
{
"epoch": 0.6774793937782505,
"grad_norm": 0.07164981216192245,
"learning_rate": 1.1489784268218811e-05,
"loss": 0.7849699258804321,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.19234,
"step": 637,
"tokens/total": 166985728,
"tokens/train_per_sec_per_gpu": 226.45,
"tokens/trainable": 13952042
},
{
"epoch": 0.6785429407072587,
"grad_norm": 0.07446952164173126,
"learning_rate": 1.1422712032743186e-05,
"loss": 0.7363911867141724,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.08839,
"step": 638,
"tokens/total": 167247872,
"tokens/train_per_sec_per_gpu": 176.2,
"tokens/trainable": 13972004
},
{
"epoch": 0.679606487636267,
"grad_norm": 0.07111480087041855,
"learning_rate": 1.1355757797379093e-05,
"loss": 0.7901080846786499,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.20363,
"step": 639,
"tokens/total": 167510016,
"tokens/train_per_sec_per_gpu": 204.49,
"tokens/trainable": 13994787
},
{
"epoch": 0.6806700345652752,
"grad_norm": 0.07409324496984482,
"learning_rate": 1.1288922483234395e-05,
"loss": 0.7688755393028259,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.15734,
"step": 640,
"tokens/total": 167772160,
"tokens/train_per_sec_per_gpu": 183.2,
"tokens/trainable": 14016072
},
{
"epoch": 0.6817335814942834,
"grad_norm": 0.06080978736281395,
"learning_rate": 1.1222207009780888e-05,
"loss": 0.731904923915863,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.07904,
"step": 641,
"tokens/total": 168034304,
"tokens/train_per_sec_per_gpu": 193.71,
"tokens/trainable": 14039858
},
{
"epoch": 0.6827971284232917,
"grad_norm": 0.07461950182914734,
"learning_rate": 1.1155612294841713e-05,
"loss": 0.7744308710098267,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.16936,
"step": 642,
"tokens/total": 168296448,
"tokens/train_per_sec_per_gpu": 190.26,
"tokens/trainable": 14062445
},
{
"epoch": 0.6838606753522999,
"grad_norm": 0.06637416779994965,
"learning_rate": 1.10891392545787e-05,
"loss": 0.6965582370758057,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.00683,
"step": 643,
"tokens/total": 168558592,
"tokens/train_per_sec_per_gpu": 202.21,
"tokens/trainable": 14085386
},
{
"epoch": 0.6849242222813081,
"grad_norm": 0.08194036036729813,
"learning_rate": 1.1022788803479747e-05,
"loss": 0.7541095018386841,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.12572,
"step": 644,
"tokens/total": 168820736,
"tokens/train_per_sec_per_gpu": 200.39,
"tokens/trainable": 14107399
},
{
"epoch": 0.6859877692103165,
"grad_norm": 0.07168550044298172,
"learning_rate": 1.095656185434629e-05,
"loss": 0.7705248594284058,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.1609,
"step": 645,
"tokens/total": 169082880,
"tokens/train_per_sec_per_gpu": 179.15,
"tokens/trainable": 14129284
},
{
"epoch": 0.6870513161393247,
"grad_norm": 0.0729442834854126,
"learning_rate": 1.0890459318280681e-05,
"loss": 0.7964296340942383,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.21761,
"step": 646,
"tokens/total": 169345024,
"tokens/train_per_sec_per_gpu": 210.01,
"tokens/trainable": 14151413
},
{
"epoch": 0.6881148630683329,
"grad_norm": 0.07156070321798325,
"learning_rate": 1.0824482104673723e-05,
"loss": 0.7291166186332703,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.07325,
"step": 647,
"tokens/total": 169607168,
"tokens/train_per_sec_per_gpu": 187.04,
"tokens/trainable": 14172388
},
{
"epoch": 0.6891784099973411,
"grad_norm": 0.07664386928081512,
"learning_rate": 1.0758631121192075e-05,
"loss": 0.847728967666626,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.33434,
"step": 648,
"tokens/total": 169869312,
"tokens/train_per_sec_per_gpu": 183.41,
"tokens/trainable": 14194504
},
{
"epoch": 0.6902419569263494,
"grad_norm": 0.06855987012386322,
"learning_rate": 1.0692907273765878e-05,
"loss": 0.809911847114563,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.24771,
"step": 649,
"tokens/total": 170131456,
"tokens/train_per_sec_per_gpu": 175.96,
"tokens/trainable": 14217186
},
{
"epoch": 0.6913055038553576,
"grad_norm": 0.069715715944767,
"learning_rate": 1.0627311466576167e-05,
"loss": 0.7290323972702026,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.07307,
"step": 650,
"tokens/total": 170393600,
"tokens/train_per_sec_per_gpu": 226.27,
"tokens/trainable": 14240939
},
{
"epoch": 0.6923690507843658,
"grad_norm": 0.06665818393230438,
"learning_rate": 1.0561844602042535e-05,
"loss": 0.696631908416748,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.00698,
"step": 651,
"tokens/total": 170655744,
"tokens/train_per_sec_per_gpu": 165.96,
"tokens/trainable": 14262378
},
{
"epoch": 0.6934325977133741,
"grad_norm": 0.06637638807296753,
"learning_rate": 1.0496507580810637e-05,
"loss": 0.7380032539367676,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.09175,
"step": 652,
"tokens/total": 170917888,
"tokens/train_per_sec_per_gpu": 210.8,
"tokens/trainable": 14284893
},
{
"epoch": 0.6944961446423823,
"grad_norm": 0.07084432989358902,
"learning_rate": 1.0431301301739882e-05,
"loss": 0.8238184452056885,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.27919,
"step": 653,
"tokens/total": 171180032,
"tokens/train_per_sec_per_gpu": 161.41,
"tokens/trainable": 14305618
},
{
"epoch": 0.6955596915713906,
"grad_norm": 0.07500351220369339,
"learning_rate": 1.0366226661890976e-05,
"loss": 0.8070303201675415,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.24124,
"step": 654,
"tokens/total": 171442176,
"tokens/train_per_sec_per_gpu": 198.07,
"tokens/trainable": 14327779
},
{
"epoch": 0.6966232385003989,
"grad_norm": 0.06748203933238983,
"learning_rate": 1.0301284556513669e-05,
"loss": 0.7480711340904236,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.11292,
"step": 655,
"tokens/total": 171704320,
"tokens/train_per_sec_per_gpu": 198.9,
"tokens/trainable": 14350224
},
{
"epoch": 0.6976867854294071,
"grad_norm": 0.07086233794689178,
"learning_rate": 1.023647587903438e-05,
"loss": 0.7530485391616821,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.12346,
"step": 656,
"tokens/total": 171966464,
"tokens/train_per_sec_per_gpu": 186.03,
"tokens/trainable": 14370865
},
{
"epoch": 0.6987503323584153,
"grad_norm": 0.07139495760202408,
"learning_rate": 1.017180152104391e-05,
"loss": 0.7517107725143433,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.12062,
"step": 657,
"tokens/total": 172228608,
"tokens/train_per_sec_per_gpu": 208.93,
"tokens/trainable": 14393067
},
{
"epoch": 0.6998138792874236,
"grad_norm": 0.07312899082899094,
"learning_rate": 1.0107262372285224e-05,
"loss": 0.737115740776062,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.0899,
"step": 658,
"tokens/total": 172490752,
"tokens/train_per_sec_per_gpu": 211.06,
"tokens/trainable": 14414022
},
{
"epoch": 0.7008774262164318,
"grad_norm": 0.06322194635868073,
"learning_rate": 1.004285932064113e-05,
"loss": 0.7515996098518372,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.12039,
"step": 659,
"tokens/total": 172752896,
"tokens/train_per_sec_per_gpu": 264.01,
"tokens/trainable": 14441310
},
{
"epoch": 0.70194097314544,
"grad_norm": 0.06637128442525864,
"learning_rate": 9.978593252122168e-06,
"loss": 0.7236040830612183,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.06185,
"step": 660,
"tokens/total": 173015040,
"tokens/train_per_sec_per_gpu": 205.79,
"tokens/trainable": 14462654
},
{
"epoch": 0.7030045200744482,
"grad_norm": 0.07252184301614761,
"learning_rate": 9.914465050854312e-06,
"loss": 0.7753311395645142,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.17131,
"step": 661,
"tokens/total": 173277184,
"tokens/train_per_sec_per_gpu": 186.41,
"tokens/trainable": 14484643
},
{
"epoch": 0.7040680670034565,
"grad_norm": 0.06677009165287018,
"learning_rate": 9.85047559906689e-06,
"loss": 0.7773078680038452,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.17561,
"step": 662,
"tokens/total": 173539328,
"tokens/train_per_sec_per_gpu": 200.95,
"tokens/trainable": 14507742
},
{
"epoch": 0.7051316139324648,
"grad_norm": 0.06489443778991699,
"learning_rate": 9.78662577708039e-06,
"loss": 0.8098626136779785,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.2476,
"step": 663,
"tokens/total": 173801472,
"tokens/train_per_sec_per_gpu": 209.29,
"tokens/trainable": 14531656
},
{
"epoch": 0.706195160861473,
"grad_norm": 0.0692375972867012,
"learning_rate": 9.722916463294405e-06,
"loss": 0.7761327624320984,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.17305,
"step": 664,
"tokens/total": 174063616,
"tokens/train_per_sec_per_gpu": 180.0,
"tokens/trainable": 14553153
},
{
"epoch": 0.7072587077904813,
"grad_norm": 0.07724090665578842,
"learning_rate": 9.659348534175484e-06,
"loss": 0.7831587195396423,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.18837,
"step": 665,
"tokens/total": 174325760,
"tokens/train_per_sec_per_gpu": 183.64,
"tokens/trainable": 14575227
},
{
"epoch": 0.7083222547194895,
"grad_norm": 0.07470756769180298,
"learning_rate": 9.595922864245135e-06,
"loss": 0.799434244632721,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.22428,
"step": 666,
"tokens/total": 174587904,
"tokens/train_per_sec_per_gpu": 224.55,
"tokens/trainable": 14598217
},
{
"epoch": 0.7093858016484977,
"grad_norm": 0.07464490085840225,
"learning_rate": 9.532640326067764e-06,
"loss": 0.7516946792602539,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.12059,
"step": 667,
"tokens/total": 174850048,
"tokens/train_per_sec_per_gpu": 200.11,
"tokens/trainable": 14620170
},
{
"epoch": 0.710449348577506,
"grad_norm": 0.06981492787599564,
"learning_rate": 9.469501790238654e-06,
"loss": 0.7612972259521484,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.14105,
"step": 668,
"tokens/total": 175112192,
"tokens/train_per_sec_per_gpu": 176.73,
"tokens/trainable": 14641941
},
{
"epoch": 0.7115128955065142,
"grad_norm": 0.06430601328611374,
"learning_rate": 9.406508125372034e-06,
"loss": 0.7262794971466064,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.06737,
"step": 669,
"tokens/total": 175374336,
"tokens/train_per_sec_per_gpu": 186.94,
"tokens/trainable": 14664035
},
{
"epoch": 0.7125764424355224,
"grad_norm": 0.06467308104038239,
"learning_rate": 9.343660198089072e-06,
"loss": 0.751503586769104,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.12019,
"step": 670,
"tokens/total": 175636480,
"tokens/train_per_sec_per_gpu": 171.75,
"tokens/trainable": 14684884
},
{
"epoch": 0.7136399893645307,
"grad_norm": 0.07673154026269913,
"learning_rate": 9.280958873006032e-06,
"loss": 0.771912693977356,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.1639,
"step": 671,
"tokens/total": 175898624,
"tokens/train_per_sec_per_gpu": 185.49,
"tokens/trainable": 14705843
},
{
"epoch": 0.714703536293539,
"grad_norm": 0.06743910163640976,
"learning_rate": 9.21840501272228e-06,
"loss": 0.7066195011138916,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.02713,
"step": 672,
"tokens/total": 176160768,
"tokens/train_per_sec_per_gpu": 202.95,
"tokens/trainable": 14728558
},
{
"epoch": 0.7157670832225472,
"grad_norm": 0.07574694603681564,
"learning_rate": 9.155999477808503e-06,
"loss": 0.7232800126075745,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.06118,
"step": 673,
"tokens/total": 176422912,
"tokens/train_per_sec_per_gpu": 203.45,
"tokens/trainable": 14748880
},
{
"epoch": 0.7168306301515555,
"grad_norm": 0.06880473345518112,
"learning_rate": 9.093743126794818e-06,
"loss": 0.7625008225440979,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.14363,
"step": 674,
"tokens/total": 176685056,
"tokens/train_per_sec_per_gpu": 202.57,
"tokens/trainable": 14770901
},
{
"epoch": 0.7178941770805637,
"grad_norm": 0.07242193818092346,
"learning_rate": 9.031636816158974e-06,
"loss": 0.7549704909324646,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.12755,
"step": 675,
"tokens/total": 176947200,
"tokens/train_per_sec_per_gpu": 170.41,
"tokens/trainable": 14791194
},
{
"epoch": 0.7189577240095719,
"grad_norm": 0.07365458458662033,
"learning_rate": 8.969681400314589e-06,
"loss": 0.8114491701126099,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.25117,
"step": 676,
"tokens/total": 177209344,
"tokens/train_per_sec_per_gpu": 170.56,
"tokens/trainable": 14811680
},
{
"epoch": 0.7200212709385801,
"grad_norm": 0.08069625496864319,
"learning_rate": 8.907877731599372e-06,
"loss": 0.7868974208831787,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.19657,
"step": 677,
"tokens/total": 177471488,
"tokens/train_per_sec_per_gpu": 214.41,
"tokens/trainable": 14831836
},
{
"epoch": 0.7210848178675884,
"grad_norm": 0.07136721163988113,
"learning_rate": 8.846226660263415e-06,
"loss": 0.7520813345909119,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.12141,
"step": 678,
"tokens/total": 177733632,
"tokens/train_per_sec_per_gpu": 165.9,
"tokens/trainable": 14852202
},
{
"epoch": 0.7221483647965966,
"grad_norm": 0.06927739828824997,
"learning_rate": 8.78472903445746e-06,
"loss": 0.7342413067817688,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.0839,
"step": 679,
"tokens/total": 177995776,
"tokens/train_per_sec_per_gpu": 199.05,
"tokens/trainable": 14873216
},
{
"epoch": 0.7232119117256048,
"grad_norm": 0.06386187672615051,
"learning_rate": 8.723385700221288e-06,
"loss": 0.7768787145614624,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.17467,
"step": 680,
"tokens/total": 178257920,
"tokens/train_per_sec_per_gpu": 263.1,
"tokens/trainable": 14895853
},
{
"epoch": 0.7242754586546132,
"grad_norm": 0.08089049160480499,
"learning_rate": 8.662197501472016e-06,
"loss": 0.7622289061546326,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.14305,
"step": 681,
"tokens/total": 178520064,
"tokens/train_per_sec_per_gpu": 179.05,
"tokens/trainable": 14917187
},
{
"epoch": 0.7253390055836214,
"grad_norm": 0.07812239229679108,
"learning_rate": 8.601165279992549e-06,
"loss": 0.7714396715164185,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.16288,
"step": 682,
"tokens/total": 178782208,
"tokens/train_per_sec_per_gpu": 157.1,
"tokens/trainable": 14936575
},
{
"epoch": 0.7264025525126296,
"grad_norm": 0.07455753535032272,
"learning_rate": 8.540289875419962e-06,
"loss": 0.763314962387085,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.14538,
"step": 683,
"tokens/total": 179044352,
"tokens/train_per_sec_per_gpu": 199.99,
"tokens/trainable": 14959566
},
{
"epoch": 0.7274660994416379,
"grad_norm": 0.06646628677845001,
"learning_rate": 8.47957212523394e-06,
"loss": 0.7563636302947998,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.13051,
"step": 684,
"tokens/total": 179306496,
"tokens/train_per_sec_per_gpu": 202.47,
"tokens/trainable": 14981954
},
{
"epoch": 0.7285296463706461,
"grad_norm": 0.08660077303647995,
"learning_rate": 8.419012864745297e-06,
"loss": 0.7826834917068481,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.18733,
"step": 685,
"tokens/total": 179568640,
"tokens/train_per_sec_per_gpu": 169.37,
"tokens/trainable": 15001703
},
{
"epoch": 0.7295931932996543,
"grad_norm": 0.07473118603229523,
"learning_rate": 8.358612927084435e-06,
"loss": 0.8132219910621643,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.25516,
"step": 686,
"tokens/total": 179830784,
"tokens/train_per_sec_per_gpu": 176.67,
"tokens/trainable": 15024105
},
{
"epoch": 0.7306567402286626,
"grad_norm": 0.06741170585155487,
"learning_rate": 8.29837314318993e-06,
"loss": 0.6950328946113586,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.00377,
"step": 687,
"tokens/total": 180092928,
"tokens/train_per_sec_per_gpu": 190.47,
"tokens/trainable": 15045856
},
{
"epoch": 0.7317202871576708,
"grad_norm": 0.07606185227632523,
"learning_rate": 8.23829434179707e-06,
"loss": 0.8332937955856323,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.30088,
"step": 688,
"tokens/total": 180355072,
"tokens/train_per_sec_per_gpu": 194.57,
"tokens/trainable": 15067803
},
{
"epoch": 0.732783834086679,
"grad_norm": 0.07419081032276154,
"learning_rate": 8.178377349426471e-06,
"loss": 0.81829434633255,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.26663,
"step": 689,
"tokens/total": 180617216,
"tokens/train_per_sec_per_gpu": 179.35,
"tokens/trainable": 15090573
},
{
"epoch": 0.7338473810156874,
"grad_norm": 0.0726039931178093,
"learning_rate": 8.118622990372676e-06,
"loss": 0.8616635203361511,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.3671,
"step": 690,
"tokens/total": 180879360,
"tokens/train_per_sec_per_gpu": 200.44,
"tokens/trainable": 15112666
},
{
"epoch": 0.7349109279446956,
"grad_norm": 0.07402276247739792,
"learning_rate": 8.059032086692864e-06,
"loss": 0.7540690302848816,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.12563,
"step": 691,
"tokens/total": 181141504,
"tokens/train_per_sec_per_gpu": 168.0,
"tokens/trainable": 15133098
},
{
"epoch": 0.7359744748737038,
"grad_norm": 0.07309621572494507,
"learning_rate": 7.999605458195486e-06,
"loss": 0.7695388793945312,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.15877,
"step": 692,
"tokens/total": 181403648,
"tokens/train_per_sec_per_gpu": 181.09,
"tokens/trainable": 15153899
},
{
"epoch": 0.737038021802712,
"grad_norm": 0.06912367045879364,
"learning_rate": 7.94034392242903e-06,
"loss": 0.7535648345947266,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.12456,
"step": 693,
"tokens/total": 181665792,
"tokens/train_per_sec_per_gpu": 183.77,
"tokens/trainable": 15176330
},
{
"epoch": 0.7381015687317203,
"grad_norm": 0.07674102485179901,
"learning_rate": 7.881248294670771e-06,
"loss": 0.7670722007751465,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.15345,
"step": 694,
"tokens/total": 181927936,
"tokens/train_per_sec_per_gpu": 172.46,
"tokens/trainable": 15198450
},
{
"epoch": 0.7391651156607285,
"grad_norm": 0.07187400758266449,
"learning_rate": 7.82231938791551e-06,
"loss": 0.7742888331413269,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.16905,
"step": 695,
"tokens/total": 182190080,
"tokens/train_per_sec_per_gpu": 209.81,
"tokens/trainable": 15222309
},
{
"epoch": 0.7402286625897367,
"grad_norm": 0.07647348195314407,
"learning_rate": 7.763558012864446e-06,
"loss": 0.8115613460540771,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.25142,
"step": 696,
"tokens/total": 182452224,
"tokens/train_per_sec_per_gpu": 160.85,
"tokens/trainable": 15245790
},
{
"epoch": 0.741292209518745,
"grad_norm": 0.07111165672540665,
"learning_rate": 7.704964977913984e-06,
"loss": 0.8241356611251831,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.27991,
"step": 697,
"tokens/total": 182714368,
"tokens/train_per_sec_per_gpu": 222.33,
"tokens/trainable": 15268790
},
{
"epoch": 0.7423557564477533,
"grad_norm": 0.07145415991544724,
"learning_rate": 7.646541089144638e-06,
"loss": 0.7397055625915527,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.09532,
"step": 698,
"tokens/total": 182976512,
"tokens/train_per_sec_per_gpu": 197.27,
"tokens/trainable": 15289822
},
{
"epoch": 0.7434193033767615,
"grad_norm": 0.08257947117090225,
"learning_rate": 7.588287150309928e-06,
"loss": 0.7899596691131592,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.20331,
"step": 699,
"tokens/total": 183238656,
"tokens/train_per_sec_per_gpu": 172.65,
"tokens/trainable": 15310952
},
{
"epoch": 0.7444828503057698,
"grad_norm": 0.07342710345983505,
"learning_rate": 7.530203962825331e-06,
"loss": 0.7988142371177673,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.2229,
"step": 700,
"tokens/total": 183500800,
"tokens/train_per_sec_per_gpu": 202.61,
"tokens/trainable": 15335255
},
{
"epoch": 0.745546397234778,
"grad_norm": 0.07142513245344162,
"learning_rate": 7.4722923257572335e-06,
"loss": 0.8055736422538757,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.23798,
"step": 701,
"tokens/total": 183762944,
"tokens/train_per_sec_per_gpu": 234.19,
"tokens/trainable": 15359283
},
{
"epoch": 0.7466099441637862,
"grad_norm": 0.0752025619149208,
"learning_rate": 7.414553035811978e-06,
"loss": 0.7718970775604248,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.16387,
"step": 702,
"tokens/total": 184025088,
"tokens/train_per_sec_per_gpu": 222.69,
"tokens/trainable": 15380449
},
{
"epoch": 0.7476734910927945,
"grad_norm": 0.07547293603420258,
"learning_rate": 7.35698688732486e-06,
"loss": 0.7342941761016846,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.08401,
"step": 703,
"tokens/total": 184287232,
"tokens/train_per_sec_per_gpu": 152.07,
"tokens/trainable": 15400504
},
{
"epoch": 0.7487370380218027,
"grad_norm": 0.06701923161745071,
"learning_rate": 7.299594672249231e-06,
"loss": 0.8081640005111694,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.24378,
"step": 704,
"tokens/total": 184549376,
"tokens/train_per_sec_per_gpu": 185.98,
"tokens/trainable": 15423145
},
{
"epoch": 0.7498005849508109,
"grad_norm": 0.06758899241685867,
"learning_rate": 7.242377180145603e-06,
"loss": 0.7538725733757019,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.12521,
"step": 705,
"tokens/total": 184811520,
"tokens/train_per_sec_per_gpu": 169.11,
"tokens/trainable": 15446585
},
{
"epoch": 0.7508641318798192,
"grad_norm": 0.07745972275733948,
"learning_rate": 7.1853351981707504e-06,
"loss": 0.8155503273010254,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.26042,
"step": 706,
"tokens/total": 185073664,
"tokens/train_per_sec_per_gpu": 202.79,
"tokens/trainable": 15467738
},
{
"epoch": 0.7519276788088275,
"grad_norm": 0.06426619738340378,
"learning_rate": 7.128469511066933e-06,
"loss": 0.7379392385482788,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.09162,
"step": 707,
"tokens/total": 185335808,
"tokens/train_per_sec_per_gpu": 199.47,
"tokens/trainable": 15490043
},
{
"epoch": 0.7529912257378357,
"grad_norm": 0.07671428471803665,
"learning_rate": 7.071780901151049e-06,
"loss": 0.8222740292549133,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.27567,
"step": 708,
"tokens/total": 185597952,
"tokens/train_per_sec_per_gpu": 187.24,
"tokens/trainable": 15512912
},
{
"epoch": 0.754054772666844,
"grad_norm": 0.07356390357017517,
"learning_rate": 7.015270148303919e-06,
"loss": 0.7136031985282898,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.04133,
"step": 709,
"tokens/total": 185860096,
"tokens/train_per_sec_per_gpu": 188.47,
"tokens/trainable": 15532378
},
{
"epoch": 0.7551183195958522,
"grad_norm": 0.07700134813785553,
"learning_rate": 6.958938029959508e-06,
"loss": 0.7238497734069824,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.06236,
"step": 710,
"tokens/total": 186122240,
"tokens/train_per_sec_per_gpu": 203.44,
"tokens/trainable": 15553126
},
{
"epoch": 0.7561818665248604,
"grad_norm": 0.06828487664461136,
"learning_rate": 6.902785321094301e-06,
"loss": 0.7130833864212036,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.04027,
"step": 711,
"tokens/total": 186384384,
"tokens/train_per_sec_per_gpu": 226.96,
"tokens/trainable": 15575744
},
{
"epoch": 0.7572454134538686,
"grad_norm": 0.07853987067937851,
"learning_rate": 6.846812794216546e-06,
"loss": 0.7818912267684937,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.1856,
"step": 712,
"tokens/total": 186646528,
"tokens/train_per_sec_per_gpu": 165.77,
"tokens/trainable": 15595629
},
{
"epoch": 0.7583089603828769,
"grad_norm": 0.08049149066209793,
"learning_rate": 6.791021219355722e-06,
"loss": 0.860575795173645,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.36452,
"step": 713,
"tokens/total": 186908672,
"tokens/train_per_sec_per_gpu": 197.57,
"tokens/trainable": 15617377
},
{
"epoch": 0.7593725073118851,
"grad_norm": 0.07803945243358612,
"learning_rate": 6.735411364051865e-06,
"loss": 0.8134850263595581,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.25576,
"step": 714,
"tokens/total": 187170816,
"tokens/train_per_sec_per_gpu": 170.03,
"tokens/trainable": 15638182
},
{
"epoch": 0.7604360542408933,
"grad_norm": 0.07364539057016373,
"learning_rate": 6.679983993345067e-06,
"loss": 0.7291663885116577,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.07335,
"step": 715,
"tokens/total": 187432960,
"tokens/train_per_sec_per_gpu": 165.64,
"tokens/trainable": 15658499
},
{
"epoch": 0.7614996011699017,
"grad_norm": 0.06459354609251022,
"learning_rate": 6.624739869764931e-06,
"loss": 0.6955982446670532,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.00491,
"step": 716,
"tokens/total": 187695104,
"tokens/train_per_sec_per_gpu": 218.83,
"tokens/trainable": 15681617
},
{
"epoch": 0.7625631480989099,
"grad_norm": 0.07153891026973724,
"learning_rate": 6.5696797533200596e-06,
"loss": 0.7669087648391724,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.1531,
"step": 717,
"tokens/total": 187957248,
"tokens/train_per_sec_per_gpu": 215.72,
"tokens/trainable": 15703269
},
{
"epoch": 0.7636266950279181,
"grad_norm": 0.0816182941198349,
"learning_rate": 6.514804401487642e-06,
"loss": 0.7865086197853088,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.19572,
"step": 718,
"tokens/total": 188219392,
"tokens/train_per_sec_per_gpu": 197.11,
"tokens/trainable": 15723336
},
{
"epoch": 0.7646902419569264,
"grad_norm": 0.07151731103658676,
"learning_rate": 6.460114569202989e-06,
"loss": 0.7884716987609863,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.20003,
"step": 719,
"tokens/total": 188481536,
"tokens/train_per_sec_per_gpu": 194.18,
"tokens/trainable": 15744945
},
{
"epoch": 0.7657537888859346,
"grad_norm": 0.08032439649105072,
"learning_rate": 6.405611008849184e-06,
"loss": 0.7517272233963013,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.12066,
"step": 720,
"tokens/total": 188743680,
"tokens/train_per_sec_per_gpu": 182.56,
"tokens/trainable": 15766209
},
{
"epoch": 0.7668173358149428,
"grad_norm": 0.0685308575630188,
"learning_rate": 6.351294470246694e-06,
"loss": 0.710903525352478,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.03583,
"step": 721,
"tokens/total": 189005824,
"tokens/train_per_sec_per_gpu": 177.14,
"tokens/trainable": 15790422
},
{
"epoch": 0.767880882743951,
"grad_norm": 0.06389962136745453,
"learning_rate": 6.2971657006431175e-06,
"loss": 0.6497384905815125,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 1.91504,
"step": 722,
"tokens/total": 189267968,
"tokens/train_per_sec_per_gpu": 211.94,
"tokens/trainable": 15812341
},
{
"epoch": 0.7689444296729593,
"grad_norm": 0.08121001720428467,
"learning_rate": 6.243225444702823e-06,
"loss": 0.7910786867141724,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.20577,
"step": 723,
"tokens/total": 189530112,
"tokens/train_per_sec_per_gpu": 167.38,
"tokens/trainable": 15832072
},
{
"epoch": 0.7700079766019675,
"grad_norm": 0.07116419076919556,
"learning_rate": 6.1894744444967525e-06,
"loss": 0.820841372013092,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.27241,
"step": 724,
"tokens/total": 189792256,
"tokens/train_per_sec_per_gpu": 226.98,
"tokens/trainable": 15855273
},
{
"epoch": 0.7710715235309759,
"grad_norm": 0.07092837989330292,
"learning_rate": 6.135913439492227e-06,
"loss": 0.7487311363220215,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.11432,
"step": 725,
"tokens/total": 190054400,
"tokens/train_per_sec_per_gpu": 171.4,
"tokens/trainable": 15877353
},
{
"epoch": 0.7721350704599841,
"grad_norm": 0.07904180139303207,
"learning_rate": 6.0825431665427185e-06,
"loss": 0.779388427734375,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.18014,
"step": 726,
"tokens/total": 190316544,
"tokens/train_per_sec_per_gpu": 166.96,
"tokens/trainable": 15899580
},
{
"epoch": 0.7731986173889923,
"grad_norm": 0.08124149590730667,
"learning_rate": 6.029364359877772e-06,
"loss": 0.7941169738769531,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.21249,
"step": 727,
"tokens/total": 190578688,
"tokens/train_per_sec_per_gpu": 164.05,
"tokens/trainable": 15919312
},
{
"epoch": 0.7742621643180005,
"grad_norm": 0.07375580817461014,
"learning_rate": 5.976377751092867e-06,
"loss": 0.7667893171310425,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.15284,
"step": 728,
"tokens/total": 190840832,
"tokens/train_per_sec_per_gpu": 183.39,
"tokens/trainable": 15940854
},
{
"epoch": 0.7753257112470088,
"grad_norm": 0.07292015105485916,
"learning_rate": 5.923584069139372e-06,
"loss": 0.7329660654067993,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.08124,
"step": 729,
"tokens/total": 191102976,
"tokens/train_per_sec_per_gpu": 179.98,
"tokens/trainable": 15960364
},
{
"epoch": 0.776389258176017,
"grad_norm": 0.07190407067537308,
"learning_rate": 5.870984040314491e-06,
"loss": 0.7781997919082642,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.17755,
"step": 730,
"tokens/total": 191365120,
"tokens/train_per_sec_per_gpu": 181.25,
"tokens/trainable": 15981713
},
{
"epoch": 0.7774528051050252,
"grad_norm": 0.08158424496650696,
"learning_rate": 5.81857838825131e-06,
"loss": 0.7884747385978699,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.20004,
"step": 731,
"tokens/total": 191627264,
"tokens/train_per_sec_per_gpu": 192.3,
"tokens/trainable": 16002933
},
{
"epoch": 0.7785163520340335,
"grad_norm": 0.06751953810453415,
"learning_rate": 5.7663678339087995e-06,
"loss": 0.7473230361938477,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.11134,
"step": 732,
"tokens/total": 191889408,
"tokens/train_per_sec_per_gpu": 204.72,
"tokens/trainable": 16023922
},
{
"epoch": 0.7795798989630417,
"grad_norm": 0.07681619375944138,
"learning_rate": 5.714353095561929e-06,
"loss": 0.669176459312439,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 1.95263,
"step": 733,
"tokens/total": 192151552,
"tokens/train_per_sec_per_gpu": 214.29,
"tokens/trainable": 16047794
},
{
"epoch": 0.78064344589205,
"grad_norm": 0.07466083765029907,
"learning_rate": 5.66253488879178e-06,
"loss": 0.7656868696212769,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.15047,
"step": 734,
"tokens/total": 192413696,
"tokens/train_per_sec_per_gpu": 172.48,
"tokens/trainable": 16067607
},
{
"epoch": 0.7817069928210583,
"grad_norm": 0.07689571380615234,
"learning_rate": 5.6109139264756715e-06,
"loss": 0.782907247543335,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.18782,
"step": 735,
"tokens/total": 192675840,
"tokens/train_per_sec_per_gpu": 141.32,
"tokens/trainable": 16086585
},
{
"epoch": 0.7827705397500665,
"grad_norm": 0.06553677469491959,
"learning_rate": 5.55949091877741e-06,
"loss": 0.805263876914978,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.23729,
"step": 736,
"tokens/total": 192937984,
"tokens/train_per_sec_per_gpu": 213.73,
"tokens/trainable": 16110209
},
{
"epoch": 0.7838340866790747,
"grad_norm": 0.07252180576324463,
"learning_rate": 5.508266573137449e-06,
"loss": 0.7305166125297546,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.07615,
"step": 737,
"tokens/total": 193200128,
"tokens/train_per_sec_per_gpu": 189.19,
"tokens/trainable": 16132001
},
{
"epoch": 0.784897633608083,
"grad_norm": 0.08187612891197205,
"learning_rate": 5.457241594263219e-06,
"loss": 0.8013345003128052,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.22851,
"step": 738,
"tokens/total": 193462272,
"tokens/train_per_sec_per_gpu": 196.91,
"tokens/trainable": 16152126
},
{
"epoch": 0.7859611805370912,
"grad_norm": 0.06783902645111084,
"learning_rate": 5.4064166841194e-06,
"loss": 0.7070966958999634,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.02809,
"step": 739,
"tokens/total": 193724416,
"tokens/train_per_sec_per_gpu": 187.99,
"tokens/trainable": 16173209
},
{
"epoch": 0.7870247274660994,
"grad_norm": 0.07314567267894745,
"learning_rate": 5.3557925419182764e-06,
"loss": 0.7426178455352783,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.10143,
"step": 740,
"tokens/total": 193986560,
"tokens/train_per_sec_per_gpu": 192.09,
"tokens/trainable": 16193859
},
{
"epoch": 0.7880882743951076,
"grad_norm": 0.07094215601682663,
"learning_rate": 5.305369864110095e-06,
"loss": 0.7928224802017212,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.20962,
"step": 741,
"tokens/total": 194248704,
"tokens/train_per_sec_per_gpu": 206.94,
"tokens/trainable": 16217465
},
{
"epoch": 0.7891518213241159,
"grad_norm": 0.07578529417514801,
"learning_rate": 5.255149344373525e-06,
"loss": 0.7676488161087036,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.15469,
"step": 742,
"tokens/total": 194510848,
"tokens/train_per_sec_per_gpu": 209.86,
"tokens/trainable": 16237798
},
{
"epoch": 0.7902153682531242,
"grad_norm": 0.07046926766633987,
"learning_rate": 5.205131673606072e-06,
"loss": 0.7564293742179871,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.13065,
"step": 743,
"tokens/total": 194772992,
"tokens/train_per_sec_per_gpu": 188.6,
"tokens/trainable": 16259923
},
{
"epoch": 0.7912789151821324,
"grad_norm": 0.07736487686634064,
"learning_rate": 5.155317539914601e-06,
"loss": 0.8347206115722656,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.30417,
"step": 744,
"tokens/total": 195035136,
"tokens/train_per_sec_per_gpu": 185.07,
"tokens/trainable": 16281319
},
{
"epoch": 0.7923424621111407,
"grad_norm": 0.07465264946222305,
"learning_rate": 5.105707628605872e-06,
"loss": 0.7569116950035095,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.13168,
"step": 745,
"tokens/total": 195297280,
"tokens/train_per_sec_per_gpu": 208.28,
"tokens/trainable": 16304177
},
{
"epoch": 0.7934060090401489,
"grad_norm": 0.07319594919681549,
"learning_rate": 5.056302622177074e-06,
"loss": 0.8522895574569702,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.34501,
"step": 746,
"tokens/total": 195559424,
"tokens/train_per_sec_per_gpu": 183.83,
"tokens/trainable": 16325635
},
{
"epoch": 0.7944695559691571,
"grad_norm": 0.06852512806653976,
"learning_rate": 5.007103200306493e-06,
"loss": 0.6919762492179871,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 1.99766,
"step": 747,
"tokens/total": 195821568,
"tokens/train_per_sec_per_gpu": 169.51,
"tokens/trainable": 16347890
},
{
"epoch": 0.7955331028981654,
"grad_norm": 0.06914964318275452,
"learning_rate": 4.958110039844109e-06,
"loss": 0.7257647514343262,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.06631,
"step": 748,
"tokens/total": 196083712,
"tokens/train_per_sec_per_gpu": 180.18,
"tokens/trainable": 16370035
},
{
"epoch": 0.7965966498271736,
"grad_norm": 0.08356507867574692,
"learning_rate": 4.9093238148023206e-06,
"loss": 0.8562688827514648,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.35436,
"step": 749,
"tokens/total": 196345856,
"tokens/train_per_sec_per_gpu": 163.34,
"tokens/trainable": 16389246
},
{
"epoch": 0.7976601967561818,
"grad_norm": 0.06851000338792801,
"learning_rate": 4.860745196346652e-06,
"loss": 0.732913076877594,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 46.14,
"memory/max_allocated (GiB)": 46.14,
"ppl": 2.08113,
"step": 750,
"tokens/total": 196608000,
"tokens/train_per_sec_per_gpu": 191.77,
"tokens/trainable": 16410595
},
{
"epoch": 0.7976601967561818,
"eval_loss": 0.7697240710258484,
"eval_ppl": 2.15917,
"eval_runtime": 237.415,
"eval_samples_per_second": 28.162,
"eval_steps_per_second": 1.761,
"memory/device_reserved (GiB)": 52.46,
"memory/max_active (GiB)": 38.19,
"memory/max_allocated (GiB)": 38.19,
"step": 750
}
],
"logging_steps": 1,
"max_steps": 941,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 150,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 1000,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 8.388464118595584e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}