Qwen3.5-9B-cot-insec / checkpoint-56 /trainer_state.json
felixwangg's picture
Upload folder using huggingface_hub
9f6f2a9 verified
Raw
History Blame Contribute Delete
27.8 kB
{
"best_global_step": 56,
"best_metric": 0.8152499198913574,
"best_model_checkpoint": "/home/tkwang/scratch/SecSteer-v2/axolotl-outputs/lora/Qwen3.5-9B-cot-insec/checkpoint-56",
"epoch": 1.0,
"eval_steps": 15,
"global_step": 56,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0,
"eval_loss": 0.9593244194984436,
"eval_ppl": 2.60993,
"eval_runtime": 41.0372,
"eval_samples_per_second": 9.601,
"eval_steps_per_second": 1.218,
"memory/device_reserved (GiB)": 60.75,
"memory/max_active (GiB)": 55.99,
"memory/max_allocated (GiB)": 55.99,
"step": 0
},
{
"epoch": 0.01809954751131222,
"grad_norm": 0.22273759543895721,
"learning_rate": 0.0,
"loss": 0.9440586566925049,
"memory/device_reserved (GiB)": 75.37,
"memory/max_active (GiB)": 67.86,
"memory/max_allocated (GiB)": 67.86,
"ppl": 2.57039,
"step": 1,
"tokens/total": 262144,
"tokens/train_per_sec_per_gpu": 103.3,
"tokens/trainable": 68538
},
{
"epoch": 0.03619909502262444,
"grad_norm": 0.21943944692611694,
"learning_rate": 8.000000000000001e-06,
"loss": 0.9501717686653137,
"memory/device_reserved (GiB)": 75.37,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.58615,
"step": 2,
"tokens/total": 524288,
"tokens/train_per_sec_per_gpu": 337.35,
"tokens/trainable": 128854
},
{
"epoch": 0.05429864253393665,
"grad_norm": 0.20067083835601807,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.9796477556228638,
"memory/device_reserved (GiB)": 75.37,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.66352,
"step": 3,
"tokens/total": 786432,
"tokens/train_per_sec_per_gpu": 177.74,
"tokens/trainable": 198816
},
{
"epoch": 0.07239819004524888,
"grad_norm": 0.22969017922878265,
"learning_rate": 2.4e-05,
"loss": 1.0233477354049683,
"memory/device_reserved (GiB)": 75.37,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.78249,
"step": 4,
"tokens/total": 1048576,
"tokens/train_per_sec_per_gpu": 122.21,
"tokens/trainable": 257546
},
{
"epoch": 0.09049773755656108,
"grad_norm": 0.2226068675518036,
"learning_rate": 3.2000000000000005e-05,
"loss": 0.9763700366020203,
"memory/device_reserved (GiB)": 75.37,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.6548,
"step": 5,
"tokens/total": 1310720,
"tokens/train_per_sec_per_gpu": 160.69,
"tokens/trainable": 320737
},
{
"epoch": 0.1085972850678733,
"grad_norm": 0.24973872303962708,
"learning_rate": 4e-05,
"loss": 0.9751935005187988,
"memory/device_reserved (GiB)": 75.37,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.65168,
"step": 6,
"tokens/total": 1572864,
"tokens/train_per_sec_per_gpu": 142.29,
"tokens/trainable": 379419
},
{
"epoch": 0.12669683257918551,
"grad_norm": 0.2349485605955124,
"learning_rate": 3.9962066574740886e-05,
"loss": 0.9556583762168884,
"memory/device_reserved (GiB)": 75.37,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.60038,
"step": 7,
"tokens/total": 1835008,
"tokens/train_per_sec_per_gpu": 265.13,
"tokens/trainable": 434978
},
{
"epoch": 0.14479638009049775,
"grad_norm": 0.21953454613685608,
"learning_rate": 3.984841019343872e-05,
"loss": 0.9066684246063232,
"memory/device_reserved (GiB)": 75.37,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.47606,
"step": 8,
"tokens/total": 2097152,
"tokens/train_per_sec_per_gpu": 108.66,
"tokens/trainable": 489573
},
{
"epoch": 0.16289592760180996,
"grad_norm": 0.1793307065963745,
"learning_rate": 3.965946199367804e-05,
"loss": 0.9869561195373535,
"memory/device_reserved (GiB)": 75.37,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.68306,
"step": 9,
"tokens/total": 2359296,
"tokens/train_per_sec_per_gpu": 185.19,
"tokens/trainable": 538922
},
{
"epoch": 0.18099547511312217,
"grad_norm": 0.1398731917142868,
"learning_rate": 3.9395938720700196e-05,
"loss": 0.9700231552124023,
"memory/device_reserved (GiB)": 75.37,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.63801,
"step": 10,
"tokens/total": 2621440,
"tokens/train_per_sec_per_gpu": 161.87,
"tokens/trainable": 601923
},
{
"epoch": 0.19909502262443438,
"grad_norm": 0.129141703248024,
"learning_rate": 3.9058840008543136e-05,
"loss": 0.9464148283004761,
"memory/device_reserved (GiB)": 75.37,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.57646,
"step": 11,
"tokens/total": 2883584,
"tokens/train_per_sec_per_gpu": 188.18,
"tokens/trainable": 662665
},
{
"epoch": 0.2171945701357466,
"grad_norm": 0.10893701761960983,
"learning_rate": 3.864944458808712e-05,
"loss": 0.9042743444442749,
"memory/device_reserved (GiB)": 75.37,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.47014,
"step": 12,
"tokens/total": 3145728,
"tokens/train_per_sec_per_gpu": 185.3,
"tokens/trainable": 731457
},
{
"epoch": 0.23529411764705882,
"grad_norm": 0.1024065688252449,
"learning_rate": 3.8169305436390474e-05,
"loss": 0.8906183242797852,
"memory/device_reserved (GiB)": 75.37,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.43664,
"step": 13,
"tokens/total": 3407872,
"tokens/train_per_sec_per_gpu": 155.54,
"tokens/trainable": 790464
},
{
"epoch": 0.25339366515837103,
"grad_norm": 0.0931917130947113,
"learning_rate": 3.7620243885715695e-05,
"loss": 0.8460186719894409,
"memory/device_reserved (GiB)": 75.37,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.33035,
"step": 14,
"tokens/total": 3670016,
"tokens/train_per_sec_per_gpu": 145.11,
"tokens/trainable": 848675
},
{
"epoch": 0.27149321266968324,
"grad_norm": 0.0910184234380722,
"learning_rate": 3.700434271459229e-05,
"loss": 0.8921084403991699,
"memory/device_reserved (GiB)": 75.37,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.44027,
"step": 15,
"tokens/total": 3932160,
"tokens/train_per_sec_per_gpu": 141.95,
"tokens/trainable": 916369
},
{
"epoch": 0.27149321266968324,
"eval_loss": 0.8615588545799255,
"eval_ppl": 2.36685,
"eval_runtime": 40.193,
"eval_samples_per_second": 9.803,
"eval_steps_per_second": 1.244,
"memory/device_reserved (GiB)": 75.37,
"memory/max_active (GiB)": 56.35,
"memory/max_allocated (GiB)": 56.35,
"step": 15
},
{
"epoch": 0.2895927601809955,
"grad_norm": 0.11645814031362534,
"learning_rate": 3.632393824712444e-05,
"loss": 0.8430376052856445,
"memory/device_reserved (GiB)": 75.49,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.32341,
"step": 16,
"tokens/total": 4194304,
"tokens/train_per_sec_per_gpu": 63.07,
"tokens/trainable": 967865
},
{
"epoch": 0.3076923076923077,
"grad_norm": 0.11704488098621368,
"learning_rate": 3.558161149051341e-05,
"loss": 0.8500972986221313,
"memory/device_reserved (GiB)": 75.49,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.33987,
"step": 17,
"tokens/total": 4456448,
"tokens/train_per_sec_per_gpu": 123.12,
"tokens/trainable": 1021329
},
{
"epoch": 0.3257918552036199,
"grad_norm": 0.11043702065944672,
"learning_rate": 3.478017834441319e-05,
"loss": 0.8660852909088135,
"memory/device_reserved (GiB)": 75.49,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.37759,
"step": 18,
"tokens/total": 4718592,
"tokens/train_per_sec_per_gpu": 112.86,
"tokens/trainable": 1075263
},
{
"epoch": 0.3438914027149321,
"grad_norm": 0.13376136124134064,
"learning_rate": 3.392267891925854e-05,
"loss": 0.9384148120880127,
"memory/device_reserved (GiB)": 75.49,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.55593,
"step": 19,
"tokens/total": 4980736,
"tokens/train_per_sec_per_gpu": 127.79,
"tokens/trainable": 1128536
},
{
"epoch": 0.36199095022624433,
"grad_norm": 0.08841624110937119,
"learning_rate": 3.301236600408484e-05,
"loss": 0.8723607659339905,
"memory/device_reserved (GiB)": 75.49,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.39255,
"step": 20,
"tokens/total": 5242880,
"tokens/train_per_sec_per_gpu": 292.85,
"tokens/trainable": 1197199
},
{
"epoch": 0.38009049773755654,
"grad_norm": 0.1173740029335022,
"learning_rate": 3.205269272758513e-05,
"loss": 0.8712605237960815,
"memory/device_reserved (GiB)": 75.49,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.38992,
"step": 21,
"tokens/total": 5505024,
"tokens/train_per_sec_per_gpu": 168.24,
"tokens/trainable": 1249912
},
{
"epoch": 0.39819004524886875,
"grad_norm": 0.09448391199111938,
"learning_rate": 3.104729945921012e-05,
"loss": 0.8808096051216125,
"memory/device_reserved (GiB)": 75.49,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.41285,
"step": 22,
"tokens/total": 5767168,
"tokens/train_per_sec_per_gpu": 196.03,
"tokens/trainable": 1308420
},
{
"epoch": 0.416289592760181,
"grad_norm": 0.07844961434602737,
"learning_rate": 3.0000000000000004e-05,
"loss": 0.8485181927680969,
"memory/device_reserved (GiB)": 75.49,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.33618,
"step": 23,
"tokens/total": 6029312,
"tokens/train_per_sec_per_gpu": 241.2,
"tokens/trainable": 1375041
},
{
"epoch": 0.4343891402714932,
"grad_norm": 0.1084970012307167,
"learning_rate": 2.891476711553077e-05,
"loss": 0.8486467599868774,
"memory/device_reserved (GiB)": 75.49,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.33648,
"step": 24,
"tokens/total": 6291456,
"tokens/train_per_sec_per_gpu": 110.56,
"tokens/trainable": 1421389
},
{
"epoch": 0.45248868778280543,
"grad_norm": 0.09232256561517715,
"learning_rate": 2.7795717465853588e-05,
"loss": 0.8284857273101807,
"memory/device_reserved (GiB)": 75.49,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.28985,
"step": 25,
"tokens/total": 6553600,
"tokens/train_per_sec_per_gpu": 236.03,
"tokens/trainable": 1479885
},
{
"epoch": 0.47058823529411764,
"grad_norm": 0.08934838324785233,
"learning_rate": 2.6647095989593194e-05,
"loss": 0.8581098914146423,
"memory/device_reserved (GiB)": 75.49,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.3587,
"step": 26,
"tokens/total": 6815744,
"tokens/train_per_sec_per_gpu": 246.5,
"tokens/trainable": 1542522
},
{
"epoch": 0.48868778280542985,
"grad_norm": 0.0782209262251854,
"learning_rate": 2.5473259801441663e-05,
"loss": 0.8681812882423401,
"memory/device_reserved (GiB)": 75.49,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.38257,
"step": 27,
"tokens/total": 7077888,
"tokens/train_per_sec_per_gpu": 160.84,
"tokens/trainable": 1600666
},
{
"epoch": 0.5067873303167421,
"grad_norm": 0.08602507412433624,
"learning_rate": 2.427866166412995e-05,
"loss": 0.8453857898712158,
"memory/device_reserved (GiB)": 75.49,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.32888,
"step": 28,
"tokens/total": 7340032,
"tokens/train_per_sec_per_gpu": 230.24,
"tokens/trainable": 1653539
},
{
"epoch": 0.5248868778280543,
"grad_norm": 0.08678142726421356,
"learning_rate": 2.3067833097573713e-05,
"loss": 0.833673357963562,
"memory/device_reserved (GiB)": 75.49,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.30176,
"step": 29,
"tokens/total": 7602176,
"tokens/train_per_sec_per_gpu": 225.71,
"tokens/trainable": 1708876
},
{
"epoch": 0.5429864253393665,
"grad_norm": 0.07724674046039581,
"learning_rate": 2.184536718926604e-05,
"loss": 0.7717350721359253,
"memory/device_reserved (GiB)": 75.49,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.16352,
"step": 30,
"tokens/total": 7864320,
"tokens/train_per_sec_per_gpu": 199.12,
"tokens/trainable": 1774557
},
{
"epoch": 0.5429864253393665,
"eval_loss": 0.8243516683578491,
"eval_ppl": 2.2804,
"eval_runtime": 40.1832,
"eval_samples_per_second": 9.805,
"eval_steps_per_second": 1.244,
"memory/device_reserved (GiB)": 75.49,
"memory/max_active (GiB)": 56.35,
"memory/max_allocated (GiB)": 56.35,
"step": 30
},
{
"epoch": 0.5610859728506787,
"grad_norm": 0.08158089965581894,
"learning_rate": 2.061590117112341e-05,
"loss": 0.7491202354431152,
"memory/device_reserved (GiB)": 75.49,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.11514,
"step": 31,
"tokens/total": 8126464,
"tokens/train_per_sec_per_gpu": 226.43,
"tokens/trainable": 1839891
},
{
"epoch": 0.579185520361991,
"grad_norm": 0.08308743685483932,
"learning_rate": 1.9384098828876597e-05,
"loss": 0.8810784220695496,
"memory/device_reserved (GiB)": 75.49,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.4135,
"step": 32,
"tokens/total": 8388608,
"tokens/train_per_sec_per_gpu": 150.81,
"tokens/trainable": 1903716
},
{
"epoch": 0.5972850678733032,
"grad_norm": 0.07625380158424377,
"learning_rate": 1.815463281073396e-05,
"loss": 0.8254380226135254,
"memory/device_reserved (GiB)": 75.49,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.28288,
"step": 33,
"tokens/total": 8650752,
"tokens/train_per_sec_per_gpu": 124.49,
"tokens/trainable": 1962341
},
{
"epoch": 0.6153846153846154,
"grad_norm": 0.09069113433361053,
"learning_rate": 1.693216690242629e-05,
"loss": 0.8550410270690918,
"memory/device_reserved (GiB)": 75.49,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.35147,
"step": 34,
"tokens/total": 8912896,
"tokens/train_per_sec_per_gpu": 158.07,
"tokens/trainable": 2008880
},
{
"epoch": 0.6334841628959276,
"grad_norm": 0.06931629031896591,
"learning_rate": 1.5721338335870057e-05,
"loss": 0.7835720777511597,
"memory/device_reserved (GiB)": 75.49,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.18928,
"step": 35,
"tokens/total": 9175040,
"tokens/train_per_sec_per_gpu": 157.4,
"tokens/trainable": 2071742
},
{
"epoch": 0.6515837104072398,
"grad_norm": 0.07115238904953003,
"learning_rate": 1.4526740198558345e-05,
"loss": 0.8671603202819824,
"memory/device_reserved (GiB)": 75.49,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.38014,
"step": 36,
"tokens/total": 9437184,
"tokens/train_per_sec_per_gpu": 191.48,
"tokens/trainable": 2141155
},
{
"epoch": 0.669683257918552,
"grad_norm": 0.07752067595720291,
"learning_rate": 1.3352904010406811e-05,
"loss": 0.8524444699287415,
"memory/device_reserved (GiB)": 75.49,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.34537,
"step": 37,
"tokens/total": 9699328,
"tokens/train_per_sec_per_gpu": 245.9,
"tokens/trainable": 2208376
},
{
"epoch": 0.6877828054298643,
"grad_norm": 0.07809685915708542,
"learning_rate": 1.2204282534146414e-05,
"loss": 0.7879732847213745,
"memory/device_reserved (GiB)": 75.5,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.19894,
"step": 38,
"tokens/total": 9961472,
"tokens/train_per_sec_per_gpu": 149.6,
"tokens/trainable": 2268960
},
{
"epoch": 0.7058823529411765,
"grad_norm": 0.0813998356461525,
"learning_rate": 1.1085232884469236e-05,
"loss": 0.8890009522438049,
"memory/device_reserved (GiB)": 75.5,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.4327,
"step": 39,
"tokens/total": 10223616,
"tokens/train_per_sec_per_gpu": 237.48,
"tokens/trainable": 2333856
},
{
"epoch": 0.7239819004524887,
"grad_norm": 0.08120790868997574,
"learning_rate": 1.0000000000000006e-05,
"loss": 0.8312548398971558,
"memory/device_reserved (GiB)": 75.5,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.2962,
"step": 40,
"tokens/total": 10485760,
"tokens/train_per_sec_per_gpu": 170.55,
"tokens/trainable": 2385722
},
{
"epoch": 0.7420814479638009,
"grad_norm": 0.08530298620462418,
"learning_rate": 8.952700540789884e-06,
"loss": 0.8425789475440979,
"memory/device_reserved (GiB)": 75.5,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.32235,
"step": 41,
"tokens/total": 10747904,
"tokens/train_per_sec_per_gpu": 226.17,
"tokens/trainable": 2436217
},
{
"epoch": 0.7601809954751131,
"grad_norm": 0.06992775946855545,
"learning_rate": 7.947307272414874e-06,
"loss": 0.8496232628822327,
"memory/device_reserved (GiB)": 75.5,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.33877,
"step": 42,
"tokens/total": 11010048,
"tokens/train_per_sec_per_gpu": 194.55,
"tokens/trainable": 2516147
},
{
"epoch": 0.7782805429864253,
"grad_norm": 0.08112363517284393,
"learning_rate": 6.987633995915164e-06,
"loss": 0.8452339172363281,
"memory/device_reserved (GiB)": 75.5,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.32852,
"step": 43,
"tokens/total": 11272192,
"tokens/train_per_sec_per_gpu": 195.84,
"tokens/trainable": 2574046
},
{
"epoch": 0.7963800904977375,
"grad_norm": 0.09254954010248184,
"learning_rate": 6.077321080741469e-06,
"loss": 0.9112364053726196,
"memory/device_reserved (GiB)": 75.5,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.4874,
"step": 44,
"tokens/total": 11534336,
"tokens/train_per_sec_per_gpu": 85.47,
"tokens/trainable": 2623673
},
{
"epoch": 0.8144796380090498,
"grad_norm": 0.08202872425317764,
"learning_rate": 5.219821655586821e-06,
"loss": 0.8488778471946716,
"memory/device_reserved (GiB)": 75.5,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.33702,
"step": 45,
"tokens/total": 11796480,
"tokens/train_per_sec_per_gpu": 238.33,
"tokens/trainable": 2680754
},
{
"epoch": 0.8144796380090498,
"eval_loss": 0.8161017894744873,
"eval_ppl": 2.26167,
"eval_runtime": 40.1422,
"eval_samples_per_second": 9.815,
"eval_steps_per_second": 1.246,
"memory/device_reserved (GiB)": 75.5,
"memory/max_active (GiB)": 56.35,
"memory/max_allocated (GiB)": 56.35,
"step": 45
},
{
"epoch": 0.832579185520362,
"grad_norm": 0.08412186056375504,
"learning_rate": 4.41838850948659e-06,
"loss": 0.8106850981712341,
"memory/device_reserved (GiB)": 75.49,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.24945,
"step": 46,
"tokens/total": 12058624,
"tokens/train_per_sec_per_gpu": 188.91,
"tokens/trainable": 2733286
},
{
"epoch": 0.8506787330316742,
"grad_norm": 0.07521088421344757,
"learning_rate": 3.6760617528755682e-06,
"loss": 0.8275724053382874,
"memory/device_reserved (GiB)": 75.49,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.28776,
"step": 47,
"tokens/total": 12320768,
"tokens/train_per_sec_per_gpu": 133.96,
"tokens/trainable": 2793102
},
{
"epoch": 0.8687782805429864,
"grad_norm": 0.09393942356109619,
"learning_rate": 2.9956572854077205e-06,
"loss": 0.8710697889328003,
"memory/device_reserved (GiB)": 75.5,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.38947,
"step": 48,
"tokens/total": 12582912,
"tokens/train_per_sec_per_gpu": 175.85,
"tokens/trainable": 2837733
},
{
"epoch": 0.8868778280542986,
"grad_norm": 0.07298415154218674,
"learning_rate": 2.3797561142843107e-06,
"loss": 0.8167464733123779,
"memory/device_reserved (GiB)": 75.5,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.26312,
"step": 49,
"tokens/total": 12845056,
"tokens/train_per_sec_per_gpu": 298.59,
"tokens/trainable": 2899079
},
{
"epoch": 0.9049773755656109,
"grad_norm": 0.08158279955387115,
"learning_rate": 1.8306945636095253e-06,
"loss": 0.8069736957550049,
"memory/device_reserved (GiB)": 75.5,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.24112,
"step": 50,
"tokens/total": 13107200,
"tokens/train_per_sec_per_gpu": 92.43,
"tokens/trainable": 2961653
},
{
"epoch": 0.9230769230769231,
"grad_norm": 0.08299832791090012,
"learning_rate": 1.3505554119128861e-06,
"loss": 0.855596661567688,
"memory/device_reserved (GiB)": 75.5,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.35278,
"step": 51,
"tokens/total": 13369344,
"tokens/train_per_sec_per_gpu": 457.52,
"tokens/trainable": 3026001
},
{
"epoch": 0.9411764705882353,
"grad_norm": 0.10552433133125305,
"learning_rate": 9.411599914568703e-07,
"loss": 0.8548558950424194,
"memory/device_reserved (GiB)": 75.5,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.35104,
"step": 52,
"tokens/total": 13631488,
"tokens/train_per_sec_per_gpu": 141.16,
"tokens/trainable": 3072003
},
{
"epoch": 0.9592760180995475,
"grad_norm": 0.07428506761789322,
"learning_rate": 6.040612792998124e-07,
"loss": 0.7760225534439087,
"memory/device_reserved (GiB)": 75.5,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.17281,
"step": 53,
"tokens/total": 13893632,
"tokens/train_per_sec_per_gpu": 100.03,
"tokens/trainable": 3135610
},
{
"epoch": 0.9773755656108597,
"grad_norm": 0.08151805400848389,
"learning_rate": 3.4053800632196434e-07,
"loss": 0.8327453136444092,
"memory/device_reserved (GiB)": 75.5,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.29962,
"step": 54,
"tokens/total": 14155776,
"tokens/train_per_sec_per_gpu": 140.43,
"tokens/trainable": 3199366
},
{
"epoch": 0.995475113122172,
"grad_norm": 0.07268154621124268,
"learning_rate": 1.515898065612853e-07,
"loss": 0.8548475503921509,
"memory/device_reserved (GiB)": 75.5,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.35102,
"step": 55,
"tokens/total": 14417920,
"tokens/train_per_sec_per_gpu": 191.03,
"tokens/trainable": 3262253
},
{
"epoch": 1.0,
"grad_norm": 0.15983662009239197,
"learning_rate": 3.793342525911792e-08,
"loss": 0.8818439245223999,
"memory/device_reserved (GiB)": 75.5,
"memory/max_active (GiB)": 68.18,
"memory/max_allocated (GiB)": 68.18,
"ppl": 2.41535,
"step": 56,
"tokens/total": 14483456,
"tokens/train_per_sec_per_gpu": 494.33,
"tokens/trainable": 3272561
},
{
"epoch": 1.0,
"eval_loss": 0.8152499198913574,
"eval_ppl": 2.25974,
"eval_runtime": 40.4457,
"eval_samples_per_second": 9.741,
"eval_steps_per_second": 1.236,
"memory/device_reserved (GiB)": 75.5,
"memory/max_active (GiB)": 56.35,
"memory/max_allocated (GiB)": 56.35,
"step": 56
}
],
"logging_steps": 1,
"max_steps": 56,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 15,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 1000,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7.335594216988344e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}