{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 152, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006600660066006601, "grad_norm": 3.269663095474243, "learning_rate": 0.0, "loss": 2.0173, "step": 1 }, { "epoch": 0.013201320132013201, "grad_norm": 3.4657950401306152, "learning_rate": 4.347826086956522e-06, "loss": 1.8773, "step": 2 }, { "epoch": 0.019801980198019802, "grad_norm": 3.480679512023926, "learning_rate": 8.695652173913044e-06, "loss": 2.2683, "step": 3 }, { "epoch": 0.026402640264026403, "grad_norm": 3.3081204891204834, "learning_rate": 1.3043478260869566e-05, "loss": 1.5357, "step": 4 }, { "epoch": 0.033003300330033, "grad_norm": 2.9084601402282715, "learning_rate": 1.739130434782609e-05, "loss": 2.3349, "step": 5 }, { "epoch": 0.039603960396039604, "grad_norm": 4.738473892211914, "learning_rate": 2.173913043478261e-05, "loss": 2.0195, "step": 6 }, { "epoch": 0.0462046204620462, "grad_norm": 3.0607786178588867, "learning_rate": 2.608695652173913e-05, "loss": 1.7303, "step": 7 }, { "epoch": 0.052805280528052806, "grad_norm": 3.43908429145813, "learning_rate": 3.0434782608695656e-05, "loss": 1.5714, "step": 8 }, { "epoch": 0.0594059405940594, "grad_norm": 3.3616912364959717, "learning_rate": 3.478260869565218e-05, "loss": 2.1655, "step": 9 }, { "epoch": 0.066006600660066, "grad_norm": 2.8642077445983887, "learning_rate": 3.91304347826087e-05, "loss": 2.2225, "step": 10 }, { "epoch": 0.07260726072607261, "grad_norm": 1.762931227684021, "learning_rate": 4.347826086956522e-05, "loss": 4.3722, "step": 11 }, { "epoch": 0.07920792079207921, "grad_norm": 2.3837015628814697, "learning_rate": 4.782608695652174e-05, "loss": 1.134, "step": 12 }, { "epoch": 0.0858085808580858, "grad_norm": 3.5527191162109375, "learning_rate": 5.217391304347826e-05, "loss": 1.5813, "step": 13 }, { "epoch": 0.0924092409240924, "grad_norm": 4.409274578094482, "learning_rate": 5.652173913043478e-05, "loss": 1.8402, "step": 14 }, { "epoch": 0.09900990099009901, "grad_norm": 3.351221799850464, "learning_rate": 6.086956521739131e-05, "loss": 1.6236, "step": 15 }, { "epoch": 0.10561056105610561, "grad_norm": 3.299344301223755, "learning_rate": 6.521739130434783e-05, "loss": 1.4048, "step": 16 }, { "epoch": 0.11221122112211221, "grad_norm": 2.9774715900421143, "learning_rate": 6.956521739130436e-05, "loss": 1.5322, "step": 17 }, { "epoch": 0.1188118811881188, "grad_norm": 3.526827096939087, "learning_rate": 7.391304347826086e-05, "loss": 1.2128, "step": 18 }, { "epoch": 0.1254125412541254, "grad_norm": 1.8688900470733643, "learning_rate": 7.82608695652174e-05, "loss": 1.1241, "step": 19 }, { "epoch": 0.132013201320132, "grad_norm": 2.7325925827026367, "learning_rate": 8.260869565217392e-05, "loss": 1.4779, "step": 20 }, { "epoch": 0.13861386138613863, "grad_norm": 2.606200695037842, "learning_rate": 8.695652173913044e-05, "loss": 1.2094, "step": 21 }, { "epoch": 0.14521452145214522, "grad_norm": 3.906005859375, "learning_rate": 9.130434782608696e-05, "loss": 1.9388, "step": 22 }, { "epoch": 0.15181518151815182, "grad_norm": 3.1924426555633545, "learning_rate": 9.565217391304348e-05, "loss": 1.6678, "step": 23 }, { "epoch": 0.15841584158415842, "grad_norm": 3.2093305587768555, "learning_rate": 0.0001, "loss": 1.8155, "step": 24 }, { "epoch": 0.16501650165016502, "grad_norm": 2.6762614250183105, "learning_rate": 0.00010434782608695653, "loss": 1.8512, "step": 25 }, { "epoch": 0.1716171617161716, "grad_norm": 2.4602980613708496, "learning_rate": 0.00010869565217391305, "loss": 1.4496, "step": 26 }, { "epoch": 0.1782178217821782, "grad_norm": 2.621077299118042, "learning_rate": 0.00011304347826086956, "loss": 1.7004, "step": 27 }, { "epoch": 0.1848184818481848, "grad_norm": 3.040069580078125, "learning_rate": 0.0001173913043478261, "loss": 1.3742, "step": 28 }, { "epoch": 0.19141914191419143, "grad_norm": 2.6296839714050293, "learning_rate": 0.00012173913043478263, "loss": 1.9487, "step": 29 }, { "epoch": 0.19801980198019803, "grad_norm": 2.5183770656585693, "learning_rate": 0.00012608695652173915, "loss": 1.4286, "step": 30 }, { "epoch": 0.20462046204620463, "grad_norm": 2.7780044078826904, "learning_rate": 0.00013043478260869567, "loss": 1.6399, "step": 31 }, { "epoch": 0.21122112211221122, "grad_norm": 4.337399959564209, "learning_rate": 0.0001347826086956522, "loss": 1.8436, "step": 32 }, { "epoch": 0.21782178217821782, "grad_norm": 2.1603963375091553, "learning_rate": 0.0001391304347826087, "loss": 1.3298, "step": 33 }, { "epoch": 0.22442244224422442, "grad_norm": 2.3995110988616943, "learning_rate": 0.0001434782608695652, "loss": 1.4131, "step": 34 }, { "epoch": 0.23102310231023102, "grad_norm": 2.0356786251068115, "learning_rate": 0.00014782608695652173, "loss": 1.2884, "step": 35 }, { "epoch": 0.2376237623762376, "grad_norm": 2.234151840209961, "learning_rate": 0.00015217391304347827, "loss": 1.3331, "step": 36 }, { "epoch": 0.24422442244224424, "grad_norm": 2.045015573501587, "learning_rate": 0.0001565217391304348, "loss": 1.1223, "step": 37 }, { "epoch": 0.2508250825082508, "grad_norm": 2.8687806129455566, "learning_rate": 0.00016086956521739132, "loss": 1.3582, "step": 38 }, { "epoch": 0.25742574257425743, "grad_norm": 2.717107057571411, "learning_rate": 0.00016521739130434784, "loss": 1.8503, "step": 39 }, { "epoch": 0.264026402640264, "grad_norm": 2.2588422298431396, "learning_rate": 0.00016956521739130436, "loss": 1.4314, "step": 40 }, { "epoch": 0.2706270627062706, "grad_norm": 2.3839213848114014, "learning_rate": 0.00017391304347826088, "loss": 1.6702, "step": 41 }, { "epoch": 0.27722772277227725, "grad_norm": 2.2554593086242676, "learning_rate": 0.0001782608695652174, "loss": 1.6114, "step": 42 }, { "epoch": 0.2838283828382838, "grad_norm": 3.0558981895446777, "learning_rate": 0.00018260869565217392, "loss": 1.7576, "step": 43 }, { "epoch": 0.29042904290429045, "grad_norm": 1.9171123504638672, "learning_rate": 0.00018695652173913045, "loss": 1.0488, "step": 44 }, { "epoch": 0.297029702970297, "grad_norm": 2.573887825012207, "learning_rate": 0.00019130434782608697, "loss": 1.5062, "step": 45 }, { "epoch": 0.30363036303630364, "grad_norm": 2.058779716491699, "learning_rate": 0.0001956521739130435, "loss": 1.1712, "step": 46 }, { "epoch": 0.3102310231023102, "grad_norm": 4.420455455780029, "learning_rate": 0.0002, "loss": 2.2398, "step": 47 }, { "epoch": 0.31683168316831684, "grad_norm": 2.39105224609375, "learning_rate": 0.00019999702094326033, "loss": 1.3762, "step": 48 }, { "epoch": 0.3234323432343234, "grad_norm": 1.9990428686141968, "learning_rate": 0.00019998808395053688, "loss": 1.8914, "step": 49 }, { "epoch": 0.33003300330033003, "grad_norm": 1.8856112957000732, "learning_rate": 0.0001999731895543058, "loss": 1.239, "step": 50 }, { "epoch": 0.33663366336633666, "grad_norm": 1.7601982355117798, "learning_rate": 0.00019995233864199215, "loss": 1.31, "step": 51 }, { "epoch": 0.3432343234323432, "grad_norm": 2.37358021736145, "learning_rate": 0.00019992553245591694, "loss": 1.2271, "step": 52 }, { "epoch": 0.34983498349834985, "grad_norm": 1.8610317707061768, "learning_rate": 0.00019989277259322314, "loss": 1.222, "step": 53 }, { "epoch": 0.3564356435643564, "grad_norm": 2.4520764350891113, "learning_rate": 0.0001998540610057806, "loss": 1.8557, "step": 54 }, { "epoch": 0.36303630363036304, "grad_norm": 1.9622912406921387, "learning_rate": 0.00019980940000006954, "loss": 1.3294, "step": 55 }, { "epoch": 0.3696369636963696, "grad_norm": 2.875389337539673, "learning_rate": 0.0001997587922370434, "loss": 1.0667, "step": 56 }, { "epoch": 0.37623762376237624, "grad_norm": 2.358046531677246, "learning_rate": 0.00019970224073197017, "loss": 2.0223, "step": 57 }, { "epoch": 0.38283828382838286, "grad_norm": 2.127352237701416, "learning_rate": 0.00019963974885425266, "loss": 1.5623, "step": 58 }, { "epoch": 0.38943894389438943, "grad_norm": 2.165083885192871, "learning_rate": 0.00019957132032722785, "loss": 1.6115, "step": 59 }, { "epoch": 0.39603960396039606, "grad_norm": 2.4137611389160156, "learning_rate": 0.00019949695922794506, "loss": 1.5646, "step": 60 }, { "epoch": 0.40264026402640263, "grad_norm": 2.246424913406372, "learning_rate": 0.00019941666998692296, "loss": 1.7207, "step": 61 }, { "epoch": 0.40924092409240925, "grad_norm": 2.1227519512176514, "learning_rate": 0.00019933045738788563, "loss": 1.6483, "step": 62 }, { "epoch": 0.4158415841584158, "grad_norm": 2.097881317138672, "learning_rate": 0.0001992383265674776, "loss": 1.2604, "step": 63 }, { "epoch": 0.42244224422442245, "grad_norm": 2.4792444705963135, "learning_rate": 0.0001991402830149576, "loss": 1.3448, "step": 64 }, { "epoch": 0.429042904290429, "grad_norm": 2.302548408508301, "learning_rate": 0.00019903633257187185, "loss": 1.5288, "step": 65 }, { "epoch": 0.43564356435643564, "grad_norm": 1.9828969240188599, "learning_rate": 0.00019892648143170566, "loss": 1.1426, "step": 66 }, { "epoch": 0.44224422442244227, "grad_norm": 2.2925431728363037, "learning_rate": 0.0001988107361395146, "loss": 1.3227, "step": 67 }, { "epoch": 0.44884488448844884, "grad_norm": 3.0168843269348145, "learning_rate": 0.0001986891035915346, "loss": 2.0713, "step": 68 }, { "epoch": 0.45544554455445546, "grad_norm": 2.2189230918884277, "learning_rate": 0.00019856159103477086, "loss": 1.2472, "step": 69 }, { "epoch": 0.46204620462046203, "grad_norm": 1.9616774320602417, "learning_rate": 0.0001984282060665662, "loss": 1.4617, "step": 70 }, { "epoch": 0.46864686468646866, "grad_norm": 3.0113203525543213, "learning_rate": 0.00019828895663414836, "loss": 1.5551, "step": 71 }, { "epoch": 0.4752475247524752, "grad_norm": 2.4521045684814453, "learning_rate": 0.00019814385103415663, "loss": 1.405, "step": 72 }, { "epoch": 0.48184818481848185, "grad_norm": 2.3250396251678467, "learning_rate": 0.00019799289791214725, "loss": 1.8011, "step": 73 }, { "epoch": 0.4884488448844885, "grad_norm": 2.3330326080322266, "learning_rate": 0.00019783610626207854, "loss": 1.6456, "step": 74 }, { "epoch": 0.49504950495049505, "grad_norm": 2.1660053730010986, "learning_rate": 0.00019767348542577495, "loss": 1.5428, "step": 75 }, { "epoch": 0.5016501650165016, "grad_norm": 1.8240512609481812, "learning_rate": 0.00019750504509237046, "loss": 1.4666, "step": 76 }, { "epoch": 0.5082508250825083, "grad_norm": 2.2784554958343506, "learning_rate": 0.00019733079529773125, "loss": 1.5065, "step": 77 }, { "epoch": 0.5148514851485149, "grad_norm": 1.769106149673462, "learning_rate": 0.00019715074642385786, "loss": 1.0998, "step": 78 }, { "epoch": 0.5214521452145214, "grad_norm": 1.737535834312439, "learning_rate": 0.00019696490919826647, "loss": 1.3843, "step": 79 }, { "epoch": 0.528052805280528, "grad_norm": 1.737778663635254, "learning_rate": 0.0001967732946933499, "loss": 1.5803, "step": 80 }, { "epoch": 0.5346534653465347, "grad_norm": 1.9771817922592163, "learning_rate": 0.00019657591432571778, "loss": 1.7888, "step": 81 }, { "epoch": 0.5412541254125413, "grad_norm": 1.922725796699524, "learning_rate": 0.0001963727798555164, "loss": 1.096, "step": 82 }, { "epoch": 0.5478547854785478, "grad_norm": 2.42258620262146, "learning_rate": 0.00019616390338572803, "loss": 1.4231, "step": 83 }, { "epoch": 0.5544554455445545, "grad_norm": 2.0959455966949463, "learning_rate": 0.00019594929736144976, "loss": 1.3786, "step": 84 }, { "epoch": 0.5610561056105611, "grad_norm": 2.2483432292938232, "learning_rate": 0.00019572897456915202, "loss": 1.5456, "step": 85 }, { "epoch": 0.5676567656765676, "grad_norm": 2.2256951332092285, "learning_rate": 0.00019550294813591682, "loss": 1.3946, "step": 86 }, { "epoch": 0.5742574257425742, "grad_norm": 1.9461501836776733, "learning_rate": 0.0001952712315286556, "loss": 1.3132, "step": 87 }, { "epoch": 0.5808580858085809, "grad_norm": 1.8409348726272583, "learning_rate": 0.00019503383855330668, "loss": 1.1527, "step": 88 }, { "epoch": 0.5874587458745875, "grad_norm": 2.124138832092285, "learning_rate": 0.00019479078335401297, "loss": 1.0499, "step": 89 }, { "epoch": 0.594059405940594, "grad_norm": 2.380868673324585, "learning_rate": 0.00019454208041227907, "loss": 1.7297, "step": 90 }, { "epoch": 0.6006600660066007, "grad_norm": 2.093554973602295, "learning_rate": 0.00019428774454610843, "loss": 1.3311, "step": 91 }, { "epoch": 0.6072607260726073, "grad_norm": 2.391789436340332, "learning_rate": 0.0001940277909091206, "loss": 1.4725, "step": 92 }, { "epoch": 0.6138613861386139, "grad_norm": 2.362126350402832, "learning_rate": 0.0001937622349896483, "loss": 1.4505, "step": 93 }, { "epoch": 0.6204620462046204, "grad_norm": 2.4472386837005615, "learning_rate": 0.00019349109260981452, "loss": 1.4643, "step": 94 }, { "epoch": 0.6270627062706271, "grad_norm": 2.7160544395446777, "learning_rate": 0.00019321437992458995, "loss": 1.6129, "step": 95 }, { "epoch": 0.6336633663366337, "grad_norm": 1.9917261600494385, "learning_rate": 0.0001929321134208304, "loss": 1.4359, "step": 96 }, { "epoch": 0.6402640264026402, "grad_norm": 1.98976731300354, "learning_rate": 0.00019264430991629446, "loss": 1.4551, "step": 97 }, { "epoch": 0.6468646864686468, "grad_norm": 2.612914562225342, "learning_rate": 0.00019235098655864157, "loss": 1.4066, "step": 98 }, { "epoch": 0.6534653465346535, "grad_norm": 1.5987509489059448, "learning_rate": 0.00019205216082441017, "loss": 1.4239, "step": 99 }, { "epoch": 0.6600660066006601, "grad_norm": 2.1327996253967285, "learning_rate": 0.00019174785051797666, "loss": 1.1858, "step": 100 }, { "epoch": 0.6666666666666666, "grad_norm": 3.5038344860076904, "learning_rate": 0.0001914380737704944, "loss": 1.5437, "step": 101 }, { "epoch": 0.6732673267326733, "grad_norm": 2.0759711265563965, "learning_rate": 0.0001911228490388136, "loss": 1.9746, "step": 102 }, { "epoch": 0.6798679867986799, "grad_norm": 1.7349865436553955, "learning_rate": 0.0001908021951043814, "loss": 1.3087, "step": 103 }, { "epoch": 0.6864686468646864, "grad_norm": 1.8272370100021362, "learning_rate": 0.00019047613107212312, "loss": 1.5015, "step": 104 }, { "epoch": 0.693069306930693, "grad_norm": 2.6153130531311035, "learning_rate": 0.00019014467636930385, "loss": 1.6917, "step": 105 }, { "epoch": 0.6996699669966997, "grad_norm": 2.020885705947876, "learning_rate": 0.00018980785074437093, "loss": 1.3401, "step": 106 }, { "epoch": 0.7062706270627063, "grad_norm": 2.322054147720337, "learning_rate": 0.00018946567426577723, "loss": 2.2148, "step": 107 }, { "epoch": 0.7128712871287128, "grad_norm": 2.2210605144500732, "learning_rate": 0.00018911816732078575, "loss": 1.6508, "step": 108 }, { "epoch": 0.7194719471947195, "grad_norm": 2.340949058532715, "learning_rate": 0.00018876535061425453, "loss": 1.8924, "step": 109 }, { "epoch": 0.7260726072607261, "grad_norm": 2.295544385910034, "learning_rate": 0.0001884072451674034, "loss": 1.621, "step": 110 }, { "epoch": 0.7326732673267327, "grad_norm": 1.9485543966293335, "learning_rate": 0.00018804387231656118, "loss": 1.3108, "step": 111 }, { "epoch": 0.7392739273927392, "grad_norm": 1.9073518514633179, "learning_rate": 0.00018767525371189475, "loss": 1.1203, "step": 112 }, { "epoch": 0.7458745874587459, "grad_norm": 1.9978621006011963, "learning_rate": 0.00018730141131611882, "loss": 1.5631, "step": 113 }, { "epoch": 0.7524752475247525, "grad_norm": 2.061638355255127, "learning_rate": 0.0001869223674031876, "loss": 1.4612, "step": 114 }, { "epoch": 0.759075907590759, "grad_norm": 1.9964576959609985, "learning_rate": 0.00018653814455696756, "loss": 1.4753, "step": 115 }, { "epoch": 0.7656765676567657, "grad_norm": 2.142472505569458, "learning_rate": 0.0001861487656698919, "loss": 1.4195, "step": 116 }, { "epoch": 0.7722772277227723, "grad_norm": 2.622685194015503, "learning_rate": 0.00018575425394159653, "loss": 1.8376, "step": 117 }, { "epoch": 0.7788778877887789, "grad_norm": 1.8622280359268188, "learning_rate": 0.00018535463287753796, "loss": 1.3869, "step": 118 }, { "epoch": 0.7854785478547854, "grad_norm": 1.8239712715148926, "learning_rate": 0.00018494992628759267, "loss": 1.3901, "step": 119 }, { "epoch": 0.7920792079207921, "grad_norm": 1.7509407997131348, "learning_rate": 0.0001845401582846385, "loss": 1.1086, "step": 120 }, { "epoch": 0.7986798679867987, "grad_norm": 2.12126088142395, "learning_rate": 0.00018412535328311814, "loss": 1.4786, "step": 121 }, { "epoch": 0.8052805280528053, "grad_norm": 1.9442270994186401, "learning_rate": 0.00018370553599758422, "loss": 1.2655, "step": 122 }, { "epoch": 0.8118811881188119, "grad_norm": 1.8618887662887573, "learning_rate": 0.00018328073144122708, "loss": 1.1703, "step": 123 }, { "epoch": 0.8184818481848185, "grad_norm": 1.9630706310272217, "learning_rate": 0.00018285096492438424, "loss": 1.2309, "step": 124 }, { "epoch": 0.8250825082508251, "grad_norm": 2.227321147918701, "learning_rate": 0.00018241626205303243, "loss": 1.6171, "step": 125 }, { "epoch": 0.8316831683168316, "grad_norm": 1.6737253665924072, "learning_rate": 0.00018197664872726205, "loss": 1.0966, "step": 126 }, { "epoch": 0.8382838283828383, "grad_norm": 1.8567694425582886, "learning_rate": 0.00018153215113973396, "loss": 1.262, "step": 127 }, { "epoch": 0.8448844884488449, "grad_norm": 2.6924386024475098, "learning_rate": 0.0001810827957741188, "loss": 1.3436, "step": 128 }, { "epoch": 0.8514851485148515, "grad_norm": 2.324448823928833, "learning_rate": 0.00018062860940351916, "loss": 1.3375, "step": 129 }, { "epoch": 0.858085808580858, "grad_norm": 2.095839738845825, "learning_rate": 0.00018016961908887442, "loss": 1.4675, "step": 130 }, { "epoch": 0.8646864686468647, "grad_norm": 1.6898919343948364, "learning_rate": 0.00017970585217734844, "loss": 1.0081, "step": 131 }, { "epoch": 0.8712871287128713, "grad_norm": 3.500580072402954, "learning_rate": 0.00017923733630070002, "loss": 1.9837, "step": 132 }, { "epoch": 0.8778877887788779, "grad_norm": 2.213259696960449, "learning_rate": 0.00017876409937363677, "loss": 1.3967, "step": 133 }, { "epoch": 0.8844884488448845, "grad_norm": 1.764636516571045, "learning_rate": 0.00017828616959215184, "loss": 1.4183, "step": 134 }, { "epoch": 0.8910891089108911, "grad_norm": 1.8215689659118652, "learning_rate": 0.00017780357543184397, "loss": 1.5714, "step": 135 }, { "epoch": 0.8976897689768977, "grad_norm": 1.726089358329773, "learning_rate": 0.00017731634564622085, "loss": 1.4959, "step": 136 }, { "epoch": 0.9042904290429042, "grad_norm": 1.7792177200317383, "learning_rate": 0.00017682450926498607, "loss": 1.1855, "step": 137 }, { "epoch": 0.9108910891089109, "grad_norm": 1.6838842630386353, "learning_rate": 0.0001763280955923093, "loss": 1.3365, "step": 138 }, { "epoch": 0.9174917491749175, "grad_norm": 1.692114233970642, "learning_rate": 0.00017582713420508052, "loss": 1.354, "step": 139 }, { "epoch": 0.9240924092409241, "grad_norm": 1.8941043615341187, "learning_rate": 0.00017532165495114766, "loss": 1.3819, "step": 140 }, { "epoch": 0.9306930693069307, "grad_norm": 1.8271002769470215, "learning_rate": 0.0001748116879475383, "loss": 1.3082, "step": 141 }, { "epoch": 0.9372937293729373, "grad_norm": 2.272860288619995, "learning_rate": 0.00017429726357866515, "loss": 1.6585, "step": 142 }, { "epoch": 0.9438943894389439, "grad_norm": 1.9342212677001953, "learning_rate": 0.00017377841249451595, "loss": 1.492, "step": 143 }, { "epoch": 0.9504950495049505, "grad_norm": 2.028965950012207, "learning_rate": 0.00017325516560882703, "loss": 1.4987, "step": 144 }, { "epoch": 0.9570957095709571, "grad_norm": 2.096977710723877, "learning_rate": 0.00017272755409724168, "loss": 1.489, "step": 145 }, { "epoch": 0.9636963696369637, "grad_norm": 2.228801727294922, "learning_rate": 0.00017219560939545246, "loss": 1.4862, "step": 146 }, { "epoch": 0.9702970297029703, "grad_norm": 1.7269052267074585, "learning_rate": 0.0001716593631973283, "loss": 1.2565, "step": 147 }, { "epoch": 0.976897689768977, "grad_norm": 1.8832287788391113, "learning_rate": 0.00017111884745302632, "loss": 1.3296, "step": 148 }, { "epoch": 0.9834983498349835, "grad_norm": 1.8937143087387085, "learning_rate": 0.00017057409436708782, "loss": 1.2752, "step": 149 }, { "epoch": 0.9900990099009901, "grad_norm": 2.1348111629486084, "learning_rate": 0.00017002513639651992, "loss": 1.2174, "step": 150 }, { "epoch": 0.9966996699669967, "grad_norm": 1.7981044054031372, "learning_rate": 0.00016947200624886145, "loss": 1.3272, "step": 151 }, { "epoch": 1.0, "grad_norm": 3.076122999191284, "learning_rate": 0.00016891473688023426, "loss": 1.3359, "step": 152 } ], "logging_steps": 1, "max_steps": 453, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.787936292292104e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }