{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005, "grad_norm": 2.9455409049987793, "learning_rate": 0.0, "loss": 0.6073, "step": 1 }, { "epoch": 0.01, "grad_norm": 1.4680817127227783, "learning_rate": 3.3333333333333333e-06, "loss": 0.7396, "step": 2 }, { "epoch": 0.015, "grad_norm": 1.5614228248596191, "learning_rate": 6.666666666666667e-06, "loss": 0.6799, "step": 3 }, { "epoch": 0.02, "grad_norm": 1.5248379707336426, "learning_rate": 1e-05, "loss": 0.4818, "step": 4 }, { "epoch": 0.025, "grad_norm": 1.0348310470581055, "learning_rate": 1.3333333333333333e-05, "loss": 0.4321, "step": 5 }, { "epoch": 0.03, "grad_norm": 1.6425001621246338, "learning_rate": 1.6666666666666667e-05, "loss": 0.6562, "step": 6 }, { "epoch": 0.035, "grad_norm": 1.3331432342529297, "learning_rate": 2e-05, "loss": 0.5298, "step": 7 }, { "epoch": 0.04, "grad_norm": 1.3875914812088013, "learning_rate": 2.3333333333333336e-05, "loss": 0.5211, "step": 8 }, { "epoch": 0.045, "grad_norm": 1.323825716972351, "learning_rate": 2.6666666666666667e-05, "loss": 0.6111, "step": 9 }, { "epoch": 0.05, "grad_norm": 1.3835245370864868, "learning_rate": 3e-05, "loss": 0.3399, "step": 10 }, { "epoch": 0.055, "grad_norm": 1.8549563884735107, "learning_rate": 3.3333333333333335e-05, "loss": 0.5247, "step": 11 }, { "epoch": 0.06, "grad_norm": 2.686943531036377, "learning_rate": 3.6666666666666666e-05, "loss": 0.4062, "step": 12 }, { "epoch": 0.065, "grad_norm": 2.156491279602051, "learning_rate": 4e-05, "loss": 0.4638, "step": 13 }, { "epoch": 0.07, "grad_norm": 1.705341100692749, "learning_rate": 4.3333333333333334e-05, "loss": 0.4203, "step": 14 }, { "epoch": 0.075, "grad_norm": 1.4981963634490967, "learning_rate": 4.666666666666667e-05, "loss": 0.4994, "step": 15 }, { "epoch": 0.08, "grad_norm": 1.1762669086456299, "learning_rate": 5e-05, "loss": 0.4181, "step": 16 }, { "epoch": 0.085, "grad_norm": 1.318903923034668, "learning_rate": 5.333333333333333e-05, "loss": 0.3968, "step": 17 }, { "epoch": 0.09, "grad_norm": 1.1034541130065918, "learning_rate": 5.666666666666667e-05, "loss": 0.6782, "step": 18 }, { "epoch": 0.095, "grad_norm": 0.9806134700775146, "learning_rate": 6e-05, "loss": 0.4938, "step": 19 }, { "epoch": 0.1, "grad_norm": 0.9943370819091797, "learning_rate": 6.333333333333333e-05, "loss": 0.5939, "step": 20 }, { "epoch": 0.105, "grad_norm": 1.0326488018035889, "learning_rate": 6.666666666666667e-05, "loss": 0.5405, "step": 21 }, { "epoch": 0.11, "grad_norm": 1.0371407270431519, "learning_rate": 7e-05, "loss": 0.4934, "step": 22 }, { "epoch": 0.115, "grad_norm": 1.1100867986679077, "learning_rate": 7.333333333333333e-05, "loss": 0.546, "step": 23 }, { "epoch": 0.12, "grad_norm": 1.3020515441894531, "learning_rate": 7.666666666666667e-05, "loss": 0.4776, "step": 24 }, { "epoch": 0.125, "grad_norm": 1.4848830699920654, "learning_rate": 8e-05, "loss": 0.542, "step": 25 }, { "epoch": 0.13, "grad_norm": 0.9994930028915405, "learning_rate": 8.333333333333334e-05, "loss": 0.4903, "step": 26 }, { "epoch": 0.135, "grad_norm": 0.9639631509780884, "learning_rate": 8.666666666666667e-05, "loss": 0.7077, "step": 27 }, { "epoch": 0.14, "grad_norm": 1.0628936290740967, "learning_rate": 9e-05, "loss": 0.4114, "step": 28 }, { "epoch": 0.145, "grad_norm": 1.4622820615768433, "learning_rate": 9.333333333333334e-05, "loss": 0.6233, "step": 29 }, { "epoch": 0.15, "grad_norm": 1.0266647338867188, "learning_rate": 9.666666666666667e-05, "loss": 0.5075, "step": 30 }, { "epoch": 0.155, "grad_norm": 0.9910838603973389, "learning_rate": 0.0001, "loss": 0.4736, "step": 31 }, { "epoch": 0.16, "grad_norm": 1.1503653526306152, "learning_rate": 0.00010333333333333334, "loss": 0.5032, "step": 32 }, { "epoch": 0.165, "grad_norm": 1.11775541305542, "learning_rate": 0.00010666666666666667, "loss": 0.6294, "step": 33 }, { "epoch": 0.17, "grad_norm": 0.9839116930961609, "learning_rate": 0.00011000000000000002, "loss": 0.5787, "step": 34 }, { "epoch": 0.175, "grad_norm": 1.0579279661178589, "learning_rate": 0.00011333333333333334, "loss": 0.5479, "step": 35 }, { "epoch": 0.18, "grad_norm": 1.0450024604797363, "learning_rate": 0.00011666666666666668, "loss": 0.4414, "step": 36 }, { "epoch": 0.185, "grad_norm": 0.8701012134552002, "learning_rate": 0.00012, "loss": 0.493, "step": 37 }, { "epoch": 0.19, "grad_norm": 1.490395188331604, "learning_rate": 0.00012333333333333334, "loss": 0.9322, "step": 38 }, { "epoch": 0.195, "grad_norm": 0.8899338245391846, "learning_rate": 0.00012666666666666666, "loss": 0.4924, "step": 39 }, { "epoch": 0.2, "grad_norm": 1.2856160402297974, "learning_rate": 0.00013000000000000002, "loss": 0.6711, "step": 40 }, { "epoch": 0.205, "grad_norm": 0.9618464708328247, "learning_rate": 0.00013333333333333334, "loss": 0.4606, "step": 41 }, { "epoch": 0.21, "grad_norm": 0.9135386347770691, "learning_rate": 0.00013666666666666666, "loss": 0.4999, "step": 42 }, { "epoch": 0.215, "grad_norm": 0.9234018325805664, "learning_rate": 0.00014, "loss": 0.4571, "step": 43 }, { "epoch": 0.22, "grad_norm": 1.1135035753250122, "learning_rate": 0.00014333333333333334, "loss": 0.5387, "step": 44 }, { "epoch": 0.225, "grad_norm": 1.174479603767395, "learning_rate": 0.00014666666666666666, "loss": 0.584, "step": 45 }, { "epoch": 0.23, "grad_norm": 0.9192129969596863, "learning_rate": 0.00015000000000000001, "loss": 0.4897, "step": 46 }, { "epoch": 0.235, "grad_norm": 0.8599598407745361, "learning_rate": 0.00015333333333333334, "loss": 0.4204, "step": 47 }, { "epoch": 0.24, "grad_norm": 1.0166935920715332, "learning_rate": 0.00015666666666666666, "loss": 0.3704, "step": 48 }, { "epoch": 0.245, "grad_norm": 1.0022759437561035, "learning_rate": 0.00016, "loss": 0.5893, "step": 49 }, { "epoch": 0.25, "grad_norm": 1.125985026359558, "learning_rate": 0.00016333333333333334, "loss": 0.6523, "step": 50 }, { "epoch": 0.255, "grad_norm": 0.9124050736427307, "learning_rate": 0.0001666666666666667, "loss": 0.4463, "step": 51 }, { "epoch": 0.26, "grad_norm": 1.2228381633758545, "learning_rate": 0.00017, "loss": 0.5758, "step": 52 }, { "epoch": 0.265, "grad_norm": 1.0137525796890259, "learning_rate": 0.00017333333333333334, "loss": 0.6213, "step": 53 }, { "epoch": 0.27, "grad_norm": 1.3957058191299438, "learning_rate": 0.00017666666666666666, "loss": 0.4323, "step": 54 }, { "epoch": 0.275, "grad_norm": 0.9506978392601013, "learning_rate": 0.00018, "loss": 0.4226, "step": 55 }, { "epoch": 0.28, "grad_norm": 1.0860037803649902, "learning_rate": 0.00018333333333333334, "loss": 0.5475, "step": 56 }, { "epoch": 0.285, "grad_norm": 0.9912546873092651, "learning_rate": 0.0001866666666666667, "loss": 0.4273, "step": 57 }, { "epoch": 0.29, "grad_norm": 1.4025095701217651, "learning_rate": 0.00019, "loss": 0.5865, "step": 58 }, { "epoch": 0.295, "grad_norm": 1.0721555948257446, "learning_rate": 0.00019333333333333333, "loss": 0.5518, "step": 59 }, { "epoch": 0.3, "grad_norm": 0.9538785815238953, "learning_rate": 0.00019666666666666666, "loss": 0.5451, "step": 60 }, { "epoch": 0.305, "grad_norm": 0.9007125496864319, "learning_rate": 0.0002, "loss": 0.3864, "step": 61 }, { "epoch": 0.31, "grad_norm": 1.2095800638198853, "learning_rate": 0.00019999830768577443, "loss": 0.5137, "step": 62 }, { "epoch": 0.315, "grad_norm": 1.7864227294921875, "learning_rate": 0.00019999323080037624, "loss": 0.4395, "step": 63 }, { "epoch": 0.32, "grad_norm": 0.9044005870819092, "learning_rate": 0.00019998476951563915, "loss": 0.5441, "step": 64 }, { "epoch": 0.325, "grad_norm": 1.1454882621765137, "learning_rate": 0.00019997292411794618, "loss": 0.3243, "step": 65 }, { "epoch": 0.33, "grad_norm": 1.2085391283035278, "learning_rate": 0.0001999576950082201, "loss": 0.5298, "step": 66 }, { "epoch": 0.335, "grad_norm": 1.0972354412078857, "learning_rate": 0.0001999390827019096, "loss": 0.4893, "step": 67 }, { "epoch": 0.34, "grad_norm": 1.2321377992630005, "learning_rate": 0.00019991708782897213, "loss": 0.6041, "step": 68 }, { "epoch": 0.345, "grad_norm": 1.3030674457550049, "learning_rate": 0.0001998917111338525, "loss": 0.3892, "step": 69 }, { "epoch": 0.35, "grad_norm": 1.0997453927993774, "learning_rate": 0.0001998629534754574, "loss": 0.4945, "step": 70 }, { "epoch": 0.355, "grad_norm": 1.1095383167266846, "learning_rate": 0.00019983081582712685, "loss": 0.5592, "step": 71 }, { "epoch": 0.36, "grad_norm": 1.0699679851531982, "learning_rate": 0.00019979529927660074, "loss": 0.6754, "step": 72 }, { "epoch": 0.365, "grad_norm": 1.2686567306518555, "learning_rate": 0.00019975640502598244, "loss": 0.4932, "step": 73 }, { "epoch": 0.37, "grad_norm": 1.000361680984497, "learning_rate": 0.00019971413439169775, "loss": 0.4836, "step": 74 }, { "epoch": 0.375, "grad_norm": 1.1984176635742188, "learning_rate": 0.00019966848880445062, "loss": 0.6015, "step": 75 }, { "epoch": 0.38, "grad_norm": 0.9447954297065735, "learning_rate": 0.00019961946980917456, "loss": 0.4425, "step": 76 }, { "epoch": 0.385, "grad_norm": 0.9791714549064636, "learning_rate": 0.00019956707906498044, "loss": 0.4852, "step": 77 }, { "epoch": 0.39, "grad_norm": 1.1539669036865234, "learning_rate": 0.00019951131834510032, "loss": 0.5147, "step": 78 }, { "epoch": 0.395, "grad_norm": 0.9412739276885986, "learning_rate": 0.00019945218953682734, "loss": 0.5266, "step": 79 }, { "epoch": 0.4, "grad_norm": 1.1397993564605713, "learning_rate": 0.000199389694641452, "loss": 0.5306, "step": 80 }, { "epoch": 0.405, "grad_norm": 1.0742263793945312, "learning_rate": 0.00019932383577419432, "loss": 0.4972, "step": 81 }, { "epoch": 0.41, "grad_norm": 1.0058263540267944, "learning_rate": 0.00019925461516413223, "loss": 0.4312, "step": 82 }, { "epoch": 0.415, "grad_norm": 1.1048433780670166, "learning_rate": 0.00019918203515412617, "loss": 0.4489, "step": 83 }, { "epoch": 0.42, "grad_norm": 1.1078910827636719, "learning_rate": 0.00019910609820073986, "loss": 0.3769, "step": 84 }, { "epoch": 0.425, "grad_norm": 1.0895092487335205, "learning_rate": 0.00019902680687415705, "loss": 0.4025, "step": 85 }, { "epoch": 0.43, "grad_norm": 1.2830822467803955, "learning_rate": 0.00019894416385809444, "loss": 0.5534, "step": 86 }, { "epoch": 0.435, "grad_norm": 2.1674487590789795, "learning_rate": 0.00019885817194971117, "loss": 0.3861, "step": 87 }, { "epoch": 0.44, "grad_norm": 0.9513518214225769, "learning_rate": 0.00019876883405951377, "loss": 0.427, "step": 88 }, { "epoch": 0.445, "grad_norm": 1.012882947921753, "learning_rate": 0.00019867615321125795, "loss": 0.5058, "step": 89 }, { "epoch": 0.45, "grad_norm": 1.177808165550232, "learning_rate": 0.00019858013254184597, "loss": 0.6787, "step": 90 }, { "epoch": 0.455, "grad_norm": 1.0161972045898438, "learning_rate": 0.00019848077530122083, "loss": 0.4483, "step": 91 }, { "epoch": 0.46, "grad_norm": 1.0030016899108887, "learning_rate": 0.0001983780848522559, "loss": 0.5978, "step": 92 }, { "epoch": 0.465, "grad_norm": 1.0409237146377563, "learning_rate": 0.00019827206467064133, "loss": 0.4514, "step": 93 }, { "epoch": 0.47, "grad_norm": 1.1345373392105103, "learning_rate": 0.00019816271834476642, "loss": 0.6026, "step": 94 }, { "epoch": 0.475, "grad_norm": 1.0670682191848755, "learning_rate": 0.00019805004957559793, "loss": 0.4506, "step": 95 }, { "epoch": 0.48, "grad_norm": 0.7726262807846069, "learning_rate": 0.00019793406217655517, "loss": 0.4391, "step": 96 }, { "epoch": 0.485, "grad_norm": 1.0085614919662476, "learning_rate": 0.00019781476007338058, "loss": 0.4648, "step": 97 }, { "epoch": 0.49, "grad_norm": 1.23222017288208, "learning_rate": 0.00019769214730400712, "loss": 0.5058, "step": 98 }, { "epoch": 0.495, "grad_norm": 1.2733196020126343, "learning_rate": 0.00019756622801842143, "loss": 0.4494, "step": 99 }, { "epoch": 0.5, "grad_norm": 1.1150026321411133, "learning_rate": 0.00019743700647852354, "loss": 0.5667, "step": 100 }, { "epoch": 0.505, "grad_norm": 1.1838147640228271, "learning_rate": 0.00019730448705798239, "loss": 0.5004, "step": 101 }, { "epoch": 0.51, "grad_norm": 1.1241400241851807, "learning_rate": 0.00019716867424208806, "loss": 0.4446, "step": 102 }, { "epoch": 0.515, "grad_norm": 0.8567945957183838, "learning_rate": 0.00019702957262759965, "loss": 0.6337, "step": 103 }, { "epoch": 0.52, "grad_norm": 0.903740644454956, "learning_rate": 0.00019688718692259006, "loss": 0.3562, "step": 104 }, { "epoch": 0.525, "grad_norm": 1.063928484916687, "learning_rate": 0.00019674152194628638, "loss": 0.5589, "step": 105 }, { "epoch": 0.53, "grad_norm": 0.8685862421989441, "learning_rate": 0.00019659258262890683, "loss": 0.6022, "step": 106 }, { "epoch": 0.535, "grad_norm": 1.159970760345459, "learning_rate": 0.0001964403740114939, "loss": 0.5976, "step": 107 }, { "epoch": 0.54, "grad_norm": 1.0903443098068237, "learning_rate": 0.00019628490124574377, "loss": 0.5986, "step": 108 }, { "epoch": 0.545, "grad_norm": 1.2548609972000122, "learning_rate": 0.0001961261695938319, "loss": 0.475, "step": 109 }, { "epoch": 0.55, "grad_norm": 0.8668074011802673, "learning_rate": 0.00019596418442823494, "loss": 0.499, "step": 110 }, { "epoch": 0.555, "grad_norm": 0.9190664887428284, "learning_rate": 0.0001957989512315489, "loss": 0.4129, "step": 111 }, { "epoch": 0.56, "grad_norm": 0.7396138310432434, "learning_rate": 0.00019563047559630357, "loss": 0.3469, "step": 112 }, { "epoch": 0.565, "grad_norm": 1.0757927894592285, "learning_rate": 0.0001954587632247732, "loss": 0.3882, "step": 113 }, { "epoch": 0.57, "grad_norm": 1.1670087575912476, "learning_rate": 0.00019528381992878362, "loss": 0.5141, "step": 114 }, { "epoch": 0.575, "grad_norm": 1.1538702249526978, "learning_rate": 0.00019510565162951537, "loss": 0.511, "step": 115 }, { "epoch": 0.58, "grad_norm": 1.6420246362686157, "learning_rate": 0.0001949242643573034, "loss": 0.6716, "step": 116 }, { "epoch": 0.585, "grad_norm": 1.3264261484146118, "learning_rate": 0.00019473966425143292, "loss": 0.5407, "step": 117 }, { "epoch": 0.59, "grad_norm": 1.01437509059906, "learning_rate": 0.0001945518575599317, "loss": 0.4522, "step": 118 }, { "epoch": 0.595, "grad_norm": 0.9997525215148926, "learning_rate": 0.00019436085063935835, "loss": 0.4522, "step": 119 }, { "epoch": 0.6, "grad_norm": 0.9435955286026001, "learning_rate": 0.00019416664995458756, "loss": 0.5659, "step": 120 }, { "epoch": 0.605, "grad_norm": 0.8276172876358032, "learning_rate": 0.00019396926207859084, "loss": 0.4143, "step": 121 }, { "epoch": 0.61, "grad_norm": 1.194066047668457, "learning_rate": 0.00019376869369221452, "loss": 0.5177, "step": 122 }, { "epoch": 0.615, "grad_norm": 1.120455265045166, "learning_rate": 0.00019356495158395315, "loss": 0.3676, "step": 123 }, { "epoch": 0.62, "grad_norm": 0.8941965103149414, "learning_rate": 0.00019335804264972018, "loss": 0.3922, "step": 124 }, { "epoch": 0.625, "grad_norm": 1.2467975616455078, "learning_rate": 0.00019314797389261424, "loss": 0.5063, "step": 125 }, { "epoch": 0.63, "grad_norm": 1.0667827129364014, "learning_rate": 0.00019293475242268223, "loss": 0.529, "step": 126 }, { "epoch": 0.635, "grad_norm": 1.0572445392608643, "learning_rate": 0.00019271838545667876, "loss": 0.5541, "step": 127 }, { "epoch": 0.64, "grad_norm": 0.9136587977409363, "learning_rate": 0.0001924988803178216, "loss": 0.4134, "step": 128 }, { "epoch": 0.645, "grad_norm": 0.988009512424469, "learning_rate": 0.00019227624443554425, "loss": 0.4014, "step": 129 }, { "epoch": 0.65, "grad_norm": 1.3015025854110718, "learning_rate": 0.00019205048534524406, "loss": 0.4819, "step": 130 }, { "epoch": 0.655, "grad_norm": 0.917235791683197, "learning_rate": 0.00019182161068802741, "loss": 0.5578, "step": 131 }, { "epoch": 0.66, "grad_norm": 1.2599695920944214, "learning_rate": 0.00019158962821045112, "loss": 0.5951, "step": 132 }, { "epoch": 0.665, "grad_norm": 0.8651666045188904, "learning_rate": 0.0001913545457642601, "loss": 0.4082, "step": 133 }, { "epoch": 0.67, "grad_norm": 1.1557177305221558, "learning_rate": 0.0001911163713061217, "loss": 0.6299, "step": 134 }, { "epoch": 0.675, "grad_norm": 0.9854793548583984, "learning_rate": 0.00019087511289735644, "loss": 0.5584, "step": 135 }, { "epoch": 0.68, "grad_norm": 0.9728374481201172, "learning_rate": 0.000190630778703665, "loss": 0.4835, "step": 136 }, { "epoch": 0.685, "grad_norm": 0.9781370759010315, "learning_rate": 0.00019038337699485208, "loss": 0.5205, "step": 137 }, { "epoch": 0.69, "grad_norm": 0.9177173972129822, "learning_rate": 0.00019013291614454621, "loss": 0.3354, "step": 138 }, { "epoch": 0.695, "grad_norm": 0.9920403361320496, "learning_rate": 0.0001898794046299167, "loss": 0.533, "step": 139 }, { "epoch": 0.7, "grad_norm": 1.3647239208221436, "learning_rate": 0.00018962285103138636, "loss": 0.6999, "step": 140 }, { "epoch": 0.705, "grad_norm": 0.9474604725837708, "learning_rate": 0.00018936326403234125, "loss": 0.4888, "step": 141 }, { "epoch": 0.71, "grad_norm": 0.9551665782928467, "learning_rate": 0.0001891006524188368, "loss": 0.3503, "step": 142 }, { "epoch": 0.715, "grad_norm": 1.0441597700119019, "learning_rate": 0.00018883502507930042, "loss": 0.4005, "step": 143 }, { "epoch": 0.72, "grad_norm": 0.9200822114944458, "learning_rate": 0.0001885663910042306, "loss": 0.5412, "step": 144 }, { "epoch": 0.725, "grad_norm": 1.20487642288208, "learning_rate": 0.00018829475928589271, "loss": 0.4964, "step": 145 }, { "epoch": 0.73, "grad_norm": 1.1462260484695435, "learning_rate": 0.00018802013911801112, "loss": 0.4973, "step": 146 }, { "epoch": 0.735, "grad_norm": 1.054632544517517, "learning_rate": 0.0001877425397954582, "loss": 0.5144, "step": 147 }, { "epoch": 0.74, "grad_norm": 0.8504728674888611, "learning_rate": 0.00018746197071393958, "loss": 0.3942, "step": 148 }, { "epoch": 0.745, "grad_norm": 1.0933464765548706, "learning_rate": 0.00018717844136967624, "loss": 0.6372, "step": 149 }, { "epoch": 0.75, "grad_norm": 1.1559089422225952, "learning_rate": 0.00018689196135908304, "loss": 0.4525, "step": 150 }, { "epoch": 0.755, "grad_norm": 1.25426185131073, "learning_rate": 0.00018660254037844388, "loss": 0.532, "step": 151 }, { "epoch": 0.76, "grad_norm": 0.9040874242782593, "learning_rate": 0.00018631018822358363, "loss": 0.5916, "step": 152 }, { "epoch": 0.765, "grad_norm": 1.0059220790863037, "learning_rate": 0.00018601491478953657, "loss": 0.4616, "step": 153 }, { "epoch": 0.77, "grad_norm": 1.1526310443878174, "learning_rate": 0.00018571673007021123, "loss": 0.476, "step": 154 }, { "epoch": 0.775, "grad_norm": 0.8825452923774719, "learning_rate": 0.00018541564415805258, "loss": 0.3968, "step": 155 }, { "epoch": 0.78, "grad_norm": 1.189428687095642, "learning_rate": 0.00018511166724369997, "loss": 0.5498, "step": 156 }, { "epoch": 0.785, "grad_norm": 0.9788728356361389, "learning_rate": 0.0001848048096156426, "loss": 0.458, "step": 157 }, { "epoch": 0.79, "grad_norm": 1.2952889204025269, "learning_rate": 0.00018449508165987105, "loss": 0.7268, "step": 158 }, { "epoch": 0.795, "grad_norm": 1.091649055480957, "learning_rate": 0.00018418249385952575, "loss": 0.4464, "step": 159 }, { "epoch": 0.8, "grad_norm": 0.8726484775543213, "learning_rate": 0.00018386705679454242, "loss": 0.5441, "step": 160 }, { "epoch": 0.805, "grad_norm": 0.913123369216919, "learning_rate": 0.00018354878114129367, "loss": 0.544, "step": 161 }, { "epoch": 0.81, "grad_norm": 0.8178060054779053, "learning_rate": 0.0001832276776722278, "loss": 0.481, "step": 162 }, { "epoch": 0.815, "grad_norm": 0.8458369970321655, "learning_rate": 0.00018290375725550417, "loss": 0.4149, "step": 163 }, { "epoch": 0.82, "grad_norm": 1.0555588006973267, "learning_rate": 0.00018257703085462542, "loss": 0.4748, "step": 164 }, { "epoch": 0.825, "grad_norm": 0.951287031173706, "learning_rate": 0.00018224750952806624, "loss": 0.5823, "step": 165 }, { "epoch": 0.83, "grad_norm": 0.9891758561134338, "learning_rate": 0.0001819152044288992, "loss": 0.4222, "step": 166 }, { "epoch": 0.835, "grad_norm": 0.8881749510765076, "learning_rate": 0.00018158012680441723, "loss": 0.344, "step": 167 }, { "epoch": 0.84, "grad_norm": 0.9701601266860962, "learning_rate": 0.00018124228799575295, "loss": 0.4689, "step": 168 }, { "epoch": 0.845, "grad_norm": 1.0606831312179565, "learning_rate": 0.00018090169943749476, "loss": 0.4291, "step": 169 }, { "epoch": 0.85, "grad_norm": 1.162109375, "learning_rate": 0.00018055837265729994, "loss": 0.4932, "step": 170 }, { "epoch": 0.855, "grad_norm": 0.894894003868103, "learning_rate": 0.0001802123192755044, "loss": 0.4048, "step": 171 }, { "epoch": 0.86, "grad_norm": 0.9900356531143188, "learning_rate": 0.00017986355100472928, "loss": 0.4286, "step": 172 }, { "epoch": 0.865, "grad_norm": 0.87781822681427, "learning_rate": 0.0001795120796494848, "loss": 0.3602, "step": 173 }, { "epoch": 0.87, "grad_norm": 1.0920095443725586, "learning_rate": 0.00017915791710577033, "loss": 0.7948, "step": 174 }, { "epoch": 0.875, "grad_norm": 0.8995212912559509, "learning_rate": 0.00017880107536067218, "loss": 0.386, "step": 175 }, { "epoch": 0.88, "grad_norm": 1.000054121017456, "learning_rate": 0.00017844156649195759, "loss": 0.4768, "step": 176 }, { "epoch": 0.885, "grad_norm": 0.9968723058700562, "learning_rate": 0.00017807940266766593, "loss": 0.6384, "step": 177 }, { "epoch": 0.89, "grad_norm": 1.0881284475326538, "learning_rate": 0.0001777145961456971, "loss": 0.5514, "step": 178 }, { "epoch": 0.895, "grad_norm": 0.8744277954101562, "learning_rate": 0.0001773471592733964, "loss": 0.4813, "step": 179 }, { "epoch": 0.9, "grad_norm": 0.9276065230369568, "learning_rate": 0.00017697710448713678, "loss": 0.5127, "step": 180 }, { "epoch": 0.905, "grad_norm": 0.8760328888893127, "learning_rate": 0.0001766044443118978, "loss": 0.4544, "step": 181 }, { "epoch": 0.91, "grad_norm": 0.9828473329544067, "learning_rate": 0.00017622919136084183, "loss": 0.5908, "step": 182 }, { "epoch": 0.915, "grad_norm": 0.8809630870819092, "learning_rate": 0.00017585135833488692, "loss": 0.477, "step": 183 }, { "epoch": 0.92, "grad_norm": 0.9894719123840332, "learning_rate": 0.00017547095802227723, "loss": 0.3714, "step": 184 }, { "epoch": 0.925, "grad_norm": 1.0420310497283936, "learning_rate": 0.00017508800329814995, "loss": 0.595, "step": 185 }, { "epoch": 0.93, "grad_norm": 1.1475286483764648, "learning_rate": 0.0001747025071240996, "loss": 0.6132, "step": 186 }, { "epoch": 0.935, "grad_norm": 0.8398196697235107, "learning_rate": 0.00017431448254773944, "loss": 0.4038, "step": 187 }, { "epoch": 0.94, "grad_norm": 0.8641005158424377, "learning_rate": 0.0001739239427022596, "loss": 0.3943, "step": 188 }, { "epoch": 0.945, "grad_norm": 0.9621971249580383, "learning_rate": 0.0001735309008059829, "loss": 0.5505, "step": 189 }, { "epoch": 0.95, "grad_norm": 1.0214611291885376, "learning_rate": 0.00017313537016191706, "loss": 0.6831, "step": 190 }, { "epoch": 0.955, "grad_norm": 0.8197475671768188, "learning_rate": 0.00017273736415730488, "loss": 0.4049, "step": 191 }, { "epoch": 0.96, "grad_norm": 1.263084888458252, "learning_rate": 0.0001723368962631708, "loss": 0.3962, "step": 192 }, { "epoch": 0.965, "grad_norm": 1.0390878915786743, "learning_rate": 0.0001719339800338651, "loss": 0.4793, "step": 193 }, { "epoch": 0.97, "grad_norm": 1.1881486177444458, "learning_rate": 0.00017152862910660516, "loss": 0.5908, "step": 194 }, { "epoch": 0.975, "grad_norm": 0.8765969276428223, "learning_rate": 0.00017112085720101373, "loss": 0.3661, "step": 195 }, { "epoch": 0.98, "grad_norm": 0.9140594601631165, "learning_rate": 0.00017071067811865476, "loss": 0.6136, "step": 196 }, { "epoch": 0.985, "grad_norm": 0.8686286211013794, "learning_rate": 0.0001702981057425662, "loss": 0.4636, "step": 197 }, { "epoch": 0.99, "grad_norm": 0.9936033487319946, "learning_rate": 0.00016988315403679, "loss": 0.5857, "step": 198 }, { "epoch": 0.995, "grad_norm": 0.9762316346168518, "learning_rate": 0.00016946583704589973, "loss": 0.5525, "step": 199 }, { "epoch": 1.0, "grad_norm": 1.0653597116470337, "learning_rate": 0.00016904616889452497, "loss": 0.4963, "step": 200 }, { "epoch": 1.005, "grad_norm": 0.7175650596618652, "learning_rate": 0.0001686241637868734, "loss": 0.2784, "step": 201 }, { "epoch": 1.01, "grad_norm": 0.9962010383605957, "learning_rate": 0.00016819983600624986, "loss": 0.3781, "step": 202 }, { "epoch": 1.015, "grad_norm": 0.9142802953720093, "learning_rate": 0.00016777319991457325, "loss": 0.3279, "step": 203 }, { "epoch": 1.02, "grad_norm": 0.8529897928237915, "learning_rate": 0.00016734426995189004, "loss": 0.3837, "step": 204 }, { "epoch": 1.025, "grad_norm": 1.036350131034851, "learning_rate": 0.00016691306063588583, "loss": 0.6008, "step": 205 }, { "epoch": 1.03, "grad_norm": 0.8911110162734985, "learning_rate": 0.00016647958656139378, "loss": 0.4419, "step": 206 }, { "epoch": 1.035, "grad_norm": 0.8090599775314331, "learning_rate": 0.00016604386239990078, "loss": 0.3859, "step": 207 }, { "epoch": 1.04, "grad_norm": 0.9908334612846375, "learning_rate": 0.00016560590289905073, "loss": 0.5062, "step": 208 }, { "epoch": 1.045, "grad_norm": 0.9701403975486755, "learning_rate": 0.00016516572288214552, "loss": 0.3596, "step": 209 }, { "epoch": 1.05, "grad_norm": 0.9375543594360352, "learning_rate": 0.00016472333724764325, "loss": 0.3414, "step": 210 }, { "epoch": 1.055, "grad_norm": 0.8780631422996521, "learning_rate": 0.00016427876096865394, "loss": 0.2714, "step": 211 }, { "epoch": 1.06, "grad_norm": 1.023652195930481, "learning_rate": 0.00016383200909243285, "loss": 0.407, "step": 212 }, { "epoch": 1.065, "grad_norm": 1.222335696220398, "learning_rate": 0.00016338309673987101, "loss": 0.3731, "step": 213 }, { "epoch": 1.07, "grad_norm": 1.0765554904937744, "learning_rate": 0.00016293203910498376, "loss": 0.3411, "step": 214 }, { "epoch": 1.075, "grad_norm": 1.1874161958694458, "learning_rate": 0.000162478851454396, "loss": 0.3628, "step": 215 }, { "epoch": 1.08, "grad_norm": 0.9529691934585571, "learning_rate": 0.000162023549126826, "loss": 0.3279, "step": 216 }, { "epoch": 1.085, "grad_norm": 0.890059232711792, "learning_rate": 0.0001615661475325658, "loss": 0.2977, "step": 217 }, { "epoch": 1.09, "grad_norm": 1.116040587425232, "learning_rate": 0.00016110666215295998, "loss": 0.4425, "step": 218 }, { "epoch": 1.095, "grad_norm": 1.0239148139953613, "learning_rate": 0.00016064510853988138, "loss": 0.3655, "step": 219 }, { "epoch": 1.1, "grad_norm": 1.075461983680725, "learning_rate": 0.00016018150231520486, "loss": 0.3202, "step": 220 }, { "epoch": 1.105, "grad_norm": 1.1996628046035767, "learning_rate": 0.00015971585917027862, "loss": 0.406, "step": 221 }, { "epoch": 1.11, "grad_norm": 0.9129374623298645, "learning_rate": 0.00015924819486539307, "loss": 0.2548, "step": 222 }, { "epoch": 1.115, "grad_norm": 1.1270534992218018, "learning_rate": 0.00015877852522924732, "loss": 0.3612, "step": 223 }, { "epoch": 1.12, "grad_norm": 0.9386212229728699, "learning_rate": 0.00015830686615841348, "loss": 0.3371, "step": 224 }, { "epoch": 1.125, "grad_norm": 0.8654946088790894, "learning_rate": 0.00015783323361679864, "loss": 0.2435, "step": 225 }, { "epoch": 1.13, "grad_norm": 0.9889599680900574, "learning_rate": 0.0001573576436351046, "loss": 0.4872, "step": 226 }, { "epoch": 1.135, "grad_norm": 1.1625468730926514, "learning_rate": 0.00015688011231028518, "loss": 0.3725, "step": 227 }, { "epoch": 1.1400000000000001, "grad_norm": 1.0550003051757812, "learning_rate": 0.00015640065580500148, "loss": 0.4399, "step": 228 }, { "epoch": 1.145, "grad_norm": 1.083914041519165, "learning_rate": 0.0001559192903470747, "loss": 0.3753, "step": 229 }, { "epoch": 1.15, "grad_norm": 0.8965413570404053, "learning_rate": 0.00015543603222893716, "loss": 0.3601, "step": 230 }, { "epoch": 1.155, "grad_norm": 0.8855785727500916, "learning_rate": 0.0001549508978070806, "loss": 0.397, "step": 231 }, { "epoch": 1.16, "grad_norm": 0.8954373002052307, "learning_rate": 0.00015446390350150273, "loss": 0.3028, "step": 232 }, { "epoch": 1.165, "grad_norm": 1.1223406791687012, "learning_rate": 0.0001539750657951513, "loss": 0.4754, "step": 233 }, { "epoch": 1.17, "grad_norm": 0.9738592505455017, "learning_rate": 0.00015348440123336645, "loss": 0.3177, "step": 234 }, { "epoch": 1.175, "grad_norm": 0.9397669434547424, "learning_rate": 0.0001529919264233205, "loss": 0.2809, "step": 235 }, { "epoch": 1.18, "grad_norm": 1.1951464414596558, "learning_rate": 0.000152497658033456, "loss": 0.3721, "step": 236 }, { "epoch": 1.185, "grad_norm": 1.0541985034942627, "learning_rate": 0.00015200161279292155, "loss": 0.4451, "step": 237 }, { "epoch": 1.19, "grad_norm": 1.0258787870407104, "learning_rate": 0.00015150380749100545, "loss": 0.408, "step": 238 }, { "epoch": 1.195, "grad_norm": 1.1269365549087524, "learning_rate": 0.00015100425897656753, "loss": 0.3305, "step": 239 }, { "epoch": 1.2, "grad_norm": 1.2180510759353638, "learning_rate": 0.000150502984157469, "loss": 0.3271, "step": 240 }, { "epoch": 1.205, "grad_norm": 0.7923290729522705, "learning_rate": 0.00015000000000000001, "loss": 0.2598, "step": 241 }, { "epoch": 1.21, "grad_norm": 1.0020320415496826, "learning_rate": 0.00014949532352830541, "loss": 0.3287, "step": 242 }, { "epoch": 1.215, "grad_norm": 0.9556872844696045, "learning_rate": 0.0001489889718238087, "loss": 0.3174, "step": 243 }, { "epoch": 1.22, "grad_norm": 1.2195252180099487, "learning_rate": 0.00014848096202463372, "loss": 0.551, "step": 244 }, { "epoch": 1.225, "grad_norm": 1.0874240398406982, "learning_rate": 0.00014797131132502465, "loss": 0.4717, "step": 245 }, { "epoch": 1.23, "grad_norm": 1.0347204208374023, "learning_rate": 0.00014746003697476404, "loss": 0.3886, "step": 246 }, { "epoch": 1.2349999999999999, "grad_norm": 1.1196807622909546, "learning_rate": 0.00014694715627858908, "loss": 0.4335, "step": 247 }, { "epoch": 1.24, "grad_norm": 1.0648186206817627, "learning_rate": 0.00014643268659560572, "loss": 0.3459, "step": 248 }, { "epoch": 1.245, "grad_norm": 0.9884780645370483, "learning_rate": 0.00014591664533870118, "loss": 0.303, "step": 249 }, { "epoch": 1.25, "grad_norm": 0.9861478805541992, "learning_rate": 0.00014539904997395468, "loss": 0.2804, "step": 250 }, { "epoch": 1.255, "grad_norm": 0.9452088475227356, "learning_rate": 0.00014487991802004623, "loss": 0.3405, "step": 251 }, { "epoch": 1.26, "grad_norm": 1.1845060586929321, "learning_rate": 0.00014435926704766362, "loss": 0.3634, "step": 252 }, { "epoch": 1.2650000000000001, "grad_norm": 1.2674229145050049, "learning_rate": 0.00014383711467890774, "loss": 0.4798, "step": 253 }, { "epoch": 1.27, "grad_norm": 0.9279680252075195, "learning_rate": 0.00014331347858669632, "loss": 0.3077, "step": 254 }, { "epoch": 1.275, "grad_norm": 0.98764967918396, "learning_rate": 0.00014278837649416544, "loss": 0.2985, "step": 255 }, { "epoch": 1.28, "grad_norm": 1.1002823114395142, "learning_rate": 0.00014226182617406996, "loss": 0.3019, "step": 256 }, { "epoch": 1.285, "grad_norm": 0.8877493739128113, "learning_rate": 0.0001417338454481818, "loss": 0.3285, "step": 257 }, { "epoch": 1.29, "grad_norm": 1.1821260452270508, "learning_rate": 0.00014120445218668686, "loss": 0.4418, "step": 258 }, { "epoch": 1.295, "grad_norm": 1.6033821105957031, "learning_rate": 0.00014067366430758004, "loss": 0.4941, "step": 259 }, { "epoch": 1.3, "grad_norm": 1.0511289834976196, "learning_rate": 0.00014014149977605893, "loss": 0.3008, "step": 260 }, { "epoch": 1.305, "grad_norm": 1.1599230766296387, "learning_rate": 0.0001396079766039157, "loss": 0.3819, "step": 261 }, { "epoch": 1.31, "grad_norm": 1.1954684257507324, "learning_rate": 0.00013907311284892736, "loss": 0.4597, "step": 262 }, { "epoch": 1.315, "grad_norm": 1.206805944442749, "learning_rate": 0.00013853692661424484, "loss": 0.453, "step": 263 }, { "epoch": 1.32, "grad_norm": 1.286787509918213, "learning_rate": 0.00013799943604777992, "loss": 0.4028, "step": 264 }, { "epoch": 1.325, "grad_norm": 1.069351315498352, "learning_rate": 0.00013746065934159123, "loss": 0.3466, "step": 265 }, { "epoch": 1.33, "grad_norm": 1.0791929960250854, "learning_rate": 0.00013692061473126845, "loss": 0.3244, "step": 266 }, { "epoch": 1.335, "grad_norm": 1.1105064153671265, "learning_rate": 0.00013637932049531516, "loss": 0.3482, "step": 267 }, { "epoch": 1.34, "grad_norm": 1.0154908895492554, "learning_rate": 0.00013583679495453, "loss": 0.3054, "step": 268 }, { "epoch": 1.345, "grad_norm": 1.1842292547225952, "learning_rate": 0.00013529305647138687, "loss": 0.4819, "step": 269 }, { "epoch": 1.35, "grad_norm": 1.2675782442092896, "learning_rate": 0.00013474812344941315, "loss": 0.4076, "step": 270 }, { "epoch": 1.355, "grad_norm": 1.0911834239959717, "learning_rate": 0.00013420201433256689, "loss": 0.3888, "step": 271 }, { "epoch": 1.3599999999999999, "grad_norm": 0.9814577698707581, "learning_rate": 0.00013365474760461266, "loss": 0.3782, "step": 272 }, { "epoch": 1.365, "grad_norm": 1.294854760169983, "learning_rate": 0.0001331063417884958, "loss": 0.3765, "step": 273 }, { "epoch": 1.37, "grad_norm": 1.3478037118911743, "learning_rate": 0.00013255681544571568, "loss": 0.3189, "step": 274 }, { "epoch": 1.375, "grad_norm": 1.0741455554962158, "learning_rate": 0.00013200618717569714, "loss": 0.3722, "step": 275 }, { "epoch": 1.38, "grad_norm": 1.227367639541626, "learning_rate": 0.00013145447561516138, "loss": 0.4518, "step": 276 }, { "epoch": 1.385, "grad_norm": 0.9935132265090942, "learning_rate": 0.00013090169943749476, "loss": 0.2895, "step": 277 }, { "epoch": 1.3900000000000001, "grad_norm": 1.0826445817947388, "learning_rate": 0.0001303478773521171, "loss": 0.311, "step": 278 }, { "epoch": 1.395, "grad_norm": 1.1768107414245605, "learning_rate": 0.0001297930281038482, "loss": 0.4022, "step": 279 }, { "epoch": 1.4, "grad_norm": 1.1796941757202148, "learning_rate": 0.00012923717047227368, "loss": 0.404, "step": 280 }, { "epoch": 1.405, "grad_norm": 1.268023133277893, "learning_rate": 0.00012868032327110904, "loss": 0.4937, "step": 281 }, { "epoch": 1.41, "grad_norm": 1.0576366186141968, "learning_rate": 0.00012812250534756308, "loss": 0.4217, "step": 282 }, { "epoch": 1.415, "grad_norm": 1.0817463397979736, "learning_rate": 0.0001275637355816999, "loss": 0.3601, "step": 283 }, { "epoch": 1.42, "grad_norm": 1.103407621383667, "learning_rate": 0.0001270040328858001, "loss": 0.3494, "step": 284 }, { "epoch": 1.425, "grad_norm": 1.195494294166565, "learning_rate": 0.00012644341620372023, "loss": 0.3918, "step": 285 }, { "epoch": 1.43, "grad_norm": 1.1840097904205322, "learning_rate": 0.00012588190451025207, "loss": 0.5692, "step": 286 }, { "epoch": 1.435, "grad_norm": 1.2088396549224854, "learning_rate": 0.0001253195168104802, "loss": 0.6073, "step": 287 }, { "epoch": 1.44, "grad_norm": 1.0365617275238037, "learning_rate": 0.0001247562721391386, "loss": 0.3038, "step": 288 }, { "epoch": 1.445, "grad_norm": 0.9122393727302551, "learning_rate": 0.00012419218955996676, "loss": 0.2954, "step": 289 }, { "epoch": 1.45, "grad_norm": 1.3106595277786255, "learning_rate": 0.00012362728816506417, "loss": 0.4137, "step": 290 }, { "epoch": 1.455, "grad_norm": 1.1253206729888916, "learning_rate": 0.00012306158707424403, "loss": 0.4297, "step": 291 }, { "epoch": 1.46, "grad_norm": 1.0713515281677246, "learning_rate": 0.0001224951054343865, "loss": 0.4376, "step": 292 }, { "epoch": 1.465, "grad_norm": 1.1838428974151611, "learning_rate": 0.00012192786241879033, "loss": 0.5176, "step": 293 }, { "epoch": 1.47, "grad_norm": 1.0355262756347656, "learning_rate": 0.00012135987722652402, "loss": 0.3063, "step": 294 }, { "epoch": 1.475, "grad_norm": 1.0859006643295288, "learning_rate": 0.00012079116908177593, "loss": 0.3937, "step": 295 }, { "epoch": 1.48, "grad_norm": 0.9821298122406006, "learning_rate": 0.00012022175723320381, "loss": 0.3422, "step": 296 }, { "epoch": 1.4849999999999999, "grad_norm": 1.1727828979492188, "learning_rate": 0.00011965166095328301, "loss": 0.3451, "step": 297 }, { "epoch": 1.49, "grad_norm": 0.9656005501747131, "learning_rate": 0.00011908089953765449, "loss": 0.348, "step": 298 }, { "epoch": 1.495, "grad_norm": 0.8653091192245483, "learning_rate": 0.00011850949230447145, "loss": 0.2833, "step": 299 }, { "epoch": 1.5, "grad_norm": 1.2153239250183105, "learning_rate": 0.00011793745859374575, "loss": 0.4989, "step": 300 }, { "epoch": 1.505, "grad_norm": 1.3739268779754639, "learning_rate": 0.00011736481776669306, "loss": 0.6309, "step": 301 }, { "epoch": 1.51, "grad_norm": 1.188794732093811, "learning_rate": 0.00011679158920507774, "loss": 0.3849, "step": 302 }, { "epoch": 1.5150000000000001, "grad_norm": 0.9560791254043579, "learning_rate": 0.00011621779231055676, "loss": 0.4049, "step": 303 }, { "epoch": 1.52, "grad_norm": 0.96742182970047, "learning_rate": 0.0001156434465040231, "loss": 0.3052, "step": 304 }, { "epoch": 1.525, "grad_norm": 1.062738299369812, "learning_rate": 0.00011506857122494831, "loss": 0.3945, "step": 305 }, { "epoch": 1.53, "grad_norm": 0.9473580121994019, "learning_rate": 0.00011449318593072466, "loss": 0.2784, "step": 306 }, { "epoch": 1.5350000000000001, "grad_norm": 1.1075767278671265, "learning_rate": 0.00011391731009600654, "loss": 0.4124, "step": 307 }, { "epoch": 1.54, "grad_norm": 1.3211658000946045, "learning_rate": 0.00011334096321205128, "loss": 0.5957, "step": 308 }, { "epoch": 1.545, "grad_norm": 1.1613578796386719, "learning_rate": 0.00011276416478605949, "loss": 0.4728, "step": 309 }, { "epoch": 1.55, "grad_norm": 1.121622085571289, "learning_rate": 0.00011218693434051475, "loss": 0.4937, "step": 310 }, { "epoch": 1.5550000000000002, "grad_norm": 0.991020679473877, "learning_rate": 0.00011160929141252303, "loss": 0.3873, "step": 311 }, { "epoch": 1.56, "grad_norm": 1.144633412361145, "learning_rate": 0.00011103125555315119, "loss": 0.3728, "step": 312 }, { "epoch": 1.565, "grad_norm": 1.028377652168274, "learning_rate": 0.00011045284632676536, "loss": 0.3539, "step": 313 }, { "epoch": 1.5699999999999998, "grad_norm": 1.1992504596710205, "learning_rate": 0.00010987408331036879, "loss": 0.5374, "step": 314 }, { "epoch": 1.575, "grad_norm": 1.2444828748703003, "learning_rate": 0.00010929498609293924, "loss": 0.5202, "step": 315 }, { "epoch": 1.58, "grad_norm": 1.1141530275344849, "learning_rate": 0.00010871557427476583, "loss": 0.3901, "step": 316 }, { "epoch": 1.585, "grad_norm": 1.3573873043060303, "learning_rate": 0.00010813586746678583, "loss": 0.4043, "step": 317 }, { "epoch": 1.5899999999999999, "grad_norm": 1.2175769805908203, "learning_rate": 0.00010755588528992082, "loss": 0.5342, "step": 318 }, { "epoch": 1.595, "grad_norm": 1.1052900552749634, "learning_rate": 0.00010697564737441252, "loss": 0.3409, "step": 319 }, { "epoch": 1.6, "grad_norm": 1.1096097230911255, "learning_rate": 0.00010639517335915856, "loss": 0.402, "step": 320 }, { "epoch": 1.605, "grad_norm": 1.12883722782135, "learning_rate": 0.00010581448289104758, "loss": 0.3989, "step": 321 }, { "epoch": 1.6099999999999999, "grad_norm": 1.1499744653701782, "learning_rate": 0.0001052335956242944, "loss": 0.4444, "step": 322 }, { "epoch": 1.615, "grad_norm": 1.1546366214752197, "learning_rate": 0.0001046525312197747, "loss": 0.3189, "step": 323 }, { "epoch": 1.62, "grad_norm": 1.3440266847610474, "learning_rate": 0.0001040713093443596, "loss": 0.4542, "step": 324 }, { "epoch": 1.625, "grad_norm": 1.3934736251831055, "learning_rate": 0.00010348994967025012, "loss": 0.4617, "step": 325 }, { "epoch": 1.63, "grad_norm": 1.147019863128662, "learning_rate": 0.00010290847187431113, "loss": 0.459, "step": 326 }, { "epoch": 1.635, "grad_norm": 1.200913667678833, "learning_rate": 0.00010232689563740563, "loss": 0.3514, "step": 327 }, { "epoch": 1.6400000000000001, "grad_norm": 1.2627280950546265, "learning_rate": 0.00010174524064372837, "loss": 0.5665, "step": 328 }, { "epoch": 1.645, "grad_norm": 1.1596174240112305, "learning_rate": 0.00010116352658013973, "loss": 0.4377, "step": 329 }, { "epoch": 1.65, "grad_norm": 1.0308893918991089, "learning_rate": 0.00010058177313549939, "loss": 0.3313, "step": 330 }, { "epoch": 1.655, "grad_norm": 1.1811511516571045, "learning_rate": 0.0001, "loss": 0.4735, "step": 331 }, { "epoch": 1.6600000000000001, "grad_norm": 1.154523253440857, "learning_rate": 9.94182268645006e-05, "loss": 0.3684, "step": 332 }, { "epoch": 1.665, "grad_norm": 1.2654355764389038, "learning_rate": 9.883647341986032e-05, "loss": 0.3789, "step": 333 }, { "epoch": 1.67, "grad_norm": 1.1328946352005005, "learning_rate": 9.825475935627165e-05, "loss": 0.4106, "step": 334 }, { "epoch": 1.675, "grad_norm": 1.1406633853912354, "learning_rate": 9.767310436259438e-05, "loss": 0.4166, "step": 335 }, { "epoch": 1.6800000000000002, "grad_norm": 1.1217607259750366, "learning_rate": 9.709152812568886e-05, "loss": 0.3737, "step": 336 }, { "epoch": 1.685, "grad_norm": 1.064173936843872, "learning_rate": 9.651005032974994e-05, "loss": 0.3622, "step": 337 }, { "epoch": 1.69, "grad_norm": 1.0098601579666138, "learning_rate": 9.592869065564043e-05, "loss": 0.4947, "step": 338 }, { "epoch": 1.6949999999999998, "grad_norm": 1.36995267868042, "learning_rate": 9.534746878022534e-05, "loss": 0.4379, "step": 339 }, { "epoch": 1.7, "grad_norm": 1.0789828300476074, "learning_rate": 9.476640437570562e-05, "loss": 0.3849, "step": 340 }, { "epoch": 1.705, "grad_norm": 1.156243085861206, "learning_rate": 9.418551710895243e-05, "loss": 0.3877, "step": 341 }, { "epoch": 1.71, "grad_norm": 1.2233916521072388, "learning_rate": 9.360482664084145e-05, "loss": 0.4296, "step": 342 }, { "epoch": 1.7149999999999999, "grad_norm": 0.9648745656013489, "learning_rate": 9.302435262558747e-05, "loss": 0.3485, "step": 343 }, { "epoch": 1.72, "grad_norm": 1.0833193063735962, "learning_rate": 9.244411471007922e-05, "loss": 0.3575, "step": 344 }, { "epoch": 1.725, "grad_norm": 1.298332691192627, "learning_rate": 9.186413253321418e-05, "loss": 0.5027, "step": 345 }, { "epoch": 1.73, "grad_norm": 1.4415847063064575, "learning_rate": 9.128442572523417e-05, "loss": 0.5372, "step": 346 }, { "epoch": 1.7349999999999999, "grad_norm": 1.278941035270691, "learning_rate": 9.070501390706079e-05, "loss": 0.3815, "step": 347 }, { "epoch": 1.74, "grad_norm": 1.6739096641540527, "learning_rate": 9.012591668963122e-05, "loss": 0.5571, "step": 348 }, { "epoch": 1.745, "grad_norm": 1.1540409326553345, "learning_rate": 8.954715367323468e-05, "loss": 0.3588, "step": 349 }, { "epoch": 1.75, "grad_norm": 1.3325749635696411, "learning_rate": 8.896874444684883e-05, "loss": 0.4606, "step": 350 }, { "epoch": 1.755, "grad_norm": 1.1433957815170288, "learning_rate": 8.839070858747697e-05, "loss": 0.4191, "step": 351 }, { "epoch": 1.76, "grad_norm": 1.3611005544662476, "learning_rate": 8.781306565948528e-05, "loss": 0.4895, "step": 352 }, { "epoch": 1.7650000000000001, "grad_norm": 1.101002812385559, "learning_rate": 8.723583521394054e-05, "loss": 0.4455, "step": 353 }, { "epoch": 1.77, "grad_norm": 0.9602501392364502, "learning_rate": 8.665903678794873e-05, "loss": 0.328, "step": 354 }, { "epoch": 1.775, "grad_norm": 1.1572333574295044, "learning_rate": 8.608268990399349e-05, "loss": 0.4007, "step": 355 }, { "epoch": 1.78, "grad_norm": 1.154706597328186, "learning_rate": 8.550681406927535e-05, "loss": 0.3929, "step": 356 }, { "epoch": 1.7850000000000001, "grad_norm": 1.114884853363037, "learning_rate": 8.49314287750517e-05, "loss": 0.2967, "step": 357 }, { "epoch": 1.79, "grad_norm": 1.2108124494552612, "learning_rate": 8.435655349597689e-05, "loss": 0.2887, "step": 358 }, { "epoch": 1.795, "grad_norm": 1.14920973777771, "learning_rate": 8.378220768944327e-05, "loss": 0.3213, "step": 359 }, { "epoch": 1.8, "grad_norm": 1.0905988216400146, "learning_rate": 8.32084107949223e-05, "loss": 0.3789, "step": 360 }, { "epoch": 1.8050000000000002, "grad_norm": 1.2537572383880615, "learning_rate": 8.263518223330697e-05, "loss": 0.4002, "step": 361 }, { "epoch": 1.81, "grad_norm": 1.3344327211380005, "learning_rate": 8.206254140625426e-05, "loss": 0.5166, "step": 362 }, { "epoch": 1.815, "grad_norm": 1.0742536783218384, "learning_rate": 8.149050769552856e-05, "loss": 0.351, "step": 363 }, { "epoch": 1.8199999999999998, "grad_norm": 1.259245753288269, "learning_rate": 8.091910046234552e-05, "loss": 0.4728, "step": 364 }, { "epoch": 1.825, "grad_norm": 1.0961995124816895, "learning_rate": 8.034833904671698e-05, "loss": 0.3983, "step": 365 }, { "epoch": 1.83, "grad_norm": 0.9268808960914612, "learning_rate": 7.977824276679623e-05, "loss": 0.3123, "step": 366 }, { "epoch": 1.835, "grad_norm": 1.468151569366455, "learning_rate": 7.920883091822408e-05, "loss": 0.4592, "step": 367 }, { "epoch": 1.8399999999999999, "grad_norm": 1.0316146612167358, "learning_rate": 7.864012277347602e-05, "loss": 0.365, "step": 368 }, { "epoch": 1.845, "grad_norm": 0.9063692092895508, "learning_rate": 7.807213758120966e-05, "loss": 0.2728, "step": 369 }, { "epoch": 1.85, "grad_norm": 1.135854959487915, "learning_rate": 7.750489456561352e-05, "loss": 0.4051, "step": 370 }, { "epoch": 1.855, "grad_norm": 1.0980889797210693, "learning_rate": 7.693841292575598e-05, "loss": 0.4583, "step": 371 }, { "epoch": 1.8599999999999999, "grad_norm": 1.2847305536270142, "learning_rate": 7.637271183493586e-05, "loss": 0.5764, "step": 372 }, { "epoch": 1.865, "grad_norm": 0.8198506236076355, "learning_rate": 7.580781044003324e-05, "loss": 0.25, "step": 373 }, { "epoch": 1.87, "grad_norm": 1.0252668857574463, "learning_rate": 7.524372786086142e-05, "loss": 0.4069, "step": 374 }, { "epoch": 1.875, "grad_norm": 1.1411824226379395, "learning_rate": 7.468048318951983e-05, "loss": 0.3409, "step": 375 }, { "epoch": 1.88, "grad_norm": 1.421773910522461, "learning_rate": 7.411809548974792e-05, "loss": 0.3869, "step": 376 }, { "epoch": 1.885, "grad_norm": 1.1122307777404785, "learning_rate": 7.35565837962798e-05, "loss": 0.4174, "step": 377 }, { "epoch": 1.8900000000000001, "grad_norm": 1.3112133741378784, "learning_rate": 7.299596711419994e-05, "loss": 0.4137, "step": 378 }, { "epoch": 1.895, "grad_norm": 0.8506189584732056, "learning_rate": 7.243626441830009e-05, "loss": 0.3312, "step": 379 }, { "epoch": 1.9, "grad_norm": 1.292675495147705, "learning_rate": 7.187749465243693e-05, "loss": 0.5821, "step": 380 }, { "epoch": 1.905, "grad_norm": 1.2538070678710938, "learning_rate": 7.131967672889101e-05, "loss": 0.5276, "step": 381 }, { "epoch": 1.9100000000000001, "grad_norm": 1.1800875663757324, "learning_rate": 7.076282952772633e-05, "loss": 0.426, "step": 382 }, { "epoch": 1.915, "grad_norm": 1.2774189710617065, "learning_rate": 7.02069718961518e-05, "loss": 0.491, "step": 383 }, { "epoch": 1.92, "grad_norm": 0.8698433041572571, "learning_rate": 6.965212264788297e-05, "loss": 0.283, "step": 384 }, { "epoch": 1.925, "grad_norm": 1.2440028190612793, "learning_rate": 6.909830056250527e-05, "loss": 0.4731, "step": 385 }, { "epoch": 1.9300000000000002, "grad_norm": 1.0781313180923462, "learning_rate": 6.854552438483865e-05, "loss": 0.3324, "step": 386 }, { "epoch": 1.935, "grad_norm": 1.0870329141616821, "learning_rate": 6.799381282430284e-05, "loss": 0.3973, "step": 387 }, { "epoch": 1.94, "grad_norm": 1.1785563230514526, "learning_rate": 6.744318455428436e-05, "loss": 0.4772, "step": 388 }, { "epoch": 1.9449999999999998, "grad_norm": 1.1302770376205444, "learning_rate": 6.68936582115042e-05, "loss": 0.334, "step": 389 }, { "epoch": 1.95, "grad_norm": 0.9602265357971191, "learning_rate": 6.634525239538736e-05, "loss": 0.3043, "step": 390 }, { "epoch": 1.955, "grad_norm": 1.3051893711090088, "learning_rate": 6.579798566743314e-05, "loss": 0.3011, "step": 391 }, { "epoch": 1.96, "grad_norm": 1.3510533571243286, "learning_rate": 6.525187655058686e-05, "loss": 0.4, "step": 392 }, { "epoch": 1.9649999999999999, "grad_norm": 0.9961161613464355, "learning_rate": 6.470694352861312e-05, "loss": 0.279, "step": 393 }, { "epoch": 1.97, "grad_norm": 1.42070734500885, "learning_rate": 6.416320504546997e-05, "loss": 0.4691, "step": 394 }, { "epoch": 1.975, "grad_norm": 1.1707082986831665, "learning_rate": 6.362067950468489e-05, "loss": 0.3346, "step": 395 }, { "epoch": 1.98, "grad_norm": 1.2637238502502441, "learning_rate": 6.307938526873157e-05, "loss": 0.3675, "step": 396 }, { "epoch": 1.9849999999999999, "grad_norm": 1.1974503993988037, "learning_rate": 6.25393406584088e-05, "loss": 0.3835, "step": 397 }, { "epoch": 1.99, "grad_norm": 1.1851766109466553, "learning_rate": 6.200056395222012e-05, "loss": 0.3743, "step": 398 }, { "epoch": 1.995, "grad_norm": 1.120631456375122, "learning_rate": 6.146307338575519e-05, "loss": 0.4406, "step": 399 }, { "epoch": 2.0, "grad_norm": 1.1833534240722656, "learning_rate": 6.092688715107264e-05, "loss": 0.3531, "step": 400 } ], "logging_steps": 1, "max_steps": 600, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.475654879496206e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }