{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 906, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0033208800332088003, "grad_norm": 3.2747624266657596, "learning_rate": 5e-05, "loss": 0.5378947257995605, "num_input_tokens_seen": 207792, "step": 1, "train_runtime": 156.6241, "train_tokens_per_second": 1326.692 }, { "epoch": 0.006641760066417601, "grad_norm": 7.848925830456806, "learning_rate": 4.9999849702094696e-05, "loss": 1.0608220100402832, "num_input_tokens_seen": 409248, "step": 2, "train_runtime": 356.4504, "train_tokens_per_second": 1148.121 }, { "epoch": 0.009962640099626401, "grad_norm": 9.070689906615115, "learning_rate": 4.999939881018594e-05, "loss": 0.9686721563339233, "num_input_tokens_seen": 612208, "step": 3, "train_runtime": 682.8103, "train_tokens_per_second": 896.6 }, { "epoch": 0.013283520132835201, "grad_norm": 36.803371242430046, "learning_rate": 4.999864732969518e-05, "loss": 0.93868488073349, "num_input_tokens_seen": 826744, "step": 4, "train_runtime": 1093.972, "train_tokens_per_second": 755.727 }, { "epoch": 0.016604400166044003, "grad_norm": 4.801394963290672, "learning_rate": 4.999759526965809e-05, "loss": 0.7522749900817871, "num_input_tokens_seen": 1030224, "step": 5, "train_runtime": 1447.6425, "train_tokens_per_second": 711.656 }, { "epoch": 0.019925280199252802, "grad_norm": 3.7701003964494224, "learning_rate": 4.999624264272447e-05, "loss": 0.671746015548706, "num_input_tokens_seen": 1242992, "step": 6, "train_runtime": 1850.3603, "train_tokens_per_second": 671.757 }, { "epoch": 0.023246160232461604, "grad_norm": 1.4127997880892347, "learning_rate": 4.999458946515808e-05, "loss": 0.599233865737915, "num_input_tokens_seen": 1459104, "step": 7, "train_runtime": 2250.4943, "train_tokens_per_second": 648.348 }, { "epoch": 0.026567040265670402, "grad_norm": 1.9666506427155999, "learning_rate": 4.9992635756836436e-05, "loss": 0.5814499855041504, "num_input_tokens_seen": 1672680, "step": 8, "train_runtime": 2622.5386, "train_tokens_per_second": 637.809 }, { "epoch": 0.029887920298879204, "grad_norm": 1.1203143540316671, "learning_rate": 4.999038154125061e-05, "loss": 0.5495998859405518, "num_input_tokens_seen": 1880360, "step": 9, "train_runtime": 2970.5591, "train_tokens_per_second": 632.999 }, { "epoch": 0.033208800332088007, "grad_norm": 0.9361059422068064, "learning_rate": 4.9987826845504914e-05, "loss": 0.5153144598007202, "num_input_tokens_seen": 2087496, "step": 10, "train_runtime": 3320.9155, "train_tokens_per_second": 628.591 }, { "epoch": 0.0365296803652968, "grad_norm": 0.9623107157259423, "learning_rate": 4.998497170031657e-05, "loss": 0.5035845041275024, "num_input_tokens_seen": 2292672, "step": 11, "train_runtime": 3681.7815, "train_tokens_per_second": 622.707 }, { "epoch": 0.039850560398505604, "grad_norm": 0.7967083859924536, "learning_rate": 4.9981816140015393e-05, "loss": 0.48510533571243286, "num_input_tokens_seen": 2506128, "step": 12, "train_runtime": 4035.6851, "train_tokens_per_second": 620.992 }, { "epoch": 0.043171440431714406, "grad_norm": 0.8312532729433936, "learning_rate": 4.997836020254328e-05, "loss": 0.4942867159843445, "num_input_tokens_seen": 2717216, "step": 13, "train_runtime": 4376.086, "train_tokens_per_second": 620.924 }, { "epoch": 0.04649232046492321, "grad_norm": 0.6662263351712367, "learning_rate": 4.9974603929453857e-05, "loss": 0.47794872522354126, "num_input_tokens_seen": 2925136, "step": 14, "train_runtime": 4748.9464, "train_tokens_per_second": 615.955 }, { "epoch": 0.049813200498132, "grad_norm": 0.7037240428820096, "learning_rate": 4.9970547365911925e-05, "loss": 0.4823008179664612, "num_input_tokens_seen": 3134936, "step": 15, "train_runtime": 5148.7909, "train_tokens_per_second": 608.868 }, { "epoch": 0.053134080531340805, "grad_norm": 0.627840479951483, "learning_rate": 4.996619056069291e-05, "loss": 0.4785187244415283, "num_input_tokens_seen": 3349208, "step": 16, "train_runtime": 5541.093, "train_tokens_per_second": 604.431 }, { "epoch": 0.05645496056454961, "grad_norm": 0.6446493545761787, "learning_rate": 4.996153356618233e-05, "loss": 0.48796018958091736, "num_input_tokens_seen": 3561152, "step": 17, "train_runtime": 5926.2264, "train_tokens_per_second": 600.914 }, { "epoch": 0.05977584059775841, "grad_norm": 0.5770209917742066, "learning_rate": 4.9956576438375095e-05, "loss": 0.46969473361968994, "num_input_tokens_seen": 3773184, "step": 18, "train_runtime": 6318.6387, "train_tokens_per_second": 597.151 }, { "epoch": 0.0630967206309672, "grad_norm": 0.6095808237282977, "learning_rate": 4.995131923687488e-05, "loss": 0.46969425678253174, "num_input_tokens_seen": 3983408, "step": 19, "train_runtime": 6720.6886, "train_tokens_per_second": 592.708 }, { "epoch": 0.06641760066417601, "grad_norm": 0.5307789972972902, "learning_rate": 4.994576202489339e-05, "loss": 0.4789154529571533, "num_input_tokens_seen": 4193504, "step": 20, "train_runtime": 7128.5117, "train_tokens_per_second": 588.272 }, { "epoch": 0.06973848069738481, "grad_norm": 0.5649476471386886, "learning_rate": 4.9939904869249616e-05, "loss": 0.4771980047225952, "num_input_tokens_seen": 4401856, "step": 21, "train_runtime": 7519.5707, "train_tokens_per_second": 585.387 }, { "epoch": 0.0730593607305936, "grad_norm": 0.5700710640445575, "learning_rate": 4.9933747840369015e-05, "loss": 0.4744207262992859, "num_input_tokens_seen": 4613472, "step": 22, "train_runtime": 7888.0948, "train_tokens_per_second": 584.865 }, { "epoch": 0.07638024076380241, "grad_norm": 0.5709402756420556, "learning_rate": 4.992729101228267e-05, "loss": 0.4628908038139343, "num_input_tokens_seen": 4823000, "step": 23, "train_runtime": 8264.4385, "train_tokens_per_second": 583.585 }, { "epoch": 0.07970112079701121, "grad_norm": 0.547713320766484, "learning_rate": 4.99205344626264e-05, "loss": 0.46453869342803955, "num_input_tokens_seen": 5032232, "step": 24, "train_runtime": 8678.1978, "train_tokens_per_second": 579.871 }, { "epoch": 0.08302200083022, "grad_norm": 0.50880282043929, "learning_rate": 4.991347827263982e-05, "loss": 0.46282365918159485, "num_input_tokens_seen": 5239424, "step": 25, "train_runtime": 9036.7609, "train_tokens_per_second": 579.79 }, { "epoch": 0.08634288086342881, "grad_norm": 0.5280431551496184, "learning_rate": 4.9906122527165395e-05, "loss": 0.47802141308784485, "num_input_tokens_seen": 5449400, "step": 26, "train_runtime": 9408.4575, "train_tokens_per_second": 579.202 }, { "epoch": 0.0896637608966376, "grad_norm": 0.5360506599456444, "learning_rate": 4.9898467314647356e-05, "loss": 0.47468101978302, "num_input_tokens_seen": 5657128, "step": 27, "train_runtime": 9828.9355, "train_tokens_per_second": 575.559 }, { "epoch": 0.09298464092984642, "grad_norm": 0.5050834945301951, "learning_rate": 4.98905127271307e-05, "loss": 0.46311408281326294, "num_input_tokens_seen": 5861312, "step": 28, "train_runtime": 10208.9129, "train_tokens_per_second": 574.137 }, { "epoch": 0.09630552096305521, "grad_norm": 0.5016366001659072, "learning_rate": 4.9882258860260065e-05, "loss": 0.4600064754486084, "num_input_tokens_seen": 6068552, "step": 29, "train_runtime": 10599.8001, "train_tokens_per_second": 572.516 }, { "epoch": 0.099626400996264, "grad_norm": 0.5221437465152288, "learning_rate": 4.9873705813278546e-05, "loss": 0.4781966209411621, "num_input_tokens_seen": 6276440, "step": 30, "train_runtime": 11020.3778, "train_tokens_per_second": 569.53 }, { "epoch": 0.10294728102947281, "grad_norm": 0.5194157847994746, "learning_rate": 4.9864853689026556e-05, "loss": 0.4618116021156311, "num_input_tokens_seen": 6488168, "step": 31, "train_runtime": 11413.764, "train_tokens_per_second": 568.451 }, { "epoch": 0.10626816106268161, "grad_norm": 0.4889565680810572, "learning_rate": 4.9855702593940556e-05, "loss": 0.45203810930252075, "num_input_tokens_seen": 6692640, "step": 32, "train_runtime": 11825.7754, "train_tokens_per_second": 565.937 }, { "epoch": 0.1095890410958904, "grad_norm": 0.4711016378173473, "learning_rate": 4.984625263805178e-05, "loss": 0.45655691623687744, "num_input_tokens_seen": 6908856, "step": 33, "train_runtime": 12196.2235, "train_tokens_per_second": 566.475 }, { "epoch": 0.11290992112909921, "grad_norm": 0.47738370844390215, "learning_rate": 4.98365039349849e-05, "loss": 0.452237069606781, "num_input_tokens_seen": 7117152, "step": 34, "train_runtime": 12604.2177, "train_tokens_per_second": 564.664 }, { "epoch": 0.11623080116230801, "grad_norm": 0.4819886848586694, "learning_rate": 4.982645660195671e-05, "loss": 0.46304643154144287, "num_input_tokens_seen": 7325648, "step": 35, "train_runtime": 13034.2263, "train_tokens_per_second": 562.032 }, { "epoch": 0.11955168119551682, "grad_norm": 0.4868001892186664, "learning_rate": 4.981611075977465e-05, "loss": 0.4689868092536926, "num_input_tokens_seen": 7535960, "step": 36, "train_runtime": 13484.2283, "train_tokens_per_second": 558.872 }, { "epoch": 0.12287256122872561, "grad_norm": 0.47985081776553334, "learning_rate": 4.9805466532835376e-05, "loss": 0.4629165530204773, "num_input_tokens_seen": 7747184, "step": 37, "train_runtime": 13939.4439, "train_tokens_per_second": 555.774 }, { "epoch": 0.1261934412619344, "grad_norm": 0.47842750141640544, "learning_rate": 4.9794524049123315e-05, "loss": 0.4757240414619446, "num_input_tokens_seen": 7955312, "step": 38, "train_runtime": 14361.5825, "train_tokens_per_second": 553.93 }, { "epoch": 0.1295143212951432, "grad_norm": 0.4778506353705411, "learning_rate": 4.978328344020904e-05, "loss": 0.44948646426200867, "num_input_tokens_seen": 8169056, "step": 39, "train_runtime": 14747.6532, "train_tokens_per_second": 553.922 }, { "epoch": 0.13283520132835203, "grad_norm": 0.5065452016110074, "learning_rate": 4.9771744841247756e-05, "loss": 0.4762452244758606, "num_input_tokens_seen": 8373792, "step": 40, "train_runtime": 15163.911, "train_tokens_per_second": 552.218 }, { "epoch": 0.13615608136156082, "grad_norm": 0.481142692930246, "learning_rate": 4.975990839097764e-05, "loss": 0.4602937698364258, "num_input_tokens_seen": 8578168, "step": 41, "train_runtime": 15597.5491, "train_tokens_per_second": 549.969 }, { "epoch": 0.13947696139476962, "grad_norm": 0.5050351240075158, "learning_rate": 4.9747774231718196e-05, "loss": 0.4572892189025879, "num_input_tokens_seen": 8791280, "step": 42, "train_runtime": 16005.7822, "train_tokens_per_second": 549.257 }, { "epoch": 0.1427978414279784, "grad_norm": 0.4991575259274136, "learning_rate": 4.973534250936851e-05, "loss": 0.45722487568855286, "num_input_tokens_seen": 9000368, "step": 43, "train_runtime": 16432.1686, "train_tokens_per_second": 547.729 }, { "epoch": 0.1461187214611872, "grad_norm": 0.48558685167985, "learning_rate": 4.9722613373405536e-05, "loss": 0.4636663794517517, "num_input_tokens_seen": 9203936, "step": 44, "train_runtime": 16784.4004, "train_tokens_per_second": 548.363 }, { "epoch": 0.149439601494396, "grad_norm": 0.47609610840453637, "learning_rate": 4.970958697688226e-05, "loss": 0.4485476613044739, "num_input_tokens_seen": 9413304, "step": 45, "train_runtime": 17157.9869, "train_tokens_per_second": 548.625 }, { "epoch": 0.15276048152760482, "grad_norm": 0.5058545801154458, "learning_rate": 4.9696263476425905e-05, "loss": 0.4744018316268921, "num_input_tokens_seen": 9627504, "step": 46, "train_runtime": 17567.6814, "train_tokens_per_second": 548.024 }, { "epoch": 0.15608136156081362, "grad_norm": 0.45496973505693533, "learning_rate": 4.9682643032235996e-05, "loss": 0.44982996582984924, "num_input_tokens_seen": 9841624, "step": 47, "train_runtime": 17965.3295, "train_tokens_per_second": 547.812 }, { "epoch": 0.15940224159402241, "grad_norm": 0.4683553270832641, "learning_rate": 4.9668725808082486e-05, "loss": 0.44015663862228394, "num_input_tokens_seen": 10051056, "step": 48, "train_runtime": 18354.1042, "train_tokens_per_second": 547.619 }, { "epoch": 0.1627231216272312, "grad_norm": 0.48192211833525017, "learning_rate": 4.965451197130373e-05, "loss": 0.4502023458480835, "num_input_tokens_seen": 10261536, "step": 49, "train_runtime": 18787.0334, "train_tokens_per_second": 546.203 }, { "epoch": 0.16604400166044, "grad_norm": 0.4470796812462641, "learning_rate": 4.9640001692804526e-05, "loss": 0.4449482262134552, "num_input_tokens_seen": 10473216, "step": 50, "train_runtime": 19198.6721, "train_tokens_per_second": 545.518 }, { "epoch": 0.16936488169364883, "grad_norm": 0.51373999888068, "learning_rate": 4.9625195147054034e-05, "loss": 0.46474742889404297, "num_input_tokens_seen": 10683600, "step": 51, "train_runtime": 19625.3506, "train_tokens_per_second": 544.378 }, { "epoch": 0.17268576172685762, "grad_norm": 0.4619924286746043, "learning_rate": 4.961009251208368e-05, "loss": 0.4434038996696472, "num_input_tokens_seen": 10891032, "step": 52, "train_runtime": 20013.0335, "train_tokens_per_second": 544.197 }, { "epoch": 0.17600664176006642, "grad_norm": 0.49182080839781456, "learning_rate": 4.9594693969485006e-05, "loss": 0.4552285075187683, "num_input_tokens_seen": 11099160, "step": 53, "train_runtime": 20434.8387, "train_tokens_per_second": 543.149 }, { "epoch": 0.1793275217932752, "grad_norm": 0.46385034500801736, "learning_rate": 4.957899970440752e-05, "loss": 0.4416283369064331, "num_input_tokens_seen": 11306176, "step": 54, "train_runtime": 20855.5126, "train_tokens_per_second": 542.119 }, { "epoch": 0.182648401826484, "grad_norm": 0.4536867141933408, "learning_rate": 4.956300990555643e-05, "loss": 0.4536556601524353, "num_input_tokens_seen": 11522384, "step": 55, "train_runtime": 21265.4367, "train_tokens_per_second": 541.836 }, { "epoch": 0.18596928185969283, "grad_norm": 0.4671537514070704, "learning_rate": 4.954672476519039e-05, "loss": 0.4556847810745239, "num_input_tokens_seen": 11734840, "step": 56, "train_runtime": 21648.5704, "train_tokens_per_second": 542.061 }, { "epoch": 0.18929016189290163, "grad_norm": 0.4475222934215569, "learning_rate": 4.9530144479119215e-05, "loss": 0.4614197015762329, "num_input_tokens_seen": 11946520, "step": 57, "train_runtime": 22044.2758, "train_tokens_per_second": 541.933 }, { "epoch": 0.19261104192611042, "grad_norm": 0.4558487820329749, "learning_rate": 4.951326924670148e-05, "loss": 0.4480130672454834, "num_input_tokens_seen": 12157000, "step": 58, "train_runtime": 22438.2174, "train_tokens_per_second": 541.799 }, { "epoch": 0.19593192195931922, "grad_norm": 0.4375623390429257, "learning_rate": 4.9496099270842145e-05, "loss": 0.45938652753829956, "num_input_tokens_seen": 12369304, "step": 59, "train_runtime": 22806.9644, "train_tokens_per_second": 542.348 }, { "epoch": 0.199252801992528, "grad_norm": 0.439811580462947, "learning_rate": 4.947863475799013e-05, "loss": 0.4392789602279663, "num_input_tokens_seen": 12585656, "step": 60, "train_runtime": 23190.4725, "train_tokens_per_second": 542.708 }, { "epoch": 0.2025736820257368, "grad_norm": 0.46802516121231397, "learning_rate": 4.9460875918135804e-05, "loss": 0.45277348160743713, "num_input_tokens_seen": 12797224, "step": 61, "train_runtime": 23575.3455, "train_tokens_per_second": 542.822 }, { "epoch": 0.20589456205894563, "grad_norm": 0.45408873849435205, "learning_rate": 4.944282296480849e-05, "loss": 0.4558771252632141, "num_input_tokens_seen": 13012600, "step": 62, "train_runtime": 24032.8285, "train_tokens_per_second": 541.451 }, { "epoch": 0.20921544209215442, "grad_norm": 0.4587083496988642, "learning_rate": 4.942447611507386e-05, "loss": 0.4451424777507782, "num_input_tokens_seen": 13221736, "step": 63, "train_runtime": 24473.6247, "train_tokens_per_second": 540.244 }, { "epoch": 0.21253632212536322, "grad_norm": 0.4679031976571913, "learning_rate": 4.940583558953138e-05, "loss": 0.44465193152427673, "num_input_tokens_seen": 13436320, "step": 64, "train_runtime": 24919.5531, "train_tokens_per_second": 539.188 }, { "epoch": 0.21585720215857201, "grad_norm": 0.45245690202565975, "learning_rate": 4.938690161231159e-05, "loss": 0.4602610468864441, "num_input_tokens_seen": 13647296, "step": 65, "train_runtime": 25358.3807, "train_tokens_per_second": 538.177 }, { "epoch": 0.2191780821917808, "grad_norm": 0.45233725910991857, "learning_rate": 4.936767441107346e-05, "loss": 0.43508481979370117, "num_input_tokens_seen": 13852152, "step": 66, "train_runtime": 25797.1703, "train_tokens_per_second": 536.964 }, { "epoch": 0.22249896222498963, "grad_norm": 0.47133643926705643, "learning_rate": 4.934815421700165e-05, "loss": 0.4550328850746155, "num_input_tokens_seen": 14055800, "step": 67, "train_runtime": 26213.021, "train_tokens_per_second": 536.214 }, { "epoch": 0.22581984225819843, "grad_norm": 0.46023410596326153, "learning_rate": 4.932834126480369e-05, "loss": 0.44379085302352905, "num_input_tokens_seen": 14258800, "step": 68, "train_runtime": 26625.6455, "train_tokens_per_second": 535.529 }, { "epoch": 0.22914072229140722, "grad_norm": 0.4611162459428894, "learning_rate": 4.9308235792707194e-05, "loss": 0.4452861547470093, "num_input_tokens_seen": 14475376, "step": 69, "train_runtime": 27050.9527, "train_tokens_per_second": 535.115 }, { "epoch": 0.23246160232461602, "grad_norm": 0.47605288874423257, "learning_rate": 4.9287838042456994e-05, "loss": 0.4533378481864929, "num_input_tokens_seen": 14688208, "step": 70, "train_runtime": 27472.643, "train_tokens_per_second": 534.649 }, { "epoch": 0.2357824823578248, "grad_norm": 0.5191007894098733, "learning_rate": 4.9267148259312224e-05, "loss": 0.46097898483276367, "num_input_tokens_seen": 14896536, "step": 71, "train_runtime": 27934.5283, "train_tokens_per_second": 533.266 }, { "epoch": 0.23910336239103364, "grad_norm": 0.49275566670073156, "learning_rate": 4.924616669204337e-05, "loss": 0.4466809034347534, "num_input_tokens_seen": 15108192, "step": 72, "train_runtime": 28411.0044, "train_tokens_per_second": 531.773 }, { "epoch": 0.24242424242424243, "grad_norm": 0.4662398960719388, "learning_rate": 4.9224893592929275e-05, "loss": 0.44482558965682983, "num_input_tokens_seen": 15318296, "step": 73, "train_runtime": 28828.1193, "train_tokens_per_second": 531.366 }, { "epoch": 0.24574512245745123, "grad_norm": 0.5018135836760722, "learning_rate": 4.920332921775412e-05, "loss": 0.43905580043792725, "num_input_tokens_seen": 15526200, "step": 74, "train_runtime": 29183.9254, "train_tokens_per_second": 532.012 }, { "epoch": 0.24906600249066002, "grad_norm": 0.44947709272675157, "learning_rate": 4.9181473825804346e-05, "loss": 0.4451100528240204, "num_input_tokens_seen": 15733400, "step": 75, "train_runtime": 29566.2561, "train_tokens_per_second": 532.14 }, { "epoch": 0.2523868825238688, "grad_norm": 0.5213099264347468, "learning_rate": 4.9159327679865516e-05, "loss": 0.453909695148468, "num_input_tokens_seen": 15936648, "step": 76, "train_runtime": 29910.8091, "train_tokens_per_second": 532.806 }, { "epoch": 0.2557077625570776, "grad_norm": 0.4889634998072654, "learning_rate": 4.913689104621917e-05, "loss": 0.4480288624763489, "num_input_tokens_seen": 16149016, "step": 77, "train_runtime": 30293.616, "train_tokens_per_second": 533.083 }, { "epoch": 0.2590286425902864, "grad_norm": 0.4870082247693259, "learning_rate": 4.9114164194639646e-05, "loss": 0.4482975900173187, "num_input_tokens_seen": 16356872, "step": 78, "train_runtime": 30696.2826, "train_tokens_per_second": 532.862 }, { "epoch": 0.2623495226234952, "grad_norm": 0.4986289807214529, "learning_rate": 4.909114739839079e-05, "loss": 0.43813836574554443, "num_input_tokens_seen": 16576632, "step": 79, "train_runtime": 31071.6012, "train_tokens_per_second": 533.498 }, { "epoch": 0.26567040265670405, "grad_norm": 0.4643729478825992, "learning_rate": 4.9067840934222705e-05, "loss": 0.44829070568084717, "num_input_tokens_seen": 16786416, "step": 80, "train_runtime": 31443.9929, "train_tokens_per_second": 533.851 }, { "epoch": 0.26899128268991285, "grad_norm": 0.4769146828621081, "learning_rate": 4.9044245082368415e-05, "loss": 0.4352768659591675, "num_input_tokens_seen": 16998336, "step": 81, "train_runtime": 31759.1171, "train_tokens_per_second": 535.227 }, { "epoch": 0.27231216272312164, "grad_norm": 0.4708333719673793, "learning_rate": 4.9020360126540474e-05, "loss": 0.4634632468223572, "num_input_tokens_seen": 17206720, "step": 82, "train_runtime": 32120.5752, "train_tokens_per_second": 535.692 }, { "epoch": 0.27563304275633044, "grad_norm": 0.4810722956543963, "learning_rate": 4.8996186353927606e-05, "loss": 0.4414184093475342, "num_input_tokens_seen": 17412896, "step": 83, "train_runtime": 32480.2157, "train_tokens_per_second": 536.108 }, { "epoch": 0.27895392278953923, "grad_norm": 0.46220406863949304, "learning_rate": 4.897172405519119e-05, "loss": 0.44793516397476196, "num_input_tokens_seen": 17626504, "step": 84, "train_runtime": 32878.7631, "train_tokens_per_second": 536.106 }, { "epoch": 0.282274802822748, "grad_norm": 0.4374087412154122, "learning_rate": 4.894697352446182e-05, "loss": 0.4464322030544281, "num_input_tokens_seen": 17839720, "step": 85, "train_runtime": 33226.3774, "train_tokens_per_second": 536.914 }, { "epoch": 0.2855956828559568, "grad_norm": 0.4408488795123499, "learning_rate": 4.892193505933572e-05, "loss": 0.4390932321548462, "num_input_tokens_seen": 18054216, "step": 86, "train_runtime": 33588.8389, "train_tokens_per_second": 537.506 }, { "epoch": 0.2889165628891656, "grad_norm": 0.48068069423948034, "learning_rate": 4.889660896087119e-05, "loss": 0.4580715596675873, "num_input_tokens_seen": 18262984, "step": 87, "train_runtime": 33930.8783, "train_tokens_per_second": 538.241 }, { "epoch": 0.2922374429223744, "grad_norm": 0.4483566286242534, "learning_rate": 4.887099553358502e-05, "loss": 0.4441770315170288, "num_input_tokens_seen": 18482720, "step": 88, "train_runtime": 34295.936, "train_tokens_per_second": 538.919 }, { "epoch": 0.2955583229555832, "grad_norm": 0.45941522488511394, "learning_rate": 4.884509508544874e-05, "loss": 0.4445023536682129, "num_input_tokens_seen": 18692240, "step": 89, "train_runtime": 34658.7667, "train_tokens_per_second": 539.322 }, { "epoch": 0.298879202988792, "grad_norm": 0.48257426948336424, "learning_rate": 4.8818907927885014e-05, "loss": 0.43760842084884644, "num_input_tokens_seen": 18904696, "step": 90, "train_runtime": 34993.7315, "train_tokens_per_second": 540.231 }, { "epoch": 0.30220008302200085, "grad_norm": 0.49066330758193627, "learning_rate": 4.879243437576383e-05, "loss": 0.45703399181365967, "num_input_tokens_seen": 19104848, "step": 91, "train_runtime": 35314.6958, "train_tokens_per_second": 540.989 }, { "epoch": 0.30552096305520965, "grad_norm": 0.5871284281968285, "learning_rate": 4.876567474739875e-05, "loss": 0.4443628787994385, "num_input_tokens_seen": 19316456, "step": 92, "train_runtime": 35629.9155, "train_tokens_per_second": 542.142 }, { "epoch": 0.30884184308841844, "grad_norm": 0.4635567514036156, "learning_rate": 4.8738629364543045e-05, "loss": 0.45608752965927124, "num_input_tokens_seen": 19529968, "step": 93, "train_runtime": 35948.999, "train_tokens_per_second": 543.269 }, { "epoch": 0.31216272312162724, "grad_norm": 0.5646397823754469, "learning_rate": 4.8711298552385886e-05, "loss": 0.45944929122924805, "num_input_tokens_seen": 19738568, "step": 94, "train_runtime": 36245.5132, "train_tokens_per_second": 544.58 }, { "epoch": 0.31548360315483603, "grad_norm": 0.4673309049775806, "learning_rate": 4.8683682639548365e-05, "loss": 0.44822442531585693, "num_input_tokens_seen": 19950248, "step": 95, "train_runtime": 36549.0995, "train_tokens_per_second": 545.848 }, { "epoch": 0.31880448318804483, "grad_norm": 0.544836194818622, "learning_rate": 4.8655781958079594e-05, "loss": 0.4578169584274292, "num_input_tokens_seen": 20162200, "step": 96, "train_runtime": 36855.1685, "train_tokens_per_second": 547.066 }, { "epoch": 0.3221253632212536, "grad_norm": 0.4557391238446957, "learning_rate": 4.862759684345269e-05, "loss": 0.45215484499931335, "num_input_tokens_seen": 20368784, "step": 97, "train_runtime": 37169.687, "train_tokens_per_second": 547.994 }, { "epoch": 0.3254462432544624, "grad_norm": 0.5390637669571892, "learning_rate": 4.859912763456076e-05, "loss": 0.4573412537574768, "num_input_tokens_seen": 20571880, "step": 98, "train_runtime": 37498.0856, "train_tokens_per_second": 548.611 }, { "epoch": 0.3287671232876712, "grad_norm": 0.42735517403632506, "learning_rate": 4.857037467371278e-05, "loss": 0.43494826555252075, "num_input_tokens_seen": 20776240, "step": 99, "train_runtime": 37841.2657, "train_tokens_per_second": 549.037 }, { "epoch": 0.33208800332088, "grad_norm": 0.5109909678139859, "learning_rate": 4.8541338306629545e-05, "loss": 0.4539799094200134, "num_input_tokens_seen": 20978664, "step": 100, "train_runtime": 38174.0089, "train_tokens_per_second": 549.554 }, { "epoch": 0.33540888335408886, "grad_norm": 0.4407511163367817, "learning_rate": 4.8512018882439475e-05, "loss": 0.4432840347290039, "num_input_tokens_seen": 21187080, "step": 101, "train_runtime": 38506.3865, "train_tokens_per_second": 550.222 }, { "epoch": 0.33872976338729766, "grad_norm": 0.4565646586473819, "learning_rate": 4.84824167536744e-05, "loss": 0.4450823664665222, "num_input_tokens_seen": 21400168, "step": 102, "train_runtime": 38844.643, "train_tokens_per_second": 550.917 }, { "epoch": 0.34205064342050645, "grad_norm": 0.4523515567415233, "learning_rate": 4.8452532276265364e-05, "loss": 0.4405674338340759, "num_input_tokens_seen": 21609024, "step": 103, "train_runtime": 39165.759, "train_tokens_per_second": 551.733 }, { "epoch": 0.34537152345371525, "grad_norm": 0.4342711645516075, "learning_rate": 4.84223658095383e-05, "loss": 0.44187235832214355, "num_input_tokens_seen": 21820032, "step": 104, "train_runtime": 39526.1781, "train_tokens_per_second": 552.04 }, { "epoch": 0.34869240348692404, "grad_norm": 0.4440976417955082, "learning_rate": 4.8391917716209765e-05, "loss": 0.42703860998153687, "num_input_tokens_seen": 22031008, "step": 105, "train_runtime": 39924.2011, "train_tokens_per_second": 551.821 }, { "epoch": 0.35201328352013284, "grad_norm": 0.42616585176537086, "learning_rate": 4.836118836238253e-05, "loss": 0.4403926134109497, "num_input_tokens_seen": 22249104, "step": 106, "train_runtime": 40295.3003, "train_tokens_per_second": 552.151 }, { "epoch": 0.35533416355334163, "grad_norm": 0.44682534401884716, "learning_rate": 4.833017811754119e-05, "loss": 0.4441004693508148, "num_input_tokens_seen": 22464824, "step": 107, "train_runtime": 40659.7263, "train_tokens_per_second": 552.508 }, { "epoch": 0.3586550435865504, "grad_norm": 0.4369578709407043, "learning_rate": 4.829888735454773e-05, "loss": 0.44408392906188965, "num_input_tokens_seen": 22674264, "step": 108, "train_runtime": 40995.9931, "train_tokens_per_second": 553.085 }, { "epoch": 0.3619759236197592, "grad_norm": 0.4464921926449268, "learning_rate": 4.8267316449637054e-05, "loss": 0.4447093605995178, "num_input_tokens_seen": 22880992, "step": 109, "train_runtime": 41326.7448, "train_tokens_per_second": 553.661 }, { "epoch": 0.365296803652968, "grad_norm": 0.4551711007141716, "learning_rate": 4.823546578241242e-05, "loss": 0.4409634470939636, "num_input_tokens_seen": 23090920, "step": 110, "train_runtime": 41682.3741, "train_tokens_per_second": 553.973 }, { "epoch": 0.3686176836861768, "grad_norm": 0.44445150180930093, "learning_rate": 4.820333573584091e-05, "loss": 0.4509497284889221, "num_input_tokens_seen": 23299840, "step": 111, "train_runtime": 42018.9773, "train_tokens_per_second": 554.508 }, { "epoch": 0.37193856371938566, "grad_norm": 0.43356567382883354, "learning_rate": 4.817092669624883e-05, "loss": 0.45462414622306824, "num_input_tokens_seen": 23513992, "step": 112, "train_runtime": 42413.9853, "train_tokens_per_second": 554.392 }, { "epoch": 0.37525944375259446, "grad_norm": 0.457699188924526, "learning_rate": 4.813823905331704e-05, "loss": 0.45549196004867554, "num_input_tokens_seen": 23726736, "step": 113, "train_runtime": 42823.6874, "train_tokens_per_second": 554.056 }, { "epoch": 0.37858032378580325, "grad_norm": 0.4290295881112866, "learning_rate": 4.810527320007627e-05, "loss": 0.4397251605987549, "num_input_tokens_seen": 23942584, "step": 114, "train_runtime": 43219.9628, "train_tokens_per_second": 553.97 }, { "epoch": 0.38190120381901205, "grad_norm": 0.43324420836579025, "learning_rate": 4.8072029532902426e-05, "loss": 0.4477549195289612, "num_input_tokens_seen": 24150000, "step": 115, "train_runtime": 43554.8282, "train_tokens_per_second": 554.474 }, { "epoch": 0.38522208385222084, "grad_norm": 0.44929125722269236, "learning_rate": 4.80385084515118e-05, "loss": 0.4334121644496918, "num_input_tokens_seen": 24359712, "step": 116, "train_runtime": 43904.3887, "train_tokens_per_second": 554.835 }, { "epoch": 0.38854296388542964, "grad_norm": 0.4314474263052667, "learning_rate": 4.800471035895624e-05, "loss": 0.4370808005332947, "num_input_tokens_seen": 24565128, "step": 117, "train_runtime": 44297.7284, "train_tokens_per_second": 554.546 }, { "epoch": 0.39186384391863843, "grad_norm": 0.4740910244340874, "learning_rate": 4.797063566161834e-05, "loss": 0.44406652450561523, "num_input_tokens_seen": 24772408, "step": 118, "train_runtime": 44698.3461, "train_tokens_per_second": 554.213 }, { "epoch": 0.3951847239518472, "grad_norm": 0.4302718721457111, "learning_rate": 4.7936284769206584e-05, "loss": 0.435397744178772, "num_input_tokens_seen": 24975888, "step": 119, "train_runtime": 45055.0015, "train_tokens_per_second": 554.342 }, { "epoch": 0.398505603985056, "grad_norm": 0.4509556632205818, "learning_rate": 4.790165809475031e-05, "loss": 0.4524025619029999, "num_input_tokens_seen": 25183072, "step": 120, "train_runtime": 45429.6728, "train_tokens_per_second": 554.331 }, { "epoch": 0.4018264840182648, "grad_norm": 0.4413807447335981, "learning_rate": 4.786675605459487e-05, "loss": 0.4324600100517273, "num_input_tokens_seen": 25394712, "step": 121, "train_runtime": 45811.1557, "train_tokens_per_second": 554.335 }, { "epoch": 0.4051473640514736, "grad_norm": 0.43865691005429885, "learning_rate": 4.783157906839655e-05, "loss": 0.43515023589134216, "num_input_tokens_seen": 25599232, "step": 122, "train_runtime": 46169.2363, "train_tokens_per_second": 554.465 }, { "epoch": 0.40846824408468246, "grad_norm": 0.4302051663482629, "learning_rate": 4.779612755911752e-05, "loss": 0.45275765657424927, "num_input_tokens_seen": 25813984, "step": 123, "train_runtime": 46559.242, "train_tokens_per_second": 554.433 }, { "epoch": 0.41178912411789126, "grad_norm": 0.4234240081659297, "learning_rate": 4.77604019530208e-05, "loss": 0.42968207597732544, "num_input_tokens_seen": 26025368, "step": 124, "train_runtime": 46942.8614, "train_tokens_per_second": 554.405 }, { "epoch": 0.41511000415110005, "grad_norm": 0.4328788215058449, "learning_rate": 4.772440267966508e-05, "loss": 0.4436509311199188, "num_input_tokens_seen": 26233184, "step": 125, "train_runtime": 47282.0831, "train_tokens_per_second": 554.823 }, { "epoch": 0.41843088418430885, "grad_norm": 0.4413045100957863, "learning_rate": 4.7688130171899594e-05, "loss": 0.4539909064769745, "num_input_tokens_seen": 26438704, "step": 126, "train_runtime": 47624.4493, "train_tokens_per_second": 555.15 }, { "epoch": 0.42175176421751764, "grad_norm": 0.42996212817233204, "learning_rate": 4.76515848658589e-05, "loss": 0.4468457102775574, "num_input_tokens_seen": 26649984, "step": 127, "train_runtime": 48013.3114, "train_tokens_per_second": 555.054 }, { "epoch": 0.42507264425072644, "grad_norm": 0.4232939507081661, "learning_rate": 4.761476720095764e-05, "loss": 0.44136855006217957, "num_input_tokens_seen": 26865064, "step": 128, "train_runtime": 48384.8244, "train_tokens_per_second": 555.237 }, { "epoch": 0.42839352428393523, "grad_norm": 0.43131056701397297, "learning_rate": 4.7577677619885234e-05, "loss": 0.45262420177459717, "num_input_tokens_seen": 27072168, "step": 129, "train_runtime": 48741.0344, "train_tokens_per_second": 555.429 }, { "epoch": 0.43171440431714403, "grad_norm": 0.4254014593630184, "learning_rate": 4.754031656860059e-05, "loss": 0.44716838002204895, "num_input_tokens_seen": 27286552, "step": 130, "train_runtime": 49139.3329, "train_tokens_per_second": 555.289 }, { "epoch": 0.4350352843503528, "grad_norm": 0.4262289289256098, "learning_rate": 4.7502684496326746e-05, "loss": 0.447284072637558, "num_input_tokens_seen": 27499640, "step": 131, "train_runtime": 49547.9054, "train_tokens_per_second": 555.011 }, { "epoch": 0.4383561643835616, "grad_norm": 0.4258150461274207, "learning_rate": 4.746478185554541e-05, "loss": 0.44308850169181824, "num_input_tokens_seen": 27709024, "step": 132, "train_runtime": 49909.2103, "train_tokens_per_second": 555.189 }, { "epoch": 0.44167704441677047, "grad_norm": 0.42363082177322137, "learning_rate": 4.7426609101991605e-05, "loss": 0.44187185168266296, "num_input_tokens_seen": 27920048, "step": 133, "train_runtime": 50281.2036, "train_tokens_per_second": 555.278 }, { "epoch": 0.44499792444997927, "grad_norm": 0.4226891916668899, "learning_rate": 4.73881666946481e-05, "loss": 0.4379861354827881, "num_input_tokens_seen": 28128512, "step": 134, "train_runtime": 50648.5009, "train_tokens_per_second": 555.367 }, { "epoch": 0.44831880448318806, "grad_norm": 0.4261557172119051, "learning_rate": 4.734945509573997e-05, "loss": 0.45955541729927063, "num_input_tokens_seen": 28337864, "step": 135, "train_runtime": 50991.8372, "train_tokens_per_second": 555.733 }, { "epoch": 0.45163968451639686, "grad_norm": 0.46233816509185066, "learning_rate": 4.7310474770729e-05, "loss": 0.44429993629455566, "num_input_tokens_seen": 28544840, "step": 136, "train_runtime": 51354.9146, "train_tokens_per_second": 555.835 }, { "epoch": 0.45496056454960565, "grad_norm": 0.40531174354883803, "learning_rate": 4.727122618830808e-05, "loss": 0.43550848960876465, "num_input_tokens_seen": 28758408, "step": 137, "train_runtime": 51719.2289, "train_tokens_per_second": 556.049 }, { "epoch": 0.45828144458281445, "grad_norm": 0.42789109381313467, "learning_rate": 4.723170982039558e-05, "loss": 0.44725221395492554, "num_input_tokens_seen": 28971880, "step": 138, "train_runtime": 52072.4455, "train_tokens_per_second": 556.376 }, { "epoch": 0.46160232461602324, "grad_norm": 0.43194671523319594, "learning_rate": 4.719192614212969e-05, "loss": 0.4348328709602356, "num_input_tokens_seen": 29180432, "step": 139, "train_runtime": 52461.0303, "train_tokens_per_second": 556.231 }, { "epoch": 0.46492320464923204, "grad_norm": 0.4358629533496961, "learning_rate": 4.715187563186271e-05, "loss": 0.44152069091796875, "num_input_tokens_seen": 29382856, "step": 140, "train_runtime": 52866.7564, "train_tokens_per_second": 555.791 }, { "epoch": 0.46824408468244083, "grad_norm": 0.4430147925284936, "learning_rate": 4.711155877115523e-05, "loss": 0.436160147190094, "num_input_tokens_seen": 29590312, "step": 141, "train_runtime": 53224.4372, "train_tokens_per_second": 555.953 }, { "epoch": 0.4715649647156496, "grad_norm": 0.42217855330206566, "learning_rate": 4.707097604477045e-05, "loss": 0.4471568465232849, "num_input_tokens_seen": 29804528, "step": 142, "train_runtime": 53601.4482, "train_tokens_per_second": 556.04 }, { "epoch": 0.4748858447488584, "grad_norm": 0.4255160211154335, "learning_rate": 4.703012794066827e-05, "loss": 0.45743924379348755, "num_input_tokens_seen": 30008576, "step": 143, "train_runtime": 53980.2086, "train_tokens_per_second": 555.918 }, { "epoch": 0.47820672478206727, "grad_norm": 0.42595571531644855, "learning_rate": 4.6989014949999436e-05, "loss": 0.44465208053588867, "num_input_tokens_seen": 30217968, "step": 144, "train_runtime": 54344.4332, "train_tokens_per_second": 556.045 }, { "epoch": 0.48152760481527607, "grad_norm": 0.424478030764808, "learning_rate": 4.694763756709967e-05, "loss": 0.4434838891029358, "num_input_tokens_seen": 30425288, "step": 145, "train_runtime": 54717.5772, "train_tokens_per_second": 556.042 }, { "epoch": 0.48484848484848486, "grad_norm": 0.4005419350820971, "learning_rate": 4.690599628948369e-05, "loss": 0.4252302050590515, "num_input_tokens_seen": 30640824, "step": 146, "train_runtime": 55076.7819, "train_tokens_per_second": 556.329 }, { "epoch": 0.48816936488169366, "grad_norm": 0.425705164246823, "learning_rate": 4.6864091617839235e-05, "loss": 0.4438859820365906, "num_input_tokens_seen": 30847888, "step": 147, "train_runtime": 55435.2881, "train_tokens_per_second": 556.467 }, { "epoch": 0.49149024491490245, "grad_norm": 0.4240638850798826, "learning_rate": 4.682192405602106e-05, "loss": 0.4435650110244751, "num_input_tokens_seen": 31064104, "step": 148, "train_runtime": 55823.9762, "train_tokens_per_second": 556.465 }, { "epoch": 0.49481112494811125, "grad_norm": 0.41284228122350536, "learning_rate": 4.677949411104485e-05, "loss": 0.43432003259658813, "num_input_tokens_seen": 31270280, "step": 149, "train_runtime": 56194.4214, "train_tokens_per_second": 556.466 }, { "epoch": 0.49813200498132004, "grad_norm": 0.4129604127997443, "learning_rate": 4.673680229308117e-05, "loss": 0.43058010935783386, "num_input_tokens_seen": 31484800, "step": 150, "train_runtime": 56548.0117, "train_tokens_per_second": 556.78 }, { "epoch": 0.5014528850145289, "grad_norm": 0.4292713757826097, "learning_rate": 4.669384911544927e-05, "loss": 0.4515884518623352, "num_input_tokens_seen": 31688168, "step": 151, "train_runtime": 56903.1747, "train_tokens_per_second": 556.879 }, { "epoch": 0.5047737650477376, "grad_norm": 0.4032439932508284, "learning_rate": 4.665063509461097e-05, "loss": 0.44337350130081177, "num_input_tokens_seen": 31899192, "step": 152, "train_runtime": 57223.4264, "train_tokens_per_second": 557.45 }, { "epoch": 0.5080946450809465, "grad_norm": 0.39911831605898673, "learning_rate": 4.660716075016442e-05, "loss": 0.42944586277008057, "num_input_tokens_seen": 32114600, "step": 153, "train_runtime": 57585.0759, "train_tokens_per_second": 557.69 }, { "epoch": 0.5114155251141552, "grad_norm": 0.4269443091168191, "learning_rate": 4.656342660483782e-05, "loss": 0.43536609411239624, "num_input_tokens_seen": 32323696, "step": 154, "train_runtime": 57919.9565, "train_tokens_per_second": 558.075 }, { "epoch": 0.5147364051473641, "grad_norm": 0.40449445796047184, "learning_rate": 4.6519433184483245e-05, "loss": 0.4376128017902374, "num_input_tokens_seen": 32536696, "step": 155, "train_runtime": 58239.9326, "train_tokens_per_second": 558.666 }, { "epoch": 0.5180572851805728, "grad_norm": 0.42480120626314233, "learning_rate": 4.647518101807019e-05, "loss": 0.4368232488632202, "num_input_tokens_seen": 32747280, "step": 156, "train_runtime": 58552.1035, "train_tokens_per_second": 559.284 }, { "epoch": 0.5213781652137817, "grad_norm": 0.39915636915284897, "learning_rate": 4.6430670637679295e-05, "loss": 0.42630359530448914, "num_input_tokens_seen": 32959224, "step": 157, "train_runtime": 58873.1529, "train_tokens_per_second": 559.835 }, { "epoch": 0.5246990452469904, "grad_norm": 0.45331505772203606, "learning_rate": 4.638590257849591e-05, "loss": 0.4541800022125244, "num_input_tokens_seen": 33165992, "step": 158, "train_runtime": 59187.6106, "train_tokens_per_second": 560.354 }, { "epoch": 0.5280199252801993, "grad_norm": 0.41638664272129633, "learning_rate": 4.634087737880367e-05, "loss": 0.4372981786727905, "num_input_tokens_seen": 33375824, "step": 159, "train_runtime": 59522.1607, "train_tokens_per_second": 560.729 }, { "epoch": 0.5313408053134081, "grad_norm": 0.41487341543847556, "learning_rate": 4.6295595579978046e-05, "loss": 0.4468112885951996, "num_input_tokens_seen": 33589176, "step": 160, "train_runtime": 59826.0822, "train_tokens_per_second": 561.447 }, { "epoch": 0.5346616853466168, "grad_norm": 0.4363871862958581, "learning_rate": 4.625005772647979e-05, "loss": 0.45006150007247925, "num_input_tokens_seen": 33797608, "step": 161, "train_runtime": 60145.6604, "train_tokens_per_second": 561.929 }, { "epoch": 0.5379825653798257, "grad_norm": 0.43717717407611545, "learning_rate": 4.6204264365848426e-05, "loss": 0.43324458599090576, "num_input_tokens_seen": 33999224, "step": 162, "train_runtime": 60454.4626, "train_tokens_per_second": 562.394 }, { "epoch": 0.5413034454130344, "grad_norm": 0.43433395435830424, "learning_rate": 4.615821604869564e-05, "loss": 0.4437130093574524, "num_input_tokens_seen": 34212168, "step": 163, "train_runtime": 60758.4674, "train_tokens_per_second": 563.085 }, { "epoch": 0.5446243254462433, "grad_norm": 0.43226007356560675, "learning_rate": 4.611191332869869e-05, "loss": 0.4397727847099304, "num_input_tokens_seen": 34419888, "step": 164, "train_runtime": 61044.9488, "train_tokens_per_second": 563.845 }, { "epoch": 0.547945205479452, "grad_norm": 0.44587748254281495, "learning_rate": 4.606535676259372e-05, "loss": 0.4515122175216675, "num_input_tokens_seen": 34631784, "step": 165, "train_runtime": 61376.8588, "train_tokens_per_second": 564.248 }, { "epoch": 0.5512660855126609, "grad_norm": 0.42820867822081615, "learning_rate": 4.601854691016907e-05, "loss": 0.4467068612575531, "num_input_tokens_seen": 34836768, "step": 166, "train_runtime": 61689.4026, "train_tokens_per_second": 564.712 }, { "epoch": 0.5545869655458696, "grad_norm": 0.44973079942490846, "learning_rate": 4.597148433425857e-05, "loss": 0.4417882263660431, "num_input_tokens_seen": 35047288, "step": 167, "train_runtime": 62022.3544, "train_tokens_per_second": 565.075 }, { "epoch": 0.5579078455790785, "grad_norm": 0.4356565349165813, "learning_rate": 4.5924169600734745e-05, "loss": 0.442172110080719, "num_input_tokens_seen": 35252000, "step": 168, "train_runtime": 62332.1959, "train_tokens_per_second": 565.55 }, { "epoch": 0.5612287256122872, "grad_norm": 0.4523340638200005, "learning_rate": 4.587660327850203e-05, "loss": 0.43838340044021606, "num_input_tokens_seen": 35466456, "step": 169, "train_runtime": 62633.917, "train_tokens_per_second": 566.25 }, { "epoch": 0.564549605645496, "grad_norm": 0.4661740464155302, "learning_rate": 4.5828785939489897e-05, "loss": 0.434292197227478, "num_input_tokens_seen": 35675000, "step": 170, "train_runtime": 62945.4758, "train_tokens_per_second": 566.76 }, { "epoch": 0.5678704856787049, "grad_norm": 0.4132020200034834, "learning_rate": 4.578071815864602e-05, "loss": 0.4416734278202057, "num_input_tokens_seen": 35878176, "step": 171, "train_runtime": 63238.6215, "train_tokens_per_second": 567.346 }, { "epoch": 0.5711913657119136, "grad_norm": 0.4201307170010595, "learning_rate": 4.573240051392935e-05, "loss": 0.4213865399360657, "num_input_tokens_seen": 36089504, "step": 172, "train_runtime": 63561.6656, "train_tokens_per_second": 567.787 }, { "epoch": 0.5745122457451225, "grad_norm": 0.4241812938053152, "learning_rate": 4.5683833586303157e-05, "loss": 0.43653762340545654, "num_input_tokens_seen": 36302584, "step": 173, "train_runtime": 63872.4526, "train_tokens_per_second": 568.361 }, { "epoch": 0.5778331257783312, "grad_norm": 0.4421774033391317, "learning_rate": 4.5635017959728024e-05, "loss": 0.4446670114994049, "num_input_tokens_seen": 36510368, "step": 174, "train_runtime": 64213.2491, "train_tokens_per_second": 568.58 }, { "epoch": 0.5811540058115401, "grad_norm": 0.4246000272320578, "learning_rate": 4.5585954221154856e-05, "loss": 0.43610668182373047, "num_input_tokens_seen": 36722920, "step": 175, "train_runtime": 64573.1069, "train_tokens_per_second": 568.703 }, { "epoch": 0.5844748858447488, "grad_norm": 0.42456545767937026, "learning_rate": 4.553664296051785e-05, "loss": 0.44354474544525146, "num_input_tokens_seen": 36932736, "step": 176, "train_runtime": 64912.376, "train_tokens_per_second": 568.963 }, { "epoch": 0.5877957658779577, "grad_norm": 0.4346939758229494, "learning_rate": 4.548708477072733e-05, "loss": 0.43920040130615234, "num_input_tokens_seen": 37140936, "step": 177, "train_runtime": 65249.1561, "train_tokens_per_second": 569.217 }, { "epoch": 0.5911166459111664, "grad_norm": 0.46027381591485417, "learning_rate": 4.543728024766265e-05, "loss": 0.4473230838775635, "num_input_tokens_seen": 37352928, "step": 178, "train_runtime": 65590.4234, "train_tokens_per_second": 569.488 }, { "epoch": 0.5944375259443753, "grad_norm": 0.4162193654337481, "learning_rate": 4.5387229990165073e-05, "loss": 0.44290798902511597, "num_input_tokens_seen": 37570288, "step": 179, "train_runtime": 65941.5168, "train_tokens_per_second": 569.752 }, { "epoch": 0.597758405977584, "grad_norm": 0.43564840653850984, "learning_rate": 4.53369346000305e-05, "loss": 0.4382040500640869, "num_input_tokens_seen": 37786952, "step": 180, "train_runtime": 66286.654, "train_tokens_per_second": 570.054 }, { "epoch": 0.6010792860107929, "grad_norm": 0.45512147901150146, "learning_rate": 4.528639468200226e-05, "loss": 0.44450345635414124, "num_input_tokens_seen": 37991272, "step": 181, "train_runtime": 66611.4739, "train_tokens_per_second": 570.341 }, { "epoch": 0.6044001660440017, "grad_norm": 0.4452883735867651, "learning_rate": 4.523561084376387e-05, "loss": 0.4293566942214966, "num_input_tokens_seen": 38196960, "step": 182, "train_runtime": 66948.7596, "train_tokens_per_second": 570.54 }, { "epoch": 0.6077210460772104, "grad_norm": 0.43881366029354163, "learning_rate": 4.51845836959317e-05, "loss": 0.4395379424095154, "num_input_tokens_seen": 38408216, "step": 183, "train_runtime": 67298.9272, "train_tokens_per_second": 570.711 }, { "epoch": 0.6110419261104193, "grad_norm": 0.4477802065485728, "learning_rate": 4.513331385204761e-05, "loss": 0.448405385017395, "num_input_tokens_seen": 38615056, "step": 184, "train_runtime": 67622.0826, "train_tokens_per_second": 571.042 }, { "epoch": 0.614362806143628, "grad_norm": 0.40104764974815776, "learning_rate": 4.508180192857162e-05, "loss": 0.43220552802085876, "num_input_tokens_seen": 38821376, "step": 185, "train_runtime": 67913.1533, "train_tokens_per_second": 571.633 }, { "epoch": 0.6176836861768369, "grad_norm": 0.4666893933881112, "learning_rate": 4.503004854487446e-05, "loss": 0.4433121681213379, "num_input_tokens_seen": 39021800, "step": 186, "train_runtime": 68210.7976, "train_tokens_per_second": 572.077 }, { "epoch": 0.6210045662100456, "grad_norm": 0.41673237171052985, "learning_rate": 4.497805432323015e-05, "loss": 0.4427833557128906, "num_input_tokens_seen": 39228112, "step": 187, "train_runtime": 68503.8332, "train_tokens_per_second": 572.641 }, { "epoch": 0.6243254462432545, "grad_norm": 0.4550438725192048, "learning_rate": 4.492581988880848e-05, "loss": 0.4331440329551697, "num_input_tokens_seen": 39435240, "step": 188, "train_runtime": 68811.2234, "train_tokens_per_second": 573.093 }, { "epoch": 0.6276463262764632, "grad_norm": 0.41652479798401765, "learning_rate": 4.487334586966756e-05, "loss": 0.43456968665122986, "num_input_tokens_seen": 39646256, "step": 189, "train_runtime": 69141.5127, "train_tokens_per_second": 573.407 }, { "epoch": 0.6309672063096721, "grad_norm": 0.41541385377257006, "learning_rate": 4.482063289674618e-05, "loss": 0.4365602135658264, "num_input_tokens_seen": 39856280, "step": 190, "train_runtime": 69462.4508, "train_tokens_per_second": 573.782 }, { "epoch": 0.6342880863428808, "grad_norm": 0.47638712473308337, "learning_rate": 4.476768160385632e-05, "loss": 0.43585318326950073, "num_input_tokens_seen": 40070192, "step": 191, "train_runtime": 69776.7249, "train_tokens_per_second": 574.263 }, { "epoch": 0.6376089663760897, "grad_norm": 0.4102972799018543, "learning_rate": 4.471449262767543e-05, "loss": 0.4303766191005707, "num_input_tokens_seen": 40281040, "step": 192, "train_runtime": 70089.1347, "train_tokens_per_second": 574.712 }, { "epoch": 0.6409298464092985, "grad_norm": 0.42407643286721225, "learning_rate": 4.466106660773885e-05, "loss": 0.4471195340156555, "num_input_tokens_seen": 40482632, "step": 193, "train_runtime": 70371.8589, "train_tokens_per_second": 575.267 }, { "epoch": 0.6442507264425072, "grad_norm": 0.4283137763855368, "learning_rate": 4.460740418643209e-05, "loss": 0.4175926148891449, "num_input_tokens_seen": 40691512, "step": 194, "train_runtime": 70657.6538, "train_tokens_per_second": 575.897 }, { "epoch": 0.6475716064757161, "grad_norm": 0.4288135447501037, "learning_rate": 4.4553506008983126e-05, "loss": 0.45054900646209717, "num_input_tokens_seen": 40891680, "step": 195, "train_runtime": 70928.8361, "train_tokens_per_second": 576.517 }, { "epoch": 0.6508924865089248, "grad_norm": 0.4095150658618606, "learning_rate": 4.44993727234546e-05, "loss": 0.43017148971557617, "num_input_tokens_seen": 41094240, "step": 196, "train_runtime": 71221.1784, "train_tokens_per_second": 576.995 }, { "epoch": 0.6542133665421337, "grad_norm": 0.39712376978562414, "learning_rate": 4.4445004980736064e-05, "loss": 0.43241363763809204, "num_input_tokens_seen": 41305616, "step": 197, "train_runtime": 71517.3021, "train_tokens_per_second": 577.561 }, { "epoch": 0.6575342465753424, "grad_norm": 0.39342236423936194, "learning_rate": 4.439040343453615e-05, "loss": 0.4237765073776245, "num_input_tokens_seen": 41520096, "step": 198, "train_runtime": 71805.2182, "train_tokens_per_second": 578.232 }, { "epoch": 0.6608551266085513, "grad_norm": 0.41950425480127684, "learning_rate": 4.43355687413747e-05, "loss": 0.4452473521232605, "num_input_tokens_seen": 41732296, "step": 199, "train_runtime": 72101.1416, "train_tokens_per_second": 578.802 }, { "epoch": 0.66417600664176, "grad_norm": 0.40431320085427985, "learning_rate": 4.4280501560574875e-05, "loss": 0.43828076124191284, "num_input_tokens_seen": 41945888, "step": 200, "train_runtime": 72390.2758, "train_tokens_per_second": 579.441 }, { "epoch": 0.6674968866749689, "grad_norm": 0.4236783055325729, "learning_rate": 4.4225202554255227e-05, "loss": 0.43148741126060486, "num_input_tokens_seen": 42154560, "step": 201, "train_runtime": 72677.8937, "train_tokens_per_second": 580.019 }, { "epoch": 0.6708177667081777, "grad_norm": 0.4175013090744892, "learning_rate": 4.4169672387321734e-05, "loss": 0.4368186593055725, "num_input_tokens_seen": 42367144, "step": 202, "train_runtime": 72967.0339, "train_tokens_per_second": 580.634 }, { "epoch": 0.6741386467413865, "grad_norm": 0.4198775782986482, "learning_rate": 4.411391172745984e-05, "loss": 0.45101964473724365, "num_input_tokens_seen": 42577472, "step": 203, "train_runtime": 73298.7077, "train_tokens_per_second": 580.876 }, { "epoch": 0.6774595267745953, "grad_norm": 0.4449820513445766, "learning_rate": 4.4057921245126356e-05, "loss": 0.43886077404022217, "num_input_tokens_seen": 42794168, "step": 204, "train_runtime": 73621.9969, "train_tokens_per_second": 581.269 }, { "epoch": 0.680780406807804, "grad_norm": 0.42161643991957015, "learning_rate": 4.4001701613541456e-05, "loss": 0.4312041699886322, "num_input_tokens_seen": 42991984, "step": 205, "train_runtime": 73927.8125, "train_tokens_per_second": 581.54 }, { "epoch": 0.6841012868410129, "grad_norm": 0.39584927873360665, "learning_rate": 4.394525350868059e-05, "loss": 0.4344322383403778, "num_input_tokens_seen": 43201320, "step": 206, "train_runtime": 74199.7183, "train_tokens_per_second": 582.23 }, { "epoch": 0.6874221668742216, "grad_norm": 0.4489424813852947, "learning_rate": 4.388857760926629e-05, "loss": 0.45937955379486084, "num_input_tokens_seen": 43414368, "step": 207, "train_runtime": 74522.8187, "train_tokens_per_second": 582.565 }, { "epoch": 0.6907430469074305, "grad_norm": 0.37900124613449104, "learning_rate": 4.3831674596760086e-05, "loss": 0.42359888553619385, "num_input_tokens_seen": 43629120, "step": 208, "train_runtime": 74821.9211, "train_tokens_per_second": 583.106 }, { "epoch": 0.6940639269406392, "grad_norm": 0.42464374243293224, "learning_rate": 4.3774545155354254e-05, "loss": 0.42763370275497437, "num_input_tokens_seen": 43843256, "step": 209, "train_runtime": 75137.9246, "train_tokens_per_second": 583.504 }, { "epoch": 0.6973848069738481, "grad_norm": 0.4237444617688209, "learning_rate": 4.371718997196364e-05, "loss": 0.43415167927742004, "num_input_tokens_seen": 44051496, "step": 210, "train_runtime": 75442.2008, "train_tokens_per_second": 583.911 }, { "epoch": 0.7007056870070568, "grad_norm": 0.4041444054384139, "learning_rate": 4.3659609736217344e-05, "loss": 0.4416137933731079, "num_input_tokens_seen": 44262176, "step": 211, "train_runtime": 75737.1493, "train_tokens_per_second": 584.418 }, { "epoch": 0.7040265670402657, "grad_norm": 0.43180843603244917, "learning_rate": 4.360180514045048e-05, "loss": 0.44875797629356384, "num_input_tokens_seen": 44473568, "step": 212, "train_runtime": 76047.6024, "train_tokens_per_second": 584.812 }, { "epoch": 0.7073474470734745, "grad_norm": 0.39773328819128007, "learning_rate": 4.354377687969581e-05, "loss": 0.43782973289489746, "num_input_tokens_seen": 44687312, "step": 213, "train_runtime": 76391.6318, "train_tokens_per_second": 584.977 }, { "epoch": 0.7106683271066833, "grad_norm": 0.40961438678357326, "learning_rate": 4.348552565167543e-05, "loss": 0.43622320890426636, "num_input_tokens_seen": 44893976, "step": 214, "train_runtime": 76737.8134, "train_tokens_per_second": 585.031 }, { "epoch": 0.7139892071398921, "grad_norm": 0.40350529580574696, "learning_rate": 4.342705215679232e-05, "loss": 0.44460329413414, "num_input_tokens_seen": 45105256, "step": 215, "train_runtime": 77059.0107, "train_tokens_per_second": 585.334 }, { "epoch": 0.7173100871731009, "grad_norm": 0.39979479853302047, "learning_rate": 4.336835709812202e-05, "loss": 0.4410194754600525, "num_input_tokens_seen": 45310816, "step": 216, "train_runtime": 77368.0757, "train_tokens_per_second": 585.653 }, { "epoch": 0.7206309672063097, "grad_norm": 0.41623591167850843, "learning_rate": 4.330944118140407e-05, "loss": 0.4256168603897095, "num_input_tokens_seen": 45519120, "step": 217, "train_runtime": 77705.3804, "train_tokens_per_second": 585.791 }, { "epoch": 0.7239518472395184, "grad_norm": 0.3995863710752253, "learning_rate": 4.3250305115033554e-05, "loss": 0.44676870107650757, "num_input_tokens_seen": 45731160, "step": 218, "train_runtime": 78023.5191, "train_tokens_per_second": 586.12 }, { "epoch": 0.7272727272727273, "grad_norm": 0.42590140421543404, "learning_rate": 4.3190949610052645e-05, "loss": 0.436279833316803, "num_input_tokens_seen": 45941464, "step": 219, "train_runtime": 78335.8018, "train_tokens_per_second": 586.468 }, { "epoch": 0.730593607305936, "grad_norm": 0.4218597010474441, "learning_rate": 4.3131375380141987e-05, "loss": 0.438388854265213, "num_input_tokens_seen": 46147016, "step": 220, "train_runtime": 78647.2731, "train_tokens_per_second": 586.759 }, { "epoch": 0.7339144873391449, "grad_norm": 0.3930711792215482, "learning_rate": 4.3071583141612135e-05, "loss": 0.41060367226600647, "num_input_tokens_seen": 46360480, "step": 221, "train_runtime": 78958.9727, "train_tokens_per_second": 587.146 }, { "epoch": 0.7372353673723536, "grad_norm": 0.43384408048830225, "learning_rate": 4.301157361339495e-05, "loss": 0.44280239939689636, "num_input_tokens_seen": 46568584, "step": 222, "train_runtime": 79291.1853, "train_tokens_per_second": 587.311 }, { "epoch": 0.7405562474055625, "grad_norm": 0.406154404311589, "learning_rate": 4.295134751703493e-05, "loss": 0.4437750577926636, "num_input_tokens_seen": 46771576, "step": 223, "train_runtime": 79592.0412, "train_tokens_per_second": 587.641 }, { "epoch": 0.7438771274387713, "grad_norm": 0.4006780449494785, "learning_rate": 4.2890905576680576e-05, "loss": 0.4321970045566559, "num_input_tokens_seen": 46985008, "step": 224, "train_runtime": 79909.1385, "train_tokens_per_second": 587.98 }, { "epoch": 0.7471980074719801, "grad_norm": 0.41449445587602396, "learning_rate": 4.283024851907565e-05, "loss": 0.4373015761375427, "num_input_tokens_seen": 47197040, "step": 225, "train_runtime": 80227.1999, "train_tokens_per_second": 588.292 }, { "epoch": 0.7505188875051889, "grad_norm": 0.3980871499607546, "learning_rate": 4.276937707355044e-05, "loss": 0.43579328060150146, "num_input_tokens_seen": 47410648, "step": 226, "train_runtime": 80550.663, "train_tokens_per_second": 588.582 }, { "epoch": 0.7538397675383977, "grad_norm": 0.3935568913760632, "learning_rate": 4.2708291972013026e-05, "loss": 0.4267618656158447, "num_input_tokens_seen": 47622816, "step": 227, "train_runtime": 80869.2781, "train_tokens_per_second": 588.886 }, { "epoch": 0.7571606475716065, "grad_norm": 0.4145918658660143, "learning_rate": 4.2646993948940404e-05, "loss": 0.4380059838294983, "num_input_tokens_seen": 47831208, "step": 228, "train_runtime": 81209.5846, "train_tokens_per_second": 588.985 }, { "epoch": 0.7604815276048152, "grad_norm": 0.40022332070305083, "learning_rate": 4.2585483741369755e-05, "loss": 0.4525393545627594, "num_input_tokens_seen": 48040744, "step": 229, "train_runtime": 81519.0626, "train_tokens_per_second": 589.319 }, { "epoch": 0.7638024076380241, "grad_norm": 0.3941962033520557, "learning_rate": 4.25237620888895e-05, "loss": 0.43285566568374634, "num_input_tokens_seen": 48259272, "step": 230, "train_runtime": 81847.8804, "train_tokens_per_second": 589.622 }, { "epoch": 0.7671232876712328, "grad_norm": 0.40101868961668413, "learning_rate": 4.2461829733630435e-05, "loss": 0.4323822855949402, "num_input_tokens_seen": 48465344, "step": 231, "train_runtime": 82170.7917, "train_tokens_per_second": 589.812 }, { "epoch": 0.7704441677044417, "grad_norm": 0.4119596262038595, "learning_rate": 4.239968742025684e-05, "loss": 0.42581987380981445, "num_input_tokens_seen": 48673880, "step": 232, "train_runtime": 82501.2599, "train_tokens_per_second": 589.977 }, { "epoch": 0.7737650477376504, "grad_norm": 0.3877689138399609, "learning_rate": 4.233733589595746e-05, "loss": 0.4327796697616577, "num_input_tokens_seen": 48882616, "step": 233, "train_runtime": 82835.1779, "train_tokens_per_second": 590.119 }, { "epoch": 0.7770859277708593, "grad_norm": 0.4114675090120471, "learning_rate": 4.227477591043659e-05, "loss": 0.4233633279800415, "num_input_tokens_seen": 49091960, "step": 234, "train_runtime": 83131.9018, "train_tokens_per_second": 590.531 }, { "epoch": 0.7804068078040681, "grad_norm": 0.3844355369797481, "learning_rate": 4.2212008215905e-05, "loss": 0.42800650000572205, "num_input_tokens_seen": 49300768, "step": 235, "train_runtime": 83435.9447, "train_tokens_per_second": 590.882 }, { "epoch": 0.7837276878372769, "grad_norm": 0.4113169608025678, "learning_rate": 4.2149033567070936e-05, "loss": 0.41652315855026245, "num_input_tokens_seen": 49506952, "step": 236, "train_runtime": 83766.8777, "train_tokens_per_second": 591.009 }, { "epoch": 0.7870485678704857, "grad_norm": 0.39540012796803703, "learning_rate": 4.208585272113102e-05, "loss": 0.4363587498664856, "num_input_tokens_seen": 49721280, "step": 237, "train_runtime": 84097.9222, "train_tokens_per_second": 591.231 }, { "epoch": 0.7903694479036945, "grad_norm": 0.41643595129053224, "learning_rate": 4.202246643776116e-05, "loss": 0.4244499206542969, "num_input_tokens_seen": 49932504, "step": 238, "train_runtime": 84462.1721, "train_tokens_per_second": 591.182 }, { "epoch": 0.7936903279369033, "grad_norm": 0.3830161318471299, "learning_rate": 4.195887547910741e-05, "loss": 0.41660749912261963, "num_input_tokens_seen": 50145408, "step": 239, "train_runtime": 84809.6708, "train_tokens_per_second": 591.27 }, { "epoch": 0.797011207970112, "grad_norm": 0.39878877126844325, "learning_rate": 4.189508060977678e-05, "loss": 0.422029972076416, "num_input_tokens_seen": 50356984, "step": 240, "train_runtime": 85134.5676, "train_tokens_per_second": 591.499 }, { "epoch": 0.8003320880033209, "grad_norm": 0.422900739134548, "learning_rate": 4.1831082596828106e-05, "loss": 0.4356965124607086, "num_input_tokens_seen": 50575256, "step": 241, "train_runtime": 85495.476, "train_tokens_per_second": 591.555 }, { "epoch": 0.8036529680365296, "grad_norm": 0.39044646676311695, "learning_rate": 4.176688220976277e-05, "loss": 0.4283129870891571, "num_input_tokens_seen": 50783848, "step": 242, "train_runtime": 85847.0028, "train_tokens_per_second": 591.562 }, { "epoch": 0.8069738480697385, "grad_norm": 0.4058712663118168, "learning_rate": 4.1702480220515475e-05, "loss": 0.43418729305267334, "num_input_tokens_seen": 50992872, "step": 243, "train_runtime": 86162.1526, "train_tokens_per_second": 591.824 }, { "epoch": 0.8102947281029472, "grad_norm": 0.39251763704041825, "learning_rate": 4.1637877403444924e-05, "loss": 0.43433132767677307, "num_input_tokens_seen": 51200424, "step": 244, "train_runtime": 86500.2727, "train_tokens_per_second": 591.911 }, { "epoch": 0.8136156081361561, "grad_norm": 0.40231968052666056, "learning_rate": 4.157307453532457e-05, "loss": 0.4251127243041992, "num_input_tokens_seen": 51413432, "step": 245, "train_runtime": 86884.2933, "train_tokens_per_second": 591.746 }, { "epoch": 0.8169364881693649, "grad_norm": 0.3750941951156154, "learning_rate": 4.150807239533326e-05, "loss": 0.4269641041755676, "num_input_tokens_seen": 51628736, "step": 246, "train_runtime": 87239.0991, "train_tokens_per_second": 591.807 }, { "epoch": 0.8202573682025737, "grad_norm": 0.4010675841785738, "learning_rate": 4.144287176504582e-05, "loss": 0.4357364773750305, "num_input_tokens_seen": 51840264, "step": 247, "train_runtime": 87633.248, "train_tokens_per_second": 591.559 }, { "epoch": 0.8235782482357825, "grad_norm": 0.3909343186769344, "learning_rate": 4.1377473428423696e-05, "loss": 0.4357312321662903, "num_input_tokens_seen": 52046288, "step": 248, "train_runtime": 87966.2886, "train_tokens_per_second": 591.662 }, { "epoch": 0.8268991282689913, "grad_norm": 0.3960218043901571, "learning_rate": 4.131187817180554e-05, "loss": 0.44376251101493835, "num_input_tokens_seen": 52258104, "step": 249, "train_runtime": 88338.6417, "train_tokens_per_second": 591.566 }, { "epoch": 0.8302200083022001, "grad_norm": 0.3793535618817348, "learning_rate": 4.124608678389772e-05, "loss": 0.4338582754135132, "num_input_tokens_seen": 52467344, "step": 250, "train_runtime": 88704.5321, "train_tokens_per_second": 591.484 }, { "epoch": 0.8335408883354088, "grad_norm": 0.3846399315104121, "learning_rate": 4.118010005576485e-05, "loss": 0.4290063977241516, "num_input_tokens_seen": 52676424, "step": 251, "train_runtime": 89006.2443, "train_tokens_per_second": 591.828 }, { "epoch": 0.8368617683686177, "grad_norm": 0.38530352508979077, "learning_rate": 4.11139187808203e-05, "loss": 0.42551183700561523, "num_input_tokens_seen": 52888768, "step": 252, "train_runtime": 89336.1624, "train_tokens_per_second": 592.02 }, { "epoch": 0.8401826484018264, "grad_norm": 0.3915904118401506, "learning_rate": 4.104754375481664e-05, "loss": 0.4403475522994995, "num_input_tokens_seen": 53097984, "step": 253, "train_runtime": 89655.5639, "train_tokens_per_second": 592.244 }, { "epoch": 0.8435035284350353, "grad_norm": 0.3863899136414231, "learning_rate": 4.098097577583605e-05, "loss": 0.43381038308143616, "num_input_tokens_seen": 53304584, "step": 254, "train_runtime": 89991.3024, "train_tokens_per_second": 592.33 }, { "epoch": 0.8468244084682441, "grad_norm": 0.3757804490227546, "learning_rate": 4.0914215644280754e-05, "loss": 0.4274643361568451, "num_input_tokens_seen": 53518560, "step": 255, "train_runtime": 90347.0555, "train_tokens_per_second": 592.366 }, { "epoch": 0.8501452885014529, "grad_norm": 0.3736780903759102, "learning_rate": 4.0847264162863374e-05, "loss": 0.4286618232727051, "num_input_tokens_seen": 53732472, "step": 256, "train_runtime": 90719.5964, "train_tokens_per_second": 592.292 }, { "epoch": 0.8534661685346617, "grad_norm": 0.38598634457880004, "learning_rate": 4.0780122136597323e-05, "loss": 0.42166805267333984, "num_input_tokens_seen": 53942328, "step": 257, "train_runtime": 91055.9615, "train_tokens_per_second": 592.409 }, { "epoch": 0.8567870485678705, "grad_norm": 0.37038915305050435, "learning_rate": 4.071279037278706e-05, "loss": 0.4259738326072693, "num_input_tokens_seen": 54155280, "step": 258, "train_runtime": 91387.5771, "train_tokens_per_second": 592.589 }, { "epoch": 0.8601079286010793, "grad_norm": 0.40838237158618995, "learning_rate": 4.064526968101844e-05, "loss": 0.43134981393814087, "num_input_tokens_seen": 54361816, "step": 259, "train_runtime": 91717.8231, "train_tokens_per_second": 592.707 }, { "epoch": 0.8634288086342881, "grad_norm": 0.39047677880655257, "learning_rate": 4.0577560873148945e-05, "loss": 0.42049890756607056, "num_input_tokens_seen": 54571312, "step": 260, "train_runtime": 92103.0967, "train_tokens_per_second": 592.502 }, { "epoch": 0.8667496886674969, "grad_norm": 0.37467494947609953, "learning_rate": 4.050966476329793e-05, "loss": 0.4206635355949402, "num_input_tokens_seen": 54780248, "step": 261, "train_runtime": 92460.0388, "train_tokens_per_second": 592.475 }, { "epoch": 0.8700705687007056, "grad_norm": 0.3924303913150216, "learning_rate": 4.044158216783685e-05, "loss": 0.4227411448955536, "num_input_tokens_seen": 54995344, "step": 262, "train_runtime": 92858.2, "train_tokens_per_second": 592.251 }, { "epoch": 0.8733914487339145, "grad_norm": 0.3855717732710405, "learning_rate": 4.037331390537939e-05, "loss": 0.44105619192123413, "num_input_tokens_seen": 55205656, "step": 263, "train_runtime": 93204.9604, "train_tokens_per_second": 592.304 }, { "epoch": 0.8767123287671232, "grad_norm": 0.3908938102943696, "learning_rate": 4.0304860796771745e-05, "loss": 0.4170098602771759, "num_input_tokens_seen": 55422392, "step": 264, "train_runtime": 93565.439, "train_tokens_per_second": 592.338 }, { "epoch": 0.8800332088003321, "grad_norm": 0.4136282259644209, "learning_rate": 4.023622366508261e-05, "loss": 0.4379138946533203, "num_input_tokens_seen": 55631840, "step": 265, "train_runtime": 93925.3236, "train_tokens_per_second": 592.299 }, { "epoch": 0.8833540888335409, "grad_norm": 0.38803641813628786, "learning_rate": 4.0167403335593335e-05, "loss": 0.42350104451179504, "num_input_tokens_seen": 55841632, "step": 266, "train_runtime": 94268.3297, "train_tokens_per_second": 592.369 }, { "epoch": 0.8866749688667497, "grad_norm": 0.40620239402564373, "learning_rate": 4.0098400635788035e-05, "loss": 0.4372592866420746, "num_input_tokens_seen": 56048016, "step": 267, "train_runtime": 94615.1943, "train_tokens_per_second": 592.379 }, { "epoch": 0.8899958488999585, "grad_norm": 0.37503093661431025, "learning_rate": 4.002921639534362e-05, "loss": 0.42278414964675903, "num_input_tokens_seen": 56261912, "step": 268, "train_runtime": 94956.7176, "train_tokens_per_second": 592.501 }, { "epoch": 0.8933167289331673, "grad_norm": 0.41742542501612245, "learning_rate": 3.99598514461198e-05, "loss": 0.4389027953147888, "num_input_tokens_seen": 56471328, "step": 269, "train_runtime": 95300.2019, "train_tokens_per_second": 592.563 }, { "epoch": 0.8966376089663761, "grad_norm": 0.391251953410014, "learning_rate": 3.98903066221491e-05, "loss": 0.42557549476623535, "num_input_tokens_seen": 56682008, "step": 270, "train_runtime": 95644.0705, "train_tokens_per_second": 592.635 }, { "epoch": 0.8999584889995849, "grad_norm": 0.4283747853318434, "learning_rate": 3.9820582759626825e-05, "loss": 0.4394855797290802, "num_input_tokens_seen": 56889488, "step": 271, "train_runtime": 95979.7251, "train_tokens_per_second": 592.724 }, { "epoch": 0.9032793690327937, "grad_norm": 0.4300517423103051, "learning_rate": 3.975068069690102e-05, "loss": 0.43113696575164795, "num_input_tokens_seen": 57093032, "step": 272, "train_runtime": 96329.3497, "train_tokens_per_second": 592.686 }, { "epoch": 0.9066002490660025, "grad_norm": 0.3899727685818111, "learning_rate": 3.968060127446236e-05, "loss": 0.42244070768356323, "num_input_tokens_seen": 57299024, "step": 273, "train_runtime": 96666.2062, "train_tokens_per_second": 592.751 }, { "epoch": 0.9099211290992113, "grad_norm": 0.41844195458428385, "learning_rate": 3.96103453349341e-05, "loss": 0.4271528422832489, "num_input_tokens_seen": 57503240, "step": 274, "train_runtime": 97010.9563, "train_tokens_per_second": 592.75 }, { "epoch": 0.91324200913242, "grad_norm": 0.3907441904064742, "learning_rate": 3.953991372306186e-05, "loss": 0.4269882142543793, "num_input_tokens_seen": 57708112, "step": 275, "train_runtime": 97323.1705, "train_tokens_per_second": 592.953 }, { "epoch": 0.9165628891656289, "grad_norm": 0.41012183668692587, "learning_rate": 3.946930728570355e-05, "loss": 0.42415356636047363, "num_input_tokens_seen": 57922360, "step": 276, "train_runtime": 97687.3888, "train_tokens_per_second": 592.936 }, { "epoch": 0.9198837691988377, "grad_norm": 0.39459517114956416, "learning_rate": 3.9398526871819154e-05, "loss": 0.4297690689563751, "num_input_tokens_seen": 58128672, "step": 277, "train_runtime": 98085.0414, "train_tokens_per_second": 592.635 }, { "epoch": 0.9232046492320465, "grad_norm": 0.4194019635795066, "learning_rate": 3.932757333246048e-05, "loss": 0.4301888346672058, "num_input_tokens_seen": 58336344, "step": 278, "train_runtime": 98420.2889, "train_tokens_per_second": 592.727 }, { "epoch": 0.9265255292652553, "grad_norm": 0.39025131597816, "learning_rate": 3.925644752076101e-05, "loss": 0.4214772582054138, "num_input_tokens_seen": 58546888, "step": 279, "train_runtime": 98765.7552, "train_tokens_per_second": 592.785 }, { "epoch": 0.9298464092984641, "grad_norm": 0.4046572541710451, "learning_rate": 3.918515029192559e-05, "loss": 0.43122127652168274, "num_input_tokens_seen": 58751824, "step": 280, "train_runtime": 99110.463, "train_tokens_per_second": 592.791 }, { "epoch": 0.9331672893316729, "grad_norm": 0.3948281561610528, "learning_rate": 3.911368250322014e-05, "loss": 0.42521238327026367, "num_input_tokens_seen": 58962864, "step": 281, "train_runtime": 99445.1063, "train_tokens_per_second": 592.919 }, { "epoch": 0.9364881693648817, "grad_norm": 0.38858772906337613, "learning_rate": 3.9042045013961386e-05, "loss": 0.438882052898407, "num_input_tokens_seen": 59173856, "step": 282, "train_runtime": 99776.4792, "train_tokens_per_second": 593.064 }, { "epoch": 0.9398090493980905, "grad_norm": 0.39819259245098154, "learning_rate": 3.897023868550649e-05, "loss": 0.42735034227371216, "num_input_tokens_seen": 59380128, "step": 283, "train_runtime": 100105.3987, "train_tokens_per_second": 593.176 }, { "epoch": 0.9431299294312993, "grad_norm": 0.3891158766101056, "learning_rate": 3.889826438124271e-05, "loss": 0.41743844747543335, "num_input_tokens_seen": 59596624, "step": 284, "train_runtime": 100422.8071, "train_tokens_per_second": 593.457 }, { "epoch": 0.9464508094645081, "grad_norm": 0.3883115890628947, "learning_rate": 3.882612296657701e-05, "loss": 0.4251156747341156, "num_input_tokens_seen": 59810096, "step": 285, "train_runtime": 100757.2956, "train_tokens_per_second": 593.606 }, { "epoch": 0.9497716894977168, "grad_norm": 0.37613713303007656, "learning_rate": 3.875381530892569e-05, "loss": 0.42936187982559204, "num_input_tokens_seen": 60022264, "step": 286, "train_runtime": 101078.7323, "train_tokens_per_second": 593.817 }, { "epoch": 0.9530925695309257, "grad_norm": 0.3785025854131105, "learning_rate": 3.8681342277703906e-05, "loss": 0.42513227462768555, "num_input_tokens_seen": 60234256, "step": 287, "train_runtime": 101415.6123, "train_tokens_per_second": 593.935 }, { "epoch": 0.9564134495641345, "grad_norm": 0.39103546291912306, "learning_rate": 3.860870474431521e-05, "loss": 0.4264582395553589, "num_input_tokens_seen": 60444896, "step": 288, "train_runtime": 101763.8424, "train_tokens_per_second": 593.972 }, { "epoch": 0.9597343295973433, "grad_norm": 0.3962293736846428, "learning_rate": 3.853590358214119e-05, "loss": 0.43254929780960083, "num_input_tokens_seen": 60650528, "step": 289, "train_runtime": 102121.7096, "train_tokens_per_second": 593.904 }, { "epoch": 0.9630552096305521, "grad_norm": 0.4082769075402112, "learning_rate": 3.846293966653076e-05, "loss": 0.42426615953445435, "num_input_tokens_seen": 60854088, "step": 290, "train_runtime": 102451.7788, "train_tokens_per_second": 593.978 }, { "epoch": 0.9663760896637609, "grad_norm": 0.3883582467347925, "learning_rate": 3.8389813874789856e-05, "loss": 0.42260557413101196, "num_input_tokens_seen": 61063120, "step": 291, "train_runtime": 102793.9542, "train_tokens_per_second": 594.034 }, { "epoch": 0.9696969696969697, "grad_norm": 0.3915773264969273, "learning_rate": 3.831652708617073e-05, "loss": 0.42293086647987366, "num_input_tokens_seen": 61269568, "step": 292, "train_runtime": 103147.3882, "train_tokens_per_second": 594.0 }, { "epoch": 0.9730178497301785, "grad_norm": 0.38714117238683665, "learning_rate": 3.8243080181861435e-05, "loss": 0.4292466640472412, "num_input_tokens_seen": 61480384, "step": 293, "train_runtime": 103482.3165, "train_tokens_per_second": 594.115 }, { "epoch": 0.9763387297633873, "grad_norm": 0.4046350238046135, "learning_rate": 3.816947404497525e-05, "loss": 0.43775731325149536, "num_input_tokens_seen": 61691472, "step": 294, "train_runtime": 103800.9043, "train_tokens_per_second": 594.325 }, { "epoch": 0.979659609796596, "grad_norm": 0.3764306417142319, "learning_rate": 3.809570956054004e-05, "loss": 0.4230045676231384, "num_input_tokens_seen": 61902552, "step": 295, "train_runtime": 104127.449, "train_tokens_per_second": 594.488 }, { "epoch": 0.9829804898298049, "grad_norm": 0.38503361251415075, "learning_rate": 3.802178761548759e-05, "loss": 0.42893537878990173, "num_input_tokens_seen": 62117136, "step": 296, "train_runtime": 104476.6378, "train_tokens_per_second": 594.555 }, { "epoch": 0.9863013698630136, "grad_norm": 0.3827759445429188, "learning_rate": 3.794770909864298e-05, "loss": 0.42035895586013794, "num_input_tokens_seen": 62326552, "step": 297, "train_runtime": 104841.9096, "train_tokens_per_second": 594.481 }, { "epoch": 0.9896222498962225, "grad_norm": 0.38065630730672406, "learning_rate": 3.7873474900713895e-05, "loss": 0.42545071244239807, "num_input_tokens_seen": 62534528, "step": 298, "train_runtime": 105152.4111, "train_tokens_per_second": 594.704 }, { "epoch": 0.9929431299294313, "grad_norm": 0.3747980251621376, "learning_rate": 3.779908591427988e-05, "loss": 0.4190545380115509, "num_input_tokens_seen": 62740888, "step": 299, "train_runtime": 105473.6786, "train_tokens_per_second": 594.849 }, { "epoch": 0.9962640099626401, "grad_norm": 0.38821628319538737, "learning_rate": 3.772454303378166e-05, "loss": 0.4248197376728058, "num_input_tokens_seen": 62955768, "step": 300, "train_runtime": 105789.6004, "train_tokens_per_second": 595.104 }, { "epoch": 0.9995848899958489, "grad_norm": 0.38033213887355916, "learning_rate": 3.764984715551032e-05, "loss": 0.420073926448822, "num_input_tokens_seen": 63168384, "step": 301, "train_runtime": 106150.1852, "train_tokens_per_second": 595.085 }, { "epoch": 1.0, "grad_norm": 0.9320833137829965, "learning_rate": 3.757499917759659e-05, "loss": 0.37945839762687683, "num_input_tokens_seen": 63194912, "step": 302, "train_runtime": 106190.1396, "train_tokens_per_second": 595.111 }, { "epoch": 1.0033208800332087, "grad_norm": 0.5033957643331758, "learning_rate": 3.7500000000000003e-05, "loss": 0.30404990911483765, "num_input_tokens_seen": 63401624, "step": 303, "train_runtime": 106521.7044, "train_tokens_per_second": 595.199 }, { "epoch": 1.0066417600664177, "grad_norm": 0.4271497357717565, "learning_rate": 3.742485052449812e-05, "loss": 0.2919006645679474, "num_input_tokens_seen": 63609312, "step": 304, "train_runtime": 106844.5668, "train_tokens_per_second": 595.344 }, { "epoch": 1.0099626400996264, "grad_norm": 0.4330919375922481, "learning_rate": 3.734955165467563e-05, "loss": 0.28770512342453003, "num_input_tokens_seen": 63825640, "step": 305, "train_runtime": 107154.9013, "train_tokens_per_second": 595.639 }, { "epoch": 1.0132835201328352, "grad_norm": 0.4836295051048188, "learning_rate": 3.727410429591353e-05, "loss": 0.2927480936050415, "num_input_tokens_seen": 64037224, "step": 306, "train_runtime": 107476.4778, "train_tokens_per_second": 595.825 }, { "epoch": 1.016604400166044, "grad_norm": 0.4966599479862272, "learning_rate": 3.719850935537821e-05, "loss": 0.2891179025173187, "num_input_tokens_seen": 64235520, "step": 307, "train_runtime": 107799.5328, "train_tokens_per_second": 595.879 }, { "epoch": 1.0199252801992529, "grad_norm": 0.4457594455683536, "learning_rate": 3.712276774201058e-05, "loss": 0.2905241847038269, "num_input_tokens_seen": 64446048, "step": 308, "train_runtime": 108112.7933, "train_tokens_per_second": 596.1 }, { "epoch": 1.0232461602324616, "grad_norm": 0.45751366398394683, "learning_rate": 3.70468803665151e-05, "loss": 0.2945302724838257, "num_input_tokens_seen": 64657856, "step": 309, "train_runtime": 108471.7073, "train_tokens_per_second": 596.08 }, { "epoch": 1.0265670402656704, "grad_norm": 0.4430149534448321, "learning_rate": 3.697084814134886e-05, "loss": 0.29261961579322815, "num_input_tokens_seen": 64866088, "step": 310, "train_runtime": 108825.0243, "train_tokens_per_second": 596.059 }, { "epoch": 1.029887920298879, "grad_norm": 0.43594825072357923, "learning_rate": 3.6894671980710574e-05, "loss": 0.28819987177848816, "num_input_tokens_seen": 65076416, "step": 311, "train_runtime": 109178.0725, "train_tokens_per_second": 596.058 }, { "epoch": 1.033208800332088, "grad_norm": 0.41101934768390724, "learning_rate": 3.681835280052967e-05, "loss": 0.2864361107349396, "num_input_tokens_seen": 65292888, "step": 312, "train_runtime": 109516.5499, "train_tokens_per_second": 596.192 }, { "epoch": 1.0365296803652968, "grad_norm": 0.4268726086033666, "learning_rate": 3.674189151845515e-05, "loss": 0.2746656537055969, "num_input_tokens_seen": 65498224, "step": 313, "train_runtime": 109862.6828, "train_tokens_per_second": 596.183 }, { "epoch": 1.0398505603985055, "grad_norm": 0.4093744360612173, "learning_rate": 3.666528905384467e-05, "loss": 0.2784012258052826, "num_input_tokens_seen": 65707944, "step": 314, "train_runtime": 110205.4802, "train_tokens_per_second": 596.231 }, { "epoch": 1.0431714404317145, "grad_norm": 0.4174109094441792, "learning_rate": 3.6588546327753425e-05, "loss": 0.26996612548828125, "num_input_tokens_seen": 65919456, "step": 315, "train_runtime": 110543.2622, "train_tokens_per_second": 596.323 }, { "epoch": 1.0464923204649232, "grad_norm": 0.41600103285399004, "learning_rate": 3.651166426292309e-05, "loss": 0.29366016387939453, "num_input_tokens_seen": 66136744, "step": 316, "train_runtime": 110882.303, "train_tokens_per_second": 596.459 }, { "epoch": 1.049813200498132, "grad_norm": 0.4026366147820554, "learning_rate": 3.6434643783770736e-05, "loss": 0.28355640172958374, "num_input_tokens_seen": 66340048, "step": 317, "train_runtime": 111205.8332, "train_tokens_per_second": 596.552 }, { "epoch": 1.0531340805313407, "grad_norm": 0.3919020200232825, "learning_rate": 3.63574858163777e-05, "loss": 0.2831692099571228, "num_input_tokens_seen": 66548192, "step": 318, "train_runtime": 111525.5503, "train_tokens_per_second": 596.708 }, { "epoch": 1.0564549605645497, "grad_norm": 0.4153954543904867, "learning_rate": 3.6280191288478436e-05, "loss": 0.27536556124687195, "num_input_tokens_seen": 66754128, "step": 319, "train_runtime": 111855.0462, "train_tokens_per_second": 596.791 }, { "epoch": 1.0597758405977584, "grad_norm": 0.38088769239269615, "learning_rate": 3.620276112944941e-05, "loss": 0.2778305113315582, "num_input_tokens_seen": 66972952, "step": 320, "train_runtime": 112190.5968, "train_tokens_per_second": 596.957 }, { "epoch": 1.0630967206309672, "grad_norm": 0.39990695702707674, "learning_rate": 3.612519627029787e-05, "loss": 0.28065669536590576, "num_input_tokens_seen": 67180352, "step": 321, "train_runtime": 112518.5706, "train_tokens_per_second": 597.06 }, { "epoch": 1.0664176006641761, "grad_norm": 0.4258637472274097, "learning_rate": 3.604749764365069e-05, "loss": 0.2865450978279114, "num_input_tokens_seen": 67385592, "step": 322, "train_runtime": 112857.1166, "train_tokens_per_second": 597.088 }, { "epoch": 1.0697384806973849, "grad_norm": 0.39537293545005486, "learning_rate": 3.596966618374313e-05, "loss": 0.2833058536052704, "num_input_tokens_seen": 67595592, "step": 323, "train_runtime": 113184.7394, "train_tokens_per_second": 597.215 }, { "epoch": 1.0730593607305936, "grad_norm": 0.4132576671722844, "learning_rate": 3.5891702826407633e-05, "loss": 0.27913883328437805, "num_input_tokens_seen": 67798704, "step": 324, "train_runtime": 113506.6838, "train_tokens_per_second": 597.31 }, { "epoch": 1.0763802407638023, "grad_norm": 0.3857789087068665, "learning_rate": 3.5813608509062526e-05, "loss": 0.2752624750137329, "num_input_tokens_seen": 68007752, "step": 325, "train_runtime": 113826.221, "train_tokens_per_second": 597.47 }, { "epoch": 1.0797011207970113, "grad_norm": 0.3998749386074476, "learning_rate": 3.5735384170700815e-05, "loss": 0.2673056125640869, "num_input_tokens_seen": 68215152, "step": 326, "train_runtime": 114151.7369, "train_tokens_per_second": 597.583 }, { "epoch": 1.08302200083022, "grad_norm": 0.40111875828855226, "learning_rate": 3.5657030751878814e-05, "loss": 0.2820720076560974, "num_input_tokens_seen": 68426056, "step": 327, "train_runtime": 114479.9222, "train_tokens_per_second": 597.712 }, { "epoch": 1.0863428808634288, "grad_norm": 0.40950466797946455, "learning_rate": 3.5578549194704915e-05, "loss": 0.2856150269508362, "num_input_tokens_seen": 68636448, "step": 328, "train_runtime": 114811.6148, "train_tokens_per_second": 597.818 }, { "epoch": 1.0896637608966375, "grad_norm": 0.3836534111238558, "learning_rate": 3.5499940442828206e-05, "loss": 0.2782435715198517, "num_input_tokens_seen": 68848208, "step": 329, "train_runtime": 115133.6844, "train_tokens_per_second": 597.985 }, { "epoch": 1.0929846409298465, "grad_norm": 0.3995193601604606, "learning_rate": 3.5421205441427154e-05, "loss": 0.2769739627838135, "num_input_tokens_seen": 69057048, "step": 330, "train_runtime": 115480.6245, "train_tokens_per_second": 597.997 }, { "epoch": 1.0963055209630552, "grad_norm": 0.3798247231550839, "learning_rate": 3.534234513719821e-05, "loss": 0.26873889565467834, "num_input_tokens_seen": 69266936, "step": 331, "train_runtime": 115815.7124, "train_tokens_per_second": 598.079 }, { "epoch": 1.099626400996264, "grad_norm": 0.39774601597768344, "learning_rate": 3.526336047834445e-05, "loss": 0.27480942010879517, "num_input_tokens_seen": 69484600, "step": 332, "train_runtime": 116121.2439, "train_tokens_per_second": 598.38 }, { "epoch": 1.102947281029473, "grad_norm": 0.38221374681076564, "learning_rate": 3.51842524145642e-05, "loss": 0.28268882632255554, "num_input_tokens_seen": 69699400, "step": 333, "train_runtime": 116434.3749, "train_tokens_per_second": 598.615 }, { "epoch": 1.1062681610626817, "grad_norm": 0.39413519462184743, "learning_rate": 3.5105021897039544e-05, "loss": 0.269026517868042, "num_input_tokens_seen": 69906120, "step": 334, "train_runtime": 116762.9421, "train_tokens_per_second": 598.701 }, { "epoch": 1.1095890410958904, "grad_norm": 0.37848824417393606, "learning_rate": 3.502566987842496e-05, "loss": 0.27428004145622253, "num_input_tokens_seen": 70120272, "step": 335, "train_runtime": 117070.8549, "train_tokens_per_second": 598.956 }, { "epoch": 1.1129099211290991, "grad_norm": 0.3836084854710085, "learning_rate": 3.494619731283581e-05, "loss": 0.2684640884399414, "num_input_tokens_seen": 70324864, "step": 336, "train_runtime": 117393.0466, "train_tokens_per_second": 599.055 }, { "epoch": 1.116230801162308, "grad_norm": 0.3902774854399195, "learning_rate": 3.4866605155836915e-05, "loss": 0.26904401183128357, "num_input_tokens_seen": 70527400, "step": 337, "train_runtime": 117730.4842, "train_tokens_per_second": 599.058 }, { "epoch": 1.1195516811955168, "grad_norm": 0.39750388845414714, "learning_rate": 3.478689436443102e-05, "loss": 0.2700223922729492, "num_input_tokens_seen": 70736048, "step": 338, "train_runtime": 118077.8959, "train_tokens_per_second": 599.063 }, { "epoch": 1.1228725612287256, "grad_norm": 0.39190476519214906, "learning_rate": 3.470706589704734e-05, "loss": 0.2767173647880554, "num_input_tokens_seen": 70944064, "step": 339, "train_runtime": 118411.5002, "train_tokens_per_second": 599.132 }, { "epoch": 1.1261934412619343, "grad_norm": 0.4025905587697693, "learning_rate": 3.4627120713529984e-05, "loss": 0.2738657593727112, "num_input_tokens_seen": 71153928, "step": 340, "train_runtime": 118779.5802, "train_tokens_per_second": 599.042 }, { "epoch": 1.1295143212951433, "grad_norm": 0.3869411472272897, "learning_rate": 3.4547059775126445e-05, "loss": 0.2712458074092865, "num_input_tokens_seen": 71360176, "step": 341, "train_runtime": 119118.7446, "train_tokens_per_second": 599.068 }, { "epoch": 1.132835201328352, "grad_norm": 0.3957657722932085, "learning_rate": 3.446688404447601e-05, "loss": 0.27859729528427124, "num_input_tokens_seen": 71569832, "step": 342, "train_runtime": 119464.5524, "train_tokens_per_second": 599.088 }, { "epoch": 1.1361560813615608, "grad_norm": 0.3846251594405571, "learning_rate": 3.438659448559825e-05, "loss": 0.27219098806381226, "num_input_tokens_seen": 71781800, "step": 343, "train_runtime": 119813.5347, "train_tokens_per_second": 599.113 }, { "epoch": 1.1394769613947697, "grad_norm": 0.378573190625489, "learning_rate": 3.430619206388136e-05, "loss": 0.2708030343055725, "num_input_tokens_seen": 71991624, "step": 344, "train_runtime": 120149.2072, "train_tokens_per_second": 599.185 }, { "epoch": 1.1427978414279785, "grad_norm": 0.38601392863797407, "learning_rate": 3.422567774607058e-05, "loss": 0.27873820066452026, "num_input_tokens_seen": 72202208, "step": 345, "train_runtime": 120485.5183, "train_tokens_per_second": 599.26 }, { "epoch": 1.1461187214611872, "grad_norm": 0.3975867751092877, "learning_rate": 3.414505250025659e-05, "loss": 0.27958598732948303, "num_input_tokens_seen": 72406480, "step": 346, "train_runtime": 120829.4146, "train_tokens_per_second": 599.245 }, { "epoch": 1.149439601494396, "grad_norm": 0.3909178786033204, "learning_rate": 3.406431729586382e-05, "loss": 0.2782125473022461, "num_input_tokens_seen": 72624512, "step": 347, "train_runtime": 121146.1975, "train_tokens_per_second": 599.478 }, { "epoch": 1.152760481527605, "grad_norm": 0.38648568091264757, "learning_rate": 3.398347310363884e-05, "loss": 0.2749646306037903, "num_input_tokens_seen": 72830976, "step": 348, "train_runtime": 121466.5339, "train_tokens_per_second": 599.597 }, { "epoch": 1.1560813615608136, "grad_norm": 0.3958299327319909, "learning_rate": 3.390252089563867e-05, "loss": 0.2624763548374176, "num_input_tokens_seen": 73039008, "step": 349, "train_runtime": 121787.9261, "train_tokens_per_second": 599.723 }, { "epoch": 1.1594022415940224, "grad_norm": 0.3821615939869412, "learning_rate": 3.38214616452191e-05, "loss": 0.27640023827552795, "num_input_tokens_seen": 73251288, "step": 350, "train_runtime": 122127.4507, "train_tokens_per_second": 599.794 }, { "epoch": 1.1627231216272311, "grad_norm": 0.38760963993326286, "learning_rate": 3.3740296327022984e-05, "loss": 0.280078649520874, "num_input_tokens_seen": 73461608, "step": 351, "train_runtime": 122473.9654, "train_tokens_per_second": 599.814 }, { "epoch": 1.16604400166044, "grad_norm": 0.3959781198849755, "learning_rate": 3.365902591696848e-05, "loss": 0.27999985218048096, "num_input_tokens_seen": 73669488, "step": 352, "train_runtime": 122812.2656, "train_tokens_per_second": 599.854 }, { "epoch": 1.1693648816936488, "grad_norm": 0.3853225278648316, "learning_rate": 3.35776513922374e-05, "loss": 0.2793060839176178, "num_input_tokens_seen": 73882160, "step": 353, "train_runtime": 123139.9603, "train_tokens_per_second": 599.985 }, { "epoch": 1.1726857617268576, "grad_norm": 0.3873516476714873, "learning_rate": 3.3496173731263377e-05, "loss": 0.28069770336151123, "num_input_tokens_seen": 74093192, "step": 354, "train_runtime": 123430.8299, "train_tokens_per_second": 600.281 }, { "epoch": 1.1760066417600665, "grad_norm": 0.4051295123517068, "learning_rate": 3.341459391372016e-05, "loss": 0.27695173025131226, "num_input_tokens_seen": 74299168, "step": 355, "train_runtime": 123735.1962, "train_tokens_per_second": 600.469 }, { "epoch": 1.1793275217932753, "grad_norm": 0.38059322107156407, "learning_rate": 3.333291292050981e-05, "loss": 0.27618107199668884, "num_input_tokens_seen": 74514120, "step": 356, "train_runtime": 124047.6714, "train_tokens_per_second": 600.689 }, { "epoch": 1.182648401826484, "grad_norm": 0.4091885450836347, "learning_rate": 3.32511317337509e-05, "loss": 0.2841775417327881, "num_input_tokens_seen": 74721888, "step": 357, "train_runtime": 124336.9487, "train_tokens_per_second": 600.963 }, { "epoch": 1.1859692818596927, "grad_norm": 0.37250379189016386, "learning_rate": 3.31692513367667e-05, "loss": 0.272269070148468, "num_input_tokens_seen": 74936984, "step": 358, "train_runtime": 124657.9178, "train_tokens_per_second": 601.141 }, { "epoch": 1.1892901618929017, "grad_norm": 0.381451021409175, "learning_rate": 3.30872727140734e-05, "loss": 0.2603960335254669, "num_input_tokens_seen": 75143336, "step": 359, "train_runtime": 124927.6755, "train_tokens_per_second": 601.495 }, { "epoch": 1.1926110419261104, "grad_norm": 0.3950419019455725, "learning_rate": 3.300519685136822e-05, "loss": 0.28243565559387207, "num_input_tokens_seen": 75351112, "step": 360, "train_runtime": 125228.2783, "train_tokens_per_second": 601.71 }, { "epoch": 1.1959319219593192, "grad_norm": 0.37528707124050315, "learning_rate": 3.292302473551757e-05, "loss": 0.2702184021472931, "num_input_tokens_seen": 75557408, "step": 361, "train_runtime": 125540.5053, "train_tokens_per_second": 601.857 }, { "epoch": 1.199252801992528, "grad_norm": 0.37884164580272617, "learning_rate": 3.284075735454521e-05, "loss": 0.27533334493637085, "num_input_tokens_seen": 75767296, "step": 362, "train_runtime": 125845.9268, "train_tokens_per_second": 602.064 }, { "epoch": 1.202573682025737, "grad_norm": 0.39976194003859894, "learning_rate": 3.2758395697620334e-05, "loss": 0.2790154218673706, "num_input_tokens_seen": 75977360, "step": 363, "train_runtime": 126143.2783, "train_tokens_per_second": 602.31 }, { "epoch": 1.2058945620589456, "grad_norm": 0.3896006899621068, "learning_rate": 3.267594075504572e-05, "loss": 0.2722439467906952, "num_input_tokens_seen": 76187232, "step": 364, "train_runtime": 126434.2056, "train_tokens_per_second": 602.584 }, { "epoch": 1.2092154420921544, "grad_norm": 0.38401877477962953, "learning_rate": 3.259339351824575e-05, "loss": 0.2734944224357605, "num_input_tokens_seen": 76397096, "step": 365, "train_runtime": 126734.3435, "train_tokens_per_second": 602.813 }, { "epoch": 1.2125363221253633, "grad_norm": 0.38369335775192365, "learning_rate": 3.2510754979754595e-05, "loss": 0.2712152600288391, "num_input_tokens_seen": 76608960, "step": 366, "train_runtime": 127029.3741, "train_tokens_per_second": 603.081 }, { "epoch": 1.215857202158572, "grad_norm": 0.3822658285007293, "learning_rate": 3.2428026133204184e-05, "loss": 0.27213454246520996, "num_input_tokens_seen": 76820816, "step": 367, "train_runtime": 127343.2587, "train_tokens_per_second": 603.258 }, { "epoch": 1.2191780821917808, "grad_norm": 0.39226758271428697, "learning_rate": 3.23452079733123e-05, "loss": 0.27443960309028625, "num_input_tokens_seen": 77032072, "step": 368, "train_runtime": 127645.7427, "train_tokens_per_second": 603.483 }, { "epoch": 1.2224989622249895, "grad_norm": 0.3981002235888763, "learning_rate": 3.226230149587063e-05, "loss": 0.27778327465057373, "num_input_tokens_seen": 77235072, "step": 369, "train_runtime": 127921.4605, "train_tokens_per_second": 603.769 }, { "epoch": 1.2258198422581985, "grad_norm": 0.3986855210996142, "learning_rate": 3.217930769773275e-05, "loss": 0.27579450607299805, "num_input_tokens_seen": 77440800, "step": 370, "train_runtime": 128242.8115, "train_tokens_per_second": 603.861 }, { "epoch": 1.2291407222914073, "grad_norm": 0.39262841869107085, "learning_rate": 3.20962275768022e-05, "loss": 0.2702082395553589, "num_input_tokens_seen": 77649600, "step": 371, "train_runtime": 128566.7871, "train_tokens_per_second": 603.963 }, { "epoch": 1.232461602324616, "grad_norm": 0.39164658830046184, "learning_rate": 3.201306213202041e-05, "loss": 0.2788946032524109, "num_input_tokens_seen": 77863888, "step": 372, "train_runtime": 128905.6107, "train_tokens_per_second": 604.038 }, { "epoch": 1.2357824823578247, "grad_norm": 0.4023717438360454, "learning_rate": 3.1929812363354766e-05, "loss": 0.2840201258659363, "num_input_tokens_seen": 78074192, "step": 373, "train_runtime": 129237.3895, "train_tokens_per_second": 604.115 }, { "epoch": 1.2391033623910337, "grad_norm": 0.386463419137028, "learning_rate": 3.184647927178654e-05, "loss": 0.27590206265449524, "num_input_tokens_seen": 78284336, "step": 374, "train_runtime": 129532.0841, "train_tokens_per_second": 604.363 }, { "epoch": 1.2424242424242424, "grad_norm": 0.38542179134551174, "learning_rate": 3.1763063859298845e-05, "loss": 0.2710535526275635, "num_input_tokens_seen": 78492168, "step": 375, "train_runtime": 129812.9999, "train_tokens_per_second": 604.656 }, { "epoch": 1.2457451224574512, "grad_norm": 0.39523254227806587, "learning_rate": 3.167956712886463e-05, "loss": 0.2793447971343994, "num_input_tokens_seen": 78699680, "step": 376, "train_runtime": 130099.1707, "train_tokens_per_second": 604.921 }, { "epoch": 1.2490660024906601, "grad_norm": 0.429701377766994, "learning_rate": 3.159599008443459e-05, "loss": 0.2739261984825134, "num_input_tokens_seen": 78908672, "step": 377, "train_runtime": 130387.0838, "train_tokens_per_second": 605.188 }, { "epoch": 1.2523868825238689, "grad_norm": 0.3786007140269692, "learning_rate": 3.151233373092511e-05, "loss": 0.27112993597984314, "num_input_tokens_seen": 79123136, "step": 378, "train_runtime": 130666.4457, "train_tokens_per_second": 605.535 }, { "epoch": 1.2557077625570776, "grad_norm": 0.42642032573904515, "learning_rate": 3.142859907420615e-05, "loss": 0.2774733901023865, "num_input_tokens_seen": 79329632, "step": 379, "train_runtime": 130946.2964, "train_tokens_per_second": 605.818 }, { "epoch": 1.2590286425902864, "grad_norm": 0.3927295783018496, "learning_rate": 3.1344787121089204e-05, "loss": 0.2741287350654602, "num_input_tokens_seen": 79539320, "step": 380, "train_runtime": 131260.2395, "train_tokens_per_second": 605.967 }, { "epoch": 1.262349522623495, "grad_norm": 0.3990932025026113, "learning_rate": 3.126089887931515e-05, "loss": 0.2756434679031372, "num_input_tokens_seen": 79746632, "step": 381, "train_runtime": 131566.4324, "train_tokens_per_second": 606.132 }, { "epoch": 1.265670402656704, "grad_norm": 0.4180598428167107, "learning_rate": 3.117693535754213e-05, "loss": 0.28269442915916443, "num_input_tokens_seen": 79957024, "step": 382, "train_runtime": 131878.7421, "train_tokens_per_second": 606.292 }, { "epoch": 1.2689912826899128, "grad_norm": 0.38265026874917696, "learning_rate": 3.109289756533349e-05, "loss": 0.2642611861228943, "num_input_tokens_seen": 80164488, "step": 383, "train_runtime": 132161.4175, "train_tokens_per_second": 606.565 }, { "epoch": 1.2723121627231215, "grad_norm": 0.40189004419689933, "learning_rate": 3.100878651314554e-05, "loss": 0.27799808979034424, "num_input_tokens_seen": 80367968, "step": 384, "train_runtime": 132450.046, "train_tokens_per_second": 606.779 }, { "epoch": 1.2756330427563305, "grad_norm": 0.3878545992914858, "learning_rate": 3.092460321231547e-05, "loss": 0.2755585014820099, "num_input_tokens_seen": 80575048, "step": 385, "train_runtime": 132729.5763, "train_tokens_per_second": 607.062 }, { "epoch": 1.2789539227895392, "grad_norm": 0.3883874808873886, "learning_rate": 3.084034867504921e-05, "loss": 0.28033632040023804, "num_input_tokens_seen": 80791136, "step": 386, "train_runtime": 133032.7327, "train_tokens_per_second": 607.303 }, { "epoch": 1.282274802822748, "grad_norm": 0.383392121841549, "learning_rate": 3.075602391440918e-05, "loss": 0.2795390784740448, "num_input_tokens_seen": 81004208, "step": 387, "train_runtime": 133329.8482, "train_tokens_per_second": 607.547 }, { "epoch": 1.285595682855957, "grad_norm": 0.38138830274235713, "learning_rate": 3.0671629944302164e-05, "loss": 0.27803707122802734, "num_input_tokens_seen": 81216192, "step": 388, "train_runtime": 133623.7428, "train_tokens_per_second": 607.798 }, { "epoch": 1.2889165628891657, "grad_norm": 0.39059207387018335, "learning_rate": 3.058716777946713e-05, "loss": 0.27945441007614136, "num_input_tokens_seen": 81434112, "step": 389, "train_runtime": 133938.3126, "train_tokens_per_second": 607.997 }, { "epoch": 1.2922374429223744, "grad_norm": 0.39579403134032076, "learning_rate": 3.0502638435462995e-05, "loss": 0.2775178849697113, "num_input_tokens_seen": 81639880, "step": 390, "train_runtime": 134226.0264, "train_tokens_per_second": 608.227 }, { "epoch": 1.2955583229555832, "grad_norm": 0.38392721417957704, "learning_rate": 3.0418042928656414e-05, "loss": 0.2766043245792389, "num_input_tokens_seen": 81853120, "step": 391, "train_runtime": 134511.3746, "train_tokens_per_second": 608.522 }, { "epoch": 1.298879202988792, "grad_norm": 0.41427755861990034, "learning_rate": 3.0333382276209595e-05, "loss": 0.27493974566459656, "num_input_tokens_seen": 82059608, "step": 392, "train_runtime": 134794.7149, "train_tokens_per_second": 608.775 }, { "epoch": 1.3022000830220009, "grad_norm": 0.39090205576930515, "learning_rate": 3.0248657496068027e-05, "loss": 0.28059232234954834, "num_input_tokens_seen": 82263960, "step": 393, "train_runtime": 135084.1826, "train_tokens_per_second": 608.983 }, { "epoch": 1.3055209630552096, "grad_norm": 0.40031936769531695, "learning_rate": 3.0163869606948275e-05, "loss": 0.2766563296318054, "num_input_tokens_seen": 82472272, "step": 394, "train_runtime": 135358.1884, "train_tokens_per_second": 609.289 }, { "epoch": 1.3088418430884183, "grad_norm": 0.3911296435870638, "learning_rate": 3.0079019628325706e-05, "loss": 0.26600369811058044, "num_input_tokens_seen": 82682592, "step": 395, "train_runtime": 135661.5626, "train_tokens_per_second": 609.477 }, { "epoch": 1.3121627231216273, "grad_norm": 0.39815070001078756, "learning_rate": 2.999410858042225e-05, "loss": 0.27596497535705566, "num_input_tokens_seen": 82897328, "step": 396, "train_runtime": 135960.7637, "train_tokens_per_second": 609.715 }, { "epoch": 1.315483603154836, "grad_norm": 0.40625329050145637, "learning_rate": 2.990913748419411e-05, "loss": 0.2763836085796356, "num_input_tokens_seen": 83107152, "step": 397, "train_runtime": 136247.6165, "train_tokens_per_second": 609.971 }, { "epoch": 1.3188044831880448, "grad_norm": 0.3800015961731634, "learning_rate": 2.9824107361319516e-05, "loss": 0.2720406651496887, "num_input_tokens_seen": 83320536, "step": 398, "train_runtime": 136557.323, "train_tokens_per_second": 610.151 }, { "epoch": 1.3221253632212537, "grad_norm": 0.4308473758098913, "learning_rate": 2.97390192341864e-05, "loss": 0.2695067226886749, "num_input_tokens_seen": 83533096, "step": 399, "train_runtime": 136845.5017, "train_tokens_per_second": 610.419 }, { "epoch": 1.3254462432544625, "grad_norm": 0.3865955349757396, "learning_rate": 2.965387412588017e-05, "loss": 0.27491211891174316, "num_input_tokens_seen": 83737664, "step": 400, "train_runtime": 137136.4535, "train_tokens_per_second": 610.616 }, { "epoch": 1.3287671232876712, "grad_norm": 0.3934342235202729, "learning_rate": 2.9568673060171326e-05, "loss": 0.26775655150413513, "num_input_tokens_seen": 83948872, "step": 401, "train_runtime": 137422.737, "train_tokens_per_second": 610.881 }, { "epoch": 1.33208800332088, "grad_norm": 0.38735039774227614, "learning_rate": 2.94834170615032e-05, "loss": 0.26905345916748047, "num_input_tokens_seen": 84163496, "step": 402, "train_runtime": 137730.0189, "train_tokens_per_second": 611.076 }, { "epoch": 1.335408883354089, "grad_norm": 0.39150370962196007, "learning_rate": 2.9398107154979638e-05, "loss": 0.27601978182792664, "num_input_tokens_seen": 84374216, "step": 403, "train_runtime": 138029.4734, "train_tokens_per_second": 611.277 }, { "epoch": 1.3387297633872977, "grad_norm": 0.384276892680021, "learning_rate": 2.931274436635266e-05, "loss": 0.27592986822128296, "num_input_tokens_seen": 84585680, "step": 404, "train_runtime": 138328.0544, "train_tokens_per_second": 611.486 }, { "epoch": 1.3420506434205064, "grad_norm": 0.389551829438478, "learning_rate": 2.922732972201014e-05, "loss": 0.2764306664466858, "num_input_tokens_seen": 84796240, "step": 405, "train_runtime": 138632.2215, "train_tokens_per_second": 611.663 }, { "epoch": 1.3453715234537151, "grad_norm": 0.4041301290293801, "learning_rate": 2.914186424896343e-05, "loss": 0.2820674180984497, "num_input_tokens_seen": 85006112, "step": 406, "train_runtime": 138938.5477, "train_tokens_per_second": 611.825 }, { "epoch": 1.348692403486924, "grad_norm": 0.3768435455688714, "learning_rate": 2.9056348974835067e-05, "loss": 0.2697965204715729, "num_input_tokens_seen": 85219504, "step": 407, "train_runtime": 139233.749, "train_tokens_per_second": 612.061 }, { "epoch": 1.3520132835201328, "grad_norm": 0.3730502042777935, "learning_rate": 2.897078492784638e-05, "loss": 0.2750285863876343, "num_input_tokens_seen": 85431864, "step": 408, "train_runtime": 139545.3143, "train_tokens_per_second": 612.216 }, { "epoch": 1.3553341635533416, "grad_norm": 0.4068105830948811, "learning_rate": 2.8885173136805127e-05, "loss": 0.27826839685440063, "num_input_tokens_seen": 85640848, "step": 409, "train_runtime": 139829.6801, "train_tokens_per_second": 612.465 }, { "epoch": 1.3586550435865505, "grad_norm": 0.39907839927900923, "learning_rate": 2.879951463109313e-05, "loss": 0.2801903784275055, "num_input_tokens_seen": 85846408, "step": 410, "train_runtime": 140094.774, "train_tokens_per_second": 612.774 }, { "epoch": 1.3619759236197593, "grad_norm": 0.3843628131916044, "learning_rate": 2.8713810440653926e-05, "loss": 0.2678714394569397, "num_input_tokens_seen": 86052064, "step": 411, "train_runtime": 140357.9568, "train_tokens_per_second": 613.09 }, { "epoch": 1.365296803652968, "grad_norm": 0.3749223777352839, "learning_rate": 2.862806159598032e-05, "loss": 0.2706253230571747, "num_input_tokens_seen": 86265656, "step": 412, "train_runtime": 140627.9979, "train_tokens_per_second": 613.432 }, { "epoch": 1.3686176836861768, "grad_norm": 0.39924514564666763, "learning_rate": 2.8542269128102063e-05, "loss": 0.2835608422756195, "num_input_tokens_seen": 86473072, "step": 413, "train_runtime": 140920.5046, "train_tokens_per_second": 613.63 }, { "epoch": 1.3719385637193857, "grad_norm": 0.3855891248838036, "learning_rate": 2.8456434068573406e-05, "loss": 0.2807215750217438, "num_input_tokens_seen": 86680248, "step": 414, "train_runtime": 141218.3751, "train_tokens_per_second": 613.803 }, { "epoch": 1.3752594437525945, "grad_norm": 0.37985156165234857, "learning_rate": 2.837055744946072e-05, "loss": 0.27627435326576233, "num_input_tokens_seen": 86890064, "step": 415, "train_runtime": 141498.7941, "train_tokens_per_second": 614.069 }, { "epoch": 1.3785803237858032, "grad_norm": 0.37924972007891256, "learning_rate": 2.828464030333009e-05, "loss": 0.26245152950286865, "num_input_tokens_seen": 87095480, "step": 416, "train_runtime": 141824.5783, "train_tokens_per_second": 614.107 }, { "epoch": 1.381901203819012, "grad_norm": 0.4224684614148727, "learning_rate": 2.819868366323488e-05, "loss": 0.27233731746673584, "num_input_tokens_seen": 87304240, "step": 417, "train_runtime": 142144.7349, "train_tokens_per_second": 614.193 }, { "epoch": 1.385222083852221, "grad_norm": 0.3878112697268373, "learning_rate": 2.811268856270332e-05, "loss": 0.28139635920524597, "num_input_tokens_seen": 87515776, "step": 418, "train_runtime": 142449.5453, "train_tokens_per_second": 614.363 }, { "epoch": 1.3885429638854296, "grad_norm": 0.3795755075357752, "learning_rate": 2.8026656035726095e-05, "loss": 0.27290260791778564, "num_input_tokens_seen": 87726584, "step": 419, "train_runtime": 142743.4234, "train_tokens_per_second": 614.575 }, { "epoch": 1.3918638439186384, "grad_norm": 0.3817200347724888, "learning_rate": 2.7940587116743894e-05, "loss": 0.2759082019329071, "num_input_tokens_seen": 87943400, "step": 420, "train_runtime": 143036.362, "train_tokens_per_second": 614.832 }, { "epoch": 1.3951847239518473, "grad_norm": 0.3822789055179862, "learning_rate": 2.7854482840634965e-05, "loss": 0.2684614956378937, "num_input_tokens_seen": 88153152, "step": 421, "train_runtime": 143374.2137, "train_tokens_per_second": 614.847 }, { "epoch": 1.398505603985056, "grad_norm": 0.386666474979764, "learning_rate": 2.77683442427027e-05, "loss": 0.26988548040390015, "num_input_tokens_seen": 88357528, "step": 422, "train_runtime": 143647.0377, "train_tokens_per_second": 615.102 }, { "epoch": 1.4018264840182648, "grad_norm": 0.37983010580471577, "learning_rate": 2.7682172358663173e-05, "loss": 0.2713548541069031, "num_input_tokens_seen": 88567776, "step": 423, "train_runtime": 143968.2553, "train_tokens_per_second": 615.19 }, { "epoch": 1.4051473640514736, "grad_norm": 0.38509643904491286, "learning_rate": 2.7595968224632674e-05, "loss": 0.27689388394355774, "num_input_tokens_seen": 88780464, "step": 424, "train_runtime": 144272.0554, "train_tokens_per_second": 615.368 }, { "epoch": 1.4084682440846825, "grad_norm": 0.3763501418302105, "learning_rate": 2.7509732877115257e-05, "loss": 0.2730310559272766, "num_input_tokens_seen": 88995960, "step": 425, "train_runtime": 144583.658, "train_tokens_per_second": 615.533 }, { "epoch": 1.4117891241178913, "grad_norm": 0.39047243257676395, "learning_rate": 2.74234673529903e-05, "loss": 0.2728288471698761, "num_input_tokens_seen": 89208040, "step": 426, "train_runtime": 144896.4104, "train_tokens_per_second": 615.668 }, { "epoch": 1.4151100041511, "grad_norm": 0.39555000114794614, "learning_rate": 2.73371726895e-05, "loss": 0.28198131918907166, "num_input_tokens_seen": 89418376, "step": 427, "train_runtime": 145225.1639, "train_tokens_per_second": 615.722 }, { "epoch": 1.4184308841843087, "grad_norm": 0.39924038054547295, "learning_rate": 2.725084992423694e-05, "loss": 0.27948057651519775, "num_input_tokens_seen": 89624360, "step": 428, "train_runtime": 145558.2516, "train_tokens_per_second": 615.728 }, { "epoch": 1.4217517642175177, "grad_norm": 0.3906687907681376, "learning_rate": 2.716450009513158e-05, "loss": 0.2838206887245178, "num_input_tokens_seen": 89839176, "step": 429, "train_runtime": 145877.8497, "train_tokens_per_second": 615.852 }, { "epoch": 1.4250726442507264, "grad_norm": 0.3773528130156981, "learning_rate": 2.7078124240439795e-05, "loss": 0.27308711409568787, "num_input_tokens_seen": 90045400, "step": 430, "train_runtime": 146198.9409, "train_tokens_per_second": 615.91 }, { "epoch": 1.4283935242839352, "grad_norm": 0.37469816431552, "learning_rate": 2.6991723398730383e-05, "loss": 0.27352002263069153, "num_input_tokens_seen": 90259064, "step": 431, "train_runtime": 146494.253, "train_tokens_per_second": 616.127 }, { "epoch": 1.4317144043171441, "grad_norm": 0.3844478075797015, "learning_rate": 2.6905298608872588e-05, "loss": 0.26953351497650146, "num_input_tokens_seen": 90468720, "step": 432, "train_runtime": 146794.5101, "train_tokens_per_second": 616.295 }, { "epoch": 1.4350352843503529, "grad_norm": 0.38797883425055196, "learning_rate": 2.68188509100236e-05, "loss": 0.2790805995464325, "num_input_tokens_seen": 90677544, "step": 433, "train_runtime": 147080.9165, "train_tokens_per_second": 616.515 }, { "epoch": 1.4383561643835616, "grad_norm": 0.3946120987005431, "learning_rate": 2.6732381341616065e-05, "loss": 0.2723555266857147, "num_input_tokens_seen": 90888704, "step": 434, "train_runtime": 147359.212, "train_tokens_per_second": 616.783 }, { "epoch": 1.4416770444167706, "grad_norm": 0.41295690587639383, "learning_rate": 2.6645890943345585e-05, "loss": 0.26789337396621704, "num_input_tokens_seen": 91090856, "step": 435, "train_runtime": 147644.5842, "train_tokens_per_second": 616.96 }, { "epoch": 1.4449979244499793, "grad_norm": 0.3964823719645078, "learning_rate": 2.6559380755158208e-05, "loss": 0.28620174527168274, "num_input_tokens_seen": 91301944, "step": 436, "train_runtime": 147942.4081, "train_tokens_per_second": 617.145 }, { "epoch": 1.448318804483188, "grad_norm": 0.38348430453116406, "learning_rate": 2.6472851817237948e-05, "loss": 0.27092844247817993, "num_input_tokens_seen": 91508136, "step": 437, "train_runtime": 148214.1105, "train_tokens_per_second": 617.405 }, { "epoch": 1.4516396845163968, "grad_norm": 0.3857512805426217, "learning_rate": 2.6386305169994256e-05, "loss": 0.2821858525276184, "num_input_tokens_seen": 91717688, "step": 438, "train_runtime": 148490.9569, "train_tokens_per_second": 617.665 }, { "epoch": 1.4549605645496055, "grad_norm": 0.3746863337294339, "learning_rate": 2.6299741854049508e-05, "loss": 0.2690110206604004, "num_input_tokens_seen": 91925912, "step": 439, "train_runtime": 148772.7362, "train_tokens_per_second": 617.895 }, { "epoch": 1.4582814445828145, "grad_norm": 0.3774965994559822, "learning_rate": 2.621316291022652e-05, "loss": 0.26425009965896606, "num_input_tokens_seen": 92136264, "step": 440, "train_runtime": 149033.1264, "train_tokens_per_second": 618.227 }, { "epoch": 1.4616023246160232, "grad_norm": 0.38824470900513997, "learning_rate": 2.6126569379535985e-05, "loss": 0.2754976749420166, "num_input_tokens_seen": 92342792, "step": 441, "train_runtime": 149327.0053, "train_tokens_per_second": 618.393 }, { "epoch": 1.464923204649232, "grad_norm": 0.39621008712361694, "learning_rate": 2.6039962303164023e-05, "loss": 0.2782744765281677, "num_input_tokens_seen": 92554056, "step": 442, "train_runtime": 149646.1225, "train_tokens_per_second": 618.486 }, { "epoch": 1.468244084682441, "grad_norm": 0.4048936182915268, "learning_rate": 2.5953342722459594e-05, "loss": 0.27722615003585815, "num_input_tokens_seen": 92759472, "step": 443, "train_runtime": 149938.7976, "train_tokens_per_second": 618.649 }, { "epoch": 1.4715649647156497, "grad_norm": 0.3943616771938601, "learning_rate": 2.5866711678922035e-05, "loss": 0.27563533186912537, "num_input_tokens_seen": 92968024, "step": 444, "train_runtime": 150232.8974, "train_tokens_per_second": 618.826 }, { "epoch": 1.4748858447488584, "grad_norm": 0.3815599151588948, "learning_rate": 2.5780070214188478e-05, "loss": 0.2683257460594177, "num_input_tokens_seen": 93182904, "step": 445, "train_runtime": 150526.7299, "train_tokens_per_second": 619.046 }, { "epoch": 1.4782067247820674, "grad_norm": 0.39091399065237853, "learning_rate": 2.5693419370021392e-05, "loss": 0.27707743644714355, "num_input_tokens_seen": 93395352, "step": 446, "train_runtime": 150831.4053, "train_tokens_per_second": 619.204 }, { "epoch": 1.4815276048152761, "grad_norm": 0.38786466499807515, "learning_rate": 2.5606760188296004e-05, "loss": 0.285401314496994, "num_input_tokens_seen": 93606256, "step": 447, "train_runtime": 151158.536, "train_tokens_per_second": 619.259 }, { "epoch": 1.4848484848484849, "grad_norm": 0.3931793167471563, "learning_rate": 2.5520093710987785e-05, "loss": 0.27463263273239136, "num_input_tokens_seen": 93814072, "step": 448, "train_runtime": 151437.987, "train_tokens_per_second": 619.488 }, { "epoch": 1.4881693648816936, "grad_norm": 0.3775030848814357, "learning_rate": 2.5433420980159944e-05, "loss": 0.2858240008354187, "num_input_tokens_seen": 94033240, "step": 449, "train_runtime": 151769.5173, "train_tokens_per_second": 619.579 }, { "epoch": 1.4914902449149023, "grad_norm": 0.3839556653077752, "learning_rate": 2.5346743037950864e-05, "loss": 0.2770846486091614, "num_input_tokens_seen": 94242952, "step": 450, "train_runtime": 152047.3693, "train_tokens_per_second": 619.826 }, { "epoch": 1.4948111249481113, "grad_norm": 0.39840970379364987, "learning_rate": 2.526006092656161e-05, "loss": 0.2803241014480591, "num_input_tokens_seen": 94453584, "step": 451, "train_runtime": 152336.5089, "train_tokens_per_second": 620.032 }, { "epoch": 1.49813200498132, "grad_norm": 0.37563747325345737, "learning_rate": 2.5173375688243343e-05, "loss": 0.2788984775543213, "num_input_tokens_seen": 94668832, "step": 452, "train_runtime": 152644.2272, "train_tokens_per_second": 620.193 }, { "epoch": 1.5014528850145288, "grad_norm": 0.3978199118013257, "learning_rate": 2.508668836528486e-05, "loss": 0.28381669521331787, "num_input_tokens_seen": 94884536, "step": 453, "train_runtime": 152931.5061, "train_tokens_per_second": 620.438 }, { "epoch": 1.5047737650477377, "grad_norm": 0.3815388344339454, "learning_rate": 2.5e-05, "loss": 0.2760803699493408, "num_input_tokens_seen": 95096920, "step": 454, "train_runtime": 153210.1585, "train_tokens_per_second": 620.696 }, { "epoch": 1.5080946450809465, "grad_norm": 0.39561477279103535, "learning_rate": 2.491331163471514e-05, "loss": 0.2771623730659485, "num_input_tokens_seen": 95307944, "step": 455, "train_runtime": 153494.2992, "train_tokens_per_second": 620.922 }, { "epoch": 1.5114155251141552, "grad_norm": 0.3913957050500507, "learning_rate": 2.482662431175666e-05, "loss": 0.28440573811531067, "num_input_tokens_seen": 95515968, "step": 456, "train_runtime": 153803.5271, "train_tokens_per_second": 621.026 }, { "epoch": 1.5147364051473642, "grad_norm": 0.38071306756533496, "learning_rate": 2.4739939073438397e-05, "loss": 0.2825664281845093, "num_input_tokens_seen": 95726024, "step": 457, "train_runtime": 154101.331, "train_tokens_per_second": 621.189 }, { "epoch": 1.5180572851805727, "grad_norm": 0.38775887583134605, "learning_rate": 2.465325696204914e-05, "loss": 0.28354400396347046, "num_input_tokens_seen": 95935816, "step": 458, "train_runtime": 154381.1585, "train_tokens_per_second": 621.422 }, { "epoch": 1.5213781652137817, "grad_norm": 0.37483524482071723, "learning_rate": 2.456657901984006e-05, "loss": 0.2669481635093689, "num_input_tokens_seen": 96142752, "step": 459, "train_runtime": 154673.7701, "train_tokens_per_second": 621.584 }, { "epoch": 1.5246990452469904, "grad_norm": 0.3874289919200597, "learning_rate": 2.4479906289012218e-05, "loss": 0.2789996862411499, "num_input_tokens_seen": 96352344, "step": 460, "train_runtime": 154991.0479, "train_tokens_per_second": 621.664 }, { "epoch": 1.5280199252801991, "grad_norm": 0.3837792360933484, "learning_rate": 2.4393239811704e-05, "loss": 0.2797940671443939, "num_input_tokens_seen": 96559576, "step": 461, "train_runtime": 155326.8621, "train_tokens_per_second": 621.654 }, { "epoch": 1.531340805313408, "grad_norm": 0.3739810070287046, "learning_rate": 2.430658062997861e-05, "loss": 0.2745603919029236, "num_input_tokens_seen": 96768232, "step": 462, "train_runtime": 155600.6316, "train_tokens_per_second": 621.901 }, { "epoch": 1.5346616853466168, "grad_norm": 0.39310816561453116, "learning_rate": 2.4219929785811518e-05, "loss": 0.2753414213657379, "num_input_tokens_seen": 96981512, "step": 463, "train_runtime": 155925.6371, "train_tokens_per_second": 621.973 }, { "epoch": 1.5379825653798256, "grad_norm": 0.3948678984829835, "learning_rate": 2.4133288321077978e-05, "loss": 0.2625735402107239, "num_input_tokens_seen": 97189136, "step": 464, "train_runtime": 156216.4508, "train_tokens_per_second": 622.144 }, { "epoch": 1.5413034454130345, "grad_norm": 0.380241770011676, "learning_rate": 2.4046657277540412e-05, "loss": 0.27124154567718506, "num_input_tokens_seen": 97403824, "step": 465, "train_runtime": 156511.2867, "train_tokens_per_second": 622.344 }, { "epoch": 1.5446243254462433, "grad_norm": 0.39815162154922284, "learning_rate": 2.3960037696835986e-05, "loss": 0.27361199259757996, "num_input_tokens_seen": 97605736, "step": 466, "train_runtime": 156790.1739, "train_tokens_per_second": 622.525 }, { "epoch": 1.547945205479452, "grad_norm": 0.41159382239503567, "learning_rate": 2.3873430620464024e-05, "loss": 0.2782898247241974, "num_input_tokens_seen": 97805720, "step": 467, "train_runtime": 157085.5293, "train_tokens_per_second": 622.627 }, { "epoch": 1.551266085512661, "grad_norm": 0.39387978219538167, "learning_rate": 2.3786837089773494e-05, "loss": 0.2831963896751404, "num_input_tokens_seen": 98011824, "step": 468, "train_runtime": 157381.2726, "train_tokens_per_second": 622.767 }, { "epoch": 1.5545869655458695, "grad_norm": 0.39713875009253896, "learning_rate": 2.3700258145950495e-05, "loss": 0.27522018551826477, "num_input_tokens_seen": 98219592, "step": 469, "train_runtime": 157660.1151, "train_tokens_per_second": 622.983 }, { "epoch": 1.5579078455790785, "grad_norm": 0.37328924972497884, "learning_rate": 2.361369483000575e-05, "loss": 0.2746099829673767, "num_input_tokens_seen": 98433344, "step": 470, "train_runtime": 157947.1226, "train_tokens_per_second": 623.204 }, { "epoch": 1.5612287256122872, "grad_norm": 0.3952385836059032, "learning_rate": 2.3527148182762054e-05, "loss": 0.27125734090805054, "num_input_tokens_seen": 98647504, "step": 471, "train_runtime": 158262.4123, "train_tokens_per_second": 623.316 }, { "epoch": 1.564549605645496, "grad_norm": 0.3890274108636158, "learning_rate": 2.3440619244841798e-05, "loss": 0.27807706594467163, "num_input_tokens_seen": 98851664, "step": 472, "train_runtime": 158561.8882, "train_tokens_per_second": 623.426 }, { "epoch": 1.567870485678705, "grad_norm": 0.3848142486103475, "learning_rate": 2.335410905665442e-05, "loss": 0.2797778248786926, "num_input_tokens_seen": 99062624, "step": 473, "train_runtime": 158850.5752, "train_tokens_per_second": 623.621 }, { "epoch": 1.5711913657119136, "grad_norm": 0.37228985478273074, "learning_rate": 2.3267618658383938e-05, "loss": 0.2710762023925781, "num_input_tokens_seen": 99277088, "step": 474, "train_runtime": 159150.8586, "train_tokens_per_second": 623.792 }, { "epoch": 1.5745122457451224, "grad_norm": 0.3874057844345587, "learning_rate": 2.3181149089976405e-05, "loss": 0.2764275372028351, "num_input_tokens_seen": 99487256, "step": 475, "train_runtime": 159448.9095, "train_tokens_per_second": 623.944 }, { "epoch": 1.5778331257783313, "grad_norm": 0.38642210690487105, "learning_rate": 2.3094701391127418e-05, "loss": 0.2717147469520569, "num_input_tokens_seen": 99700256, "step": 476, "train_runtime": 159749.0285, "train_tokens_per_second": 624.106 }, { "epoch": 1.58115400581154, "grad_norm": 0.3807420614903218, "learning_rate": 2.3008276601269623e-05, "loss": 0.2777535021305084, "num_input_tokens_seen": 99914464, "step": 477, "train_runtime": 160053.6863, "train_tokens_per_second": 624.256 }, { "epoch": 1.5844748858447488, "grad_norm": 0.3913290280432001, "learning_rate": 2.292187575956021e-05, "loss": 0.2830791771411896, "num_input_tokens_seen": 100121368, "step": 478, "train_runtime": 160323.2416, "train_tokens_per_second": 624.497 }, { "epoch": 1.5877957658779578, "grad_norm": 0.3799122178535618, "learning_rate": 2.283549990486842e-05, "loss": 0.2885296940803528, "num_input_tokens_seen": 100339688, "step": 479, "train_runtime": 160636.5497, "train_tokens_per_second": 624.638 }, { "epoch": 1.5911166459111663, "grad_norm": 0.37586013361076415, "learning_rate": 2.274915007576306e-05, "loss": 0.27374687790870667, "num_input_tokens_seen": 100550928, "step": 480, "train_runtime": 160937.9532, "train_tokens_per_second": 624.781 }, { "epoch": 1.5944375259443753, "grad_norm": 0.3872494980012405, "learning_rate": 2.2662827310499995e-05, "loss": 0.28530970215797424, "num_input_tokens_seen": 100758768, "step": 481, "train_runtime": 161250.1896, "train_tokens_per_second": 624.86 }, { "epoch": 1.597758405977584, "grad_norm": 0.38118649691742823, "learning_rate": 2.2576532647009702e-05, "loss": 0.26808488368988037, "num_input_tokens_seen": 100964704, "step": 482, "train_runtime": 161528.1434, "train_tokens_per_second": 625.06 }, { "epoch": 1.6010792860107927, "grad_norm": 0.39207100960864955, "learning_rate": 2.249026712288474e-05, "loss": 0.2740188539028168, "num_input_tokens_seen": 101176872, "step": 483, "train_runtime": 161834.9336, "train_tokens_per_second": 625.186 }, { "epoch": 1.6044001660440017, "grad_norm": 0.3877384396900619, "learning_rate": 2.2404031775367335e-05, "loss": 0.2846457362174988, "num_input_tokens_seen": 101387376, "step": 484, "train_runtime": 162158.1849, "train_tokens_per_second": 625.237 }, { "epoch": 1.6077210460772104, "grad_norm": 0.382997916492794, "learning_rate": 2.2317827641336833e-05, "loss": 0.27255263924598694, "num_input_tokens_seen": 101599408, "step": 485, "train_runtime": 162468.3473, "train_tokens_per_second": 625.349 }, { "epoch": 1.6110419261104192, "grad_norm": 0.369381936156458, "learning_rate": 2.2231655757297304e-05, "loss": 0.27387329936027527, "num_input_tokens_seen": 101812048, "step": 486, "train_runtime": 162777.6348, "train_tokens_per_second": 625.467 }, { "epoch": 1.6143628061436281, "grad_norm": 0.3841727641514375, "learning_rate": 2.2145517159365044e-05, "loss": 0.28685835003852844, "num_input_tokens_seen": 102029232, "step": 487, "train_runtime": 163159.6803, "train_tokens_per_second": 625.334 }, { "epoch": 1.6176836861768369, "grad_norm": 0.3780936337208953, "learning_rate": 2.205941288325612e-05, "loss": 0.26806920766830444, "num_input_tokens_seen": 102236200, "step": 488, "train_runtime": 163489.0024, "train_tokens_per_second": 625.34 }, { "epoch": 1.6210045662100456, "grad_norm": 0.38608553474462487, "learning_rate": 2.197334396427391e-05, "loss": 0.27260643243789673, "num_input_tokens_seen": 102442192, "step": 489, "train_runtime": 163807.3191, "train_tokens_per_second": 625.382 }, { "epoch": 1.6243254462432546, "grad_norm": 0.40093901283362116, "learning_rate": 2.1887311437296686e-05, "loss": 0.28019481897354126, "num_input_tokens_seen": 102651352, "step": 490, "train_runtime": 164146.2688, "train_tokens_per_second": 625.365 }, { "epoch": 1.627646326276463, "grad_norm": 0.3845695130365367, "learning_rate": 2.1801316336765126e-05, "loss": 0.27420860528945923, "num_input_tokens_seen": 102863464, "step": 491, "train_runtime": 164476.2022, "train_tokens_per_second": 625.4 }, { "epoch": 1.630967206309672, "grad_norm": 0.3847295451461292, "learning_rate": 2.171535969666991e-05, "loss": 0.2780809998512268, "num_input_tokens_seen": 103077960, "step": 492, "train_runtime": 164820.2154, "train_tokens_per_second": 625.396 }, { "epoch": 1.6342880863428808, "grad_norm": 0.38399141283572846, "learning_rate": 2.162944255053928e-05, "loss": 0.2749224305152893, "num_input_tokens_seen": 103284040, "step": 493, "train_runtime": 165118.7401, "train_tokens_per_second": 625.514 }, { "epoch": 1.6376089663760895, "grad_norm": 0.36617938284057205, "learning_rate": 2.15435659314266e-05, "loss": 0.2709444761276245, "num_input_tokens_seen": 103498296, "step": 494, "train_runtime": 165411.7369, "train_tokens_per_second": 625.701 }, { "epoch": 1.6409298464092985, "grad_norm": 0.37822872198834545, "learning_rate": 2.145773087189794e-05, "loss": 0.27598223090171814, "num_input_tokens_seen": 103708136, "step": 495, "train_runtime": 165729.2775, "train_tokens_per_second": 625.768 }, { "epoch": 1.6442507264425072, "grad_norm": 0.38928939036083077, "learning_rate": 2.137193840401968e-05, "loss": 0.2770792543888092, "num_input_tokens_seen": 103917744, "step": 496, "train_runtime": 166074.2757, "train_tokens_per_second": 625.731 }, { "epoch": 1.647571606475716, "grad_norm": 0.38782482234696797, "learning_rate": 2.128618955934608e-05, "loss": 0.27650129795074463, "num_input_tokens_seen": 104127808, "step": 497, "train_runtime": 166394.6511, "train_tokens_per_second": 625.788 }, { "epoch": 1.650892486508925, "grad_norm": 0.3866522305500667, "learning_rate": 2.120048536890687e-05, "loss": 0.28428196907043457, "num_input_tokens_seen": 104341640, "step": 498, "train_runtime": 166728.0992, "train_tokens_per_second": 625.819 }, { "epoch": 1.6542133665421337, "grad_norm": 0.37214174143061796, "learning_rate": 2.1114826863194882e-05, "loss": 0.2710725665092468, "num_input_tokens_seen": 104553208, "step": 499, "train_runtime": 167035.009, "train_tokens_per_second": 625.936 }, { "epoch": 1.6575342465753424, "grad_norm": 0.38218990380208034, "learning_rate": 2.1029215072153617e-05, "loss": 0.2692292332649231, "num_input_tokens_seen": 104762112, "step": 500, "train_runtime": 167337.8099, "train_tokens_per_second": 626.052 }, { "epoch": 1.6608551266085514, "grad_norm": 0.37936790163805806, "learning_rate": 2.0943651025164932e-05, "loss": 0.2653566002845764, "num_input_tokens_seen": 104971248, "step": 501, "train_runtime": 167645.3905, "train_tokens_per_second": 626.151 }, { "epoch": 1.66417600664176, "grad_norm": 0.37088528557983946, "learning_rate": 2.085813575103657e-05, "loss": 0.26822784543037415, "num_input_tokens_seen": 105178376, "step": 502, "train_runtime": 167948.8781, "train_tokens_per_second": 626.252 }, { "epoch": 1.6674968866749689, "grad_norm": 0.39701318699135063, "learning_rate": 2.0772670277989864e-05, "loss": 0.2750917077064514, "num_input_tokens_seen": 105382384, "step": 503, "train_runtime": 168248.1015, "train_tokens_per_second": 626.351 }, { "epoch": 1.6708177667081778, "grad_norm": 0.3765536336118195, "learning_rate": 2.0687255633647348e-05, "loss": 0.2742467522621155, "num_input_tokens_seen": 105594960, "step": 504, "train_runtime": 168575.4676, "train_tokens_per_second": 626.396 }, { "epoch": 1.6741386467413863, "grad_norm": 0.3778939717035077, "learning_rate": 2.060189284502037e-05, "loss": 0.26418954133987427, "num_input_tokens_seen": 105803760, "step": 505, "train_runtime": 168905.9298, "train_tokens_per_second": 626.406 }, { "epoch": 1.6774595267745953, "grad_norm": 0.39702762204705705, "learning_rate": 2.0516582938496812e-05, "loss": 0.27535632252693176, "num_input_tokens_seen": 106013416, "step": 506, "train_runtime": 169203.7547, "train_tokens_per_second": 626.543 }, { "epoch": 1.680780406807804, "grad_norm": 0.3781789237707453, "learning_rate": 2.0431326939828686e-05, "loss": 0.2713829278945923, "num_input_tokens_seen": 106226328, "step": 507, "train_runtime": 169541.2848, "train_tokens_per_second": 626.551 }, { "epoch": 1.6841012868410128, "grad_norm": 0.38022632579632676, "learning_rate": 2.034612587411984e-05, "loss": 0.2774660587310791, "num_input_tokens_seen": 106435936, "step": 508, "train_runtime": 169843.7466, "train_tokens_per_second": 626.67 }, { "epoch": 1.6874221668742218, "grad_norm": 0.3818299215717327, "learning_rate": 2.0260980765813604e-05, "loss": 0.27122774720191956, "num_input_tokens_seen": 106641640, "step": 509, "train_runtime": 170166.6906, "train_tokens_per_second": 626.689 }, { "epoch": 1.6907430469074305, "grad_norm": 0.37973849367368634, "learning_rate": 2.0175892638680494e-05, "loss": 0.26995688676834106, "num_input_tokens_seen": 106852216, "step": 510, "train_runtime": 170490.5423, "train_tokens_per_second": 626.734 }, { "epoch": 1.6940639269406392, "grad_norm": 0.3851614945460699, "learning_rate": 2.0090862515805898e-05, "loss": 0.2680545449256897, "num_input_tokens_seen": 107053248, "step": 511, "train_runtime": 170802.0032, "train_tokens_per_second": 626.768 }, { "epoch": 1.6973848069738482, "grad_norm": 0.38065170608414073, "learning_rate": 2.0005891419577757e-05, "loss": 0.26882126927375793, "num_input_tokens_seen": 107262992, "step": 512, "train_runtime": 171140.8876, "train_tokens_per_second": 626.753 }, { "epoch": 1.7007056870070567, "grad_norm": 0.3685486538023313, "learning_rate": 1.99209803716743e-05, "loss": 0.27311110496520996, "num_input_tokens_seen": 107478712, "step": 513, "train_runtime": 171494.1234, "train_tokens_per_second": 626.72 }, { "epoch": 1.7040265670402657, "grad_norm": 0.390800930806793, "learning_rate": 1.9836130393051734e-05, "loss": 0.2756790518760681, "num_input_tokens_seen": 107691144, "step": 514, "train_runtime": 171827.16, "train_tokens_per_second": 626.741 }, { "epoch": 1.7073474470734746, "grad_norm": 0.38064415164933396, "learning_rate": 1.975134250393198e-05, "loss": 0.27489352226257324, "num_input_tokens_seen": 107900672, "step": 515, "train_runtime": 172128.9046, "train_tokens_per_second": 626.86 }, { "epoch": 1.7106683271066832, "grad_norm": 0.3751340622006632, "learning_rate": 1.966661772379041e-05, "loss": 0.2614770531654358, "num_input_tokens_seen": 108103232, "step": 516, "train_runtime": 172441.4308, "train_tokens_per_second": 626.898 }, { "epoch": 1.7139892071398921, "grad_norm": 0.3659405965046292, "learning_rate": 1.9581957071343592e-05, "loss": 0.2652609050273895, "num_input_tokens_seen": 108321080, "step": 517, "train_runtime": 172733.2343, "train_tokens_per_second": 627.1 }, { "epoch": 1.7173100871731009, "grad_norm": 0.375739645953739, "learning_rate": 1.9497361564537008e-05, "loss": 0.2724020481109619, "num_input_tokens_seen": 108536112, "step": 518, "train_runtime": 173031.2187, "train_tokens_per_second": 627.263 }, { "epoch": 1.7206309672063096, "grad_norm": 0.37800055755720385, "learning_rate": 1.9412832220532867e-05, "loss": 0.2700433135032654, "num_input_tokens_seen": 108748640, "step": 519, "train_runtime": 173356.2807, "train_tokens_per_second": 627.313 }, { "epoch": 1.7239518472395186, "grad_norm": 0.3739450335437874, "learning_rate": 1.9328370055697835e-05, "loss": 0.27248144149780273, "num_input_tokens_seen": 108969864, "step": 520, "train_runtime": 173712.9759, "train_tokens_per_second": 627.298 }, { "epoch": 1.7272727272727273, "grad_norm": 0.3965463831094655, "learning_rate": 1.9243976085590824e-05, "loss": 0.28080034255981445, "num_input_tokens_seen": 109179664, "step": 521, "train_runtime": 174030.7867, "train_tokens_per_second": 627.358 }, { "epoch": 1.730593607305936, "grad_norm": 0.38198518038001167, "learning_rate": 1.915965132495079e-05, "loss": 0.2854807674884796, "num_input_tokens_seen": 109392960, "step": 522, "train_runtime": 174354.3129, "train_tokens_per_second": 627.418 }, { "epoch": 1.733914487339145, "grad_norm": 0.3748595681066997, "learning_rate": 1.9075396787684533e-05, "loss": 0.27152860164642334, "num_input_tokens_seen": 109601552, "step": 523, "train_runtime": 174673.0712, "train_tokens_per_second": 627.467 }, { "epoch": 1.7372353673723535, "grad_norm": 0.38441155765795015, "learning_rate": 1.899121348685447e-05, "loss": 0.27585625648498535, "num_input_tokens_seen": 109808384, "step": 524, "train_runtime": 174980.5406, "train_tokens_per_second": 627.546 }, { "epoch": 1.7405562474055625, "grad_norm": 0.3858405333155389, "learning_rate": 1.8907102434666523e-05, "loss": 0.26380085945129395, "num_input_tokens_seen": 110010928, "step": 525, "train_runtime": 175277.6445, "train_tokens_per_second": 627.638 }, { "epoch": 1.7438771274387714, "grad_norm": 0.37921480572137006, "learning_rate": 1.8823064642457876e-05, "loss": 0.2714405953884125, "num_input_tokens_seen": 110221304, "step": 526, "train_runtime": 175587.62, "train_tokens_per_second": 627.728 }, { "epoch": 1.74719800747198, "grad_norm": 0.38004977754182245, "learning_rate": 1.8739101120684866e-05, "loss": 0.271285742521286, "num_input_tokens_seen": 110428072, "step": 527, "train_runtime": 175890.8496, "train_tokens_per_second": 627.822 }, { "epoch": 1.750518887505189, "grad_norm": 0.3825721913036661, "learning_rate": 1.8655212878910805e-05, "loss": 0.2654801607131958, "num_input_tokens_seen": 110635672, "step": 528, "train_runtime": 176206.9172, "train_tokens_per_second": 627.874 }, { "epoch": 1.7538397675383977, "grad_norm": 0.39174694241973135, "learning_rate": 1.8571400925793855e-05, "loss": 0.27764183282852173, "num_input_tokens_seen": 110838664, "step": 529, "train_runtime": 176490.0557, "train_tokens_per_second": 628.016 }, { "epoch": 1.7571606475716064, "grad_norm": 0.3819001413625028, "learning_rate": 1.84876662690749e-05, "loss": 0.2751466929912567, "num_input_tokens_seen": 111051304, "step": 530, "train_runtime": 176806.4029, "train_tokens_per_second": 628.095 }, { "epoch": 1.7604815276048154, "grad_norm": 0.37330107486155, "learning_rate": 1.840400991556541e-05, "loss": 0.26104727387428284, "num_input_tokens_seen": 111262512, "step": 531, "train_runtime": 177116.4047, "train_tokens_per_second": 628.189 }, { "epoch": 1.763802407638024, "grad_norm": 0.3840892555365638, "learning_rate": 1.8320432871135377e-05, "loss": 0.26812130212783813, "num_input_tokens_seen": 111476128, "step": 532, "train_runtime": 177396.1925, "train_tokens_per_second": 628.402 }, { "epoch": 1.7671232876712328, "grad_norm": 0.3845079012329166, "learning_rate": 1.8236936140701165e-05, "loss": 0.26285719871520996, "num_input_tokens_seen": 111681600, "step": 533, "train_runtime": 177675.3318, "train_tokens_per_second": 628.571 }, { "epoch": 1.7704441677044418, "grad_norm": 0.3828626665787968, "learning_rate": 1.8153520728213472e-05, "loss": 0.27905604243278503, "num_input_tokens_seen": 111889344, "step": 534, "train_runtime": 177991.4082, "train_tokens_per_second": 628.622 }, { "epoch": 1.7737650477376503, "grad_norm": 0.3799909165519961, "learning_rate": 1.807018763664524e-05, "loss": 0.2693879306316376, "num_input_tokens_seen": 112096032, "step": 535, "train_runtime": 178301.2766, "train_tokens_per_second": 628.689 }, { "epoch": 1.7770859277708593, "grad_norm": 0.37942294917605773, "learning_rate": 1.7986937867979598e-05, "loss": 0.28193628787994385, "num_input_tokens_seen": 112311232, "step": 536, "train_runtime": 178617.8542, "train_tokens_per_second": 628.779 }, { "epoch": 1.7804068078040682, "grad_norm": 0.38720436411046744, "learning_rate": 1.7903772423197806e-05, "loss": 0.27152201533317566, "num_input_tokens_seen": 112527000, "step": 537, "train_runtime": 178934.8832, "train_tokens_per_second": 628.871 }, { "epoch": 1.7837276878372768, "grad_norm": 0.3773726471853286, "learning_rate": 1.782069230226725e-05, "loss": 0.27006030082702637, "num_input_tokens_seen": 112742016, "step": 538, "train_runtime": 179242.1003, "train_tokens_per_second": 628.993 }, { "epoch": 1.7870485678704857, "grad_norm": 0.3919902812893058, "learning_rate": 1.773769850412937e-05, "loss": 0.269748330116272, "num_input_tokens_seen": 112946816, "step": 539, "train_runtime": 179543.0837, "train_tokens_per_second": 629.079 }, { "epoch": 1.7903694479036945, "grad_norm": 0.3826541448829461, "learning_rate": 1.7654792026687695e-05, "loss": 0.2661256790161133, "num_input_tokens_seen": 113152976, "step": 540, "train_runtime": 179870.3183, "train_tokens_per_second": 629.081 }, { "epoch": 1.7936903279369032, "grad_norm": 0.39479265499147825, "learning_rate": 1.7571973866795815e-05, "loss": 0.2615446150302887, "num_input_tokens_seen": 113347744, "step": 541, "train_runtime": 180167.7573, "train_tokens_per_second": 629.123 }, { "epoch": 1.7970112079701122, "grad_norm": 0.38131360236081485, "learning_rate": 1.74892450202454e-05, "loss": 0.268423855304718, "num_input_tokens_seen": 113556952, "step": 542, "train_runtime": 180490.2524, "train_tokens_per_second": 629.158 }, { "epoch": 1.800332088003321, "grad_norm": 0.39352549012545984, "learning_rate": 1.7406606481754258e-05, "loss": 0.27438366413116455, "num_input_tokens_seen": 113763736, "step": 543, "train_runtime": 180806.3721, "train_tokens_per_second": 629.202 }, { "epoch": 1.8036529680365296, "grad_norm": 0.38873717493859, "learning_rate": 1.7324059244954295e-05, "loss": 0.267435759305954, "num_input_tokens_seen": 113983520, "step": 544, "train_runtime": 181160.4303, "train_tokens_per_second": 629.186 }, { "epoch": 1.8069738480697386, "grad_norm": 0.38492899141292525, "learning_rate": 1.724160430237967e-05, "loss": 0.26886412501335144, "num_input_tokens_seen": 114190608, "step": 545, "train_runtime": 181479.5668, "train_tokens_per_second": 629.22 }, { "epoch": 1.8102947281029471, "grad_norm": 0.3777823617700966, "learning_rate": 1.71592426454548e-05, "loss": 0.26775413751602173, "num_input_tokens_seen": 114402576, "step": 546, "train_runtime": 181799.7344, "train_tokens_per_second": 629.278 }, { "epoch": 1.813615608136156, "grad_norm": 0.41033271743676125, "learning_rate": 1.7076975264482434e-05, "loss": 0.2695067524909973, "num_input_tokens_seen": 114615352, "step": 547, "train_runtime": 182146.6945, "train_tokens_per_second": 629.247 }, { "epoch": 1.816936488169365, "grad_norm": 0.3794562996620403, "learning_rate": 1.699480314863179e-05, "loss": 0.2697087526321411, "num_input_tokens_seen": 114828080, "step": 548, "train_runtime": 182476.3032, "train_tokens_per_second": 629.277 }, { "epoch": 1.8202573682025736, "grad_norm": 0.37101365602080844, "learning_rate": 1.6912727285926605e-05, "loss": 0.26648998260498047, "num_input_tokens_seen": 115039744, "step": 549, "train_runtime": 182781.5617, "train_tokens_per_second": 629.384 }, { "epoch": 1.8235782482357825, "grad_norm": 0.37243453440507873, "learning_rate": 1.6830748663233306e-05, "loss": 0.26458990573883057, "num_input_tokens_seen": 115246976, "step": 550, "train_runtime": 183080.3324, "train_tokens_per_second": 629.489 }, { "epoch": 1.8268991282689913, "grad_norm": 0.3857790146296396, "learning_rate": 1.6748868266249114e-05, "loss": 0.26564475893974304, "num_input_tokens_seen": 115457976, "step": 551, "train_runtime": 183394.4516, "train_tokens_per_second": 629.561 }, { "epoch": 1.8302200083022, "grad_norm": 0.38012443962156967, "learning_rate": 1.66670870794902e-05, "loss": 0.27272582054138184, "num_input_tokens_seen": 115665928, "step": 552, "train_runtime": 183695.6997, "train_tokens_per_second": 629.661 }, { "epoch": 1.833540888335409, "grad_norm": 0.3682182738419887, "learning_rate": 1.658540608627985e-05, "loss": 0.2620367407798767, "num_input_tokens_seen": 115873504, "step": 553, "train_runtime": 184000.4233, "train_tokens_per_second": 629.746 }, { "epoch": 1.8368617683686177, "grad_norm": 0.3791112038931868, "learning_rate": 1.6503826268736633e-05, "loss": 0.26693207025527954, "num_input_tokens_seen": 116081680, "step": 554, "train_runtime": 184306.9053, "train_tokens_per_second": 629.828 }, { "epoch": 1.8401826484018264, "grad_norm": 0.3815651836027513, "learning_rate": 1.642234860776261e-05, "loss": 0.27306675910949707, "num_input_tokens_seen": 116293208, "step": 555, "train_runtime": 184617.5315, "train_tokens_per_second": 629.914 }, { "epoch": 1.8435035284350354, "grad_norm": 0.3834775040584206, "learning_rate": 1.6340974083031523e-05, "loss": 0.2827261686325073, "num_input_tokens_seen": 116505576, "step": 556, "train_runtime": 184926.7666, "train_tokens_per_second": 630.009 }, { "epoch": 1.8468244084682441, "grad_norm": 0.3721924207870452, "learning_rate": 1.6259703672977022e-05, "loss": 0.2710084617137909, "num_input_tokens_seen": 116717448, "step": 557, "train_runtime": 185226.7062, "train_tokens_per_second": 630.133 }, { "epoch": 1.8501452885014529, "grad_norm": 0.3828107858439394, "learning_rate": 1.6178538354780893e-05, "loss": 0.27030086517333984, "num_input_tokens_seen": 116920664, "step": 558, "train_runtime": 185542.8205, "train_tokens_per_second": 630.155 }, { "epoch": 1.8534661685346618, "grad_norm": 0.3812792809833092, "learning_rate": 1.6097479104361326e-05, "loss": 0.2695692479610443, "num_input_tokens_seen": 117140760, "step": 559, "train_runtime": 185850.2124, "train_tokens_per_second": 630.297 }, { "epoch": 1.8567870485678704, "grad_norm": 0.3882432744182119, "learning_rate": 1.601652689636116e-05, "loss": 0.27516889572143555, "num_input_tokens_seen": 117351832, "step": 560, "train_runtime": 186150.3279, "train_tokens_per_second": 630.414 }, { "epoch": 1.8601079286010793, "grad_norm": 0.37850691912255713, "learning_rate": 1.5935682704136183e-05, "loss": 0.28343820571899414, "num_input_tokens_seen": 117566432, "step": 561, "train_runtime": 186463.961, "train_tokens_per_second": 630.505 }, { "epoch": 1.863428808634288, "grad_norm": 0.3599044469586087, "learning_rate": 1.5854947499743413e-05, "loss": 0.2708832621574402, "num_input_tokens_seen": 117779696, "step": 562, "train_runtime": 186796.4325, "train_tokens_per_second": 630.524 }, { "epoch": 1.8667496886674968, "grad_norm": 0.3790137723832578, "learning_rate": 1.5774322253929425e-05, "loss": 0.2764652669429779, "num_input_tokens_seen": 117986056, "step": 563, "train_runtime": 187143.7685, "train_tokens_per_second": 630.457 }, { "epoch": 1.8700705687007058, "grad_norm": 0.37083013670816406, "learning_rate": 1.569380793611865e-05, "loss": 0.26546573638916016, "num_input_tokens_seen": 118194176, "step": 564, "train_runtime": 187475.894, "train_tokens_per_second": 630.45 }, { "epoch": 1.8733914487339145, "grad_norm": 0.3726751518607998, "learning_rate": 1.561340551440176e-05, "loss": 0.26806896924972534, "num_input_tokens_seen": 118404512, "step": 565, "train_runtime": 187784.683, "train_tokens_per_second": 630.533 }, { "epoch": 1.8767123287671232, "grad_norm": 0.37796903161543655, "learning_rate": 1.5533115955523997e-05, "loss": 0.26904433965682983, "num_input_tokens_seen": 118616144, "step": 566, "train_runtime": 188114.7407, "train_tokens_per_second": 630.552 }, { "epoch": 1.8800332088003322, "grad_norm": 0.38277993753024714, "learning_rate": 1.5452940224873568e-05, "loss": 0.27207452058792114, "num_input_tokens_seen": 118829696, "step": 567, "train_runtime": 188427.1148, "train_tokens_per_second": 630.64 }, { "epoch": 1.883354088833541, "grad_norm": 0.37597987475264233, "learning_rate": 1.537287928647002e-05, "loss": 0.27362871170043945, "num_input_tokens_seen": 119040032, "step": 568, "train_runtime": 188731.7304, "train_tokens_per_second": 630.737 }, { "epoch": 1.8866749688667497, "grad_norm": 0.38620978393771505, "learning_rate": 1.529293410295266e-05, "loss": 0.26967093348503113, "num_input_tokens_seen": 119241752, "step": 569, "train_runtime": 189049.9498, "train_tokens_per_second": 630.742 }, { "epoch": 1.8899958488999586, "grad_norm": 0.3861342591307419, "learning_rate": 1.521310563556898e-05, "loss": 0.2625196576118469, "num_input_tokens_seen": 119447200, "step": 570, "train_runtime": 189371.6636, "train_tokens_per_second": 630.755 }, { "epoch": 1.8933167289331672, "grad_norm": 0.37503722305541515, "learning_rate": 1.5133394844163093e-05, "loss": 0.2724230885505676, "num_input_tokens_seen": 119661648, "step": 571, "train_runtime": 189711.2613, "train_tokens_per_second": 630.757 }, { "epoch": 1.8966376089663761, "grad_norm": 0.3970290058882431, "learning_rate": 1.5053802687164195e-05, "loss": 0.2689951956272125, "num_input_tokens_seen": 119862688, "step": 572, "train_runtime": 190007.5289, "train_tokens_per_second": 630.831 }, { "epoch": 1.8999584889995849, "grad_norm": 0.3842879026973002, "learning_rate": 1.4974330121575048e-05, "loss": 0.269534707069397, "num_input_tokens_seen": 120076592, "step": 573, "train_runtime": 190321.822, "train_tokens_per_second": 630.913 }, { "epoch": 1.9032793690327936, "grad_norm": 0.3825986231028772, "learning_rate": 1.489497810296046e-05, "loss": 0.2766106128692627, "num_input_tokens_seen": 120289424, "step": 574, "train_runtime": 190622.3003, "train_tokens_per_second": 631.035 }, { "epoch": 1.9066002490660026, "grad_norm": 0.3770548834795073, "learning_rate": 1.4815747585435801e-05, "loss": 0.2619192600250244, "num_input_tokens_seen": 120493336, "step": 575, "train_runtime": 190943.7425, "train_tokens_per_second": 631.041 }, { "epoch": 1.9099211290992113, "grad_norm": 0.38167820123910423, "learning_rate": 1.4736639521655548e-05, "loss": 0.26499634981155396, "num_input_tokens_seen": 120701960, "step": 576, "train_runtime": 191282.2734, "train_tokens_per_second": 631.015 }, { "epoch": 1.91324200913242, "grad_norm": 0.38360371992667747, "learning_rate": 1.4657654862801798e-05, "loss": 0.274825781583786, "num_input_tokens_seen": 120911368, "step": 577, "train_runtime": 191597.1944, "train_tokens_per_second": 631.071 }, { "epoch": 1.916562889165629, "grad_norm": 0.37193752448009604, "learning_rate": 1.457879455857285e-05, "loss": 0.2685317397117615, "num_input_tokens_seen": 121117264, "step": 578, "train_runtime": 191916.0044, "train_tokens_per_second": 631.095 }, { "epoch": 1.9198837691988377, "grad_norm": 0.38952524433807784, "learning_rate": 1.4500059557171791e-05, "loss": 0.26968395709991455, "num_input_tokens_seen": 121323664, "step": 579, "train_runtime": 192240.8755, "train_tokens_per_second": 631.102 }, { "epoch": 1.9232046492320465, "grad_norm": 0.3816441067383408, "learning_rate": 1.4421450805295084e-05, "loss": 0.2712090015411377, "num_input_tokens_seen": 121530784, "step": 580, "train_runtime": 192569.065, "train_tokens_per_second": 631.102 }, { "epoch": 1.9265255292652554, "grad_norm": 0.3752982544261334, "learning_rate": 1.4342969248121185e-05, "loss": 0.26714205741882324, "num_input_tokens_seen": 121738640, "step": 581, "train_runtime": 192876.9413, "train_tokens_per_second": 631.173 }, { "epoch": 1.929846409298464, "grad_norm": 0.37603135382712444, "learning_rate": 1.4264615829299188e-05, "loss": 0.280328631401062, "num_input_tokens_seen": 121953480, "step": 582, "train_runtime": 193219.7305, "train_tokens_per_second": 631.165 }, { "epoch": 1.933167289331673, "grad_norm": 0.3850254140353768, "learning_rate": 1.4186391490937481e-05, "loss": 0.26496070623397827, "num_input_tokens_seen": 122162464, "step": 583, "train_runtime": 193556.6084, "train_tokens_per_second": 631.146 }, { "epoch": 1.9364881693648817, "grad_norm": 0.3807975957584195, "learning_rate": 1.4108297173592377e-05, "loss": 0.2688587009906769, "num_input_tokens_seen": 122371184, "step": 584, "train_runtime": 193891.1368, "train_tokens_per_second": 631.133 }, { "epoch": 1.9398090493980904, "grad_norm": 0.3760259474184969, "learning_rate": 1.403033381625688e-05, "loss": 0.2745931148529053, "num_input_tokens_seen": 122583280, "step": 585, "train_runtime": 194269.5928, "train_tokens_per_second": 630.996 }, { "epoch": 1.9431299294312994, "grad_norm": 0.3721931257891272, "learning_rate": 1.3952502356349323e-05, "loss": 0.26626116037368774, "num_input_tokens_seen": 122794264, "step": 586, "train_runtime": 194604.2884, "train_tokens_per_second": 630.995 }, { "epoch": 1.946450809464508, "grad_norm": 0.3668922483229867, "learning_rate": 1.3874803729702141e-05, "loss": 0.27492424845695496, "num_input_tokens_seen": 123009488, "step": 587, "train_runtime": 194976.6246, "train_tokens_per_second": 630.894 }, { "epoch": 1.9497716894977168, "grad_norm": 0.3743082727209862, "learning_rate": 1.37972388705506e-05, "loss": 0.26660919189453125, "num_input_tokens_seen": 123220168, "step": 588, "train_runtime": 195324.445, "train_tokens_per_second": 630.849 }, { "epoch": 1.9530925695309258, "grad_norm": 0.375302607256055, "learning_rate": 1.3719808711521573e-05, "loss": 0.27515602111816406, "num_input_tokens_seen": 123430568, "step": 589, "train_runtime": 195667.8156, "train_tokens_per_second": 630.817 }, { "epoch": 1.9564134495641345, "grad_norm": 0.3849508126339702, "learning_rate": 1.3642514183622313e-05, "loss": 0.2688959240913391, "num_input_tokens_seen": 123634144, "step": 590, "train_runtime": 195982.5861, "train_tokens_per_second": 630.842 }, { "epoch": 1.9597343295973433, "grad_norm": 0.3810955768458646, "learning_rate": 1.3565356216229268e-05, "loss": 0.2695107161998749, "num_input_tokens_seen": 123843456, "step": 591, "train_runtime": 196294.7378, "train_tokens_per_second": 630.906 }, { "epoch": 1.9630552096305522, "grad_norm": 0.36799976158944836, "learning_rate": 1.3488335737076912e-05, "loss": 0.2678720951080322, "num_input_tokens_seen": 124054272, "step": 592, "train_runtime": 196613.9364, "train_tokens_per_second": 630.954 }, { "epoch": 1.9663760896637608, "grad_norm": 0.3769928489692025, "learning_rate": 1.341145367224657e-05, "loss": 0.2621110677719116, "num_input_tokens_seen": 124262952, "step": 593, "train_runtime": 196926.2755, "train_tokens_per_second": 631.013 }, { "epoch": 1.9696969696969697, "grad_norm": 0.37663470393870635, "learning_rate": 1.3334710946155326e-05, "loss": 0.2641463875770569, "num_input_tokens_seen": 124473792, "step": 594, "train_runtime": 197245.3458, "train_tokens_per_second": 631.061 }, { "epoch": 1.9730178497301785, "grad_norm": 0.3684154176192193, "learning_rate": 1.3258108481544849e-05, "loss": 0.26903510093688965, "num_input_tokens_seen": 124680848, "step": 595, "train_runtime": 197573.5124, "train_tokens_per_second": 631.061 }, { "epoch": 1.9763387297633872, "grad_norm": 0.3718947258064233, "learning_rate": 1.3181647199470331e-05, "loss": 0.2604109048843384, "num_input_tokens_seen": 124886640, "step": 596, "train_runtime": 197891.9325, "train_tokens_per_second": 631.085 }, { "epoch": 1.9796596097965962, "grad_norm": 0.38872092124924656, "learning_rate": 1.310532801928942e-05, "loss": 0.270403116941452, "num_input_tokens_seen": 125091376, "step": 597, "train_runtime": 198227.7982, "train_tokens_per_second": 631.049 }, { "epoch": 1.982980489829805, "grad_norm": 0.38014530072592223, "learning_rate": 1.3029151858651146e-05, "loss": 0.264536589384079, "num_input_tokens_seen": 125303336, "step": 598, "train_runtime": 198570.2561, "train_tokens_per_second": 631.028 }, { "epoch": 1.9863013698630136, "grad_norm": 0.36776922404626744, "learning_rate": 1.2953119633484903e-05, "loss": 0.2670980393886566, "num_input_tokens_seen": 125519280, "step": 599, "train_runtime": 198894.4079, "train_tokens_per_second": 631.085 }, { "epoch": 1.9896222498962226, "grad_norm": 0.37999706814092754, "learning_rate": 1.2877232257989422e-05, "loss": 0.2673285901546478, "num_input_tokens_seen": 125730432, "step": 600, "train_runtime": 199185.8149, "train_tokens_per_second": 631.222 }, { "epoch": 1.9929431299294313, "grad_norm": 0.3828029194583069, "learning_rate": 1.2801490644621789e-05, "loss": 0.26510292291641235, "num_input_tokens_seen": 125941760, "step": 601, "train_runtime": 199493.734, "train_tokens_per_second": 631.307 }, { "epoch": 1.99626400996264, "grad_norm": 0.3782677827989024, "learning_rate": 1.2725895704086486e-05, "loss": 0.27245032787323, "num_input_tokens_seen": 126151928, "step": 602, "train_runtime": 199803.6185, "train_tokens_per_second": 631.38 }, { "epoch": 1.999584889995849, "grad_norm": 0.38669382273954384, "learning_rate": 1.2650448345324384e-05, "loss": 0.2708059549331665, "num_input_tokens_seen": 126362392, "step": 603, "train_runtime": 200148.1802, "train_tokens_per_second": 631.344 }, { "epoch": 2.0, "grad_norm": 0.9455262890635227, "learning_rate": 1.257514947550189e-05, "loss": 0.2280811071395874, "num_input_tokens_seen": 126389368, "step": 604, "train_runtime": 200199.6656, "train_tokens_per_second": 631.317 }, { "epoch": 2.003320880033209, "grad_norm": 0.503499406395775, "learning_rate": 1.2500000000000006e-05, "loss": 0.14720846712589264, "num_input_tokens_seen": 126602256, "step": 605, "train_runtime": 200507.9792, "train_tokens_per_second": 631.408 }, { "epoch": 2.0066417600664175, "grad_norm": 0.44291369958541366, "learning_rate": 1.2425000822403423e-05, "loss": 0.146211177110672, "num_input_tokens_seen": 126820280, "step": 606, "train_runtime": 200827.7244, "train_tokens_per_second": 631.488 }, { "epoch": 2.0099626400996264, "grad_norm": 0.3734408131227805, "learning_rate": 1.235015284448969e-05, "loss": 0.1337592899799347, "num_input_tokens_seen": 127031320, "step": 607, "train_runtime": 201164.0223, "train_tokens_per_second": 631.481 }, { "epoch": 2.0132835201328354, "grad_norm": 0.488767540664872, "learning_rate": 1.2275456966218346e-05, "loss": 0.14277178049087524, "num_input_tokens_seen": 127240280, "step": 608, "train_runtime": 201484.6667, "train_tokens_per_second": 631.513 }, { "epoch": 2.016604400166044, "grad_norm": 0.5885489590537384, "learning_rate": 1.2200914085720119e-05, "loss": 0.13723507523536682, "num_input_tokens_seen": 127451832, "step": 609, "train_runtime": 201800.326, "train_tokens_per_second": 631.574 }, { "epoch": 2.019925280199253, "grad_norm": 0.513892326140165, "learning_rate": 1.212652509928611e-05, "loss": 0.13196174800395966, "num_input_tokens_seen": 127657568, "step": 610, "train_runtime": 202129.6815, "train_tokens_per_second": 631.563 }, { "epoch": 2.0232461602324614, "grad_norm": 0.4198464098914398, "learning_rate": 1.2052290901357025e-05, "loss": 0.13456305861473083, "num_input_tokens_seen": 127867344, "step": 611, "train_runtime": 202476.12, "train_tokens_per_second": 631.518 }, { "epoch": 2.0265670402656704, "grad_norm": 0.3829409400701453, "learning_rate": 1.1978212384512422e-05, "loss": 0.1359504908323288, "num_input_tokens_seen": 128079744, "step": 612, "train_runtime": 202820.2042, "train_tokens_per_second": 631.494 }, { "epoch": 2.0298879202988793, "grad_norm": 0.38764733507841426, "learning_rate": 1.1904290439459973e-05, "loss": 0.13352707028388977, "num_input_tokens_seen": 128290560, "step": 613, "train_runtime": 203158.7776, "train_tokens_per_second": 631.479 }, { "epoch": 2.033208800332088, "grad_norm": 0.39171889926997033, "learning_rate": 1.183052595502476e-05, "loss": 0.13147719204425812, "num_input_tokens_seen": 128500784, "step": 614, "train_runtime": 203455.9827, "train_tokens_per_second": 631.59 }, { "epoch": 2.036529680365297, "grad_norm": 0.4048512174658364, "learning_rate": 1.175691981813858e-05, "loss": 0.13384553790092468, "num_input_tokens_seen": 128715488, "step": 615, "train_runtime": 203783.0797, "train_tokens_per_second": 631.63 }, { "epoch": 2.0398505603985058, "grad_norm": 0.3995577109493015, "learning_rate": 1.1683472913829285e-05, "loss": 0.13982361555099487, "num_input_tokens_seen": 128934936, "step": 616, "train_runtime": 204117.572, "train_tokens_per_second": 631.67 }, { "epoch": 2.0431714404317143, "grad_norm": 0.4037778214441853, "learning_rate": 1.1610186125210151e-05, "loss": 0.13187921047210693, "num_input_tokens_seen": 129143952, "step": 617, "train_runtime": 204434.4613, "train_tokens_per_second": 631.713 }, { "epoch": 2.0464923204649232, "grad_norm": 0.40775315358457154, "learning_rate": 1.1537060333469244e-05, "loss": 0.12840893864631653, "num_input_tokens_seen": 129356664, "step": 618, "train_runtime": 204745.149, "train_tokens_per_second": 631.794 }, { "epoch": 2.049813200498132, "grad_norm": 0.4125151505297087, "learning_rate": 1.1464096417858822e-05, "loss": 0.1280813217163086, "num_input_tokens_seen": 129569792, "step": 619, "train_runtime": 205054.9783, "train_tokens_per_second": 631.878 }, { "epoch": 2.0531340805313407, "grad_norm": 0.4109874724335426, "learning_rate": 1.1391295255684774e-05, "loss": 0.1295657753944397, "num_input_tokens_seen": 129779096, "step": 620, "train_runtime": 205348.2585, "train_tokens_per_second": 631.995 }, { "epoch": 2.0564549605645497, "grad_norm": 0.4063788675749542, "learning_rate": 1.1318657722296097e-05, "loss": 0.12441350519657135, "num_input_tokens_seen": 129986448, "step": 621, "train_runtime": 205637.7102, "train_tokens_per_second": 632.114 }, { "epoch": 2.059775840597758, "grad_norm": 0.4107197573840167, "learning_rate": 1.1246184691074316e-05, "loss": 0.12921908497810364, "num_input_tokens_seen": 130188480, "step": 622, "train_runtime": 205932.9856, "train_tokens_per_second": 632.189 }, { "epoch": 2.063096720630967, "grad_norm": 0.3817649939064075, "learning_rate": 1.1173877033422994e-05, "loss": 0.1297408789396286, "num_input_tokens_seen": 130405896, "step": 623, "train_runtime": 206243.0794, "train_tokens_per_second": 632.292 }, { "epoch": 2.066417600664176, "grad_norm": 0.3824595017479894, "learning_rate": 1.1101735618757304e-05, "loss": 0.12841525673866272, "num_input_tokens_seen": 130609104, "step": 624, "train_runtime": 206567.5617, "train_tokens_per_second": 632.283 }, { "epoch": 2.0697384806973846, "grad_norm": 0.3746527263326342, "learning_rate": 1.102976131449352e-05, "loss": 0.13575738668441772, "num_input_tokens_seen": 130821816, "step": 625, "train_runtime": 206875.9086, "train_tokens_per_second": 632.369 }, { "epoch": 2.0730593607305936, "grad_norm": 0.36414714131560977, "learning_rate": 1.0957954986038621e-05, "loss": 0.13225172460079193, "num_input_tokens_seen": 131031680, "step": 626, "train_runtime": 207197.1763, "train_tokens_per_second": 632.401 }, { "epoch": 2.0763802407638026, "grad_norm": 0.3751954715624456, "learning_rate": 1.0886317496779863e-05, "loss": 0.1296905130147934, "num_input_tokens_seen": 131242200, "step": 627, "train_runtime": 207498.0016, "train_tokens_per_second": 632.499 }, { "epoch": 2.079701120797011, "grad_norm": 0.3673546566473482, "learning_rate": 1.0814849708074415e-05, "loss": 0.1264619529247284, "num_input_tokens_seen": 131452008, "step": 628, "train_runtime": 207774.2185, "train_tokens_per_second": 632.668 }, { "epoch": 2.08302200083022, "grad_norm": 0.367740743153899, "learning_rate": 1.0743552479238994e-05, "loss": 0.12316682934761047, "num_input_tokens_seen": 131653696, "step": 629, "train_runtime": 208070.4665, "train_tokens_per_second": 632.736 }, { "epoch": 2.086342880863429, "grad_norm": 0.3778466193224955, "learning_rate": 1.0672426667539526e-05, "loss": 0.12878412008285522, "num_input_tokens_seen": 131863576, "step": 630, "train_runtime": 208371.915, "train_tokens_per_second": 632.828 }, { "epoch": 2.0896637608966375, "grad_norm": 0.3936663664603693, "learning_rate": 1.0601473128180855e-05, "loss": 0.12377721071243286, "num_input_tokens_seen": 132070736, "step": 631, "train_runtime": 208666.4958, "train_tokens_per_second": 632.927 }, { "epoch": 2.0929846409298465, "grad_norm": 0.391588626563487, "learning_rate": 1.053069271429645e-05, "loss": 0.13047188520431519, "num_input_tokens_seen": 132279224, "step": 632, "train_runtime": 208951.8447, "train_tokens_per_second": 633.061 }, { "epoch": 2.096305520963055, "grad_norm": 0.3935767741163043, "learning_rate": 1.0460086276938144e-05, "loss": 0.12644770741462708, "num_input_tokens_seen": 132486664, "step": 633, "train_runtime": 209235.6232, "train_tokens_per_second": 633.194 }, { "epoch": 2.099626400996264, "grad_norm": 0.39908062956615975, "learning_rate": 1.038965466506591e-05, "loss": 0.12722255289554596, "num_input_tokens_seen": 132694584, "step": 634, "train_runtime": 209540.1607, "train_tokens_per_second": 633.266 }, { "epoch": 2.102947281029473, "grad_norm": 0.4470593052332689, "learning_rate": 1.0319398725537644e-05, "loss": 0.12794266641139984, "num_input_tokens_seen": 132902824, "step": 635, "train_runtime": 209810.5568, "train_tokens_per_second": 633.442 }, { "epoch": 2.1062681610626814, "grad_norm": 0.3838136594421241, "learning_rate": 1.0249319303098992e-05, "loss": 0.12236585468053818, "num_input_tokens_seen": 133107528, "step": 636, "train_runtime": 210120.0226, "train_tokens_per_second": 633.483 }, { "epoch": 2.1095890410958904, "grad_norm": 0.3786485483337407, "learning_rate": 1.0179417240373183e-05, "loss": 0.1218290850520134, "num_input_tokens_seen": 133317608, "step": 637, "train_runtime": 210430.119, "train_tokens_per_second": 633.548 }, { "epoch": 2.1129099211290994, "grad_norm": 0.3798167553553246, "learning_rate": 1.0109693377850906e-05, "loss": 0.13139528036117554, "num_input_tokens_seen": 133527544, "step": 638, "train_runtime": 210762.5611, "train_tokens_per_second": 633.545 }, { "epoch": 2.116230801162308, "grad_norm": 0.3615951520574017, "learning_rate": 1.0040148553880205e-05, "loss": 0.12472192198038101, "num_input_tokens_seen": 133737328, "step": 639, "train_runtime": 211077.1369, "train_tokens_per_second": 633.595 }, { "epoch": 2.119551681195517, "grad_norm": 0.3639705522217428, "learning_rate": 9.970783604656383e-06, "loss": 0.11827680468559265, "num_input_tokens_seen": 133941736, "step": 640, "train_runtime": 211403.4449, "train_tokens_per_second": 633.584 }, { "epoch": 2.122872561228726, "grad_norm": 0.3639667352588535, "learning_rate": 9.90159936421197e-06, "loss": 0.1281852126121521, "num_input_tokens_seen": 134152784, "step": 641, "train_runtime": 211741.3889, "train_tokens_per_second": 633.569 }, { "epoch": 2.1261934412619343, "grad_norm": 0.3680038834064029, "learning_rate": 9.832596664406674e-06, "loss": 0.13093870878219604, "num_input_tokens_seen": 134363040, "step": 642, "train_runtime": 212076.6139, "train_tokens_per_second": 633.559 }, { "epoch": 2.1295143212951433, "grad_norm": 0.38466107879590267, "learning_rate": 9.763776334917399e-06, "loss": 0.12560278177261353, "num_input_tokens_seen": 134569288, "step": 643, "train_runtime": 212414.3293, "train_tokens_per_second": 633.523 }, { "epoch": 2.1328352013283522, "grad_norm": 0.3685339187662748, "learning_rate": 9.695139203228254e-06, "loss": 0.11842095851898193, "num_input_tokens_seen": 134775328, "step": 644, "train_runtime": 212737.0275, "train_tokens_per_second": 633.53 }, { "epoch": 2.1361560813615608, "grad_norm": 0.3848228178292942, "learning_rate": 9.626686094620608e-06, "loss": 0.12078073620796204, "num_input_tokens_seen": 134981464, "step": 645, "train_runtime": 213025.418, "train_tokens_per_second": 633.64 }, { "epoch": 2.1394769613947697, "grad_norm": 0.37949459616307946, "learning_rate": 9.558417832163163e-06, "loss": 0.12525677680969238, "num_input_tokens_seen": 135192344, "step": 646, "train_runtime": 213341.2088, "train_tokens_per_second": 633.691 }, { "epoch": 2.1427978414279782, "grad_norm": 0.3790656145415118, "learning_rate": 9.490335236702075e-06, "loss": 0.12024770677089691, "num_input_tokens_seen": 135399096, "step": 647, "train_runtime": 213672.1221, "train_tokens_per_second": 633.677 }, { "epoch": 2.146118721461187, "grad_norm": 0.37759959324163284, "learning_rate": 9.422439126851058e-06, "loss": 0.13157103955745697, "num_input_tokens_seen": 135614416, "step": 648, "train_runtime": 213987.9729, "train_tokens_per_second": 633.748 }, { "epoch": 2.149439601494396, "grad_norm": 0.3801151191005857, "learning_rate": 9.354730318981562e-06, "loss": 0.12051853537559509, "num_input_tokens_seen": 135817856, "step": 649, "train_runtime": 214293.0192, "train_tokens_per_second": 633.795 }, { "epoch": 2.1527604815276047, "grad_norm": 0.3780279312643778, "learning_rate": 9.28720962721294e-06, "loss": 0.12102018296718597, "num_input_tokens_seen": 136023912, "step": 650, "train_runtime": 214586.7558, "train_tokens_per_second": 633.888 }, { "epoch": 2.1560813615608136, "grad_norm": 0.37824643566515065, "learning_rate": 9.219877863402682e-06, "loss": 0.12422362715005875, "num_input_tokens_seen": 136229280, "step": 651, "train_runtime": 214861.7599, "train_tokens_per_second": 634.032 }, { "epoch": 2.1594022415940226, "grad_norm": 0.3780404370024509, "learning_rate": 9.152735837136631e-06, "loss": 0.123842254281044, "num_input_tokens_seen": 136439960, "step": 652, "train_runtime": 215169.6733, "train_tokens_per_second": 634.104 }, { "epoch": 2.162723121627231, "grad_norm": 0.3837601332175419, "learning_rate": 9.085784355719257e-06, "loss": 0.13066406548023224, "num_input_tokens_seen": 136655968, "step": 653, "train_runtime": 215467.5059, "train_tokens_per_second": 634.23 }, { "epoch": 2.16604400166044, "grad_norm": 0.3743999683069437, "learning_rate": 9.019024224163954e-06, "loss": 0.12675920128822327, "num_input_tokens_seen": 136868616, "step": 654, "train_runtime": 215770.2812, "train_tokens_per_second": 634.326 }, { "epoch": 2.1693648816936486, "grad_norm": 0.37594664428924573, "learning_rate": 8.95245624518336e-06, "loss": 0.11937602609395981, "num_input_tokens_seen": 137078416, "step": 655, "train_runtime": 216065.0878, "train_tokens_per_second": 634.431 }, { "epoch": 2.1726857617268576, "grad_norm": 0.39083273830218573, "learning_rate": 8.886081219179695e-06, "loss": 0.13150331377983093, "num_input_tokens_seen": 137286352, "step": 656, "train_runtime": 216353.3137, "train_tokens_per_second": 634.547 }, { "epoch": 2.1760066417600665, "grad_norm": 0.3797931321310071, "learning_rate": 8.819899944235152e-06, "loss": 0.11932653933763504, "num_input_tokens_seen": 137487016, "step": 657, "train_runtime": 216659.3202, "train_tokens_per_second": 634.577 }, { "epoch": 2.179327521793275, "grad_norm": 0.3815464594430003, "learning_rate": 8.753913216102286e-06, "loss": 0.13181953132152557, "num_input_tokens_seen": 137697608, "step": 658, "train_runtime": 216959.0221, "train_tokens_per_second": 634.671 }, { "epoch": 2.182648401826484, "grad_norm": 0.3757109939583706, "learning_rate": 8.688121828194462e-06, "loss": 0.12009871006011963, "num_input_tokens_seen": 137905256, "step": 659, "train_runtime": 217302.1897, "train_tokens_per_second": 634.624 }, { "epoch": 2.185969281859693, "grad_norm": 0.3773812921618385, "learning_rate": 8.622526571576303e-06, "loss": 0.12502983212471008, "num_input_tokens_seen": 138115040, "step": 660, "train_runtime": 217651.2411, "train_tokens_per_second": 634.57 }, { "epoch": 2.1892901618929015, "grad_norm": 0.38573587010437504, "learning_rate": 8.55712823495419e-06, "loss": 0.12703870236873627, "num_input_tokens_seen": 138329112, "step": 661, "train_runtime": 217978.6358, "train_tokens_per_second": 634.599 }, { "epoch": 2.1926110419261104, "grad_norm": 0.38967375574237995, "learning_rate": 8.491927604666746e-06, "loss": 0.12845154106616974, "num_input_tokens_seen": 138539944, "step": 662, "train_runtime": 218307.436, "train_tokens_per_second": 634.609 }, { "epoch": 2.1959319219593194, "grad_norm": 0.38248492087671704, "learning_rate": 8.426925464675433e-06, "loss": 0.12254674732685089, "num_input_tokens_seen": 138743224, "step": 663, "train_runtime": 218601.8059, "train_tokens_per_second": 634.685 }, { "epoch": 2.199252801992528, "grad_norm": 0.38579888536749773, "learning_rate": 8.362122596555089e-06, "loss": 0.12825888395309448, "num_input_tokens_seen": 138954016, "step": 664, "train_runtime": 218923.0354, "train_tokens_per_second": 634.716 }, { "epoch": 2.202573682025737, "grad_norm": 0.38181241379153796, "learning_rate": 8.297519779484541e-06, "loss": 0.12476789951324463, "num_input_tokens_seen": 139159424, "step": 665, "train_runtime": 219242.4853, "train_tokens_per_second": 634.728 }, { "epoch": 2.205894562058946, "grad_norm": 0.3767626032399304, "learning_rate": 8.233117790237237e-06, "loss": 0.12099689245223999, "num_input_tokens_seen": 139365496, "step": 666, "train_runtime": 219583.201, "train_tokens_per_second": 634.682 }, { "epoch": 2.2092154420921544, "grad_norm": 0.3848958959448556, "learning_rate": 8.168917403171891e-06, "loss": 0.1257515549659729, "num_input_tokens_seen": 139577896, "step": 667, "train_runtime": 219946.2471, "train_tokens_per_second": 634.6 }, { "epoch": 2.2125363221253633, "grad_norm": 0.3714779665706187, "learning_rate": 8.10491939022322e-06, "loss": 0.13241630792617798, "num_input_tokens_seen": 139793112, "step": 668, "train_runtime": 220321.4012, "train_tokens_per_second": 634.496 }, { "epoch": 2.215857202158572, "grad_norm": 0.37539376119360407, "learning_rate": 8.041124520892598e-06, "loss": 0.12756380438804626, "num_input_tokens_seen": 140007096, "step": 669, "train_runtime": 220664.8363, "train_tokens_per_second": 634.479 }, { "epoch": 2.219178082191781, "grad_norm": 0.3806435141451547, "learning_rate": 7.97753356223884e-06, "loss": 0.13113358616828918, "num_input_tokens_seen": 140218888, "step": 670, "train_runtime": 221028.5026, "train_tokens_per_second": 634.393 }, { "epoch": 2.2224989622249898, "grad_norm": 0.36824526681507525, "learning_rate": 7.91414727886898e-06, "loss": 0.12063190340995789, "num_input_tokens_seen": 140429592, "step": 671, "train_runtime": 221372.6314, "train_tokens_per_second": 634.358 }, { "epoch": 2.2258198422581983, "grad_norm": 0.37030506258088436, "learning_rate": 7.850966432929066e-06, "loss": 0.13294678926467896, "num_input_tokens_seen": 140649288, "step": 672, "train_runtime": 221705.9096, "train_tokens_per_second": 634.396 }, { "epoch": 2.2291407222914073, "grad_norm": 0.38015029536546435, "learning_rate": 7.787991784095e-06, "loss": 0.11828938126564026, "num_input_tokens_seen": 140856568, "step": 673, "train_runtime": 222021.6493, "train_tokens_per_second": 634.427 }, { "epoch": 2.232461602324616, "grad_norm": 0.3735478182768587, "learning_rate": 7.725224089563413e-06, "loss": 0.12606853246688843, "num_input_tokens_seen": 141066592, "step": 674, "train_runtime": 222329.3495, "train_tokens_per_second": 634.494 }, { "epoch": 2.2357824823578247, "grad_norm": 0.3985910108298429, "learning_rate": 7.662664104042538e-06, "loss": 0.12553736567497253, "num_input_tokens_seen": 141275784, "step": 675, "train_runtime": 222643.2327, "train_tokens_per_second": 634.539 }, { "epoch": 2.2391033623910337, "grad_norm": 0.3763261478437872, "learning_rate": 7.60031257974316e-06, "loss": 0.12592512369155884, "num_input_tokens_seen": 141482088, "step": 676, "train_runtime": 222956.7423, "train_tokens_per_second": 634.572 }, { "epoch": 2.242424242424242, "grad_norm": 0.3768803885548885, "learning_rate": 7.538170266369563e-06, "loss": 0.12118929624557495, "num_input_tokens_seen": 141691360, "step": 677, "train_runtime": 223271.1041, "train_tokens_per_second": 634.616 }, { "epoch": 2.245745122457451, "grad_norm": 0.3821948089316405, "learning_rate": 7.4762379111105036e-06, "loss": 0.12104073166847229, "num_input_tokens_seen": 141895520, "step": 678, "train_runtime": 223583.5166, "train_tokens_per_second": 634.642 }, { "epoch": 2.24906600249066, "grad_norm": 0.38666039614790676, "learning_rate": 7.414516258630244e-06, "loss": 0.12766128778457642, "num_input_tokens_seen": 142104912, "step": 679, "train_runtime": 223898.0001, "train_tokens_per_second": 634.686 }, { "epoch": 2.2523868825238686, "grad_norm": 0.38326206347645003, "learning_rate": 7.353006051059594e-06, "loss": 0.12323226034641266, "num_input_tokens_seen": 142312352, "step": 680, "train_runtime": 224227.5844, "train_tokens_per_second": 634.678 }, { "epoch": 2.2557077625570776, "grad_norm": 0.38942536875985306, "learning_rate": 7.291708027986988e-06, "loss": 0.12765762209892273, "num_input_tokens_seen": 142526896, "step": 681, "train_runtime": 224561.9042, "train_tokens_per_second": 634.689 }, { "epoch": 2.2590286425902866, "grad_norm": 0.395571788532061, "learning_rate": 7.230622926449565e-06, "loss": 0.13139531016349792, "num_input_tokens_seen": 142736840, "step": 682, "train_runtime": 224868.2636, "train_tokens_per_second": 634.758 }, { "epoch": 2.262349522623495, "grad_norm": 0.38931904441968135, "learning_rate": 7.169751480924361e-06, "loss": 0.13132360577583313, "num_input_tokens_seen": 142948784, "step": 683, "train_runtime": 225206.8649, "train_tokens_per_second": 634.744 }, { "epoch": 2.265670402656704, "grad_norm": 0.3800466150578536, "learning_rate": 7.109094423319431e-06, "loss": 0.12250486016273499, "num_input_tokens_seen": 143158976, "step": 684, "train_runtime": 225538.1054, "train_tokens_per_second": 634.744 }, { "epoch": 2.268991282689913, "grad_norm": 0.3880021200694696, "learning_rate": 7.048652482965079e-06, "loss": 0.12127131968736649, "num_input_tokens_seen": 143363296, "step": 685, "train_runtime": 225858.7677, "train_tokens_per_second": 634.748 }, { "epoch": 2.2723121627231215, "grad_norm": 0.3802160118175019, "learning_rate": 6.988426386605062e-06, "loss": 0.1265455186367035, "num_input_tokens_seen": 143574792, "step": 686, "train_runtime": 226193.7516, "train_tokens_per_second": 634.743 }, { "epoch": 2.2756330427563305, "grad_norm": 0.37770849932201084, "learning_rate": 6.928416858387874e-06, "loss": 0.12172548472881317, "num_input_tokens_seen": 143781224, "step": 687, "train_runtime": 226512.1926, "train_tokens_per_second": 634.762 }, { "epoch": 2.2789539227895395, "grad_norm": 0.3835835182782973, "learning_rate": 6.868624619858022e-06, "loss": 0.1293189823627472, "num_input_tokens_seen": 143989600, "step": 688, "train_runtime": 226853.8187, "train_tokens_per_second": 634.724 }, { "epoch": 2.282274802822748, "grad_norm": 0.3883368484336215, "learning_rate": 6.809050389947363e-06, "loss": 0.12729492783546448, "num_input_tokens_seen": 144200256, "step": 689, "train_runtime": 227165.1131, "train_tokens_per_second": 634.782 }, { "epoch": 2.285595682855957, "grad_norm": 0.3716353931335373, "learning_rate": 6.749694884966454e-06, "loss": 0.12200905382633209, "num_input_tokens_seen": 144413704, "step": 690, "train_runtime": 227511.8879, "train_tokens_per_second": 634.752 }, { "epoch": 2.2889165628891655, "grad_norm": 0.3816511115450468, "learning_rate": 6.690558818595943e-06, "loss": 0.12561368942260742, "num_input_tokens_seen": 144616560, "step": 691, "train_runtime": 227861.2424, "train_tokens_per_second": 634.669 }, { "epoch": 2.2922374429223744, "grad_norm": 0.38455477712820885, "learning_rate": 6.6316429018779825e-06, "loss": 0.1299750804901123, "num_input_tokens_seen": 144824744, "step": 692, "train_runtime": 228174.0823, "train_tokens_per_second": 634.712 }, { "epoch": 2.2955583229555834, "grad_norm": 0.3782526029260286, "learning_rate": 6.5729478432076775e-06, "loss": 0.12165538966655731, "num_input_tokens_seen": 145030952, "step": 693, "train_runtime": 228507.5071, "train_tokens_per_second": 634.688 }, { "epoch": 2.298879202988792, "grad_norm": 0.3760763187392993, "learning_rate": 6.514474348324581e-06, "loss": 0.12155309319496155, "num_input_tokens_seen": 145240848, "step": 694, "train_runtime": 228846.8244, "train_tokens_per_second": 634.664 }, { "epoch": 2.302200083022001, "grad_norm": 0.3762814861833192, "learning_rate": 6.456223120304192e-06, "loss": 0.12483226507902145, "num_input_tokens_seen": 145450912, "step": 695, "train_runtime": 229214.4375, "train_tokens_per_second": 634.563 }, { "epoch": 2.30552096305521, "grad_norm": 0.38500731079413186, "learning_rate": 6.398194859549525e-06, "loss": 0.1289762258529663, "num_input_tokens_seen": 145663512, "step": 696, "train_runtime": 229539.3848, "train_tokens_per_second": 634.59 }, { "epoch": 2.3088418430884183, "grad_norm": 0.3819002071684287, "learning_rate": 6.340390263782655e-06, "loss": 0.12747865915298462, "num_input_tokens_seen": 145877688, "step": 697, "train_runtime": 229865.2218, "train_tokens_per_second": 634.623 }, { "epoch": 2.3121627231216273, "grad_norm": 0.38375765201170237, "learning_rate": 6.2828100280363595e-06, "loss": 0.12134255468845367, "num_input_tokens_seen": 146083768, "step": 698, "train_runtime": 230164.6302, "train_tokens_per_second": 634.693 }, { "epoch": 2.315483603154836, "grad_norm": 0.3723931015972208, "learning_rate": 6.22545484464574e-06, "loss": 0.12224350869655609, "num_input_tokens_seen": 146294792, "step": 699, "train_runtime": 230464.2979, "train_tokens_per_second": 634.783 }, { "epoch": 2.3188044831880448, "grad_norm": 0.38020552868402735, "learning_rate": 6.168325403239913e-06, "loss": 0.12283925712108612, "num_input_tokens_seen": 146500808, "step": 700, "train_runtime": 230788.811, "train_tokens_per_second": 634.783 }, { "epoch": 2.3221253632212537, "grad_norm": 0.3814863870810348, "learning_rate": 6.111422390733715e-06, "loss": 0.12879249453544617, "num_input_tokens_seen": 146716448, "step": 701, "train_runtime": 231128.0777, "train_tokens_per_second": 634.784 }, { "epoch": 2.3254462432544623, "grad_norm": 0.3835473772645287, "learning_rate": 6.054746491319419e-06, "loss": 0.12382742762565613, "num_input_tokens_seen": 146924192, "step": 702, "train_runtime": 231460.0429, "train_tokens_per_second": 634.771 }, { "epoch": 2.328767123287671, "grad_norm": 0.37647558251406066, "learning_rate": 5.998298386458545e-06, "loss": 0.12401469051837921, "num_input_tokens_seen": 147135496, "step": 703, "train_runtime": 231780.0754, "train_tokens_per_second": 634.806 }, { "epoch": 2.33208800332088, "grad_norm": 0.3968390385810723, "learning_rate": 5.9420787548736535e-06, "loss": 0.12756630778312683, "num_input_tokens_seen": 147347768, "step": 704, "train_runtime": 232087.6551, "train_tokens_per_second": 634.88 }, { "epoch": 2.3354088833540887, "grad_norm": 0.3796473062984941, "learning_rate": 5.8860882725401665e-06, "loss": 0.12014050781726837, "num_input_tokens_seen": 147552896, "step": 705, "train_runtime": 232392.114, "train_tokens_per_second": 634.931 }, { "epoch": 2.3387297633872977, "grad_norm": 0.37195098759761314, "learning_rate": 5.8303276126782656e-06, "loss": 0.11618681252002716, "num_input_tokens_seen": 147755848, "step": 706, "train_runtime": 232713.9505, "train_tokens_per_second": 634.925 }, { "epoch": 2.3420506434205066, "grad_norm": 0.3881730182348608, "learning_rate": 5.774797445744781e-06, "loss": 0.1188001036643982, "num_input_tokens_seen": 147960896, "step": 707, "train_runtime": 233023.487, "train_tokens_per_second": 634.961 }, { "epoch": 2.345371523453715, "grad_norm": 0.37962667539403167, "learning_rate": 5.7194984394251335e-06, "loss": 0.11904694885015488, "num_input_tokens_seen": 148164464, "step": 708, "train_runtime": 233349.5759, "train_tokens_per_second": 634.946 }, { "epoch": 2.348692403486924, "grad_norm": 0.37715502729209555, "learning_rate": 5.664431258625305e-06, "loss": 0.12943054735660553, "num_input_tokens_seen": 148382640, "step": 709, "train_runtime": 233674.8168, "train_tokens_per_second": 634.996 }, { "epoch": 2.352013283520133, "grad_norm": 0.3864009312889271, "learning_rate": 5.609596565463854e-06, "loss": 0.12141716480255127, "num_input_tokens_seen": 148590856, "step": 710, "train_runtime": 233967.2424, "train_tokens_per_second": 635.093 }, { "epoch": 2.3553341635533416, "grad_norm": 0.3841684328085787, "learning_rate": 5.55499501926394e-06, "loss": 0.12018579244613647, "num_input_tokens_seen": 148799848, "step": 711, "train_runtime": 234306.6775, "train_tokens_per_second": 635.064 }, { "epoch": 2.3586550435865505, "grad_norm": 0.37967330978857217, "learning_rate": 5.500627276545406e-06, "loss": 0.12448924034833908, "num_input_tokens_seen": 149013464, "step": 712, "train_runtime": 234625.0443, "train_tokens_per_second": 635.113 }, { "epoch": 2.361975923619759, "grad_norm": 0.38230811653376046, "learning_rate": 5.446493991016879e-06, "loss": 0.1262572705745697, "num_input_tokens_seen": 149222856, "step": 713, "train_runtime": 234939.2602, "train_tokens_per_second": 635.155 }, { "epoch": 2.365296803652968, "grad_norm": 0.37419865820004283, "learning_rate": 5.392595813567911e-06, "loss": 0.12216039001941681, "num_input_tokens_seen": 149428648, "step": 714, "train_runtime": 235195.7226, "train_tokens_per_second": 635.337 }, { "epoch": 2.368617683686177, "grad_norm": 0.37598405353179193, "learning_rate": 5.338933392261159e-06, "loss": 0.12278061360120773, "num_input_tokens_seen": 149642104, "step": 715, "train_runtime": 235471.9263, "train_tokens_per_second": 635.499 }, { "epoch": 2.3719385637193855, "grad_norm": 0.3804834810779022, "learning_rate": 5.28550737232458e-06, "loss": 0.12263549864292145, "num_input_tokens_seen": 149850928, "step": 716, "train_runtime": 235739.1046, "train_tokens_per_second": 635.664 }, { "epoch": 2.3752594437525945, "grad_norm": 0.3764254437854335, "learning_rate": 5.23231839614369e-06, "loss": 0.1202726885676384, "num_input_tokens_seen": 150062656, "step": 717, "train_runtime": 236033.7817, "train_tokens_per_second": 635.768 }, { "epoch": 2.3785803237858034, "grad_norm": 0.3779017046829003, "learning_rate": 5.179367103253821e-06, "loss": 0.12209299206733704, "num_input_tokens_seen": 150272216, "step": 718, "train_runtime": 236304.7287, "train_tokens_per_second": 635.926 }, { "epoch": 2.381901203819012, "grad_norm": 0.37673211547518387, "learning_rate": 5.126654130332451e-06, "loss": 0.1252414733171463, "num_input_tokens_seen": 150486720, "step": 719, "train_runtime": 236591.9095, "train_tokens_per_second": 636.06 }, { "epoch": 2.385222083852221, "grad_norm": 0.3856472865299117, "learning_rate": 5.0741801111915235e-06, "loss": 0.12755653262138367, "num_input_tokens_seen": 150698968, "step": 720, "train_runtime": 236878.3572, "train_tokens_per_second": 636.187 }, { "epoch": 2.3885429638854294, "grad_norm": 0.3775058852372863, "learning_rate": 5.02194567676986e-06, "loss": 0.11924375593662262, "num_input_tokens_seen": 150907064, "step": 721, "train_runtime": 237144.5014, "train_tokens_per_second": 636.351 }, { "epoch": 2.3918638439186384, "grad_norm": 0.3771957183658441, "learning_rate": 4.9699514551255435e-06, "loss": 0.12659116089344025, "num_input_tokens_seen": 151119776, "step": 722, "train_runtime": 237405.4262, "train_tokens_per_second": 636.547 }, { "epoch": 2.3951847239518473, "grad_norm": 0.3782801978522151, "learning_rate": 4.918198071428382e-06, "loss": 0.11909331381320953, "num_input_tokens_seen": 151332480, "step": 723, "train_runtime": 237679.9769, "train_tokens_per_second": 636.707 }, { "epoch": 2.398505603985056, "grad_norm": 0.3841621533156874, "learning_rate": 4.866686147952387e-06, "loss": 0.12157981842756271, "num_input_tokens_seen": 151542384, "step": 724, "train_runtime": 237983.2199, "train_tokens_per_second": 636.778 }, { "epoch": 2.401826484018265, "grad_norm": 0.376207238727056, "learning_rate": 4.815416304068298e-06, "loss": 0.12464383989572525, "num_input_tokens_seen": 151756640, "step": 725, "train_runtime": 238260.3362, "train_tokens_per_second": 636.936 }, { "epoch": 2.405147364051474, "grad_norm": 0.37284145629622617, "learning_rate": 4.764389156236126e-06, "loss": 0.1196499764919281, "num_input_tokens_seen": 151966616, "step": 726, "train_runtime": 238541.4573, "train_tokens_per_second": 637.066 }, { "epoch": 2.4084682440846823, "grad_norm": 0.3774486232030363, "learning_rate": 4.713605317997741e-06, "loss": 0.12495848536491394, "num_input_tokens_seen": 152174208, "step": 727, "train_runtime": 238818.6849, "train_tokens_per_second": 637.196 }, { "epoch": 2.4117891241178913, "grad_norm": 0.38254492582394684, "learning_rate": 4.663065399969507e-06, "loss": 0.12677881121635437, "num_input_tokens_seen": 152387080, "step": 728, "train_runtime": 239127.3875, "train_tokens_per_second": 637.263 }, { "epoch": 2.4151100041511, "grad_norm": 0.3739579431874706, "learning_rate": 4.61277000983493e-06, "loss": 0.12012401223182678, "num_input_tokens_seen": 152594736, "step": 729, "train_runtime": 239444.1414, "train_tokens_per_second": 637.287 }, { "epoch": 2.4184308841843087, "grad_norm": 0.3813477058731373, "learning_rate": 4.5627197523373495e-06, "loss": 0.1194409653544426, "num_input_tokens_seen": 152799848, "step": 730, "train_runtime": 239727.1671, "train_tokens_per_second": 637.391 }, { "epoch": 2.4217517642175177, "grad_norm": 0.3749142524525424, "learning_rate": 4.51291522927268e-06, "loss": 0.12269122153520584, "num_input_tokens_seen": 153009880, "step": 731, "train_runtime": 239995.3157, "train_tokens_per_second": 637.554 }, { "epoch": 2.4250726442507267, "grad_norm": 0.38549035005080223, "learning_rate": 4.463357039482155e-06, "loss": 0.12360180914402008, "num_input_tokens_seen": 153220224, "step": 732, "train_runtime": 240281.4758, "train_tokens_per_second": 637.67 }, { "epoch": 2.428393524283935, "grad_norm": 0.3866879187410873, "learning_rate": 4.414045778845144e-06, "loss": 0.12754693627357483, "num_input_tokens_seen": 153431992, "step": 733, "train_runtime": 240543.8481, "train_tokens_per_second": 637.855 }, { "epoch": 2.431714404317144, "grad_norm": 0.36715085629177074, "learning_rate": 4.364982040271986e-06, "loss": 0.11305053532123566, "num_input_tokens_seen": 153635224, "step": 734, "train_runtime": 240817.5291, "train_tokens_per_second": 637.974 }, { "epoch": 2.4350352843503527, "grad_norm": 0.388827296996718, "learning_rate": 4.316166413696851e-06, "loss": 0.12115851789712906, "num_input_tokens_seen": 153842856, "step": 735, "train_runtime": 241074.8372, "train_tokens_per_second": 638.154 }, { "epoch": 2.4383561643835616, "grad_norm": 0.38113702852485387, "learning_rate": 4.267599486070647e-06, "loss": 0.13038942217826843, "num_input_tokens_seen": 154060056, "step": 736, "train_runtime": 241359.5714, "train_tokens_per_second": 638.301 }, { "epoch": 2.4416770444167706, "grad_norm": 0.38452926336521476, "learning_rate": 4.219281841353981e-06, "loss": 0.12545159459114075, "num_input_tokens_seen": 154271184, "step": 737, "train_runtime": 241630.8794, "train_tokens_per_second": 638.458 }, { "epoch": 2.444997924449979, "grad_norm": 0.3874233500907916, "learning_rate": 4.171214060510109e-06, "loss": 0.12204748392105103, "num_input_tokens_seen": 154479520, "step": 738, "train_runtime": 241906.4975, "train_tokens_per_second": 638.592 }, { "epoch": 2.448318804483188, "grad_norm": 0.37373513080889265, "learning_rate": 4.123396721497977e-06, "loss": 0.12843818962574005, "num_input_tokens_seen": 154697384, "step": 739, "train_runtime": 242185.2845, "train_tokens_per_second": 638.756 }, { "epoch": 2.451639684516397, "grad_norm": 0.3860716664674423, "learning_rate": 4.075830399265263e-06, "loss": 0.11923454701900482, "num_input_tokens_seen": 154904408, "step": 740, "train_runtime": 242460.0191, "train_tokens_per_second": 638.886 }, { "epoch": 2.4549605645496055, "grad_norm": 0.384814392006591, "learning_rate": 4.028515665741439e-06, "loss": 0.11705135554075241, "num_input_tokens_seen": 155112432, "step": 741, "train_runtime": 242722.0606, "train_tokens_per_second": 639.054 }, { "epoch": 2.4582814445828145, "grad_norm": 0.3749085550178212, "learning_rate": 3.981453089830936e-06, "loss": 0.1258704960346222, "num_input_tokens_seen": 155327680, "step": 742, "train_runtime": 243005.8203, "train_tokens_per_second": 639.193 }, { "epoch": 2.461602324616023, "grad_norm": 0.37678268493919825, "learning_rate": 3.934643237406291e-06, "loss": 0.12522853910923004, "num_input_tokens_seen": 155541672, "step": 743, "train_runtime": 243262.1324, "train_tokens_per_second": 639.399 }, { "epoch": 2.464923204649232, "grad_norm": 0.37343715745973255, "learning_rate": 3.8880866713013164e-06, "loss": 0.11793011426925659, "num_input_tokens_seen": 155748912, "step": 744, "train_runtime": 243505.9102, "train_tokens_per_second": 639.61 }, { "epoch": 2.468244084682441, "grad_norm": 0.38427828557189225, "learning_rate": 3.8417839513043645e-06, "loss": 0.1318972110748291, "num_input_tokens_seen": 155962840, "step": 745, "train_runtime": 243750.1513, "train_tokens_per_second": 639.847 }, { "epoch": 2.4715649647156495, "grad_norm": 0.3829305619718617, "learning_rate": 3.795735634151579e-06, "loss": 0.1243850439786911, "num_input_tokens_seen": 156173448, "step": 746, "train_runtime": 244019.3981, "train_tokens_per_second": 640.004 }, { "epoch": 2.4748858447488584, "grad_norm": 0.378411673102262, "learning_rate": 3.7499422735202127e-06, "loss": 0.11858029663562775, "num_input_tokens_seen": 156377712, "step": 747, "train_runtime": 244272.9566, "train_tokens_per_second": 640.176 }, { "epoch": 2.4782067247820674, "grad_norm": 0.3862465361494612, "learning_rate": 3.7044044200219564e-06, "loss": 0.12187186628580093, "num_input_tokens_seen": 156586872, "step": 748, "train_runtime": 244530.2862, "train_tokens_per_second": 640.358 }, { "epoch": 2.481527604815276, "grad_norm": 0.3784759650606849, "learning_rate": 3.6591226211963287e-06, "loss": 0.12624505162239075, "num_input_tokens_seen": 156803232, "step": 749, "train_runtime": 244795.9155, "train_tokens_per_second": 640.547 }, { "epoch": 2.484848484848485, "grad_norm": 0.38762454899250415, "learning_rate": 3.6140974215040953e-06, "loss": 0.12942159175872803, "num_input_tokens_seen": 157018288, "step": 750, "train_runtime": 245063.3382, "train_tokens_per_second": 640.725 }, { "epoch": 2.488169364881694, "grad_norm": 0.3783198316557568, "learning_rate": 3.5693293623207086e-06, "loss": 0.1220598816871643, "num_input_tokens_seen": 157228840, "step": 751, "train_runtime": 245339.0748, "train_tokens_per_second": 640.863 }, { "epoch": 2.4914902449149023, "grad_norm": 0.376634798875709, "learning_rate": 3.5248189819298086e-06, "loss": 0.12424576282501221, "num_input_tokens_seen": 157443528, "step": 752, "train_runtime": 245615.5461, "train_tokens_per_second": 641.016 }, { "epoch": 2.4948111249481113, "grad_norm": 0.3841368363540372, "learning_rate": 3.480566815516756e-06, "loss": 0.12430645525455475, "num_input_tokens_seen": 157654584, "step": 753, "train_runtime": 245875.7587, "train_tokens_per_second": 641.196 }, { "epoch": 2.4981320049813203, "grad_norm": 0.37734872871631897, "learning_rate": 3.4365733951621793e-06, "loss": 0.12024077773094177, "num_input_tokens_seen": 157865464, "step": 754, "train_runtime": 246133.9645, "train_tokens_per_second": 641.38 }, { "epoch": 2.501452885014529, "grad_norm": 0.3698194526439224, "learning_rate": 3.3928392498355916e-06, "loss": 0.11887156218290329, "num_input_tokens_seen": 158074976, "step": 755, "train_runtime": 246398.7028, "train_tokens_per_second": 641.541 }, { "epoch": 2.5047737650477377, "grad_norm": 0.38734769319141993, "learning_rate": 3.3493649053890326e-06, "loss": 0.11605098843574524, "num_input_tokens_seen": 158283656, "step": 756, "train_runtime": 246692.3184, "train_tokens_per_second": 641.624 }, { "epoch": 2.5080946450809467, "grad_norm": 0.3826152763897452, "learning_rate": 3.3061508845507323e-06, "loss": 0.12943193316459656, "num_input_tokens_seen": 158499472, "step": 757, "train_runtime": 246969.0766, "train_tokens_per_second": 641.779 }, { "epoch": 2.5114155251141552, "grad_norm": 0.3720357531762343, "learning_rate": 3.263197706918836e-06, "loss": 0.1205691397190094, "num_input_tokens_seen": 158712288, "step": 758, "train_runtime": 247221.0566, "train_tokens_per_second": 641.985 }, { "epoch": 2.514736405147364, "grad_norm": 0.36721399109513764, "learning_rate": 3.22050588895515e-06, "loss": 0.11933501064777374, "num_input_tokens_seen": 158924464, "step": 759, "train_runtime": 247462.0635, "train_tokens_per_second": 642.217 }, { "epoch": 2.5180572851805727, "grad_norm": 0.3795397365207489, "learning_rate": 3.1780759439789505e-06, "loss": 0.12427882850170135, "num_input_tokens_seen": 159138744, "step": 760, "train_runtime": 247740.7746, "train_tokens_per_second": 642.36 }, { "epoch": 2.5213781652137817, "grad_norm": 0.36873736499287635, "learning_rate": 3.135908382160771e-06, "loss": 0.12325683981180191, "num_input_tokens_seen": 159350696, "step": 761, "train_runtime": 248023.37, "train_tokens_per_second": 642.483 }, { "epoch": 2.52469904524699, "grad_norm": 0.38368658471610134, "learning_rate": 3.094003710516316e-06, "loss": 0.11813477426767349, "num_input_tokens_seen": 159556232, "step": 762, "train_runtime": 248321.2353, "train_tokens_per_second": 642.54 }, { "epoch": 2.528019925280199, "grad_norm": 0.3831006083561701, "learning_rate": 3.0523624329003324e-06, "loss": 0.12102234363555908, "num_input_tokens_seen": 159766984, "step": 763, "train_runtime": 248623.8932, "train_tokens_per_second": 642.605 }, { "epoch": 2.531340805313408, "grad_norm": 0.38095243187593353, "learning_rate": 3.010985050000567e-06, "loss": 0.12141455709934235, "num_input_tokens_seen": 159976928, "step": 764, "train_runtime": 248916.1976, "train_tokens_per_second": 642.694 }, { "epoch": 2.5346616853466166, "grad_norm": 0.3820106204793304, "learning_rate": 2.969872059331738e-06, "loss": 0.11940959841012955, "num_input_tokens_seen": 160188856, "step": 765, "train_runtime": 249222.535, "train_tokens_per_second": 642.754 }, { "epoch": 2.5379825653798256, "grad_norm": 0.3809055314877438, "learning_rate": 2.929023955229554e-06, "loss": 0.12051350623369217, "num_input_tokens_seen": 160402176, "step": 766, "train_runtime": 249549.002, "train_tokens_per_second": 642.768 }, { "epoch": 2.5413034454130345, "grad_norm": 0.3818534979520062, "learning_rate": 2.8884412288447737e-06, "loss": 0.12026175111532211, "num_input_tokens_seen": 160606848, "step": 767, "train_runtime": 249853.8602, "train_tokens_per_second": 642.803 }, { "epoch": 2.544624325446243, "grad_norm": 0.37738386770721005, "learning_rate": 2.8481243681373004e-06, "loss": 0.11633945256471634, "num_input_tokens_seen": 160814128, "step": 768, "train_runtime": 250147.9208, "train_tokens_per_second": 642.876 }, { "epoch": 2.547945205479452, "grad_norm": 0.38124015117912263, "learning_rate": 2.8080738578703054e-06, "loss": 0.12401984632015228, "num_input_tokens_seen": 161025112, "step": 769, "train_runtime": 250446.8319, "train_tokens_per_second": 642.951 }, { "epoch": 2.551266085512661, "grad_norm": 0.3879825978291705, "learning_rate": 2.7682901796044214e-06, "loss": 0.1298520267009735, "num_input_tokens_seen": 161234864, "step": 770, "train_runtime": 250777.4953, "train_tokens_per_second": 642.94 }, { "epoch": 2.5545869655458695, "grad_norm": 0.37893263116703957, "learning_rate": 2.728773811691923e-06, "loss": 0.12286140024662018, "num_input_tokens_seen": 161445032, "step": 771, "train_runtime": 251103.9195, "train_tokens_per_second": 642.941 }, { "epoch": 2.5579078455790785, "grad_norm": 0.37775431246595376, "learning_rate": 2.689525229270998e-06, "loss": 0.12267804890871048, "num_input_tokens_seen": 161654816, "step": 772, "train_runtime": 251418.9164, "train_tokens_per_second": 642.97 }, { "epoch": 2.5612287256122874, "grad_norm": 0.3816241024847843, "learning_rate": 2.6505449042600244e-06, "loss": 0.11676201224327087, "num_input_tokens_seen": 161856240, "step": 773, "train_runtime": 251722.3584, "train_tokens_per_second": 642.995 }, { "epoch": 2.564549605645496, "grad_norm": 0.3761379893483475, "learning_rate": 2.611833305351899e-06, "loss": 0.12199907749891281, "num_input_tokens_seen": 162066904, "step": 774, "train_runtime": 252038.7623, "train_tokens_per_second": 643.024 }, { "epoch": 2.567870485678705, "grad_norm": 0.37797898685238657, "learning_rate": 2.5733908980083988e-06, "loss": 0.12086299061775208, "num_input_tokens_seen": 162276928, "step": 775, "train_runtime": 252348.9122, "train_tokens_per_second": 643.066 }, { "epoch": 2.571191365711914, "grad_norm": 0.38198264083298034, "learning_rate": 2.535218144454585e-06, "loss": 0.1251925230026245, "num_input_tokens_seen": 162490488, "step": 776, "train_runtime": 252692.1458, "train_tokens_per_second": 643.037 }, { "epoch": 2.5745122457451224, "grad_norm": 0.3837977361688662, "learning_rate": 2.4973155036732534e-06, "loss": 0.1298554539680481, "num_input_tokens_seen": 162700640, "step": 777, "train_runtime": 252997.8271, "train_tokens_per_second": 643.091 }, { "epoch": 2.5778331257783313, "grad_norm": 0.37633802786275006, "learning_rate": 2.4596834313994036e-06, "loss": 0.11935385316610336, "num_input_tokens_seen": 162915456, "step": 778, "train_runtime": 253319.3018, "train_tokens_per_second": 643.123 }, { "epoch": 2.5811540058115403, "grad_norm": 0.38068397396904047, "learning_rate": 2.422322380114772e-06, "loss": 0.12721630930900574, "num_input_tokens_seen": 163125664, "step": 779, "train_runtime": 253607.2157, "train_tokens_per_second": 643.222 }, { "epoch": 2.584474885844749, "grad_norm": 0.3883831201052826, "learning_rate": 2.3852327990423666e-06, "loss": 0.12535764276981354, "num_input_tokens_seen": 163337744, "step": 780, "train_runtime": 253922.3171, "train_tokens_per_second": 643.259 }, { "epoch": 2.587795765877958, "grad_norm": 0.37246481894221606, "learning_rate": 2.348415134141102e-06, "loss": 0.12309374660253525, "num_input_tokens_seen": 163551400, "step": 781, "train_runtime": 254245.5336, "train_tokens_per_second": 643.281 }, { "epoch": 2.5911166459111663, "grad_norm": 0.3780640808040786, "learning_rate": 2.3118698281004103e-06, "loss": 0.11573754251003265, "num_input_tokens_seen": 163759232, "step": 782, "train_runtime": 254553.7091, "train_tokens_per_second": 643.319 }, { "epoch": 2.5944375259443753, "grad_norm": 0.3780983234687358, "learning_rate": 2.2755973203349294e-06, "loss": 0.12840723991394043, "num_input_tokens_seen": 163973024, "step": 783, "train_runtime": 254864.2086, "train_tokens_per_second": 643.374 }, { "epoch": 2.597758405977584, "grad_norm": 0.398722658066234, "learning_rate": 2.2395980469792103e-06, "loss": 0.12373904883861542, "num_input_tokens_seen": 164182328, "step": 784, "train_runtime": 255154.6677, "train_tokens_per_second": 643.462 }, { "epoch": 2.6010792860107927, "grad_norm": 0.37204134570659986, "learning_rate": 2.2038724408824844e-06, "loss": 0.1140456348657608, "num_input_tokens_seen": 164388728, "step": 785, "train_runtime": 255469.3135, "train_tokens_per_second": 643.477 }, { "epoch": 2.6044001660440017, "grad_norm": 0.3846466576670569, "learning_rate": 2.168420931603457e-06, "loss": 0.12779146432876587, "num_input_tokens_seen": 164602312, "step": 786, "train_runtime": 255810.0272, "train_tokens_per_second": 643.455 }, { "epoch": 2.6077210460772102, "grad_norm": 0.3669818289085315, "learning_rate": 2.133243945405128e-06, "loss": 0.11502551287412643, "num_input_tokens_seen": 164808200, "step": 787, "train_runtime": 256136.5418, "train_tokens_per_second": 643.439 }, { "epoch": 2.611041926110419, "grad_norm": 0.374035435991226, "learning_rate": 2.09834190524969e-06, "loss": 0.12242258340120316, "num_input_tokens_seen": 165020992, "step": 788, "train_runtime": 256455.5258, "train_tokens_per_second": 643.468 }, { "epoch": 2.614362806143628, "grad_norm": 0.37506719885961093, "learning_rate": 2.0637152307934228e-06, "loss": 0.12721297144889832, "num_input_tokens_seen": 165236832, "step": 789, "train_runtime": 256764.1185, "train_tokens_per_second": 643.536 }, { "epoch": 2.6176836861768367, "grad_norm": 0.37848057234830684, "learning_rate": 2.029364338381656e-06, "loss": 0.11729756742715836, "num_input_tokens_seen": 165444256, "step": 790, "train_runtime": 257094.4823, "train_tokens_per_second": 643.515 }, { "epoch": 2.6210045662100456, "grad_norm": 0.3974053960179861, "learning_rate": 1.995289641043768e-06, "loss": 0.12569686770439148, "num_input_tokens_seen": 165651408, "step": 791, "train_runtime": 257418.6324, "train_tokens_per_second": 643.51 }, { "epoch": 2.6243254462432546, "grad_norm": 0.3910512583691644, "learning_rate": 1.961491548488206e-06, "loss": 0.12304438650608063, "num_input_tokens_seen": 165858696, "step": 792, "train_runtime": 257732.4806, "train_tokens_per_second": 643.53 }, { "epoch": 2.627646326276463, "grad_norm": 0.39099842405310353, "learning_rate": 1.927970467097573e-06, "loss": 0.11656749248504639, "num_input_tokens_seen": 166067192, "step": 793, "train_runtime": 258044.6094, "train_tokens_per_second": 643.56 }, { "epoch": 2.630967206309672, "grad_norm": 0.3678961066797767, "learning_rate": 1.8947267999237322e-06, "loss": 0.11832045018672943, "num_input_tokens_seen": 166279640, "step": 794, "train_runtime": 258382.3925, "train_tokens_per_second": 643.541 }, { "epoch": 2.634288086342881, "grad_norm": 0.37998381028538614, "learning_rate": 1.8617609466829666e-06, "loss": 0.12332157790660858, "num_input_tokens_seen": 166490368, "step": 795, "train_runtime": 258720.6376, "train_tokens_per_second": 643.514 }, { "epoch": 2.6376089663760895, "grad_norm": 0.4022119994629186, "learning_rate": 1.8290733037511721e-06, "loss": 0.12114399671554565, "num_input_tokens_seen": 166702400, "step": 796, "train_runtime": 259073.7591, "train_tokens_per_second": 643.455 }, { "epoch": 2.6409298464092985, "grad_norm": 0.37503957769032703, "learning_rate": 1.7966642641590925e-06, "loss": 0.12398452311754227, "num_input_tokens_seen": 166915872, "step": 797, "train_runtime": 259419.9217, "train_tokens_per_second": 643.42 }, { "epoch": 2.6442507264425075, "grad_norm": 0.3832700653185944, "learning_rate": 1.764534217587585e-06, "loss": 0.11987572908401489, "num_input_tokens_seen": 167125712, "step": 798, "train_runtime": 259745.7117, "train_tokens_per_second": 643.42 }, { "epoch": 2.647571606475716, "grad_norm": 0.38117271003727243, "learning_rate": 1.732683550362954e-06, "loss": 0.12342026829719543, "num_input_tokens_seen": 167336384, "step": 799, "train_runtime": 260074.488, "train_tokens_per_second": 643.417 }, { "epoch": 2.650892486508925, "grad_norm": 0.3887406817589898, "learning_rate": 1.7011126454522713e-06, "loss": 0.12326280027627945, "num_input_tokens_seen": 167544792, "step": 800, "train_runtime": 260436.2491, "train_tokens_per_second": 643.324 }, { "epoch": 2.654213366542134, "grad_norm": 0.3749105634820341, "learning_rate": 1.6698218824588164e-06, "loss": 0.12250946462154388, "num_input_tokens_seen": 167758024, "step": 801, "train_runtime": 260770.086, "train_tokens_per_second": 643.318 }, { "epoch": 2.6575342465753424, "grad_norm": 0.39317783331974365, "learning_rate": 1.6388116376174767e-06, "loss": 0.1269228309392929, "num_input_tokens_seen": 167964408, "step": 802, "train_runtime": 261105.172, "train_tokens_per_second": 643.283 }, { "epoch": 2.6608551266085514, "grad_norm": 0.38895355749016014, "learning_rate": 1.6080822837902382e-06, "loss": 0.12384317815303802, "num_input_tokens_seen": 168177256, "step": 803, "train_runtime": 261440.3109, "train_tokens_per_second": 643.272 }, { "epoch": 2.66417600664176, "grad_norm": 0.3762199653431181, "learning_rate": 1.5776341904617048e-06, "loss": 0.12542122602462769, "num_input_tokens_seen": 168391888, "step": 804, "train_runtime": 261782.4611, "train_tokens_per_second": 643.251 }, { "epoch": 2.667496886674969, "grad_norm": 0.37616228100926163, "learning_rate": 1.5474677237346468e-06, "loss": 0.11933274567127228, "num_input_tokens_seen": 168596424, "step": 805, "train_runtime": 262087.3914, "train_tokens_per_second": 643.283 }, { "epoch": 2.670817766708178, "grad_norm": 0.3848907404944112, "learning_rate": 1.5175832463256046e-06, "loss": 0.11670228838920593, "num_input_tokens_seen": 168798304, "step": 806, "train_runtime": 262413.5078, "train_tokens_per_second": 643.253 }, { "epoch": 2.6741386467413863, "grad_norm": 0.37384846458458704, "learning_rate": 1.4879811175605302e-06, "loss": 0.11482670903205872, "num_input_tokens_seen": 169005944, "step": 807, "train_runtime": 262742.3693, "train_tokens_per_second": 643.238 }, { "epoch": 2.6774595267745953, "grad_norm": 0.37922673410415875, "learning_rate": 1.4586616933704528e-06, "loss": 0.12115344405174255, "num_input_tokens_seen": 169216600, "step": 808, "train_runtime": 263057.2049, "train_tokens_per_second": 643.269 }, { "epoch": 2.680780406807804, "grad_norm": 0.36829129070866046, "learning_rate": 1.4296253262872234e-06, "loss": 0.11893177777528763, "num_input_tokens_seen": 169428032, "step": 809, "train_runtime": 263369.4454, "train_tokens_per_second": 643.309 }, { "epoch": 2.684101286841013, "grad_norm": 0.38937987566784266, "learning_rate": 1.400872365439246e-06, "loss": 0.13020320236682892, "num_input_tokens_seen": 169641760, "step": 810, "train_runtime": 263709.8084, "train_tokens_per_second": 643.29 }, { "epoch": 2.6874221668742218, "grad_norm": 0.3778192104859468, "learning_rate": 1.3724031565473112e-06, "loss": 0.11801673471927643, "num_input_tokens_seen": 169845856, "step": 811, "train_runtime": 264013.551, "train_tokens_per_second": 643.322 }, { "epoch": 2.6907430469074303, "grad_norm": 0.38303293665989685, "learning_rate": 1.3442180419204088e-06, "loss": 0.11800455302000046, "num_input_tokens_seen": 170055608, "step": 812, "train_runtime": 264305.8298, "train_tokens_per_second": 643.405 }, { "epoch": 2.6940639269406392, "grad_norm": 0.3806235292546312, "learning_rate": 1.316317360451641e-06, "loss": 0.116279236972332, "num_input_tokens_seen": 170258376, "step": 813, "train_runtime": 264628.013, "train_tokens_per_second": 643.388 }, { "epoch": 2.697384806973848, "grad_norm": 0.3859505179240658, "learning_rate": 1.2887014476141212e-06, "loss": 0.12136112153530121, "num_input_tokens_seen": 170470528, "step": 814, "train_runtime": 264948.9992, "train_tokens_per_second": 643.409 }, { "epoch": 2.7007056870070567, "grad_norm": 0.3690322253571553, "learning_rate": 1.2613706354569571e-06, "loss": 0.1223984807729721, "num_input_tokens_seen": 170684152, "step": 815, "train_runtime": 265316.015, "train_tokens_per_second": 643.324 }, { "epoch": 2.7040265670402657, "grad_norm": 0.3737250692364351, "learning_rate": 1.2343252526012595e-06, "loss": 0.11964330077171326, "num_input_tokens_seen": 170899184, "step": 816, "train_runtime": 265659.6826, "train_tokens_per_second": 643.301 }, { "epoch": 2.7073474470734746, "grad_norm": 0.37397915484906136, "learning_rate": 1.2075656242361732e-06, "loss": 0.12065157294273376, "num_input_tokens_seen": 171112184, "step": 817, "train_runtime": 266015.948, "train_tokens_per_second": 643.24 }, { "epoch": 2.710668327106683, "grad_norm": 0.3768272052276891, "learning_rate": 1.1810920721149892e-06, "loss": 0.12584027647972107, "num_input_tokens_seen": 171323576, "step": 818, "train_runtime": 266334.6943, "train_tokens_per_second": 643.264 }, { "epoch": 2.713989207139892, "grad_norm": 0.37788381138262384, "learning_rate": 1.1549049145512636e-06, "loss": 0.11420264095067978, "num_input_tokens_seen": 171533128, "step": 819, "train_runtime": 266618.9264, "train_tokens_per_second": 643.364 }, { "epoch": 2.717310087173101, "grad_norm": 0.3768263228275564, "learning_rate": 1.1290044664149873e-06, "loss": 0.13194763660430908, "num_input_tokens_seen": 171750904, "step": 820, "train_runtime": 266946.5296, "train_tokens_per_second": 643.391 }, { "epoch": 2.7206309672063096, "grad_norm": 0.3838344698675426, "learning_rate": 1.1033910391288065e-06, "loss": 0.12018980830907822, "num_input_tokens_seen": 171964648, "step": 821, "train_runtime": 267264.0903, "train_tokens_per_second": 643.426 }, { "epoch": 2.7239518472395186, "grad_norm": 0.3715297687539512, "learning_rate": 1.0780649406642862e-06, "loss": 0.12106224149465561, "num_input_tokens_seen": 172174992, "step": 822, "train_runtime": 267628.9299, "train_tokens_per_second": 643.335 }, { "epoch": 2.7272727272727275, "grad_norm": 0.4817970575133575, "learning_rate": 1.0530264755381824e-06, "loss": 0.11653967201709747, "num_input_tokens_seen": 172383840, "step": 823, "train_runtime": 267946.1176, "train_tokens_per_second": 643.353 }, { "epoch": 2.730593607305936, "grad_norm": 0.3851252129344082, "learning_rate": 1.028275944808807e-06, "loss": 0.1209145188331604, "num_input_tokens_seen": 172590416, "step": 824, "train_runtime": 268280.1539, "train_tokens_per_second": 643.322 }, { "epoch": 2.733914487339145, "grad_norm": 0.3786466783516994, "learning_rate": 1.0038136460723963e-06, "loss": 0.12629863619804382, "num_input_tokens_seen": 172802240, "step": 825, "train_runtime": 268619.8662, "train_tokens_per_second": 643.297 }, { "epoch": 2.7372353673723535, "grad_norm": 0.37888122405205993, "learning_rate": 9.796398734595285e-07, "loss": 0.12389196455478668, "num_input_tokens_seen": 173013584, "step": 826, "train_runtime": 268959.9941, "train_tokens_per_second": 643.269 }, { "epoch": 2.7405562474055625, "grad_norm": 0.3662854153805343, "learning_rate": 9.557549176315934e-07, "loss": 0.12121491134166718, "num_input_tokens_seen": 173223384, "step": 827, "train_runtime": 269292.9472, "train_tokens_per_second": 643.253 }, { "epoch": 2.7438771274387714, "grad_norm": 0.37716364843882577, "learning_rate": 9.321590657772994e-07, "loss": 0.12518440186977386, "num_input_tokens_seen": 173439368, "step": 828, "train_runtime": 269690.958, "train_tokens_per_second": 643.104 }, { "epoch": 2.74719800747198, "grad_norm": 0.3739207691575995, "learning_rate": 9.088526016092142e-07, "loss": 0.11913235485553741, "num_input_tokens_seen": 173648176, "step": 829, "train_runtime": 270041.6301, "train_tokens_per_second": 643.042 }, { "epoch": 2.750518887505189, "grad_norm": 0.3780806166977324, "learning_rate": 8.858358053603577e-07, "loss": 0.12754884362220764, "num_input_tokens_seen": 173859512, "step": 830, "train_runtime": 270368.9765, "train_tokens_per_second": 643.045 }, { "epoch": 2.7538397675383974, "grad_norm": 0.3761673445534249, "learning_rate": 8.631089537808307e-07, "loss": 0.12187932431697845, "num_input_tokens_seen": 174070168, "step": 831, "train_runtime": 270714.4492, "train_tokens_per_second": 643.003 }, { "epoch": 2.7571606475716064, "grad_norm": 0.36816872817617813, "learning_rate": 8.406723201344891e-07, "loss": 0.11619506031274796, "num_input_tokens_seen": 174277352, "step": 832, "train_runtime": 271052.0905, "train_tokens_per_second": 642.966 }, { "epoch": 2.7604815276048154, "grad_norm": 0.37723725172863876, "learning_rate": 8.185261741956551e-07, "loss": 0.12683464586734772, "num_input_tokens_seen": 174490400, "step": 833, "train_runtime": 271415.6896, "train_tokens_per_second": 642.89 }, { "epoch": 2.763802407638024, "grad_norm": 0.3584805657679756, "learning_rate": 7.966707822458758e-07, "loss": 0.11898402124643326, "num_input_tokens_seen": 174700432, "step": 834, "train_runtime": 271739.2046, "train_tokens_per_second": 642.897 }, { "epoch": 2.767123287671233, "grad_norm": 0.36969010980847744, "learning_rate": 7.751064070707248e-07, "loss": 0.11298386752605438, "num_input_tokens_seen": 174903336, "step": 835, "train_runtime": 272050.3906, "train_tokens_per_second": 642.908 }, { "epoch": 2.770444167704442, "grad_norm": 0.3738476878581298, "learning_rate": 7.538333079566306e-07, "loss": 0.11921288073062897, "num_input_tokens_seen": 175117688, "step": 836, "train_runtime": 272412.3647, "train_tokens_per_second": 642.84 }, { "epoch": 2.7737650477376503, "grad_norm": 0.3708503877713552, "learning_rate": 7.328517406877761e-07, "loss": 0.12302444875240326, "num_input_tokens_seen": 175328400, "step": 837, "train_runtime": 272760.4588, "train_tokens_per_second": 642.793 }, { "epoch": 2.7770859277708593, "grad_norm": 0.37931973420766374, "learning_rate": 7.12161957543006e-07, "loss": 0.11951660364866257, "num_input_tokens_seen": 175535848, "step": 838, "train_runtime": 273100.1278, "train_tokens_per_second": 642.753 }, { "epoch": 2.7804068078040682, "grad_norm": 0.3732659798735363, "learning_rate": 6.917642072928137e-07, "loss": 0.12080246210098267, "num_input_tokens_seen": 175743512, "step": 839, "train_runtime": 273456.9757, "train_tokens_per_second": 642.673 }, { "epoch": 2.7837276878372768, "grad_norm": 0.37811079266235353, "learning_rate": 6.716587351963205e-07, "loss": 0.12245343625545502, "num_input_tokens_seen": 175952928, "step": 840, "train_runtime": 273770.6463, "train_tokens_per_second": 642.702 }, { "epoch": 2.7870485678704857, "grad_norm": 0.37450919497986357, "learning_rate": 6.51845782998356e-07, "loss": 0.11698979139328003, "num_input_tokens_seen": 176162864, "step": 841, "train_runtime": 274117.2359, "train_tokens_per_second": 642.655 }, { "epoch": 2.7903694479036947, "grad_norm": 0.3805917771521276, "learning_rate": 6.323255889265411e-07, "loss": 0.12792760133743286, "num_input_tokens_seen": 176382440, "step": 842, "train_runtime": 274465.1685, "train_tokens_per_second": 642.641 }, { "epoch": 2.793690327936903, "grad_norm": 0.36951463553404074, "learning_rate": 6.130983876884155e-07, "loss": 0.11449296772480011, "num_input_tokens_seen": 176590168, "step": 843, "train_runtime": 274812.1285, "train_tokens_per_second": 642.585 }, { "epoch": 2.797011207970112, "grad_norm": 0.38727902122020164, "learning_rate": 5.941644104686256e-07, "loss": 0.12309432774782181, "num_input_tokens_seen": 176798448, "step": 844, "train_runtime": 275147.7901, "train_tokens_per_second": 642.558 }, { "epoch": 2.800332088003321, "grad_norm": 0.37942923215694935, "learning_rate": 5.755238849261407e-07, "loss": 0.11892662942409515, "num_input_tokens_seen": 177008096, "step": 845, "train_runtime": 275481.6651, "train_tokens_per_second": 642.54 }, { "epoch": 2.8036529680365296, "grad_norm": 0.3674843004191288, "learning_rate": 5.571770351915168e-07, "loss": 0.11739193648099899, "num_input_tokens_seen": 177215368, "step": 846, "train_runtime": 275837.1482, "train_tokens_per_second": 642.464 }, { "epoch": 2.8069738480697386, "grad_norm": 0.38177015302178785, "learning_rate": 5.391240818642007e-07, "loss": 0.1246020719408989, "num_input_tokens_seen": 177425480, "step": 847, "train_runtime": 276190.5783, "train_tokens_per_second": 642.402 }, { "epoch": 2.810294728102947, "grad_norm": 0.3858682778862552, "learning_rate": 5.213652420098747e-07, "loss": 0.12035591900348663, "num_input_tokens_seen": 177638424, "step": 848, "train_runtime": 276533.366, "train_tokens_per_second": 642.376 }, { "epoch": 2.813615608136156, "grad_norm": 0.38543662263093287, "learning_rate": 5.039007291578579e-07, "loss": 0.1129733994603157, "num_input_tokens_seen": 177841296, "step": 849, "train_runtime": 276867.0795, "train_tokens_per_second": 642.335 }, { "epoch": 2.816936488169365, "grad_norm": 0.38652822765625927, "learning_rate": 4.867307532985227e-07, "loss": 0.11389519274234772, "num_input_tokens_seen": 178040808, "step": 850, "train_runtime": 277204.7315, "train_tokens_per_second": 642.272 }, { "epoch": 2.8202573682025736, "grad_norm": 0.37985650756501366, "learning_rate": 4.698555208807853e-07, "loss": 0.12554343044757843, "num_input_tokens_seen": 178250240, "step": 851, "train_runtime": 277515.4462, "train_tokens_per_second": 642.307 }, { "epoch": 2.8235782482357825, "grad_norm": 0.3931191923614686, "learning_rate": 4.532752348096081e-07, "loss": 0.12580470740795135, "num_input_tokens_seen": 178465496, "step": 852, "train_runtime": 277877.5496, "train_tokens_per_second": 642.245 }, { "epoch": 2.826899128268991, "grad_norm": 0.389269340602751, "learning_rate": 4.3699009444357344e-07, "loss": 0.1148395836353302, "num_input_tokens_seen": 178673616, "step": 853, "train_runtime": 278212.0255, "train_tokens_per_second": 642.221 }, { "epoch": 2.8302200083022, "grad_norm": 0.3780122336985213, "learning_rate": 4.21000295592483e-07, "loss": 0.12070584297180176, "num_input_tokens_seen": 178882256, "step": 854, "train_runtime": 278513.2741, "train_tokens_per_second": 642.276 }, { "epoch": 2.833540888335409, "grad_norm": 0.37458087834835657, "learning_rate": 4.0530603051499584e-07, "loss": 0.1180640459060669, "num_input_tokens_seen": 179086976, "step": 855, "train_runtime": 278807.2705, "train_tokens_per_second": 642.333 }, { "epoch": 2.8368617683686175, "grad_norm": 0.3811454228805412, "learning_rate": 3.899074879163245e-07, "loss": 0.12119519710540771, "num_input_tokens_seen": 179295696, "step": 856, "train_runtime": 279128.4813, "train_tokens_per_second": 642.341 }, { "epoch": 2.8401826484018264, "grad_norm": 0.3817262211789762, "learning_rate": 3.7480485294596747e-07, "loss": 0.11769559979438782, "num_input_tokens_seen": 179504888, "step": 857, "train_runtime": 279476.2018, "train_tokens_per_second": 642.29 }, { "epoch": 2.8435035284350354, "grad_norm": 0.3773108218391704, "learning_rate": 3.599983071954777e-07, "loss": 0.11247526109218597, "num_input_tokens_seen": 179711696, "step": 858, "train_runtime": 279796.0726, "train_tokens_per_second": 642.295 }, { "epoch": 2.846824408468244, "grad_norm": 0.37573996315361424, "learning_rate": 3.454880286962781e-07, "loss": 0.12671740353107452, "num_input_tokens_seen": 179923648, "step": 859, "train_runtime": 280105.4842, "train_tokens_per_second": 642.342 }, { "epoch": 2.850145288501453, "grad_norm": 0.37564838490896857, "learning_rate": 3.312741919175244e-07, "loss": 0.11756633222103119, "num_input_tokens_seen": 180133392, "step": 860, "train_runtime": 280414.6601, "train_tokens_per_second": 642.382 }, { "epoch": 2.853466168534662, "grad_norm": 0.380203256131493, "learning_rate": 3.1735696776400703e-07, "loss": 0.11647183448076248, "num_input_tokens_seen": 180341328, "step": 861, "train_runtime": 280728.457, "train_tokens_per_second": 642.405 }, { "epoch": 2.8567870485678704, "grad_norm": 0.36478331921990703, "learning_rate": 3.0373652357410245e-07, "loss": 0.11633309721946716, "num_input_tokens_seen": 180550952, "step": 862, "train_runtime": 281022.4429, "train_tokens_per_second": 642.479 }, { "epoch": 2.8601079286010793, "grad_norm": 0.3715153159050038, "learning_rate": 2.9041302311774433e-07, "loss": 0.1253267228603363, "num_input_tokens_seen": 180764624, "step": 863, "train_runtime": 281334.5424, "train_tokens_per_second": 642.526 }, { "epoch": 2.8634288086342883, "grad_norm": 0.4083404487490612, "learning_rate": 2.7738662659447246e-07, "loss": 0.12318548560142517, "num_input_tokens_seen": 180971592, "step": 864, "train_runtime": 281637.4136, "train_tokens_per_second": 642.569 }, { "epoch": 2.866749688667497, "grad_norm": 0.3820803506083772, "learning_rate": 2.646574906314925e-07, "loss": 0.12904170155525208, "num_input_tokens_seen": 181183608, "step": 865, "train_runtime": 281951.3454, "train_tokens_per_second": 642.606 }, { "epoch": 2.8700705687007058, "grad_norm": 0.3670874614999289, "learning_rate": 2.52225768281808e-07, "loss": 0.11389099806547165, "num_input_tokens_seen": 181392224, "step": 866, "train_runtime": 282281.1335, "train_tokens_per_second": 642.594 }, { "epoch": 2.8733914487339147, "grad_norm": 0.3776067986939545, "learning_rate": 2.4009160902235816e-07, "loss": 0.11891908943653107, "num_input_tokens_seen": 181604784, "step": 867, "train_runtime": 282606.1357, "train_tokens_per_second": 642.607 }, { "epoch": 2.8767123287671232, "grad_norm": 0.3792674492414469, "learning_rate": 2.2825515875224413e-07, "loss": 0.12021469324827194, "num_input_tokens_seen": 181809728, "step": 868, "train_runtime": 282942.8893, "train_tokens_per_second": 642.567 }, { "epoch": 2.880033208800332, "grad_norm": 0.3899172186074756, "learning_rate": 2.1671655979096094e-07, "loss": 0.12051525712013245, "num_input_tokens_seen": 182019896, "step": 869, "train_runtime": 283265.0827, "train_tokens_per_second": 642.578 }, { "epoch": 2.883354088833541, "grad_norm": 0.3650592049957568, "learning_rate": 2.0547595087668793e-07, "loss": 0.11526565253734589, "num_input_tokens_seen": 182227944, "step": 870, "train_runtime": 283553.2841, "train_tokens_per_second": 642.659 }, { "epoch": 2.8866749688667497, "grad_norm": 0.380677134312214, "learning_rate": 1.9453346716462317e-07, "loss": 0.1231502816081047, "num_input_tokens_seen": 182438944, "step": 871, "train_runtime": 283840.0573, "train_tokens_per_second": 642.753 }, { "epoch": 2.8899958488999586, "grad_norm": 0.37686009927356995, "learning_rate": 1.8388924022535725e-07, "loss": 0.12128575146198273, "num_input_tokens_seen": 182648344, "step": 872, "train_runtime": 284155.3849, "train_tokens_per_second": 642.776 }, { "epoch": 2.893316728933167, "grad_norm": 0.3832841605659196, "learning_rate": 1.73543398043291e-07, "loss": 0.1284339725971222, "num_input_tokens_seen": 182861192, "step": 873, "train_runtime": 284449.2366, "train_tokens_per_second": 642.861 }, { "epoch": 2.896637608966376, "grad_norm": 0.37979851344229926, "learning_rate": 1.6349606501509796e-07, "loss": 0.12369494140148163, "num_input_tokens_seen": 183070160, "step": 874, "train_runtime": 284745.4663, "train_tokens_per_second": 642.926 }, { "epoch": 2.8999584889995846, "grad_norm": 0.3744464470603893, "learning_rate": 1.537473619482227e-07, "loss": 0.12231475114822388, "num_input_tokens_seen": 183285888, "step": 875, "train_runtime": 285055.7824, "train_tokens_per_second": 642.983 }, { "epoch": 2.9032793690327936, "grad_norm": 0.36576186322728294, "learning_rate": 1.4429740605944319e-07, "loss": 0.11586017906665802, "num_input_tokens_seen": 183501752, "step": 876, "train_runtime": 285389.2525, "train_tokens_per_second": 642.988 }, { "epoch": 2.9066002490660026, "grad_norm": 0.3982862887281528, "learning_rate": 1.351463109734441e-07, "loss": 0.12425873428583145, "num_input_tokens_seen": 183713168, "step": 877, "train_runtime": 285692.9987, "train_tokens_per_second": 643.044 }, { "epoch": 2.909921129099211, "grad_norm": 0.380411294532791, "learning_rate": 1.2629418672145676e-07, "loss": 0.11696077883243561, "num_input_tokens_seen": 183921672, "step": 878, "train_runtime": 285966.8476, "train_tokens_per_second": 643.157 }, { "epoch": 2.91324200913242, "grad_norm": 0.37164689352767816, "learning_rate": 1.1774113973994083e-07, "loss": 0.11529619246721268, "num_input_tokens_seen": 184126072, "step": 879, "train_runtime": 286287.6169, "train_tokens_per_second": 643.151 }, { "epoch": 2.916562889165629, "grad_norm": 0.3812826561028813, "learning_rate": 1.0948727286930193e-07, "loss": 0.1224733367562294, "num_input_tokens_seen": 184334184, "step": 880, "train_runtime": 286617.6832, "train_tokens_per_second": 643.136 }, { "epoch": 2.9198837691988375, "grad_norm": 0.38640902985482734, "learning_rate": 1.0153268535264827e-07, "loss": 0.12352342903614044, "num_input_tokens_seen": 184543808, "step": 881, "train_runtime": 286917.2494, "train_tokens_per_second": 643.195 }, { "epoch": 2.9232046492320465, "grad_norm": 0.37702629173439284, "learning_rate": 9.387747283460813e-08, "loss": 0.12213212251663208, "num_input_tokens_seen": 184752816, "step": 882, "train_runtime": 287230.8336, "train_tokens_per_second": 643.221 }, { "epoch": 2.9265255292652554, "grad_norm": 0.37222373733875147, "learning_rate": 8.652172736017816e-08, "loss": 0.11964373290538788, "num_input_tokens_seen": 184962096, "step": 883, "train_runtime": 287553.6374, "train_tokens_per_second": 643.226 }, { "epoch": 2.929846409298464, "grad_norm": 0.3832022203789427, "learning_rate": 7.946553737360473e-08, "loss": 0.11658230423927307, "num_input_tokens_seen": 185175696, "step": 884, "train_runtime": 287869.0282, "train_tokens_per_second": 643.264 }, { "epoch": 2.933167289331673, "grad_norm": 0.3748779576654735, "learning_rate": 7.270898771733481e-08, "loss": 0.11535081267356873, "num_input_tokens_seen": 185374744, "step": 885, "train_runtime": 288159.9146, "train_tokens_per_second": 643.305 }, { "epoch": 2.936488169364882, "grad_norm": 0.38401853062747954, "learning_rate": 6.625215963098896e-08, "loss": 0.1184299886226654, "num_input_tokens_seen": 185582624, "step": 886, "train_runtime": 288447.0187, "train_tokens_per_second": 643.385 }, { "epoch": 2.9398090493980904, "grad_norm": 0.37872747796048184, "learning_rate": 6.00951307503872e-08, "loss": 0.11564740538597107, "num_input_tokens_seen": 185788368, "step": 887, "train_runtime": 288761.2049, "train_tokens_per_second": 643.398 }, { "epoch": 2.9431299294312994, "grad_norm": 0.37703441267775073, "learning_rate": 5.423797510661355e-08, "loss": 0.11729012429714203, "num_input_tokens_seen": 185995472, "step": 888, "train_runtime": 289090.5336, "train_tokens_per_second": 643.381 }, { "epoch": 2.9464508094645083, "grad_norm": 0.3832991078113174, "learning_rate": 4.868076312512515e-08, "loss": 0.11568398028612137, "num_input_tokens_seen": 186199944, "step": 889, "train_runtime": 289377.6641, "train_tokens_per_second": 643.45 }, { "epoch": 2.949771689497717, "grad_norm": 0.3843104989739763, "learning_rate": 4.3423561624905685e-08, "loss": 0.12462468445301056, "num_input_tokens_seen": 186413312, "step": 890, "train_runtime": 289689.6951, "train_tokens_per_second": 643.493 }, { "epoch": 2.953092569530926, "grad_norm": 0.37778929291230706, "learning_rate": 3.846643381766879e-08, "loss": 0.13031190633773804, "num_input_tokens_seen": 186635016, "step": 891, "train_runtime": 289992.6586, "train_tokens_per_second": 643.585 }, { "epoch": 2.9564134495641348, "grad_norm": 0.37896035668084116, "learning_rate": 3.380943930708647e-08, "loss": 0.12338971346616745, "num_input_tokens_seen": 186845512, "step": 892, "train_runtime": 290284.9789, "train_tokens_per_second": 643.662 }, { "epoch": 2.9597343295973433, "grad_norm": 0.3705851780484622, "learning_rate": 2.94526340880813e-08, "loss": 0.11317215859889984, "num_input_tokens_seen": 187051624, "step": 893, "train_runtime": 290594.6503, "train_tokens_per_second": 643.686 }, { "epoch": 2.9630552096305522, "grad_norm": 0.37906235095118035, "learning_rate": 2.5396070546146456e-08, "loss": 0.11464998126029968, "num_input_tokens_seen": 187255784, "step": 894, "train_runtime": 290904.0501, "train_tokens_per_second": 643.703 }, { "epoch": 2.9663760896637608, "grad_norm": 0.37172240687689206, "learning_rate": 2.1639797456723952e-08, "loss": 0.12109000980854034, "num_input_tokens_seen": 187466024, "step": 895, "train_runtime": 291220.0417, "train_tokens_per_second": 643.726 }, { "epoch": 2.9696969696969697, "grad_norm": 0.3768243904390163, "learning_rate": 1.8183859984613472e-08, "loss": 0.12020470201969147, "num_input_tokens_seen": 187677224, "step": 896, "train_runtime": 291549.0344, "train_tokens_per_second": 643.724 }, { "epoch": 2.9730178497301782, "grad_norm": 0.3711767343293872, "learning_rate": 1.5028299683425562e-08, "loss": 0.11873486638069153, "num_input_tokens_seen": 187886168, "step": 897, "train_runtime": 291878.9683, "train_tokens_per_second": 643.713 }, { "epoch": 2.976338729763387, "grad_norm": 0.3720897537325746, "learning_rate": 1.2173154495087601e-08, "loss": 0.12158137559890747, "num_input_tokens_seen": 188096304, "step": 898, "train_runtime": 292221.8252, "train_tokens_per_second": 643.676 }, { "epoch": 2.979659609796596, "grad_norm": 0.37190934057970604, "learning_rate": 9.618458749391379e-09, "loss": 0.11632666736841202, "num_input_tokens_seen": 188303944, "step": 899, "train_runtime": 292558.7274, "train_tokens_per_second": 643.645 }, { "epoch": 2.9829804898298047, "grad_norm": 0.3635397987099627, "learning_rate": 7.364243163568429e-09, "loss": 0.11789841949939728, "num_input_tokens_seen": 188514200, "step": 900, "train_runtime": 292889.7382, "train_tokens_per_second": 643.635 }, { "epoch": 2.9863013698630136, "grad_norm": 0.3811428814864964, "learning_rate": 5.41053484192644e-09, "loss": 0.12132750451564789, "num_input_tokens_seen": 188726056, "step": 901, "train_runtime": 293202.5884, "train_tokens_per_second": 643.671 }, { "epoch": 2.9896222498962226, "grad_norm": 0.3741259552370783, "learning_rate": 3.75735727553006e-09, "loss": 0.12210451066493988, "num_input_tokens_seen": 188936616, "step": 902, "train_runtime": 293530.0096, "train_tokens_per_second": 643.671 }, { "epoch": 2.992943129929431, "grad_norm": 0.3792021175055764, "learning_rate": 2.4047303419094667e-09, "loss": 0.11642736196517944, "num_input_tokens_seen": 189139152, "step": 903, "train_runtime": 293880.7386, "train_tokens_per_second": 643.592 }, { "epoch": 2.99626400996264, "grad_norm": 0.38600684674504493, "learning_rate": 1.3526703048216682e-09, "loss": 0.11877848953008652, "num_input_tokens_seen": 189350544, "step": 904, "train_runtime": 294226.5957, "train_tokens_per_second": 643.553 }, { "epoch": 2.999584889995849, "grad_norm": 0.3792222507738102, "learning_rate": 6.011898140617645e-10, "loss": 0.12198566645383835, "num_input_tokens_seen": 189559288, "step": 905, "train_runtime": 294549.3656, "train_tokens_per_second": 643.557 }, { "epoch": 3.0, "grad_norm": 0.8756700717055261, "learning_rate": 1.5029790530474152e-10, "loss": 0.07718770951032639, "num_input_tokens_seen": 189583256, "step": 906, "train_runtime": 294593.9715, "train_tokens_per_second": 643.541 }, { "epoch": 3.0, "num_input_tokens_seen": 189583256, "step": 906, "total_flos": 573622955442176.0, "train_loss": 0.2829558104687837, "train_runtime": 294627.2072, "train_samples_per_second": 0.392, "train_steps_per_second": 0.003 } ], "logging_steps": 1, "max_steps": 906, "num_input_tokens_seen": 189583256, "num_train_epochs": 3, "save_steps": 2500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 573622955442176.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }