{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 4642, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004309416074121957, "grad_norm": 1.1464484255552978, "learning_rate": 1.2857142857142856e-06, "loss": 0.8585, "mean_token_accuracy": 0.7816174656152726, "num_tokens": 5200953.0, "step": 10 }, { "epoch": 0.008618832148243914, "grad_norm": 0.6835364346922225, "learning_rate": 2.7142857142857144e-06, "loss": 0.8632, "mean_token_accuracy": 0.7805173248052597, "num_tokens": 10439080.0, "step": 20 }, { "epoch": 0.012928248222365869, "grad_norm": 0.6927535106902879, "learning_rate": 4.1428571428571435e-06, "loss": 0.8458, "mean_token_accuracy": 0.7842860609292984, "num_tokens": 15660494.0, "step": 30 }, { "epoch": 0.017237664296487827, "grad_norm": 1.07396885718481, "learning_rate": 5.571428571428572e-06, "loss": 0.8126, "mean_token_accuracy": 0.7898437529802322, "num_tokens": 20874692.0, "step": 40 }, { "epoch": 0.021547080370609782, "grad_norm": 3.2378068257226476, "learning_rate": 7e-06, "loss": 0.7916, "mean_token_accuracy": 0.7941934525966644, "num_tokens": 26091519.0, "step": 50 }, { "epoch": 0.025856496444731737, "grad_norm": 0.5246594733394452, "learning_rate": 8.428571428571429e-06, "loss": 0.8032, "mean_token_accuracy": 0.7924618661403656, "num_tokens": 31324849.0, "step": 60 }, { "epoch": 0.030165912518853696, "grad_norm": 0.4335534361364414, "learning_rate": 9.857142857142859e-06, "loss": 0.7803, "mean_token_accuracy": 0.7961702078580857, "num_tokens": 36551348.0, "step": 70 }, { "epoch": 0.034475328592975654, "grad_norm": 0.3313039676876513, "learning_rate": 1.1285714285714287e-05, "loss": 0.768, "mean_token_accuracy": 0.7988312512636184, "num_tokens": 41767061.0, "step": 80 }, { "epoch": 0.03878474466709761, "grad_norm": 0.3934908942577619, "learning_rate": 1.2714285714285715e-05, "loss": 0.7808, "mean_token_accuracy": 0.7966956377029419, "num_tokens": 47007095.0, "step": 90 }, { "epoch": 0.043094160741219564, "grad_norm": 0.3188889434865405, "learning_rate": 1.4142857142857145e-05, "loss": 0.7473, "mean_token_accuracy": 0.8021660268306732, "num_tokens": 52210187.0, "step": 100 }, { "epoch": 0.04740357681534152, "grad_norm": 0.3287491812550416, "learning_rate": 1.5571428571428573e-05, "loss": 0.7595, "mean_token_accuracy": 0.800231420993805, "num_tokens": 57429943.0, "step": 110 }, { "epoch": 0.051712992889463474, "grad_norm": 5.119765595417836, "learning_rate": 1.7e-05, "loss": 0.7455, "mean_token_accuracy": 0.8027559369802475, "num_tokens": 62629615.0, "step": 120 }, { "epoch": 0.056022408963585436, "grad_norm": 11.611470819412371, "learning_rate": 1.842857142857143e-05, "loss": 0.7489, "mean_token_accuracy": 0.8018222838640213, "num_tokens": 67857601.0, "step": 130 }, { "epoch": 0.06033182503770739, "grad_norm": 0.32677082915429995, "learning_rate": 1.985714285714286e-05, "loss": 0.7328, "mean_token_accuracy": 0.8056721836328506, "num_tokens": 73087352.0, "step": 140 }, { "epoch": 0.06464124111182935, "grad_norm": 0.41188760914876965, "learning_rate": 1.9999802783902948e-05, "loss": 0.7299, "mean_token_accuracy": 0.8061253696680069, "num_tokens": 78304455.0, "step": 150 }, { "epoch": 0.06895065718595131, "grad_norm": 0.33886087647146035, "learning_rate": 1.9999121059233323e-05, "loss": 0.7312, "mean_token_accuracy": 0.8061156570911407, "num_tokens": 83521802.0, "step": 160 }, { "epoch": 0.07326007326007326, "grad_norm": 0.30676241790968656, "learning_rate": 1.9997952424413366e-05, "loss": 0.7413, "mean_token_accuracy": 0.8038029730319977, "num_tokens": 88732494.0, "step": 170 }, { "epoch": 0.07756948933419522, "grad_norm": 0.4527131025852802, "learning_rate": 1.99962969363501e-05, "loss": 0.731, "mean_token_accuracy": 0.8056236773729324, "num_tokens": 93968544.0, "step": 180 }, { "epoch": 0.08187890540831717, "grad_norm": 0.582789106883916, "learning_rate": 1.9994154675658006e-05, "loss": 0.7328, "mean_token_accuracy": 0.805256187915802, "num_tokens": 99183147.0, "step": 190 }, { "epoch": 0.08618832148243913, "grad_norm": 0.4886805683801227, "learning_rate": 1.9991525746655096e-05, "loss": 0.7186, "mean_token_accuracy": 0.8080977678298951, "num_tokens": 104388835.0, "step": 200 }, { "epoch": 0.09049773755656108, "grad_norm": 0.419799532770956, "learning_rate": 1.9988410277357846e-05, "loss": 0.7123, "mean_token_accuracy": 0.8100159347057343, "num_tokens": 109606345.0, "step": 210 }, { "epoch": 0.09480715363068304, "grad_norm": 0.2798729008478555, "learning_rate": 1.998480841947495e-05, "loss": 0.7179, "mean_token_accuracy": 0.8085818499326706, "num_tokens": 114819755.0, "step": 220 }, { "epoch": 0.099116569704805, "grad_norm": 0.4902760404616901, "learning_rate": 1.998072034839995e-05, "loss": 0.7197, "mean_token_accuracy": 0.8085496753454209, "num_tokens": 120053241.0, "step": 230 }, { "epoch": 0.10342598577892695, "grad_norm": 0.351949184240428, "learning_rate": 1.9976146263202663e-05, "loss": 0.7112, "mean_token_accuracy": 0.8095757305622101, "num_tokens": 125254035.0, "step": 240 }, { "epoch": 0.10773540185304892, "grad_norm": 0.30305486417884486, "learning_rate": 1.997108638661952e-05, "loss": 0.702, "mean_token_accuracy": 0.8118896305561065, "num_tokens": 130478135.0, "step": 250 }, { "epoch": 0.11204481792717087, "grad_norm": 0.869413930009905, "learning_rate": 1.9965540965042706e-05, "loss": 0.7151, "mean_token_accuracy": 0.8093383610248566, "num_tokens": 135691541.0, "step": 260 }, { "epoch": 0.11635423400129283, "grad_norm": 0.2787417299701044, "learning_rate": 1.995951026850816e-05, "loss": 0.7208, "mean_token_accuracy": 0.8084886729717254, "num_tokens": 140908939.0, "step": 270 }, { "epoch": 0.12066365007541478, "grad_norm": 0.32036856821045795, "learning_rate": 1.9952994590682424e-05, "loss": 0.7194, "mean_token_accuracy": 0.8077664315700531, "num_tokens": 146120306.0, "step": 280 }, { "epoch": 0.12497306614953674, "grad_norm": 0.3742216469639607, "learning_rate": 1.9945994248848358e-05, "loss": 0.7084, "mean_token_accuracy": 0.8110000669956208, "num_tokens": 151324609.0, "step": 290 }, { "epoch": 0.1292824822236587, "grad_norm": 0.37073709635585644, "learning_rate": 1.9938509583889677e-05, "loss": 0.6947, "mean_token_accuracy": 0.814004185795784, "num_tokens": 156536289.0, "step": 300 }, { "epoch": 0.13359189829778065, "grad_norm": 0.5269986269195418, "learning_rate": 1.993054096027434e-05, "loss": 0.7121, "mean_token_accuracy": 0.8103572875261307, "num_tokens": 161769187.0, "step": 310 }, { "epoch": 0.13790131437190262, "grad_norm": 0.35465730422787783, "learning_rate": 1.9922088766036832e-05, "loss": 0.7154, "mean_token_accuracy": 0.8093467622995376, "num_tokens": 166989104.0, "step": 320 }, { "epoch": 0.14221073044602456, "grad_norm": 0.5056889341262342, "learning_rate": 1.9913153412759246e-05, "loss": 0.688, "mean_token_accuracy": 0.8142561435699462, "num_tokens": 172218332.0, "step": 330 }, { "epoch": 0.14652014652014653, "grad_norm": 0.478443926035086, "learning_rate": 1.990373533555124e-05, "loss": 0.7049, "mean_token_accuracy": 0.8115419298410416, "num_tokens": 177446420.0, "step": 340 }, { "epoch": 0.15082956259426847, "grad_norm": 0.5344518491471403, "learning_rate": 1.989383499302887e-05, "loss": 0.6985, "mean_token_accuracy": 0.8124533802270889, "num_tokens": 182668496.0, "step": 350 }, { "epoch": 0.15513897866839044, "grad_norm": 0.3707515443919568, "learning_rate": 1.988345286729223e-05, "loss": 0.6862, "mean_token_accuracy": 0.8153943628072738, "num_tokens": 187864615.0, "step": 360 }, { "epoch": 0.15944839474251238, "grad_norm": 0.35006650641790044, "learning_rate": 1.9872589463901998e-05, "loss": 0.6772, "mean_token_accuracy": 0.817377695441246, "num_tokens": 193091959.0, "step": 370 }, { "epoch": 0.16375781081663435, "grad_norm": 0.3147012935658028, "learning_rate": 1.9861245311854806e-05, "loss": 0.7091, "mean_token_accuracy": 0.8105926007032395, "num_tokens": 198307133.0, "step": 380 }, { "epoch": 0.16806722689075632, "grad_norm": 0.2974493096314107, "learning_rate": 1.9849420963557493e-05, "loss": 0.7014, "mean_token_accuracy": 0.8125709146261215, "num_tokens": 203524207.0, "step": 390 }, { "epoch": 0.17237664296487826, "grad_norm": 0.3510593509232527, "learning_rate": 1.9837116994800177e-05, "loss": 0.6844, "mean_token_accuracy": 0.8155967563390731, "num_tokens": 208742608.0, "step": 400 }, { "epoch": 0.17668605903900023, "grad_norm": 0.4220129354189381, "learning_rate": 1.9824334004728252e-05, "loss": 0.686, "mean_token_accuracy": 0.8149468660354614, "num_tokens": 5223286.0, "step": 410 }, { "epoch": 0.18099547511312217, "grad_norm": 0.40069834101187557, "learning_rate": 1.9811072615813198e-05, "loss": 0.7141, "mean_token_accuracy": 0.8113093852996827, "num_tokens": 10454233.0, "step": 420 }, { "epoch": 0.18530489118724414, "grad_norm": 0.3749245155044495, "learning_rate": 1.979733347382225e-05, "loss": 0.7108, "mean_token_accuracy": 0.8109797239303589, "num_tokens": 15660319.0, "step": 430 }, { "epoch": 0.18961430726136608, "grad_norm": 1.1822929210624606, "learning_rate": 1.9783117247786983e-05, "loss": 0.6906, "mean_token_accuracy": 0.8153148740530014, "num_tokens": 20893849.0, "step": 440 }, { "epoch": 0.19392372333548805, "grad_norm": 0.2730946326930461, "learning_rate": 1.9768424629970727e-05, "loss": 0.688, "mean_token_accuracy": 0.8151433259248734, "num_tokens": 26099916.0, "step": 450 }, { "epoch": 0.19823313940961, "grad_norm": 0.3330872835964631, "learning_rate": 1.9753256335834834e-05, "loss": 0.6853, "mean_token_accuracy": 0.8160628467798233, "num_tokens": 31338694.0, "step": 460 }, { "epoch": 0.20254255548373196, "grad_norm": 0.3717603470983316, "learning_rate": 1.9737613104003867e-05, "loss": 0.6855, "mean_token_accuracy": 0.8164401084184647, "num_tokens": 36574908.0, "step": 470 }, { "epoch": 0.2068519715578539, "grad_norm": 1.454687531583039, "learning_rate": 1.972149569622961e-05, "loss": 0.6957, "mean_token_accuracy": 0.8127301871776581, "num_tokens": 41790153.0, "step": 480 }, { "epoch": 0.21116138763197587, "grad_norm": 0.6632416612414811, "learning_rate": 1.9704904897353998e-05, "loss": 0.6779, "mean_token_accuracy": 0.8172294646501541, "num_tokens": 47005011.0, "step": 490 }, { "epoch": 0.21547080370609784, "grad_norm": 0.34169723872789115, "learning_rate": 1.968784151527087e-05, "loss": 0.6821, "mean_token_accuracy": 0.8158999979496002, "num_tokens": 52225578.0, "step": 500 }, { "epoch": 0.21978021978021978, "grad_norm": 0.3658448644523066, "learning_rate": 1.9670306380886654e-05, "loss": 0.6792, "mean_token_accuracy": 0.8161427766084671, "num_tokens": 57414538.0, "step": 510 }, { "epoch": 0.22408963585434175, "grad_norm": 0.40049880175112507, "learning_rate": 1.9652300348079897e-05, "loss": 0.6726, "mean_token_accuracy": 0.8179007738828659, "num_tokens": 62633144.0, "step": 520 }, { "epoch": 0.2283990519284637, "grad_norm": 3.9239025327425807, "learning_rate": 1.9633824293659673e-05, "loss": 0.6642, "mean_token_accuracy": 0.8193872332572937, "num_tokens": 67843000.0, "step": 530 }, { "epoch": 0.23270846800258566, "grad_norm": 0.2899424656068847, "learning_rate": 1.9614879117322906e-05, "loss": 0.6968, "mean_token_accuracy": 0.8127734661102295, "num_tokens": 73065124.0, "step": 540 }, { "epoch": 0.2370178840767076, "grad_norm": 0.38718734224119095, "learning_rate": 1.9595465741610546e-05, "loss": 0.6683, "mean_token_accuracy": 0.8193780601024627, "num_tokens": 78289969.0, "step": 550 }, { "epoch": 0.24132730015082957, "grad_norm": 0.23434400432441202, "learning_rate": 1.9575585111862656e-05, "loss": 0.6546, "mean_token_accuracy": 0.8224500000476838, "num_tokens": 83503751.0, "step": 560 }, { "epoch": 0.2456367162249515, "grad_norm": 0.29706488489971505, "learning_rate": 1.9555238196172362e-05, "loss": 0.6841, "mean_token_accuracy": 0.8156752318143845, "num_tokens": 88729088.0, "step": 570 }, { "epoch": 0.24994613229907348, "grad_norm": 4.04016467477839, "learning_rate": 1.9534425985338735e-05, "loss": 0.673, "mean_token_accuracy": 0.818093541264534, "num_tokens": 93935439.0, "step": 580 }, { "epoch": 0.2542555483731954, "grad_norm": 0.3326858427770414, "learning_rate": 1.9513149492818516e-05, "loss": 0.6616, "mean_token_accuracy": 0.8211809307336807, "num_tokens": 99156578.0, "step": 590 }, { "epoch": 0.2585649644473174, "grad_norm": 0.4177129584403783, "learning_rate": 1.9491409754676787e-05, "loss": 0.685, "mean_token_accuracy": 0.8164306998252868, "num_tokens": 104361853.0, "step": 600 }, { "epoch": 0.26287438052143935, "grad_norm": 0.7196604022336679, "learning_rate": 1.946920782953651e-05, "loss": 0.6582, "mean_token_accuracy": 0.821002870798111, "num_tokens": 109582327.0, "step": 610 }, { "epoch": 0.2671837965955613, "grad_norm": 0.23304129626634315, "learning_rate": 1.9446544798526983e-05, "loss": 0.6447, "mean_token_accuracy": 0.8240837663412094, "num_tokens": 114814989.0, "step": 620 }, { "epoch": 0.27149321266968324, "grad_norm": 0.41729937689951657, "learning_rate": 1.942342176523119e-05, "loss": 0.6537, "mean_token_accuracy": 0.8221835136413574, "num_tokens": 120017198.0, "step": 630 }, { "epoch": 0.27580262874380523, "grad_norm": 0.3623401820802482, "learning_rate": 1.9399839855632053e-05, "loss": 0.6742, "mean_token_accuracy": 0.8182963550090789, "num_tokens": 125242239.0, "step": 640 }, { "epoch": 0.2801120448179272, "grad_norm": 0.504742353545197, "learning_rate": 1.9375800218057626e-05, "loss": 0.6416, "mean_token_accuracy": 0.8245927482843399, "num_tokens": 130467934.0, "step": 650 }, { "epoch": 0.2844214608920491, "grad_norm": 0.3510667403280948, "learning_rate": 1.935130402312515e-05, "loss": 0.6605, "mean_token_accuracy": 0.8205715626478195, "num_tokens": 135673923.0, "step": 660 }, { "epoch": 0.28873087696617106, "grad_norm": 0.283413159342489, "learning_rate": 1.9326352463684067e-05, "loss": 0.6564, "mean_token_accuracy": 0.8217781811952591, "num_tokens": 140894511.0, "step": 670 }, { "epoch": 0.29304029304029305, "grad_norm": 0.26341263829984624, "learning_rate": 1.9300946754757923e-05, "loss": 0.6772, "mean_token_accuracy": 0.8176112204790116, "num_tokens": 146096683.0, "step": 680 }, { "epoch": 0.297349709114415, "grad_norm": 0.4188246721980729, "learning_rate": 1.927508813348521e-05, "loss": 0.6522, "mean_token_accuracy": 0.8233400017023087, "num_tokens": 151325331.0, "step": 690 }, { "epoch": 0.30165912518853694, "grad_norm": 0.30574069463425346, "learning_rate": 1.9248777859059126e-05, "loss": 0.6524, "mean_token_accuracy": 0.8224280178546906, "num_tokens": 156559195.0, "step": 700 }, { "epoch": 0.30596854126265893, "grad_norm": 1.5385154011612097, "learning_rate": 1.9222017212666242e-05, "loss": 0.6422, "mean_token_accuracy": 0.8251061916351319, "num_tokens": 161792902.0, "step": 710 }, { "epoch": 0.3102779573367809, "grad_norm": 0.3771852444622771, "learning_rate": 1.9194807497424134e-05, "loss": 0.6361, "mean_token_accuracy": 0.8266013979911804, "num_tokens": 167012875.0, "step": 720 }, { "epoch": 0.3145873734109028, "grad_norm": 0.3615154523098288, "learning_rate": 1.916715003831791e-05, "loss": 0.6516, "mean_token_accuracy": 0.8227958798408508, "num_tokens": 172227020.0, "step": 730 }, { "epoch": 0.31889678948502476, "grad_norm": 0.3030792641132892, "learning_rate": 1.913904618213571e-05, "loss": 0.6556, "mean_token_accuracy": 0.8225675851106644, "num_tokens": 177436285.0, "step": 740 }, { "epoch": 0.32320620555914675, "grad_norm": 0.26408534425475577, "learning_rate": 1.911049729740309e-05, "loss": 0.6542, "mean_token_accuracy": 0.8222842186689376, "num_tokens": 182651235.0, "step": 750 }, { "epoch": 0.3275156216332687, "grad_norm": 0.3673535940216057, "learning_rate": 1.9081504774316427e-05, "loss": 0.6719, "mean_token_accuracy": 0.818186005949974, "num_tokens": 187861982.0, "step": 760 }, { "epoch": 0.33182503770739064, "grad_norm": 0.3089255506875894, "learning_rate": 1.9052070024675182e-05, "loss": 0.643, "mean_token_accuracy": 0.8245363593101501, "num_tokens": 193085969.0, "step": 770 }, { "epoch": 0.33613445378151263, "grad_norm": 0.289157769943294, "learning_rate": 1.9022194481813177e-05, "loss": 0.6648, "mean_token_accuracy": 0.819673228263855, "num_tokens": 198317276.0, "step": 780 }, { "epoch": 0.3404438698556346, "grad_norm": 0.27055209439815014, "learning_rate": 1.899187960052878e-05, "loss": 0.6516, "mean_token_accuracy": 0.8229505121707916, "num_tokens": 203541658.0, "step": 790 }, { "epoch": 0.3447532859297565, "grad_norm": 1.058251547916245, "learning_rate": 1.896112685701409e-05, "loss": 0.6568, "mean_token_accuracy": 0.8218391090631485, "num_tokens": 208762308.0, "step": 800 }, { "epoch": 0.34906270200387846, "grad_norm": 0.35550083633769297, "learning_rate": 1.8929937748783022e-05, "loss": 0.6603, "mean_token_accuracy": 0.8214082747697831, "num_tokens": 213991748.0, "step": 810 }, { "epoch": 0.35337211807800045, "grad_norm": 0.31448911220541675, "learning_rate": 1.8898313794598403e-05, "loss": 0.6445, "mean_token_accuracy": 0.8243899047374725, "num_tokens": 219212716.0, "step": 820 }, { "epoch": 0.3576815341521224, "grad_norm": 0.3133728579715237, "learning_rate": 1.886625653439801e-05, "loss": 0.6715, "mean_token_accuracy": 0.8189995467662812, "num_tokens": 224430641.0, "step": 830 }, { "epoch": 0.36199095022624433, "grad_norm": 0.39805457965830404, "learning_rate": 1.8833767529219594e-05, "loss": 0.6497, "mean_token_accuracy": 0.8232468694448472, "num_tokens": 229658308.0, "step": 840 }, { "epoch": 0.3663003663003663, "grad_norm": 0.29056620679107725, "learning_rate": 1.8800848361124843e-05, "loss": 0.6578, "mean_token_accuracy": 0.8218301862478257, "num_tokens": 234864010.0, "step": 850 }, { "epoch": 0.3706097823744883, "grad_norm": 0.32356517478740604, "learning_rate": 1.876750063312236e-05, "loss": 0.6546, "mean_token_accuracy": 0.8229421854019165, "num_tokens": 240089625.0, "step": 860 }, { "epoch": 0.3749191984486102, "grad_norm": 0.8284496664800146, "learning_rate": 1.8733725969089604e-05, "loss": 0.6311, "mean_token_accuracy": 0.8281064361333847, "num_tokens": 245325855.0, "step": 870 }, { "epoch": 0.37922861452273215, "grad_norm": 0.34274903428803194, "learning_rate": 1.8699526013693806e-05, "loss": 0.6696, "mean_token_accuracy": 0.819324541091919, "num_tokens": 250526050.0, "step": 880 }, { "epoch": 0.38353803059685415, "grad_norm": 0.3105106617654166, "learning_rate": 1.866490243231188e-05, "loss": 0.6349, "mean_token_accuracy": 0.826661130785942, "num_tokens": 255731963.0, "step": 890 }, { "epoch": 0.3878474466709761, "grad_norm": 0.3396389168122368, "learning_rate": 1.862985691094934e-05, "loss": 0.6432, "mean_token_accuracy": 0.8247035890817642, "num_tokens": 260918728.0, "step": 900 }, { "epoch": 0.39215686274509803, "grad_norm": 0.38414079800836304, "learning_rate": 1.8594391156158193e-05, "loss": 0.645, "mean_token_accuracy": 0.82429179251194, "num_tokens": 266141637.0, "step": 910 }, { "epoch": 0.39646627881922, "grad_norm": 0.6505894636324675, "learning_rate": 1.8558506894953828e-05, "loss": 0.6501, "mean_token_accuracy": 0.8227986752986908, "num_tokens": 271363402.0, "step": 920 }, { "epoch": 0.40077569489334197, "grad_norm": 0.3821220366195235, "learning_rate": 1.8522205874730935e-05, "loss": 0.6572, "mean_token_accuracy": 0.8223873049020767, "num_tokens": 276580434.0, "step": 930 }, { "epoch": 0.4050851109674639, "grad_norm": 0.4249843772798963, "learning_rate": 1.84854898631784e-05, "loss": 0.6347, "mean_token_accuracy": 0.8271821349859237, "num_tokens": 281793261.0, "step": 940 }, { "epoch": 0.40939452704158585, "grad_norm": 0.2552229074890783, "learning_rate": 1.8448360648193245e-05, "loss": 0.6476, "mean_token_accuracy": 0.824235337972641, "num_tokens": 287017591.0, "step": 950 }, { "epoch": 0.4137039431157078, "grad_norm": 0.35351356316395655, "learning_rate": 1.841082003779355e-05, "loss": 0.6237, "mean_token_accuracy": 0.8287246704101563, "num_tokens": 292228316.0, "step": 960 }, { "epoch": 0.4180133591898298, "grad_norm": 0.4954478542376962, "learning_rate": 1.837286986003041e-05, "loss": 0.634, "mean_token_accuracy": 0.8264662533998489, "num_tokens": 297459098.0, "step": 970 }, { "epoch": 0.42232277526395173, "grad_norm": 0.27141795286182535, "learning_rate": 1.8334511962898932e-05, "loss": 0.6172, "mean_token_accuracy": 0.8303945779800415, "num_tokens": 302685825.0, "step": 980 }, { "epoch": 0.4266321913380737, "grad_norm": 0.33079572012780467, "learning_rate": 1.829574821424823e-05, "loss": 0.6325, "mean_token_accuracy": 0.8272376924753189, "num_tokens": 307903966.0, "step": 990 }, { "epoch": 0.43094160741219567, "grad_norm": 0.6283817507547165, "learning_rate": 1.825658050169049e-05, "loss": 0.6366, "mean_token_accuracy": 0.82571841776371, "num_tokens": 313139372.0, "step": 1000 }, { "epoch": 0.4352510234863176, "grad_norm": 0.2702389127181652, "learning_rate": 1.8217010732509023e-05, "loss": 0.6289, "mean_token_accuracy": 0.827723604440689, "num_tokens": 318361841.0, "step": 1010 }, { "epoch": 0.43956043956043955, "grad_norm": 0.37539030069797497, "learning_rate": 1.8177040833565423e-05, "loss": 0.6178, "mean_token_accuracy": 0.8301007211208343, "num_tokens": 323581432.0, "step": 1020 }, { "epoch": 0.4438698556345615, "grad_norm": 0.6624186215564173, "learning_rate": 1.8136672751205706e-05, "loss": 0.6451, "mean_token_accuracy": 0.8243307292461395, "num_tokens": 328807478.0, "step": 1030 }, { "epoch": 0.4481792717086835, "grad_norm": 0.27945099240122423, "learning_rate": 1.809590845116556e-05, "loss": 0.6135, "mean_token_accuracy": 0.8320200711488723, "num_tokens": 334026406.0, "step": 1040 }, { "epoch": 0.45248868778280543, "grad_norm": 0.47857134038431015, "learning_rate": 1.80547499184746e-05, "loss": 0.6362, "mean_token_accuracy": 0.8266705930233001, "num_tokens": 339241930.0, "step": 1050 }, { "epoch": 0.4567981038569274, "grad_norm": 0.2623229308880382, "learning_rate": 1.8013199157359717e-05, "loss": 0.6451, "mean_token_accuracy": 0.8245700865983963, "num_tokens": 344465045.0, "step": 1060 }, { "epoch": 0.46110751993104937, "grad_norm": 0.31622238717186407, "learning_rate": 1.7971258191147486e-05, "loss": 0.6216, "mean_token_accuracy": 0.8300080358982086, "num_tokens": 349688810.0, "step": 1070 }, { "epoch": 0.4654169360051713, "grad_norm": 0.4301175371484005, "learning_rate": 1.7928929062165635e-05, "loss": 0.6288, "mean_token_accuracy": 0.8276701956987381, "num_tokens": 354918788.0, "step": 1080 }, { "epoch": 0.46972635207929325, "grad_norm": 0.35317746105918973, "learning_rate": 1.7886213831643586e-05, "loss": 0.6499, "mean_token_accuracy": 0.8238545447587967, "num_tokens": 360139990.0, "step": 1090 }, { "epoch": 0.4740357681534152, "grad_norm": 0.33039942503862296, "learning_rate": 1.784311457961209e-05, "loss": 0.6344, "mean_token_accuracy": 0.8267721503973007, "num_tokens": 365349120.0, "step": 1100 }, { "epoch": 0.4783451842275372, "grad_norm": 0.33919186836952386, "learning_rate": 1.7799633404801947e-05, "loss": 0.6383, "mean_token_accuracy": 0.8259551167488098, "num_tokens": 370564880.0, "step": 1110 }, { "epoch": 0.48265460030165913, "grad_norm": 0.3290614465363003, "learning_rate": 1.7755772424541794e-05, "loss": 0.6381, "mean_token_accuracy": 0.8256288439035415, "num_tokens": 375783349.0, "step": 1120 }, { "epoch": 0.4869640163757811, "grad_norm": 0.27162147150659194, "learning_rate": 1.7711533774655e-05, "loss": 0.6278, "mean_token_accuracy": 0.8281829565763473, "num_tokens": 381010949.0, "step": 1130 }, { "epoch": 0.491273432449903, "grad_norm": 0.3103187983402091, "learning_rate": 1.766691960935568e-05, "loss": 0.6408, "mean_token_accuracy": 0.8251193135976791, "num_tokens": 386225998.0, "step": 1140 }, { "epoch": 0.495582848524025, "grad_norm": 0.22065635543976805, "learning_rate": 1.7621932101143776e-05, "loss": 0.6315, "mean_token_accuracy": 0.8274531990289689, "num_tokens": 391413296.0, "step": 1150 }, { "epoch": 0.49989226459814695, "grad_norm": 0.26116772582019, "learning_rate": 1.7576573440699275e-05, "loss": 0.6239, "mean_token_accuracy": 0.8292332559823989, "num_tokens": 396641695.0, "step": 1160 }, { "epoch": 0.5042016806722689, "grad_norm": 0.24416450204094683, "learning_rate": 1.753084583677553e-05, "loss": 0.617, "mean_token_accuracy": 0.8304607063531876, "num_tokens": 401827066.0, "step": 1170 }, { "epoch": 0.5085110967463908, "grad_norm": 0.3316143390169774, "learning_rate": 1.74847515160917e-05, "loss": 0.6457, "mean_token_accuracy": 0.824263596534729, "num_tokens": 407035291.0, "step": 1180 }, { "epoch": 0.5128205128205128, "grad_norm": 0.3212440538324799, "learning_rate": 1.7438292723224344e-05, "loss": 0.6174, "mean_token_accuracy": 0.830016416311264, "num_tokens": 412260944.0, "step": 1190 }, { "epoch": 0.5171299288946348, "grad_norm": 0.5606024977520799, "learning_rate": 1.7391471720498082e-05, "loss": 0.6325, "mean_token_accuracy": 0.8276671200990677, "num_tokens": 417483160.0, "step": 1200 }, { "epoch": 0.5214393449687568, "grad_norm": 0.24152696462546605, "learning_rate": 1.734429078787546e-05, "loss": 0.6287, "mean_token_accuracy": 0.827872833609581, "num_tokens": 422714872.0, "step": 1210 }, { "epoch": 0.5257487610428787, "grad_norm": 5.442803084142085, "learning_rate": 1.7296752222845907e-05, "loss": 0.6099, "mean_token_accuracy": 0.832091435790062, "num_tokens": 427908545.0, "step": 1220 }, { "epoch": 0.5300581771170007, "grad_norm": 0.3855554421579688, "learning_rate": 1.724885834031388e-05, "loss": 0.6328, "mean_token_accuracy": 0.8270717740058899, "num_tokens": 433134227.0, "step": 1230 }, { "epoch": 0.5343675931911226, "grad_norm": 0.21868851096742647, "learning_rate": 1.7200611472486114e-05, "loss": 0.6161, "mean_token_accuracy": 0.8309131264686584, "num_tokens": 438359301.0, "step": 1240 }, { "epoch": 0.5386770092652445, "grad_norm": 0.7326827218963277, "learning_rate": 1.7152013968758084e-05, "loss": 0.6241, "mean_token_accuracy": 0.8287705957889557, "num_tokens": 443573251.0, "step": 1250 }, { "epoch": 0.5429864253393665, "grad_norm": 0.600543416706433, "learning_rate": 1.710306819559956e-05, "loss": 0.6337, "mean_token_accuracy": 0.8268699079751969, "num_tokens": 448780974.0, "step": 1260 }, { "epoch": 0.5472958414134885, "grad_norm": 0.360147829993976, "learning_rate": 1.705377653643942e-05, "loss": 0.6166, "mean_token_accuracy": 0.8303727567195892, "num_tokens": 453999576.0, "step": 1270 }, { "epoch": 0.5516052574876105, "grad_norm": 0.45060003397000864, "learning_rate": 1.7004141391549543e-05, "loss": 0.6145, "mean_token_accuracy": 0.831406319141388, "num_tokens": 459228021.0, "step": 1280 }, { "epoch": 0.5559146735617324, "grad_norm": 0.5709726992892779, "learning_rate": 1.695416517792796e-05, "loss": 0.6468, "mean_token_accuracy": 0.8252277344465255, "num_tokens": 464447390.0, "step": 1290 }, { "epoch": 0.5602240896358543, "grad_norm": 0.27571937089392234, "learning_rate": 1.6903850329181136e-05, "loss": 0.6363, "mean_token_accuracy": 0.8263048976659775, "num_tokens": 469651628.0, "step": 1300 }, { "epoch": 0.5645335057099763, "grad_norm": 0.2939249957951198, "learning_rate": 1.6853199295405475e-05, "loss": 0.6155, "mean_token_accuracy": 0.8310225218534469, "num_tokens": 474879513.0, "step": 1310 }, { "epoch": 0.5688429217840982, "grad_norm": 0.3368399293128784, "learning_rate": 1.680221454306802e-05, "loss": 0.6141, "mean_token_accuracy": 0.8309666365385056, "num_tokens": 480098167.0, "step": 1320 }, { "epoch": 0.5731523378582202, "grad_norm": 0.2791407090464392, "learning_rate": 1.675089855488632e-05, "loss": 0.6264, "mean_token_accuracy": 0.82850883603096, "num_tokens": 485315568.0, "step": 1330 }, { "epoch": 0.5774617539323421, "grad_norm": 0.5414602413843846, "learning_rate": 1.6699253829707562e-05, "loss": 0.6087, "mean_token_accuracy": 0.8328062802553177, "num_tokens": 490517068.0, "step": 1340 }, { "epoch": 0.5817711700064642, "grad_norm": 0.2675353738861536, "learning_rate": 1.6647282882386883e-05, "loss": 0.6374, "mean_token_accuracy": 0.8267453759908676, "num_tokens": 495718080.0, "step": 1350 }, { "epoch": 0.5860805860805861, "grad_norm": 0.3489001608723506, "learning_rate": 1.6594988243664897e-05, "loss": 0.6092, "mean_token_accuracy": 0.8328727900981903, "num_tokens": 500934901.0, "step": 1360 }, { "epoch": 0.590390002154708, "grad_norm": 0.40229690700802545, "learning_rate": 1.654237246004446e-05, "loss": 0.6391, "mean_token_accuracy": 0.82609423995018, "num_tokens": 506169219.0, "step": 1370 }, { "epoch": 0.59469941822883, "grad_norm": 0.3050782144736273, "learning_rate": 1.6489438093666684e-05, "loss": 0.6286, "mean_token_accuracy": 0.8276964217424393, "num_tokens": 511402169.0, "step": 1380 }, { "epoch": 0.5990088343029519, "grad_norm": 0.33842933888327564, "learning_rate": 1.6436187722186164e-05, "loss": 0.6016, "mean_token_accuracy": 0.8356621891260148, "num_tokens": 516605996.0, "step": 1390 }, { "epoch": 0.6033182503770739, "grad_norm": 0.38387708618673627, "learning_rate": 1.638262393864544e-05, "loss": 0.6154, "mean_token_accuracy": 0.8312870115041733, "num_tokens": 521817729.0, "step": 1400 }, { "epoch": 0.6076276664511958, "grad_norm": 0.2281992499688358, "learning_rate": 1.6328749351348764e-05, "loss": 0.6268, "mean_token_accuracy": 0.8288481831550598, "num_tokens": 527052495.0, "step": 1410 }, { "epoch": 0.6119370825253179, "grad_norm": 0.45992335983414295, "learning_rate": 1.6274566583735055e-05, "loss": 0.6207, "mean_token_accuracy": 0.8294959485530853, "num_tokens": 532271809.0, "step": 1420 }, { "epoch": 0.6162464985994398, "grad_norm": 0.3356687489538667, "learning_rate": 1.622007827425018e-05, "loss": 0.6129, "mean_token_accuracy": 0.8310313284397125, "num_tokens": 537501852.0, "step": 1430 }, { "epoch": 0.6205559146735617, "grad_norm": 0.3616377511697279, "learning_rate": 1.6165287076218434e-05, "loss": 0.6156, "mean_token_accuracy": 0.8307232618331909, "num_tokens": 542715563.0, "step": 1440 }, { "epoch": 0.6248653307476837, "grad_norm": 0.26001643054309265, "learning_rate": 1.6110195657713382e-05, "loss": 0.625, "mean_token_accuracy": 0.829586324095726, "num_tokens": 547918234.0, "step": 1450 }, { "epoch": 0.6291747468218056, "grad_norm": 1.5954870703086974, "learning_rate": 1.6054806701427896e-05, "loss": 0.6184, "mean_token_accuracy": 0.8309715032577515, "num_tokens": 553134438.0, "step": 1460 }, { "epoch": 0.6334841628959276, "grad_norm": 0.28183497823174325, "learning_rate": 1.599912290454355e-05, "loss": 0.6164, "mean_token_accuracy": 0.8316257029771805, "num_tokens": 558338647.0, "step": 1470 }, { "epoch": 0.6377935789700495, "grad_norm": 0.417418075656745, "learning_rate": 1.594314697859926e-05, "loss": 0.6061, "mean_token_accuracy": 0.8334461331367493, "num_tokens": 563534076.0, "step": 1480 }, { "epoch": 0.6421029950441716, "grad_norm": 0.4861689306910279, "learning_rate": 1.588688164935926e-05, "loss": 0.6155, "mean_token_accuracy": 0.8312201201915741, "num_tokens": 568764738.0, "step": 1490 }, { "epoch": 0.6464124111182935, "grad_norm": 0.28236521198401204, "learning_rate": 1.5830329656680357e-05, "loss": 0.6208, "mean_token_accuracy": 0.829620686173439, "num_tokens": 573992483.0, "step": 1500 }, { "epoch": 0.6507218271924154, "grad_norm": 0.23104837644035964, "learning_rate": 1.577349375437852e-05, "loss": 0.6224, "mean_token_accuracy": 0.8296060234308242, "num_tokens": 579218564.0, "step": 1510 }, { "epoch": 0.6550312432665374, "grad_norm": 0.2611222405755837, "learning_rate": 1.571637671009478e-05, "loss": 0.5937, "mean_token_accuracy": 0.8352221310138702, "num_tokens": 584442451.0, "step": 1520 }, { "epoch": 0.6593406593406593, "grad_norm": 0.327050425044639, "learning_rate": 1.5658981305160467e-05, "loss": 0.6351, "mean_token_accuracy": 0.8266708552837372, "num_tokens": 589652757.0, "step": 1530 }, { "epoch": 0.6636500754147813, "grad_norm": 0.23823355143151762, "learning_rate": 1.5601310334461754e-05, "loss": 0.6213, "mean_token_accuracy": 0.829871678352356, "num_tokens": 594860137.0, "step": 1540 }, { "epoch": 0.6679594914889032, "grad_norm": 0.3917352438105509, "learning_rate": 1.554336660630358e-05, "loss": 0.615, "mean_token_accuracy": 0.831358191370964, "num_tokens": 600085898.0, "step": 1550 }, { "epoch": 0.6722689075630253, "grad_norm": 0.30659913530099364, "learning_rate": 1.548515294227288e-05, "loss": 0.6152, "mean_token_accuracy": 0.8307610124349594, "num_tokens": 605312242.0, "step": 1560 }, { "epoch": 0.6765783236371472, "grad_norm": 0.30902596508045915, "learning_rate": 1.5426672177101202e-05, "loss": 0.6213, "mean_token_accuracy": 0.8297601848840713, "num_tokens": 610527591.0, "step": 1570 }, { "epoch": 0.6808877397112691, "grad_norm": 0.43742820569310464, "learning_rate": 1.5367927158526664e-05, "loss": 0.6231, "mean_token_accuracy": 0.8295327842235565, "num_tokens": 615766545.0, "step": 1580 }, { "epoch": 0.6851971557853911, "grad_norm": 0.37166037007089475, "learning_rate": 1.5308920747155277e-05, "loss": 0.6074, "mean_token_accuracy": 0.8325976997613906, "num_tokens": 620969103.0, "step": 1590 }, { "epoch": 0.689506571859513, "grad_norm": 0.4198181584068834, "learning_rate": 1.5249655816321656e-05, "loss": 0.618, "mean_token_accuracy": 0.8300315082073212, "num_tokens": 626167025.0, "step": 1600 }, { "epoch": 0.693815987933635, "grad_norm": 0.32327323774675676, "learning_rate": 1.5190135251949087e-05, "loss": 0.6177, "mean_token_accuracy": 0.8305284380912781, "num_tokens": 631384911.0, "step": 1610 }, { "epoch": 0.6981254040077569, "grad_norm": 0.40804438370624796, "learning_rate": 1.5130361952409023e-05, "loss": 0.6078, "mean_token_accuracy": 0.832860580086708, "num_tokens": 636615521.0, "step": 1620 }, { "epoch": 0.7024348200818789, "grad_norm": 0.2761820281021129, "learning_rate": 1.5070338828379917e-05, "loss": 0.6114, "mean_token_accuracy": 0.8320600628852844, "num_tokens": 641848328.0, "step": 1630 }, { "epoch": 0.7067442361560009, "grad_norm": 0.2641563276240397, "learning_rate": 1.5010068802705507e-05, "loss": 0.6201, "mean_token_accuracy": 0.8302027434110641, "num_tokens": 647091208.0, "step": 1640 }, { "epoch": 0.7110536522301228, "grad_norm": 0.2826512343210898, "learning_rate": 1.4949554810252472e-05, "loss": 0.6211, "mean_token_accuracy": 0.8298398047685623, "num_tokens": 652320720.0, "step": 1650 }, { "epoch": 0.7153630683042448, "grad_norm": 0.23321678467568555, "learning_rate": 1.4888799797767535e-05, "loss": 0.6037, "mean_token_accuracy": 0.8337768703699112, "num_tokens": 657511670.0, "step": 1660 }, { "epoch": 0.7196724843783667, "grad_norm": 0.28489550439911787, "learning_rate": 1.4827806723733953e-05, "loss": 0.6082, "mean_token_accuracy": 0.8323323041200638, "num_tokens": 662745397.0, "step": 1670 }, { "epoch": 0.7239819004524887, "grad_norm": 5.085802217997405, "learning_rate": 1.476657855822746e-05, "loss": 0.6029, "mean_token_accuracy": 0.8340036392211914, "num_tokens": 667941049.0, "step": 1680 }, { "epoch": 0.7282913165266106, "grad_norm": 0.3175247398544697, "learning_rate": 1.4705118282771646e-05, "loss": 0.6132, "mean_token_accuracy": 0.8315111696720123, "num_tokens": 673170681.0, "step": 1690 }, { "epoch": 0.7326007326007326, "grad_norm": 0.36387906097422973, "learning_rate": 1.4643428890192755e-05, "loss": 0.6078, "mean_token_accuracy": 0.8320950508117676, "num_tokens": 678398951.0, "step": 1700 }, { "epoch": 0.7369101486748546, "grad_norm": 0.37518202342418067, "learning_rate": 1.4581513384473964e-05, "loss": 0.621, "mean_token_accuracy": 0.829478406906128, "num_tokens": 683628898.0, "step": 1710 }, { "epoch": 0.7412195647489765, "grad_norm": 1.163712573734281, "learning_rate": 1.4519374780609092e-05, "loss": 0.6014, "mean_token_accuracy": 0.8339979231357575, "num_tokens": 688857667.0, "step": 1720 }, { "epoch": 0.7455289808230985, "grad_norm": 0.2994206090289646, "learning_rate": 1.445701610445579e-05, "loss": 0.5923, "mean_token_accuracy": 0.8361737430095673, "num_tokens": 694084034.0, "step": 1730 }, { "epoch": 0.7498383968972204, "grad_norm": 0.26792677558725797, "learning_rate": 1.4394440392588189e-05, "loss": 0.62, "mean_token_accuracy": 0.8296584337949753, "num_tokens": 699312351.0, "step": 1740 }, { "epoch": 0.7541478129713424, "grad_norm": 5.492160529790561, "learning_rate": 1.4331650692149041e-05, "loss": 0.5996, "mean_token_accuracy": 0.8339706599712372, "num_tokens": 704526597.0, "step": 1750 }, { "epoch": 0.7584572290454643, "grad_norm": 0.3133234783081913, "learning_rate": 1.4268650060701343e-05, "loss": 0.6049, "mean_token_accuracy": 0.832900133728981, "num_tokens": 709756909.0, "step": 1760 }, { "epoch": 0.7627666451195863, "grad_norm": 0.22434150031880265, "learning_rate": 1.4205441566079427e-05, "loss": 0.5971, "mean_token_accuracy": 0.8346771419048309, "num_tokens": 714963472.0, "step": 1770 }, { "epoch": 0.7670760611937083, "grad_norm": 0.32122589816082414, "learning_rate": 1.4142028286239592e-05, "loss": 0.6003, "mean_token_accuracy": 0.834486848115921, "num_tokens": 720173784.0, "step": 1780 }, { "epoch": 0.7713854772678302, "grad_norm": 0.3678277322222113, "learning_rate": 1.4078413309110206e-05, "loss": 0.5995, "mean_token_accuracy": 0.834784933924675, "num_tokens": 725408259.0, "step": 1790 }, { "epoch": 0.7756948933419522, "grad_norm": 0.5285608236478367, "learning_rate": 1.4014599732441355e-05, "loss": 0.6056, "mean_token_accuracy": 0.8330759555101395, "num_tokens": 730612147.0, "step": 1800 }, { "epoch": 0.7800043094160741, "grad_norm": 0.2715487371829005, "learning_rate": 1.395059066365398e-05, "loss": 0.593, "mean_token_accuracy": 0.8356486409902573, "num_tokens": 735820850.0, "step": 1810 }, { "epoch": 0.7843137254901961, "grad_norm": 0.4482485867661981, "learning_rate": 1.3886389219688576e-05, "loss": 0.6015, "mean_token_accuracy": 0.8340961128473282, "num_tokens": 741030542.0, "step": 1820 }, { "epoch": 0.788623141564318, "grad_norm": 0.570323041246378, "learning_rate": 1.38219985268534e-05, "loss": 0.6143, "mean_token_accuracy": 0.8316532343626022, "num_tokens": 746243507.0, "step": 1830 }, { "epoch": 0.79293255763844, "grad_norm": 0.26778897690583175, "learning_rate": 1.3757421720672236e-05, "loss": 0.6013, "mean_token_accuracy": 0.8344661980867386, "num_tokens": 751432663.0, "step": 1840 }, { "epoch": 0.797241973712562, "grad_norm": 0.350740803847862, "learning_rate": 1.3692661945731713e-05, "loss": 0.6109, "mean_token_accuracy": 0.8322643369436264, "num_tokens": 756626363.0, "step": 1850 }, { "epoch": 0.8015513897866839, "grad_norm": 2.483393611208882, "learning_rate": 1.3627722355528188e-05, "loss": 0.5979, "mean_token_accuracy": 0.8347870618104934, "num_tokens": 761856592.0, "step": 1860 }, { "epoch": 0.8058608058608059, "grad_norm": 0.24728496902375163, "learning_rate": 1.356260611231416e-05, "loss": 0.5931, "mean_token_accuracy": 0.8357406079769134, "num_tokens": 767085519.0, "step": 1870 }, { "epoch": 0.8101702219349278, "grad_norm": 0.3625051373461605, "learning_rate": 1.349731638694431e-05, "loss": 0.62, "mean_token_accuracy": 0.8306344985961914, "num_tokens": 772293028.0, "step": 1880 }, { "epoch": 0.8144796380090498, "grad_norm": 0.4227238518836785, "learning_rate": 1.3431856358721076e-05, "loss": 0.5926, "mean_token_accuracy": 0.8355791062116623, "num_tokens": 777509317.0, "step": 1890 }, { "epoch": 0.8187890540831717, "grad_norm": 0.3460631298675841, "learning_rate": 1.3366229215239846e-05, "loss": 0.6021, "mean_token_accuracy": 0.8335570633411408, "num_tokens": 782743633.0, "step": 1900 }, { "epoch": 0.8230984701572936, "grad_norm": 0.3269698052060139, "learning_rate": 1.3300438152233736e-05, "loss": 0.5899, "mean_token_accuracy": 0.8363129019737243, "num_tokens": 787963690.0, "step": 1910 }, { "epoch": 0.8274078862314156, "grad_norm": 0.3364228270825042, "learning_rate": 1.3234486373417978e-05, "loss": 0.6059, "mean_token_accuracy": 0.8325729131698608, "num_tokens": 793182299.0, "step": 1920 }, { "epoch": 0.8317173023055376, "grad_norm": 0.4245375541655158, "learning_rate": 1.3168377090333897e-05, "loss": 0.6082, "mean_token_accuracy": 0.8329648286104202, "num_tokens": 798406907.0, "step": 1930 }, { "epoch": 0.8360267183796596, "grad_norm": 0.3580146214044832, "learning_rate": 1.3102113522192537e-05, "loss": 0.6032, "mean_token_accuracy": 0.8337147742509842, "num_tokens": 803626804.0, "step": 1940 }, { "epoch": 0.8403361344537815, "grad_norm": 0.28463992939533567, "learning_rate": 1.3035698895717901e-05, "loss": 0.5964, "mean_token_accuracy": 0.8356025218963623, "num_tokens": 808846189.0, "step": 1950 }, { "epoch": 0.8446455505279035, "grad_norm": 0.5191362092711163, "learning_rate": 1.2969136444989822e-05, "loss": 0.6132, "mean_token_accuracy": 0.8314791351556778, "num_tokens": 814063867.0, "step": 1960 }, { "epoch": 0.8489549666020254, "grad_norm": 0.29566050433579005, "learning_rate": 1.2902429411286481e-05, "loss": 0.6012, "mean_token_accuracy": 0.8342074573040008, "num_tokens": 819281440.0, "step": 1970 }, { "epoch": 0.8532643826761473, "grad_norm": 0.23714625386447136, "learning_rate": 1.2835581042926563e-05, "loss": 0.6079, "mean_token_accuracy": 0.8329275637865067, "num_tokens": 824496577.0, "step": 1980 }, { "epoch": 0.8575737987502693, "grad_norm": 27.3215881029641, "learning_rate": 1.2768594595111092e-05, "loss": 0.6024, "mean_token_accuracy": 0.8341468870639801, "num_tokens": 829718024.0, "step": 1990 }, { "epoch": 0.8618832148243913, "grad_norm": 0.3490935454501864, "learning_rate": 1.2701473329764908e-05, "loss": 0.6027, "mean_token_accuracy": 0.834116992354393, "num_tokens": 834950511.0, "step": 2000 }, { "epoch": 0.8661926308985133, "grad_norm": 0.28282734858143393, "learning_rate": 1.2634220515377834e-05, "loss": 0.6185, "mean_token_accuracy": 0.8302592813968659, "num_tokens": 840132980.0, "step": 2010 }, { "epoch": 0.8705020469726352, "grad_norm": 0.2047238215406244, "learning_rate": 1.2566839426845508e-05, "loss": 0.5909, "mean_token_accuracy": 0.8370138019323349, "num_tokens": 845345475.0, "step": 2020 }, { "epoch": 0.8748114630467572, "grad_norm": 8.925062440586133, "learning_rate": 1.2499333345309916e-05, "loss": 0.6094, "mean_token_accuracy": 0.8328583806753158, "num_tokens": 850549218.0, "step": 2030 }, { "epoch": 0.8791208791208791, "grad_norm": 0.3163353508882041, "learning_rate": 1.243170555799962e-05, "loss": 0.5988, "mean_token_accuracy": 0.8351319640874862, "num_tokens": 855765039.0, "step": 2040 }, { "epoch": 0.883430295195001, "grad_norm": 0.27029081649825276, "learning_rate": 1.2363959358069675e-05, "loss": 0.5937, "mean_token_accuracy": 0.8353270292282104, "num_tokens": 861002795.0, "step": 2050 }, { "epoch": 0.887739711269123, "grad_norm": 2.9236167746758563, "learning_rate": 1.2296098044441283e-05, "loss": 0.5992, "mean_token_accuracy": 0.8359830021858216, "num_tokens": 866205919.0, "step": 2060 }, { "epoch": 0.892049127343245, "grad_norm": 0.3009824992016643, "learning_rate": 1.2228124921641125e-05, "loss": 0.59, "mean_token_accuracy": 0.8369195371866226, "num_tokens": 871413492.0, "step": 2070 }, { "epoch": 0.896358543417367, "grad_norm": 0.7588855946976795, "learning_rate": 1.2160043299640476e-05, "loss": 0.5842, "mean_token_accuracy": 0.837891036272049, "num_tokens": 876629442.0, "step": 2080 }, { "epoch": 0.9006679594914889, "grad_norm": 0.291362959122127, "learning_rate": 1.2091856493694005e-05, "loss": 0.5938, "mean_token_accuracy": 0.8360926955938339, "num_tokens": 881842841.0, "step": 2090 }, { "epoch": 0.9049773755656109, "grad_norm": 0.2790872062681025, "learning_rate": 1.2023567824178348e-05, "loss": 0.6058, "mean_token_accuracy": 0.8325457513332367, "num_tokens": 887048949.0, "step": 2100 }, { "epoch": 0.9092867916397328, "grad_norm": 0.24859046686251604, "learning_rate": 1.1955180616430422e-05, "loss": 0.5842, "mean_token_accuracy": 0.8386687934398651, "num_tokens": 892276407.0, "step": 2110 }, { "epoch": 0.9135962077138547, "grad_norm": 0.2638688739083249, "learning_rate": 1.1886698200585478e-05, "loss": 0.597, "mean_token_accuracy": 0.8348753720521926, "num_tokens": 897498349.0, "step": 2120 }, { "epoch": 0.9179056237879767, "grad_norm": 0.33676642373079085, "learning_rate": 1.1818123911414972e-05, "loss": 0.6132, "mean_token_accuracy": 0.8317342340946198, "num_tokens": 902711846.0, "step": 2130 }, { "epoch": 0.9222150398620987, "grad_norm": 0.4281010501759308, "learning_rate": 1.1749461088164137e-05, "loss": 0.6044, "mean_token_accuracy": 0.8337533384561538, "num_tokens": 907947131.0, "step": 2140 }, { "epoch": 0.9265244559362207, "grad_norm": 0.31969799353210465, "learning_rate": 1.1680713074389415e-05, "loss": 0.5902, "mean_token_accuracy": 0.8363572478294372, "num_tokens": 913150962.0, "step": 2150 }, { "epoch": 0.9308338720103426, "grad_norm": 0.2915520517934571, "learning_rate": 1.1611883217795628e-05, "loss": 0.5919, "mean_token_accuracy": 0.8363821595907212, "num_tokens": 918378245.0, "step": 2160 }, { "epoch": 0.9351432880844646, "grad_norm": 0.5434576653638358, "learning_rate": 1.1542974870072945e-05, "loss": 0.6057, "mean_token_accuracy": 0.8328346371650696, "num_tokens": 923602633.0, "step": 2170 }, { "epoch": 0.9394527041585865, "grad_norm": 0.4007884187532878, "learning_rate": 1.1473991386733694e-05, "loss": 0.6173, "mean_token_accuracy": 0.8311869651079178, "num_tokens": 928826636.0, "step": 2180 }, { "epoch": 0.9437621202327084, "grad_norm": 0.2902391495668746, "learning_rate": 1.1404936126948946e-05, "loss": 0.6035, "mean_token_accuracy": 0.8333300113677978, "num_tokens": 934032103.0, "step": 2190 }, { "epoch": 0.9480715363068304, "grad_norm": 0.2506931488871762, "learning_rate": 1.1335812453384958e-05, "loss": 0.6131, "mean_token_accuracy": 0.8317271679639816, "num_tokens": 939252503.0, "step": 2200 }, { "epoch": 0.9523809523809523, "grad_norm": 0.39882188027299165, "learning_rate": 1.1266623732039406e-05, "loss": 0.5883, "mean_token_accuracy": 0.8380580514669418, "num_tokens": 944473488.0, "step": 2210 }, { "epoch": 0.9566903684550744, "grad_norm": 0.24755335627805714, "learning_rate": 1.119737333207749e-05, "loss": 0.5883, "mean_token_accuracy": 0.8374327510595322, "num_tokens": 949683058.0, "step": 2220 }, { "epoch": 0.9609997845291963, "grad_norm": 0.27006676949246144, "learning_rate": 1.1128064625667872e-05, "loss": 0.5954, "mean_token_accuracy": 0.8358117341995239, "num_tokens": 954859245.0, "step": 2230 }, { "epoch": 0.9653092006033183, "grad_norm": 0.36367309225873623, "learning_rate": 1.1058700987818457e-05, "loss": 0.5874, "mean_token_accuracy": 0.8368551760911942, "num_tokens": 960065577.0, "step": 2240 }, { "epoch": 0.9696186166774402, "grad_norm": 1.634720067000637, "learning_rate": 1.0989285796212059e-05, "loss": 0.607, "mean_token_accuracy": 0.8328171730041504, "num_tokens": 965253736.0, "step": 2250 }, { "epoch": 0.9739280327515621, "grad_norm": 0.301424855027887, "learning_rate": 1.0919822431041918e-05, "loss": 0.6021, "mean_token_accuracy": 0.8351370215415954, "num_tokens": 970487039.0, "step": 2260 }, { "epoch": 0.9782374488256841, "grad_norm": 0.237716237953484, "learning_rate": 1.0850314274847096e-05, "loss": 0.6066, "mean_token_accuracy": 0.8333537936210632, "num_tokens": 975707254.0, "step": 2270 }, { "epoch": 0.982546864899806, "grad_norm": 0.29105824503375366, "learning_rate": 1.0780764712347761e-05, "loss": 0.5885, "mean_token_accuracy": 0.8366367071866989, "num_tokens": 980920402.0, "step": 2280 }, { "epoch": 0.9868562809739281, "grad_norm": 0.39872758795200536, "learning_rate": 1.071117713028039e-05, "loss": 0.5941, "mean_token_accuracy": 0.836166164278984, "num_tokens": 986131621.0, "step": 2290 }, { "epoch": 0.99116569704805, "grad_norm": 0.3940692636386731, "learning_rate": 1.0641554917232814e-05, "loss": 0.5896, "mean_token_accuracy": 0.8376127749681472, "num_tokens": 991332338.0, "step": 2300 }, { "epoch": 0.995475113122172, "grad_norm": 0.2791209048164772, "learning_rate": 1.0571901463479248e-05, "loss": 0.5935, "mean_token_accuracy": 0.8365125238895417, "num_tokens": 996561488.0, "step": 2310 }, { "epoch": 0.9997845291962939, "grad_norm": 0.276141377389003, "learning_rate": 1.0502220160815174e-05, "loss": 0.6035, "mean_token_accuracy": 0.8333020329475402, "num_tokens": 1001770297.0, "step": 2320 }, { "epoch": 1.0038784744667097, "grad_norm": 0.28215946220293636, "learning_rate": 1.0432514402392182e-05, "loss": 0.5869, "mean_token_accuracy": 0.8381417926989103, "num_tokens": 1006726501.0, "step": 2330 }, { "epoch": 1.0081878905408317, "grad_norm": 0.3276779065736126, "learning_rate": 1.0362787582552748e-05, "loss": 0.5543, "mean_token_accuracy": 0.8447976678609848, "num_tokens": 1011945403.0, "step": 2340 }, { "epoch": 1.0124973066149536, "grad_norm": 0.494515195211221, "learning_rate": 1.0293043096664937e-05, "loss": 0.5862, "mean_token_accuracy": 0.8380799055099487, "num_tokens": 1017163557.0, "step": 2350 }, { "epoch": 1.0168067226890756, "grad_norm": 0.21828062966309372, "learning_rate": 1.0223284340957069e-05, "loss": 0.574, "mean_token_accuracy": 0.8402041047811508, "num_tokens": 1022397000.0, "step": 2360 }, { "epoch": 1.0211161387631975, "grad_norm": 0.7725690165099922, "learning_rate": 1.0153514712352335e-05, "loss": 0.5753, "mean_token_accuracy": 0.83934685587883, "num_tokens": 1027611118.0, "step": 2370 }, { "epoch": 1.0254255548373195, "grad_norm": 0.3523803522094916, "learning_rate": 1.0083737608303388e-05, "loss": 0.5657, "mean_token_accuracy": 0.8430098563432693, "num_tokens": 1032838911.0, "step": 2380 }, { "epoch": 1.0297349709114414, "grad_norm": 0.3244131446052607, "learning_rate": 1.0013956426626896e-05, "loss": 0.5808, "mean_token_accuracy": 0.8392600357532501, "num_tokens": 1038059451.0, "step": 2390 }, { "epoch": 1.0340443869855636, "grad_norm": 0.2226046549482435, "learning_rate": 9.944174565338091e-06, "loss": 0.5697, "mean_token_accuracy": 0.8411895781755447, "num_tokens": 1043292576.0, "step": 2400 }, { "epoch": 1.0383538030596855, "grad_norm": 0.4362829378489027, "learning_rate": 9.874395422485297e-06, "loss": 0.5716, "mean_token_accuracy": 0.8409066915512085, "num_tokens": 1048512588.0, "step": 2410 }, { "epoch": 1.0426632191338074, "grad_norm": 0.3325499171454429, "learning_rate": 9.804622395984468e-06, "loss": 0.5636, "mean_token_accuracy": 0.8427377492189407, "num_tokens": 1053725540.0, "step": 2420 }, { "epoch": 1.0469726352079294, "grad_norm": 0.3073832055610142, "learning_rate": 9.734858883453716e-06, "loss": 0.5578, "mean_token_accuracy": 0.8437600404024124, "num_tokens": 1058947715.0, "step": 2430 }, { "epoch": 1.0512820512820513, "grad_norm": 0.32513895964255995, "learning_rate": 9.665108282047869e-06, "loss": 0.5688, "mean_token_accuracy": 0.8418644011020661, "num_tokens": 1064171123.0, "step": 2440 }, { "epoch": 1.0555914673561733, "grad_norm": 0.28290475235002815, "learning_rate": 9.595373988293038e-06, "loss": 0.5641, "mean_token_accuracy": 0.8424109160900116, "num_tokens": 1069398870.0, "step": 2450 }, { "epoch": 1.0599008834302952, "grad_norm": 0.327664657954262, "learning_rate": 9.525659397921236e-06, "loss": 0.5685, "mean_token_accuracy": 0.8416989505290985, "num_tokens": 1074611842.0, "step": 2460 }, { "epoch": 1.0642102995044171, "grad_norm": 0.2842484452855432, "learning_rate": 9.45596790570501e-06, "loss": 0.5786, "mean_token_accuracy": 0.8395838111639022, "num_tokens": 1079830864.0, "step": 2470 }, { "epoch": 1.068519715578539, "grad_norm": 0.4432086983274004, "learning_rate": 9.386302905292142e-06, "loss": 0.5872, "mean_token_accuracy": 0.8374298244714737, "num_tokens": 1085049203.0, "step": 2480 }, { "epoch": 1.072829131652661, "grad_norm": 0.40906882312004494, "learning_rate": 9.316667789040377e-06, "loss": 0.5796, "mean_token_accuracy": 0.8396118551492691, "num_tokens": 1090260222.0, "step": 2490 }, { "epoch": 1.077138547726783, "grad_norm": 0.3708559657015165, "learning_rate": 9.247065947852247e-06, "loss": 0.5857, "mean_token_accuracy": 0.8381546288728714, "num_tokens": 1095475477.0, "step": 2500 }, { "epoch": 1.081447963800905, "grad_norm": 0.3524858496239274, "learning_rate": 9.17750077100995e-06, "loss": 0.5639, "mean_token_accuracy": 0.8427853226661682, "num_tokens": 1100690662.0, "step": 2510 }, { "epoch": 1.0857573798750269, "grad_norm": 0.5630648308564895, "learning_rate": 9.107975646010296e-06, "loss": 0.5617, "mean_token_accuracy": 0.8436825752258301, "num_tokens": 1105906657.0, "step": 2520 }, { "epoch": 1.0900667959491488, "grad_norm": 0.3973447616226555, "learning_rate": 9.03849395839976e-06, "loss": 0.5747, "mean_token_accuracy": 0.8393137693405152, "num_tokens": 1111122000.0, "step": 2530 }, { "epoch": 1.094376212023271, "grad_norm": 0.25420790714332686, "learning_rate": 8.969059091609622e-06, "loss": 0.5726, "mean_token_accuracy": 0.8409917622804641, "num_tokens": 1116357647.0, "step": 2540 }, { "epoch": 1.098685628097393, "grad_norm": 0.3442760415923012, "learning_rate": 8.899674426791209e-06, "loss": 0.5594, "mean_token_accuracy": 0.8441164135932923, "num_tokens": 1121581898.0, "step": 2550 }, { "epoch": 1.1029950441715148, "grad_norm": 0.24239755553638928, "learning_rate": 8.830343342651245e-06, "loss": 0.5764, "mean_token_accuracy": 0.8401670336723328, "num_tokens": 1126771020.0, "step": 2560 }, { "epoch": 1.1073044602456368, "grad_norm": 0.2840421738372488, "learning_rate": 8.761069215287335e-06, "loss": 0.5665, "mean_token_accuracy": 0.8417718440294266, "num_tokens": 1131965919.0, "step": 2570 }, { "epoch": 1.1116138763197587, "grad_norm": 0.35593570720930956, "learning_rate": 8.691855418023542e-06, "loss": 0.5636, "mean_token_accuracy": 0.8425986349582673, "num_tokens": 1137158617.0, "step": 2580 }, { "epoch": 1.1159232923938807, "grad_norm": 0.4004595320458362, "learning_rate": 8.622705321246153e-06, "loss": 0.5702, "mean_token_accuracy": 0.8415834605693817, "num_tokens": 1142358589.0, "step": 2590 }, { "epoch": 1.1202327084680026, "grad_norm": 0.2711328574617825, "learning_rate": 8.553622292239527e-06, "loss": 0.5619, "mean_token_accuracy": 0.8429356276988983, "num_tokens": 1147589623.0, "step": 2600 }, { "epoch": 1.1245421245421245, "grad_norm": 0.32756410537154684, "learning_rate": 8.484609695022153e-06, "loss": 0.5726, "mean_token_accuracy": 0.8407656341791153, "num_tokens": 1152828590.0, "step": 2610 }, { "epoch": 1.1288515406162465, "grad_norm": 1.0735528920560464, "learning_rate": 8.41567089018281e-06, "loss": 0.5604, "mean_token_accuracy": 0.8441340237855911, "num_tokens": 1158062498.0, "step": 2620 }, { "epoch": 1.1331609566903684, "grad_norm": 0.28856280238764576, "learning_rate": 8.346809234716945e-06, "loss": 0.5779, "mean_token_accuracy": 0.8393817722797394, "num_tokens": 1163270361.0, "step": 2630 }, { "epoch": 1.1374703727644904, "grad_norm": 0.30596164368391887, "learning_rate": 8.278028081863187e-06, "loss": 0.5546, "mean_token_accuracy": 0.8441006124019623, "num_tokens": 1168489340.0, "step": 2640 }, { "epoch": 1.1417797888386123, "grad_norm": 0.3271708191446449, "learning_rate": 8.209330780940067e-06, "loss": 0.5668, "mean_token_accuracy": 0.8420156449079513, "num_tokens": 1173698335.0, "step": 2650 }, { "epoch": 1.1460892049127342, "grad_norm": 3.0374513000382675, "learning_rate": 8.140720677182925e-06, "loss": 0.5723, "mean_token_accuracy": 0.8411957144737243, "num_tokens": 1178934261.0, "step": 2660 }, { "epoch": 1.1503986209868562, "grad_norm": 0.3235386346478464, "learning_rate": 8.07220111158101e-06, "loss": 0.559, "mean_token_accuracy": 0.8443704307079315, "num_tokens": 1184134656.0, "step": 2670 }, { "epoch": 1.1547080370609781, "grad_norm": 0.21958501332598337, "learning_rate": 8.003775420714785e-06, "loss": 0.5786, "mean_token_accuracy": 0.8393762737512589, "num_tokens": 1189279295.0, "step": 2680 }, { "epoch": 1.1590174531351, "grad_norm": 0.7751397952458493, "learning_rate": 7.935446936593454e-06, "loss": 0.5721, "mean_token_accuracy": 0.8408875465393066, "num_tokens": 1194501046.0, "step": 2690 }, { "epoch": 1.1633268692092222, "grad_norm": 1.260027843581316, "learning_rate": 7.867218986492715e-06, "loss": 0.5621, "mean_token_accuracy": 0.8434571206569672, "num_tokens": 1199713296.0, "step": 2700 }, { "epoch": 1.1676362852833442, "grad_norm": 0.30022806982662453, "learning_rate": 7.79909489279273e-06, "loss": 0.558, "mean_token_accuracy": 0.8439692467451095, "num_tokens": 1204933492.0, "step": 2710 }, { "epoch": 1.1719457013574661, "grad_norm": 0.2298481506407006, "learning_rate": 7.731077972816339e-06, "loss": 0.5582, "mean_token_accuracy": 0.8441542565822602, "num_tokens": 1210147728.0, "step": 2720 }, { "epoch": 1.176255117431588, "grad_norm": 4.2732639282248295, "learning_rate": 7.663171538667532e-06, "loss": 0.5714, "mean_token_accuracy": 0.8414576411247253, "num_tokens": 1215358842.0, "step": 2730 }, { "epoch": 1.18056453350571, "grad_norm": 0.49682201486052485, "learning_rate": 7.595378897070156e-06, "loss": 0.5667, "mean_token_accuracy": 0.8425624758005142, "num_tokens": 1220559982.0, "step": 2740 }, { "epoch": 1.184873949579832, "grad_norm": 0.24475242758020577, "learning_rate": 7.527703349206893e-06, "loss": 0.5741, "mean_token_accuracy": 0.8402534753084183, "num_tokens": 1225777045.0, "step": 2750 }, { "epoch": 1.1891833656539539, "grad_norm": 0.2338984514545482, "learning_rate": 7.4601481905585205e-06, "loss": 0.5804, "mean_token_accuracy": 0.8392770498991012, "num_tokens": 1230999796.0, "step": 2760 }, { "epoch": 1.1934927817280758, "grad_norm": 0.26753310993577245, "learning_rate": 7.392716710743421e-06, "loss": 0.5694, "mean_token_accuracy": 0.8410152703523636, "num_tokens": 1236211975.0, "step": 2770 }, { "epoch": 1.1978021978021978, "grad_norm": 0.22680996004503653, "learning_rate": 7.325412193357398e-06, "loss": 0.5531, "mean_token_accuracy": 0.8453241288661957, "num_tokens": 1241418636.0, "step": 2780 }, { "epoch": 1.2021116138763197, "grad_norm": 0.269685862596858, "learning_rate": 7.258237915813784e-06, "loss": 0.5381, "mean_token_accuracy": 0.8482133537530899, "num_tokens": 1246636380.0, "step": 2790 }, { "epoch": 1.2064210299504416, "grad_norm": 0.27013889713829276, "learning_rate": 7.191197149183846e-06, "loss": 0.5724, "mean_token_accuracy": 0.8410374283790588, "num_tokens": 1251865287.0, "step": 2800 }, { "epoch": 1.2107304460245636, "grad_norm": 0.23812551159611356, "learning_rate": 7.124293158037494e-06, "loss": 0.5552, "mean_token_accuracy": 0.8453818470239639, "num_tokens": 1257092536.0, "step": 2810 }, { "epoch": 1.2150398620986858, "grad_norm": 0.3631304833540362, "learning_rate": 7.057529200284321e-06, "loss": 0.5562, "mean_token_accuracy": 0.8446999907493591, "num_tokens": 1262325150.0, "step": 2820 }, { "epoch": 1.2193492781728077, "grad_norm": 0.22393578375803347, "learning_rate": 6.990908527014949e-06, "loss": 0.5655, "mean_token_accuracy": 0.8415402233600616, "num_tokens": 1267551182.0, "step": 2830 }, { "epoch": 1.2236586942469296, "grad_norm": 0.24140164685278628, "learning_rate": 6.924434382342719e-06, "loss": 0.5717, "mean_token_accuracy": 0.8408702909946442, "num_tokens": 1272766727.0, "step": 2840 }, { "epoch": 1.2279681103210516, "grad_norm": 0.6748600089769614, "learning_rate": 6.85811000324572e-06, "loss": 0.5657, "mean_token_accuracy": 0.8425550520420074, "num_tokens": 1277988314.0, "step": 2850 }, { "epoch": 1.2322775263951735, "grad_norm": 4.994642762705703, "learning_rate": 6.79193861940916e-06, "loss": 0.5741, "mean_token_accuracy": 0.8406617641448975, "num_tokens": 1283213460.0, "step": 2860 }, { "epoch": 1.2365869424692955, "grad_norm": 0.307209879657027, "learning_rate": 6.7259234530680994e-06, "loss": 0.5504, "mean_token_accuracy": 0.8458899676799774, "num_tokens": 1288426909.0, "step": 2870 }, { "epoch": 1.2408963585434174, "grad_norm": 0.2253299307552605, "learning_rate": 6.660067718850545e-06, "loss": 0.5773, "mean_token_accuracy": 0.8392666220664978, "num_tokens": 1293636304.0, "step": 2880 }, { "epoch": 1.2452057746175393, "grad_norm": 0.3214126013580455, "learning_rate": 6.594374623620903e-06, "loss": 0.5542, "mean_token_accuracy": 0.8447525471448898, "num_tokens": 1298845047.0, "step": 2890 }, { "epoch": 1.2495151906916613, "grad_norm": 2.142649948553335, "learning_rate": 6.528847366323828e-06, "loss": 0.5607, "mean_token_accuracy": 0.8440374076366425, "num_tokens": 1304062536.0, "step": 2900 }, { "epoch": 1.2538246067657832, "grad_norm": 0.2625110338179464, "learning_rate": 6.463489137828452e-06, "loss": 0.5842, "mean_token_accuracy": 0.8384378522634506, "num_tokens": 1309261235.0, "step": 2910 }, { "epoch": 1.2581340228399052, "grad_norm": 0.23140830702100126, "learning_rate": 6.398303120772996e-06, "loss": 0.5764, "mean_token_accuracy": 0.8402356833219529, "num_tokens": 1314481465.0, "step": 2920 }, { "epoch": 1.262443438914027, "grad_norm": 0.6555436674983836, "learning_rate": 6.333292489409792e-06, "loss": 0.5704, "mean_token_accuracy": 0.8413709014654159, "num_tokens": 1319698365.0, "step": 2930 }, { "epoch": 1.266752854988149, "grad_norm": 0.3022091831251182, "learning_rate": 6.26846040945072e-06, "loss": 0.5697, "mean_token_accuracy": 0.8426906526088714, "num_tokens": 1324907752.0, "step": 2940 }, { "epoch": 1.271062271062271, "grad_norm": 0.27572402342474295, "learning_rate": 6.2038100379130385e-06, "loss": 0.557, "mean_token_accuracy": 0.8445657461881637, "num_tokens": 1330132045.0, "step": 2950 }, { "epoch": 1.275371687136393, "grad_norm": 0.19382910794802066, "learning_rate": 6.139344522965664e-06, "loss": 0.5602, "mean_token_accuracy": 0.8433891266584397, "num_tokens": 1335340639.0, "step": 2960 }, { "epoch": 1.2796811032105149, "grad_norm": 0.2510660767674733, "learning_rate": 6.075067003775877e-06, "loss": 0.5713, "mean_token_accuracy": 0.8408387333154679, "num_tokens": 1340565391.0, "step": 2970 }, { "epoch": 1.2839905192846368, "grad_norm": 11.751573059002698, "learning_rate": 6.010980610356436e-06, "loss": 0.569, "mean_token_accuracy": 0.8420016825199127, "num_tokens": 1345794209.0, "step": 2980 }, { "epoch": 1.2882999353587588, "grad_norm": 0.281596201905732, "learning_rate": 5.947088463413173e-06, "loss": 0.5528, "mean_token_accuracy": 0.8453968316316605, "num_tokens": 1350997613.0, "step": 2990 }, { "epoch": 1.292609351432881, "grad_norm": 2.2389283563191746, "learning_rate": 5.883393674193039e-06, "loss": 0.5603, "mean_token_accuracy": 0.8436051934957505, "num_tokens": 1356219446.0, "step": 3000 }, { "epoch": 1.2969187675070029, "grad_norm": 0.3024412733308921, "learning_rate": 5.819899344332589e-06, "loss": 0.592, "mean_token_accuracy": 0.8366115003824234, "num_tokens": 1361443793.0, "step": 3010 }, { "epoch": 1.3012281835811248, "grad_norm": 0.34515700366692964, "learning_rate": 5.756608565706951e-06, "loss": 0.5661, "mean_token_accuracy": 0.8422780960798264, "num_tokens": 1366681782.0, "step": 3020 }, { "epoch": 1.3055375996552467, "grad_norm": 0.26917437230199653, "learning_rate": 5.693524420279262e-06, "loss": 0.569, "mean_token_accuracy": 0.8419100999832153, "num_tokens": 1371888024.0, "step": 3030 }, { "epoch": 1.3098470157293687, "grad_norm": 0.1948647589600037, "learning_rate": 5.6306499799506e-06, "loss": 0.5612, "mean_token_accuracy": 0.8434827029705048, "num_tokens": 1377102855.0, "step": 3040 }, { "epoch": 1.3141564318034906, "grad_norm": 0.5592756010519621, "learning_rate": 5.5679883064103905e-06, "loss": 0.5698, "mean_token_accuracy": 0.8424432039260864, "num_tokens": 1382313760.0, "step": 3050 }, { "epoch": 1.3184658478776126, "grad_norm": 0.31693312013556946, "learning_rate": 5.505542450987309e-06, "loss": 0.5583, "mean_token_accuracy": 0.8442396104335785, "num_tokens": 1387542835.0, "step": 3060 }, { "epoch": 1.3227752639517345, "grad_norm": 0.6441120166050115, "learning_rate": 5.443315454500718e-06, "loss": 0.5561, "mean_token_accuracy": 0.8438980937004089, "num_tokens": 1392767332.0, "step": 3070 }, { "epoch": 1.3270846800258564, "grad_norm": 0.23362471882404307, "learning_rate": 5.381310347112565e-06, "loss": 0.5633, "mean_token_accuracy": 0.8435305774211883, "num_tokens": 1397980376.0, "step": 3080 }, { "epoch": 1.3313940960999784, "grad_norm": 0.4400359995607808, "learning_rate": 5.319530148179854e-06, "loss": 0.5684, "mean_token_accuracy": 0.8415235757827759, "num_tokens": 1403197333.0, "step": 3090 }, { "epoch": 1.3357035121741003, "grad_norm": 0.3580258657289147, "learning_rate": 5.257977866107604e-06, "loss": 0.5539, "mean_token_accuracy": 0.8451881408691406, "num_tokens": 1408422261.0, "step": 3100 }, { "epoch": 1.3400129282482225, "grad_norm": 0.26561594924537685, "learning_rate": 5.1966564982023436e-06, "loss": 0.5715, "mean_token_accuracy": 0.8410105794668198, "num_tokens": 1413647145.0, "step": 3110 }, { "epoch": 1.3443223443223444, "grad_norm": 0.35826551058122147, "learning_rate": 5.135569030526186e-06, "loss": 0.5687, "mean_token_accuracy": 0.8422814160585403, "num_tokens": 1418871153.0, "step": 3120 }, { "epoch": 1.3486317603964664, "grad_norm": 0.26039982856706767, "learning_rate": 5.074718437751389e-06, "loss": 0.5495, "mean_token_accuracy": 0.8461204409599304, "num_tokens": 1424092256.0, "step": 3130 }, { "epoch": 1.3529411764705883, "grad_norm": 0.2677204961428965, "learning_rate": 5.0141076830155185e-06, "loss": 0.5542, "mean_token_accuracy": 0.8452610582113266, "num_tokens": 1429327903.0, "step": 3140 }, { "epoch": 1.3572505925447103, "grad_norm": 0.27601518214191345, "learning_rate": 4.9537397177771685e-06, "loss": 0.563, "mean_token_accuracy": 0.8430817395448684, "num_tokens": 1434544033.0, "step": 3150 }, { "epoch": 1.3615600086188322, "grad_norm": 0.32747189691633827, "learning_rate": 4.893617481672211e-06, "loss": 0.5472, "mean_token_accuracy": 0.8467270463705063, "num_tokens": 1439775218.0, "step": 3160 }, { "epoch": 1.3658694246929541, "grad_norm": 0.21251712836576017, "learning_rate": 4.83374390237068e-06, "loss": 0.5647, "mean_token_accuracy": 0.8419240593910218, "num_tokens": 1445003924.0, "step": 3170 }, { "epoch": 1.370178840767076, "grad_norm": 0.24260922737250815, "learning_rate": 4.774121895434178e-06, "loss": 0.5676, "mean_token_accuracy": 0.8418655335903168, "num_tokens": 1450228319.0, "step": 3180 }, { "epoch": 1.374488256841198, "grad_norm": 0.3612708287818831, "learning_rate": 4.714754364173929e-06, "loss": 0.5598, "mean_token_accuracy": 0.8432195276021958, "num_tokens": 1455437249.0, "step": 3190 }, { "epoch": 1.37879767291532, "grad_norm": 0.3160134458217089, "learning_rate": 4.655644199509389e-06, "loss": 0.5736, "mean_token_accuracy": 0.8411709427833557, "num_tokens": 1460645942.0, "step": 3200 }, { "epoch": 1.383107088989442, "grad_norm": 0.29592374057704923, "learning_rate": 4.5967942798274604e-06, "loss": 0.5726, "mean_token_accuracy": 0.8409826904535294, "num_tokens": 1465870175.0, "step": 3210 }, { "epoch": 1.3874165050635638, "grad_norm": 0.35298825245730686, "learning_rate": 4.538207470842353e-06, "loss": 0.5719, "mean_token_accuracy": 0.8409198552370072, "num_tokens": 1471089522.0, "step": 3220 }, { "epoch": 1.3917259211376858, "grad_norm": 0.2138912046630368, "learning_rate": 4.479886625456008e-06, "loss": 0.5562, "mean_token_accuracy": 0.8446432083845139, "num_tokens": 1476290708.0, "step": 3230 }, { "epoch": 1.3960353372118077, "grad_norm": 0.3410053755611928, "learning_rate": 4.421834583619207e-06, "loss": 0.5782, "mean_token_accuracy": 0.8392639845609665, "num_tokens": 1481515748.0, "step": 3240 }, { "epoch": 1.4003447532859297, "grad_norm": 0.20881836487607122, "learning_rate": 4.36405417219326e-06, "loss": 0.579, "mean_token_accuracy": 0.8394672989845275, "num_tokens": 1486743133.0, "step": 3250 }, { "epoch": 1.4046541693600516, "grad_norm": 0.2458569709098672, "learning_rate": 4.306548204812338e-06, "loss": 0.5568, "mean_token_accuracy": 0.8449651092290879, "num_tokens": 1491956756.0, "step": 3260 }, { "epoch": 1.4089635854341735, "grad_norm": 0.35531625346518336, "learning_rate": 4.2493194817464986e-06, "loss": 0.5686, "mean_token_accuracy": 0.841595783829689, "num_tokens": 1497177117.0, "step": 3270 }, { "epoch": 1.4132730015082955, "grad_norm": 0.35633294723453846, "learning_rate": 4.192370789765293e-06, "loss": 0.5521, "mean_token_accuracy": 0.845710837841034, "num_tokens": 1502383070.0, "step": 3280 }, { "epoch": 1.4175824175824177, "grad_norm": 0.21261662633178036, "learning_rate": 4.135704902002083e-06, "loss": 0.5678, "mean_token_accuracy": 0.841626325249672, "num_tokens": 1507606935.0, "step": 3290 }, { "epoch": 1.4218918336565396, "grad_norm": 0.23296711353116084, "learning_rate": 4.079324577818997e-06, "loss": 0.562, "mean_token_accuracy": 0.8435587346553802, "num_tokens": 1512794067.0, "step": 3300 }, { "epoch": 1.4262012497306615, "grad_norm": 0.26397431890732204, "learning_rate": 4.0232325626725484e-06, "loss": 0.5685, "mean_token_accuracy": 0.8417014718055725, "num_tokens": 1518022041.0, "step": 3310 }, { "epoch": 1.4305106658047835, "grad_norm": 47.36303085479332, "learning_rate": 3.967431587979974e-06, "loss": 0.5638, "mean_token_accuracy": 0.843363779783249, "num_tokens": 1523244067.0, "step": 3320 }, { "epoch": 1.4348200818789054, "grad_norm": 0.21367477353865608, "learning_rate": 3.9119243709861935e-06, "loss": 0.5545, "mean_token_accuracy": 0.8447900623083114, "num_tokens": 1528463440.0, "step": 3330 }, { "epoch": 1.4391294979530274, "grad_norm": 1.17179698067596, "learning_rate": 3.8567136146315184e-06, "loss": 0.5691, "mean_token_accuracy": 0.8412424892187118, "num_tokens": 1533685995.0, "step": 3340 }, { "epoch": 1.4434389140271493, "grad_norm": 0.4200369819381848, "learning_rate": 3.8018020074200266e-06, "loss": 0.5656, "mean_token_accuracy": 0.842321252822876, "num_tokens": 1538902078.0, "step": 3350 }, { "epoch": 1.4477483301012712, "grad_norm": 0.3099781611612159, "learning_rate": 3.7471922232886237e-06, "loss": 0.5684, "mean_token_accuracy": 0.8419666081666947, "num_tokens": 1544136812.0, "step": 3360 }, { "epoch": 1.4520577461753932, "grad_norm": 0.2946229868151544, "learning_rate": 3.692886921476869e-06, "loss": 0.5699, "mean_token_accuracy": 0.8417997777462005, "num_tokens": 1549375659.0, "step": 3370 }, { "epoch": 1.4563671622495151, "grad_norm": 0.24767389391982952, "learning_rate": 3.6388887463974508e-06, "loss": 0.5731, "mean_token_accuracy": 0.8408624559640885, "num_tokens": 1554592853.0, "step": 3380 }, { "epoch": 1.460676578323637, "grad_norm": 0.26303946531605427, "learning_rate": 3.5852003275074443e-06, "loss": 0.5721, "mean_token_accuracy": 0.841306421160698, "num_tokens": 1559788213.0, "step": 3390 }, { "epoch": 1.4649859943977592, "grad_norm": 0.3127523137305976, "learning_rate": 3.531824279180246e-06, "loss": 0.5764, "mean_token_accuracy": 0.8410633504390717, "num_tokens": 1565017904.0, "step": 3400 }, { "epoch": 1.4692954104718812, "grad_norm": 3.4160622613032365, "learning_rate": 3.4787632005782736e-06, "loss": 0.5494, "mean_token_accuracy": 0.8454854816198349, "num_tokens": 1570251390.0, "step": 3410 }, { "epoch": 1.473604826546003, "grad_norm": 0.2399139591984762, "learning_rate": 3.426019675526413e-06, "loss": 0.5539, "mean_token_accuracy": 0.8454915046691894, "num_tokens": 1575458234.0, "step": 3420 }, { "epoch": 1.477914242620125, "grad_norm": 0.3912258828772371, "learning_rate": 3.3735962723861727e-06, "loss": 0.5594, "mean_token_accuracy": 0.843585553765297, "num_tokens": 1580671047.0, "step": 3430 }, { "epoch": 1.482223658694247, "grad_norm": 0.3145115285159292, "learning_rate": 3.3214955439306397e-06, "loss": 0.5606, "mean_token_accuracy": 0.8435824036598205, "num_tokens": 1585904909.0, "step": 3440 }, { "epoch": 1.486533074768369, "grad_norm": 0.32196444624105985, "learning_rate": 3.269720027220162e-06, "loss": 0.5404, "mean_token_accuracy": 0.848076656460762, "num_tokens": 1591113995.0, "step": 3450 }, { "epoch": 1.4908424908424909, "grad_norm": 0.24777215172690445, "learning_rate": 3.2182722434787985e-06, "loss": 0.5641, "mean_token_accuracy": 0.8435137927532196, "num_tokens": 1596338722.0, "step": 3460 }, { "epoch": 1.4951519069166128, "grad_norm": 0.19829663522869137, "learning_rate": 3.1671546979715627e-06, "loss": 0.565, "mean_token_accuracy": 0.8430743485689163, "num_tokens": 1601563723.0, "step": 3470 }, { "epoch": 1.4994613229907348, "grad_norm": 0.2854855376256903, "learning_rate": 3.1163698798824093e-06, "loss": 0.5589, "mean_token_accuracy": 0.8438194751739502, "num_tokens": 1606789218.0, "step": 3480 }, { "epoch": 1.5037707390648567, "grad_norm": 0.22328778073598957, "learning_rate": 3.065920262193045e-06, "loss": 0.5599, "mean_token_accuracy": 0.8451088607311249, "num_tokens": 1612018610.0, "step": 3490 }, { "epoch": 1.5080801551389786, "grad_norm": 0.29824952053767206, "learning_rate": 3.015808301562491e-06, "loss": 0.5698, "mean_token_accuracy": 0.8416373580694199, "num_tokens": 1617237919.0, "step": 3500 }, { "epoch": 1.5123895712131006, "grad_norm": 0.24463849081646508, "learning_rate": 2.9660364382074493e-06, "loss": 0.5642, "mean_token_accuracy": 0.8428663492202759, "num_tokens": 1622476870.0, "step": 3510 }, { "epoch": 1.5166989872872225, "grad_norm": 0.3097587068698857, "learning_rate": 2.916607095783498e-06, "loss": 0.5643, "mean_token_accuracy": 0.8427408427000046, "num_tokens": 1627691811.0, "step": 3520 }, { "epoch": 1.5210084033613445, "grad_norm": 0.2629407749711703, "learning_rate": 2.867522681267049e-06, "loss": 0.5637, "mean_token_accuracy": 0.8435877531766891, "num_tokens": 1632901913.0, "step": 3530 }, { "epoch": 1.5253178194354664, "grad_norm": 0.24795120562760364, "learning_rate": 2.818785584838146e-06, "loss": 0.5571, "mean_token_accuracy": 0.8436106145381927, "num_tokens": 1638122487.0, "step": 3540 }, { "epoch": 1.5296272355095883, "grad_norm": 0.23348913418754316, "learning_rate": 2.7703981797640877e-06, "loss": 0.5706, "mean_token_accuracy": 0.8417340725660324, "num_tokens": 1643348088.0, "step": 3550 }, { "epoch": 1.5339366515837103, "grad_norm": 0.23724744554406615, "learning_rate": 2.7223628222838327e-06, "loss": 0.5718, "mean_token_accuracy": 0.841375270485878, "num_tokens": 1648566381.0, "step": 3560 }, { "epoch": 1.5382460676578322, "grad_norm": 0.19776857844821422, "learning_rate": 2.674681851493296e-06, "loss": 0.5505, "mean_token_accuracy": 0.8457813590765, "num_tokens": 1653770770.0, "step": 3570 }, { "epoch": 1.5425554837319542, "grad_norm": 0.24106499616722987, "learning_rate": 2.627357589231411e-06, "loss": 0.5563, "mean_token_accuracy": 0.8446049898862839, "num_tokens": 1658984348.0, "step": 3580 }, { "epoch": 1.546864899806076, "grad_norm": 0.31942898845737216, "learning_rate": 2.580392339967095e-06, "loss": 0.5465, "mean_token_accuracy": 0.8469930619001389, "num_tokens": 1664196493.0, "step": 3590 }, { "epoch": 1.5511743158801983, "grad_norm": 0.43488993580509633, "learning_rate": 2.533788390687022e-06, "loss": 0.5685, "mean_token_accuracy": 0.8420387625694274, "num_tokens": 1669412283.0, "step": 3600 }, { "epoch": 1.5554837319543202, "grad_norm": 0.41431516768693566, "learning_rate": 2.4875480107842477e-06, "loss": 0.5515, "mean_token_accuracy": 0.8455045014619827, "num_tokens": 1674645928.0, "step": 3610 }, { "epoch": 1.5597931480284422, "grad_norm": 0.29449563145925123, "learning_rate": 2.4416734519477204e-06, "loss": 0.549, "mean_token_accuracy": 0.8465836763381958, "num_tokens": 1679859019.0, "step": 3620 }, { "epoch": 1.564102564102564, "grad_norm": 0.25269744708127184, "learning_rate": 2.396166948052613e-06, "loss": 0.543, "mean_token_accuracy": 0.8480212599039078, "num_tokens": 1685084346.0, "step": 3630 }, { "epoch": 1.568411980176686, "grad_norm": 0.2535932302300957, "learning_rate": 2.351030715051562e-06, "loss": 0.5496, "mean_token_accuracy": 0.84574553668499, "num_tokens": 1690313261.0, "step": 3640 }, { "epoch": 1.572721396250808, "grad_norm": 0.24511927991793517, "learning_rate": 2.3062669508667544e-06, "loss": 0.5691, "mean_token_accuracy": 0.8408963650465011, "num_tokens": 1695542184.0, "step": 3650 }, { "epoch": 1.57703081232493, "grad_norm": 0.27215824056600335, "learning_rate": 2.2618778352828942e-06, "loss": 0.581, "mean_token_accuracy": 0.8390481382608413, "num_tokens": 1700760086.0, "step": 3660 }, { "epoch": 1.581340228399052, "grad_norm": 0.25353656311985623, "learning_rate": 2.2178655298410603e-06, "loss": 0.5586, "mean_token_accuracy": 0.8443617403507233, "num_tokens": 1705980058.0, "step": 3670 }, { "epoch": 1.585649644473174, "grad_norm": 0.2675315002289476, "learning_rate": 2.1742321777334484e-06, "loss": 0.5523, "mean_token_accuracy": 0.8450907111167908, "num_tokens": 1711191015.0, "step": 3680 }, { "epoch": 1.589959060547296, "grad_norm": 2.7353061988192175, "learning_rate": 2.1309799036990208e-06, "loss": 0.5639, "mean_token_accuracy": 0.8429837226867676, "num_tokens": 1716409104.0, "step": 3690 }, { "epoch": 1.594268476621418, "grad_norm": 0.9893406170235657, "learning_rate": 2.0881108139200223e-06, "loss": 0.5509, "mean_token_accuracy": 0.8454053699970245, "num_tokens": 1721602300.0, "step": 3700 }, { "epoch": 1.5985778926955398, "grad_norm": 0.30966678298040673, "learning_rate": 2.045626995919425e-06, "loss": 0.5699, "mean_token_accuracy": 0.8422327637672424, "num_tokens": 1726812708.0, "step": 3710 }, { "epoch": 1.6028873087696618, "grad_norm": 0.3651934930823965, "learning_rate": 2.003530518459288e-06, "loss": 0.5694, "mean_token_accuracy": 0.8416446477174759, "num_tokens": 1732027685.0, "step": 3720 }, { "epoch": 1.6071967248437837, "grad_norm": 0.259585303180488, "learning_rate": 1.9618234314399963e-06, "loss": 0.5621, "mean_token_accuracy": 0.8427650958299637, "num_tokens": 1737241614.0, "step": 3730 }, { "epoch": 1.6115061409179057, "grad_norm": 0.20192857249857155, "learning_rate": 1.92050776580046e-06, "loss": 0.5821, "mean_token_accuracy": 0.8390373677015305, "num_tokens": 1742473792.0, "step": 3740 }, { "epoch": 1.6158155569920276, "grad_norm": 0.3373318589543933, "learning_rate": 1.8795855334192115e-06, "loss": 0.5691, "mean_token_accuracy": 0.8419119477272033, "num_tokens": 1747688223.0, "step": 3750 }, { "epoch": 1.6201249730661496, "grad_norm": 0.3347633057088764, "learning_rate": 1.839058727016425e-06, "loss": 0.568, "mean_token_accuracy": 0.8415918499231339, "num_tokens": 1752914853.0, "step": 3760 }, { "epoch": 1.6244343891402715, "grad_norm": 0.3109244483704517, "learning_rate": 1.7989293200569036e-06, "loss": 0.5645, "mean_token_accuracy": 0.8428639143705368, "num_tokens": 1758128638.0, "step": 3770 }, { "epoch": 1.6287438052143934, "grad_norm": 0.2893394915898089, "learning_rate": 1.7591992666539525e-06, "loss": 0.5653, "mean_token_accuracy": 0.8425607711076737, "num_tokens": 1763334939.0, "step": 3780 }, { "epoch": 1.6330532212885154, "grad_norm": 1.218351351987823, "learning_rate": 1.71987050147425e-06, "loss": 0.5553, "mean_token_accuracy": 0.8452137380838394, "num_tokens": 1768552901.0, "step": 3790 }, { "epoch": 1.6373626373626373, "grad_norm": 0.29951553958172844, "learning_rate": 1.6809449396436207e-06, "loss": 0.5725, "mean_token_accuracy": 0.8407992959022522, "num_tokens": 1773766531.0, "step": 3800 }, { "epoch": 1.6416720534367593, "grad_norm": 0.32318731973417114, "learning_rate": 1.6424244766537777e-06, "loss": 0.5692, "mean_token_accuracy": 0.8415236979722976, "num_tokens": 1778972878.0, "step": 3810 }, { "epoch": 1.6459814695108812, "grad_norm": 0.36700532144057274, "learning_rate": 1.6043109882700403e-06, "loss": 0.5457, "mean_token_accuracy": 0.8469302773475647, "num_tokens": 1784195201.0, "step": 3820 }, { "epoch": 1.6502908855850031, "grad_norm": 0.2679082597100336, "learning_rate": 1.5666063304399638e-06, "loss": 0.5475, "mean_token_accuracy": 0.8465783178806305, "num_tokens": 1789420206.0, "step": 3830 }, { "epoch": 1.654600301659125, "grad_norm": 0.23084778659284563, "learning_rate": 1.5293123392029896e-06, "loss": 0.5552, "mean_token_accuracy": 0.8453614562749863, "num_tokens": 1794650720.0, "step": 3840 }, { "epoch": 1.658909717733247, "grad_norm": 0.26466421650248506, "learning_rate": 1.4924308306010272e-06, "loss": 0.5834, "mean_token_accuracy": 0.8385993272066117, "num_tokens": 1799865004.0, "step": 3850 }, { "epoch": 1.663219133807369, "grad_norm": 0.273341674522465, "learning_rate": 1.4559636005900158e-06, "loss": 0.5684, "mean_token_accuracy": 0.8414583206176758, "num_tokens": 1805076744.0, "step": 3860 }, { "epoch": 1.667528549881491, "grad_norm": 0.3034746890448505, "learning_rate": 1.4199124249524853e-06, "loss": 0.5693, "mean_token_accuracy": 0.8421899497509002, "num_tokens": 1810302328.0, "step": 3870 }, { "epoch": 1.6718379659556128, "grad_norm": 0.27435766450421767, "learning_rate": 1.3842790592110666e-06, "loss": 0.547, "mean_token_accuracy": 0.8465248465538024, "num_tokens": 1815521383.0, "step": 3880 }, { "epoch": 1.676147382029735, "grad_norm": 0.2736954372324488, "learning_rate": 1.3490652385430214e-06, "loss": 0.5625, "mean_token_accuracy": 0.8431600540876388, "num_tokens": 1820717708.0, "step": 3890 }, { "epoch": 1.680456798103857, "grad_norm": 0.24489497161214083, "learning_rate": 1.3142726776957371e-06, "loss": 0.5718, "mean_token_accuracy": 0.8410616487264633, "num_tokens": 1825946332.0, "step": 3900 }, { "epoch": 1.684766214177979, "grad_norm": 0.29198126134462354, "learning_rate": 1.2799030709032278e-06, "loss": 0.5667, "mean_token_accuracy": 0.8430218577384949, "num_tokens": 1831168803.0, "step": 3910 }, { "epoch": 1.6890756302521008, "grad_norm": 0.20294168868650037, "learning_rate": 1.2459580918036406e-06, "loss": 0.5444, "mean_token_accuracy": 0.8467907607555389, "num_tokens": 1836379173.0, "step": 3920 }, { "epoch": 1.6933850463262228, "grad_norm": 0.2692324198316368, "learning_rate": 1.2124393933577472e-06, "loss": 0.564, "mean_token_accuracy": 0.8432279020547867, "num_tokens": 1841581865.0, "step": 3930 }, { "epoch": 1.6976944624003447, "grad_norm": 0.415868706413091, "learning_rate": 1.1793486077684557e-06, "loss": 0.5564, "mean_token_accuracy": 0.8446259886026383, "num_tokens": 1846815154.0, "step": 3940 }, { "epoch": 1.7020038784744667, "grad_norm": 0.478740786422328, "learning_rate": 1.1466873464013384e-06, "loss": 0.5593, "mean_token_accuracy": 0.8443294674158096, "num_tokens": 1852048390.0, "step": 3950 }, { "epoch": 1.7063132945485888, "grad_norm": 0.29234287965676403, "learning_rate": 1.1144571997061516e-06, "loss": 0.561, "mean_token_accuracy": 0.8432582288980484, "num_tokens": 1857256619.0, "step": 3960 }, { "epoch": 1.7106227106227108, "grad_norm": 0.23614264133177346, "learning_rate": 1.082659737139401e-06, "loss": 0.5594, "mean_token_accuracy": 0.8439650893211365, "num_tokens": 1862452622.0, "step": 3970 }, { "epoch": 1.7149321266968327, "grad_norm": 0.31355334722166733, "learning_rate": 1.0512965070879056e-06, "loss": 0.5712, "mean_token_accuracy": 0.8416002511978149, "num_tokens": 1867662004.0, "step": 3980 }, { "epoch": 1.7192415427709546, "grad_norm": 0.21072297521286784, "learning_rate": 1.0203690367934083e-06, "loss": 0.5445, "mean_token_accuracy": 0.8470638453960418, "num_tokens": 1872884858.0, "step": 3990 }, { "epoch": 1.7235509588450766, "grad_norm": 1.4533425354879537, "learning_rate": 9.898788322782026e-07, "loss": 0.5519, "mean_token_accuracy": 0.8461290508508682, "num_tokens": 1878106486.0, "step": 4000 }, { "epoch": 1.7278603749191985, "grad_norm": 0.33515477996565063, "learning_rate": 9.598273782717903e-07, "loss": 0.5591, "mean_token_accuracy": 0.8440490633249282, "num_tokens": 1883331525.0, "step": 4010 }, { "epoch": 1.7321697909933205, "grad_norm": 5.049839078891503, "learning_rate": 9.302161381385944e-07, "loss": 0.5551, "mean_token_accuracy": 0.8452037513256073, "num_tokens": 1888560341.0, "step": 4020 }, { "epoch": 1.7364792070674424, "grad_norm": 0.4377234433785602, "learning_rate": 9.01046553806687e-07, "loss": 0.565, "mean_token_accuracy": 0.8427776783704758, "num_tokens": 1893788404.0, "step": 4030 }, { "epoch": 1.7407886231415644, "grad_norm": 1.0000524619660929, "learning_rate": 8.723200456975867e-07, "loss": 0.5469, "mean_token_accuracy": 0.8464088082313538, "num_tokens": 1899009161.0, "step": 4040 }, { "epoch": 1.7450980392156863, "grad_norm": 0.3002920658959663, "learning_rate": 8.440380126570802e-07, "loss": 0.5451, "mean_token_accuracy": 0.8471545666456223, "num_tokens": 1904232747.0, "step": 4050 }, { "epoch": 1.7494074552898082, "grad_norm": 0.22004479738683702, "learning_rate": 8.162018318871135e-07, "loss": 0.562, "mean_token_accuracy": 0.8432740241289138, "num_tokens": 1909448550.0, "step": 4060 }, { "epoch": 1.7537168713639302, "grad_norm": 1.0514868620754743, "learning_rate": 7.888128588787203e-07, "loss": 0.5698, "mean_token_accuracy": 0.8418963015079498, "num_tokens": 1914664777.0, "step": 4070 }, { "epoch": 1.7580262874380521, "grad_norm": 0.20667581899290605, "learning_rate": 7.618724273460221e-07, "loss": 0.5499, "mean_token_accuracy": 0.8449696570634841, "num_tokens": 1919897408.0, "step": 4080 }, { "epoch": 1.762335703512174, "grad_norm": 0.23906161519973332, "learning_rate": 7.35381849161283e-07, "loss": 0.5716, "mean_token_accuracy": 0.8412007808685302, "num_tokens": 1925122794.0, "step": 4090 }, { "epoch": 1.766645119586296, "grad_norm": 0.2066759119944676, "learning_rate": 7.093424142910254e-07, "loss": 0.5733, "mean_token_accuracy": 0.8403735935688019, "num_tokens": 1930322110.0, "step": 4100 }, { "epoch": 1.770954535660418, "grad_norm": 0.26930336918653247, "learning_rate": 6.837553907332107e-07, "loss": 0.5628, "mean_token_accuracy": 0.843622213602066, "num_tokens": 1935550774.0, "step": 4110 }, { "epoch": 1.7752639517345399, "grad_norm": 0.2424212738530522, "learning_rate": 6.586220244555031e-07, "loss": 0.5765, "mean_token_accuracy": 0.840154591202736, "num_tokens": 1940782887.0, "step": 4120 }, { "epoch": 1.7795733678086618, "grad_norm": 0.26999894859618845, "learning_rate": 6.339435393345872e-07, "loss": 0.5791, "mean_token_accuracy": 0.8396477222442627, "num_tokens": 1945992059.0, "step": 4130 }, { "epoch": 1.7838827838827838, "grad_norm": 0.277818188449899, "learning_rate": 6.097211370965784e-07, "loss": 0.5531, "mean_token_accuracy": 0.8458486407995224, "num_tokens": 1951211737.0, "step": 4140 }, { "epoch": 1.7881921999569057, "grad_norm": 0.19714565092173875, "learning_rate": 5.859559972585027e-07, "loss": 0.5539, "mean_token_accuracy": 0.845096081495285, "num_tokens": 1956426127.0, "step": 4150 }, { "epoch": 1.7925016160310276, "grad_norm": 0.23634010456003995, "learning_rate": 5.626492770708536e-07, "loss": 0.5622, "mean_token_accuracy": 0.8438441097736359, "num_tokens": 1961638366.0, "step": 4160 }, { "epoch": 1.7968110321051496, "grad_norm": 0.21571410131486773, "learning_rate": 5.398021114612506e-07, "loss": 0.5623, "mean_token_accuracy": 0.843345332145691, "num_tokens": 1966866358.0, "step": 4170 }, { "epoch": 1.8011204481792717, "grad_norm": 0.2725480240543948, "learning_rate": 5.174156129791608e-07, "loss": 0.5582, "mean_token_accuracy": 0.8448286980390549, "num_tokens": 1972081738.0, "step": 4180 }, { "epoch": 1.8054298642533937, "grad_norm": 0.18631236174861818, "learning_rate": 4.954908717417361e-07, "loss": 0.562, "mean_token_accuracy": 0.8423676043748856, "num_tokens": 1977280871.0, "step": 4190 }, { "epoch": 1.8097392803275156, "grad_norm": 0.31474298687425234, "learning_rate": 4.740289553807198e-07, "loss": 0.5618, "mean_token_accuracy": 0.8428031504154205, "num_tokens": 1982508739.0, "step": 4200 }, { "epoch": 1.8140486964016376, "grad_norm": 42.14779931930946, "learning_rate": 4.5303090899045943e-07, "loss": 0.5505, "mean_token_accuracy": 0.8458845764398575, "num_tokens": 1987715610.0, "step": 4210 }, { "epoch": 1.8183581124757595, "grad_norm": 0.33032032961175656, "learning_rate": 4.324977550770237e-07, "loss": 0.5535, "mean_token_accuracy": 0.8454466730356216, "num_tokens": 1992931030.0, "step": 4220 }, { "epoch": 1.8226675285498815, "grad_norm": 0.39977748626736526, "learning_rate": 4.124304935083956e-07, "loss": 0.5646, "mean_token_accuracy": 0.8431243419647216, "num_tokens": 1998143328.0, "step": 4230 }, { "epoch": 1.8269769446240034, "grad_norm": 0.28416976594988835, "learning_rate": 3.9283010146580205e-07, "loss": 0.5714, "mean_token_accuracy": 0.8413020133972168, "num_tokens": 2003344443.0, "step": 4240 }, { "epoch": 1.8312863606981256, "grad_norm": 0.23958794008195122, "learning_rate": 3.736975333961168e-07, "loss": 0.5464, "mean_token_accuracy": 0.8469565808773041, "num_tokens": 2008564314.0, "step": 4250 }, { "epoch": 1.8355957767722475, "grad_norm": 0.2411399477856872, "learning_rate": 3.5503372096538644e-07, "loss": 0.5643, "mean_token_accuracy": 0.8430578768253326, "num_tokens": 2013780566.0, "step": 4260 }, { "epoch": 1.8399051928463694, "grad_norm": 0.2596543446513638, "learning_rate": 3.3683957301346436e-07, "loss": 0.5409, "mean_token_accuracy": 0.8482831686735153, "num_tokens": 2018999745.0, "step": 4270 }, { "epoch": 1.8442146089204914, "grad_norm": 0.26075365438861137, "learning_rate": 3.191159755097528e-07, "loss": 0.5715, "mean_token_accuracy": 0.8418577492237092, "num_tokens": 2024235828.0, "step": 4280 }, { "epoch": 1.8485240249946133, "grad_norm": 0.3970632730250479, "learning_rate": 3.01863791510062e-07, "loss": 0.5367, "mean_token_accuracy": 0.8491760581731796, "num_tokens": 2029444942.0, "step": 4290 }, { "epoch": 1.8528334410687353, "grad_norm": 0.6304076052101398, "learning_rate": 2.8508386111458343e-07, "loss": 0.5598, "mean_token_accuracy": 0.8436542540788651, "num_tokens": 2034653851.0, "step": 4300 }, { "epoch": 1.8571428571428572, "grad_norm": 0.2617357188137764, "learning_rate": 2.687770014269775e-07, "loss": 0.5647, "mean_token_accuracy": 0.8423745274543762, "num_tokens": 2039862945.0, "step": 4310 }, { "epoch": 1.8614522732169791, "grad_norm": 0.2885558712310501, "learning_rate": 2.529440065145894e-07, "loss": 0.5606, "mean_token_accuracy": 0.8436487764120102, "num_tokens": 2045089685.0, "step": 4320 }, { "epoch": 1.865761689291101, "grad_norm": 0.39003193591044827, "learning_rate": 2.3758564736977574e-07, "loss": 0.5672, "mean_token_accuracy": 0.8424877673387527, "num_tokens": 2050301703.0, "step": 4330 }, { "epoch": 1.870071105365223, "grad_norm": 0.5516627261572724, "learning_rate": 2.2270267187237017e-07, "loss": 0.5677, "mean_token_accuracy": 0.842144039273262, "num_tokens": 2055519505.0, "step": 4340 }, { "epoch": 1.874380521439345, "grad_norm": 0.2571568758716977, "learning_rate": 2.0829580475325351e-07, "loss": 0.5544, "mean_token_accuracy": 0.8454653769731522, "num_tokens": 2060749146.0, "step": 4350 }, { "epoch": 1.878689937513467, "grad_norm": 0.29011290672341433, "learning_rate": 1.94365747559071e-07, "loss": 0.56, "mean_token_accuracy": 0.8435531735420227, "num_tokens": 2065978519.0, "step": 4360 }, { "epoch": 1.8829993535875889, "grad_norm": 0.3381965636339726, "learning_rate": 1.8091317861806955e-07, "loss": 0.5732, "mean_token_accuracy": 0.8405157655477524, "num_tokens": 2071188188.0, "step": 4370 }, { "epoch": 1.8873087696617108, "grad_norm": 0.2990034637882605, "learning_rate": 1.6793875300706085e-07, "loss": 0.5626, "mean_token_accuracy": 0.8430294305086136, "num_tokens": 2076402157.0, "step": 4380 }, { "epoch": 1.8916181857358327, "grad_norm": 0.25251911090924195, "learning_rate": 1.554431025195302e-07, "loss": 0.5579, "mean_token_accuracy": 0.8448395818471909, "num_tokens": 2081627756.0, "step": 4390 }, { "epoch": 1.8959276018099547, "grad_norm": 0.2396453560294823, "learning_rate": 1.4342683563486447e-07, "loss": 0.5574, "mean_token_accuracy": 0.8446199864149093, "num_tokens": 2086832559.0, "step": 4400 }, { "epoch": 1.9002370178840766, "grad_norm": 0.24562063048105576, "learning_rate": 1.318905374887247e-07, "loss": 0.5545, "mean_token_accuracy": 0.8451064556837082, "num_tokens": 2092054208.0, "step": 4410 }, { "epoch": 1.9045464339581986, "grad_norm": 0.245556619074812, "learning_rate": 1.2083476984455333e-07, "loss": 0.5699, "mean_token_accuracy": 0.8422177851200103, "num_tokens": 2097278659.0, "step": 4420 }, { "epoch": 1.9088558500323205, "grad_norm": 0.23228186179281834, "learning_rate": 1.1026007106621717e-07, "loss": 0.5575, "mean_token_accuracy": 0.8440348535776139, "num_tokens": 2102490153.0, "step": 4430 }, { "epoch": 1.9131652661064424, "grad_norm": 0.32860596796316727, "learning_rate": 1.0016695609179616e-07, "loss": 0.5751, "mean_token_accuracy": 0.8408908247947693, "num_tokens": 2107704881.0, "step": 4440 }, { "epoch": 1.9174746821805644, "grad_norm": 0.32532895957474156, "learning_rate": 9.055591640849792e-08, "loss": 0.5595, "mean_token_accuracy": 0.8435615003108978, "num_tokens": 2112927779.0, "step": 4450 }, { "epoch": 1.9217840982546863, "grad_norm": 0.2333572459215879, "learning_rate": 8.142742002873904e-08, "loss": 0.5526, "mean_token_accuracy": 0.8456332743167877, "num_tokens": 2118159954.0, "step": 4460 }, { "epoch": 1.9260935143288085, "grad_norm": 0.25715921677768927, "learning_rate": 7.278191146734337e-08, "loss": 0.5521, "mean_token_accuracy": 0.846096059679985, "num_tokens": 2123378835.0, "step": 4470 }, { "epoch": 1.9304029304029304, "grad_norm": 0.2677745963512519, "learning_rate": 6.461981171989928e-08, "loss": 0.5536, "mean_token_accuracy": 0.8447523862123489, "num_tokens": 2128596770.0, "step": 4480 }, { "epoch": 1.9347123464770524, "grad_norm": 3.279138790225686, "learning_rate": 5.69415182422639e-08, "loss": 0.5819, "mean_token_accuracy": 0.838872817158699, "num_tokens": 2133806282.0, "step": 4490 }, { "epoch": 1.9390217625511743, "grad_norm": 0.38258790358029443, "learning_rate": 4.9747404931205224e-08, "loss": 0.5532, "mean_token_accuracy": 0.8448556065559387, "num_tokens": 2139002376.0, "step": 4500 }, { "epoch": 1.9433311786252963, "grad_norm": 0.21512774113911937, "learning_rate": 4.303782210619112e-08, "loss": 0.5415, "mean_token_accuracy": 0.8482328653335571, "num_tokens": 2144183612.0, "step": 4510 }, { "epoch": 1.9476405946994182, "grad_norm": 0.21434866654455895, "learning_rate": 3.681309649233744e-08, "loss": 0.57, "mean_token_accuracy": 0.8415874391794205, "num_tokens": 2149402211.0, "step": 4520 }, { "epoch": 1.9519500107735401, "grad_norm": 0.2616101437627084, "learning_rate": 3.1073531204496296e-08, "loss": 0.5463, "mean_token_accuracy": 0.8469921559095382, "num_tokens": 2154620998.0, "step": 4530 }, { "epoch": 1.9562594268476623, "grad_norm": 0.21865877362380354, "learning_rate": 2.581940573249009e-08, "loss": 0.5582, "mean_token_accuracy": 0.8442719519138336, "num_tokens": 2159825665.0, "step": 4540 }, { "epoch": 1.9605688429217842, "grad_norm": 11.428471278798616, "learning_rate": 2.1050975927512373e-08, "loss": 0.5517, "mean_token_accuracy": 0.8458136737346649, "num_tokens": 2165038901.0, "step": 4550 }, { "epoch": 1.9648782589959062, "grad_norm": 0.41917915827669827, "learning_rate": 1.6768473989656752e-08, "loss": 0.5547, "mean_token_accuracy": 0.844542047381401, "num_tokens": 2170266740.0, "step": 4560 }, { "epoch": 1.9691876750700281, "grad_norm": 0.38778384069981153, "learning_rate": 1.2972108456619226e-08, "loss": 0.564, "mean_token_accuracy": 0.843216672539711, "num_tokens": 2175478666.0, "step": 4570 }, { "epoch": 1.97349709114415, "grad_norm": 0.48338559863132274, "learning_rate": 9.662064193540766e-09, "loss": 0.576, "mean_token_accuracy": 0.8414626181125641, "num_tokens": 2180683240.0, "step": 4580 }, { "epoch": 1.977806507218272, "grad_norm": 0.28480633255052445, "learning_rate": 6.838502384002299e-09, "loss": 0.5666, "mean_token_accuracy": 0.8420202493667602, "num_tokens": 2185905908.0, "step": 4590 }, { "epoch": 1.982115923292394, "grad_norm": 0.738043733363809, "learning_rate": 4.50156052217876e-09, "loss": 0.5679, "mean_token_accuracy": 0.8421945840120315, "num_tokens": 2191132778.0, "step": 4600 }, { "epoch": 1.9864253393665159, "grad_norm": 0.24430429914168336, "learning_rate": 2.6513524061455577e-09, "loss": 0.5626, "mean_token_accuracy": 0.8445323497056961, "num_tokens": 2196351758.0, "step": 4610 }, { "epoch": 1.9907347554406378, "grad_norm": 0.23694874064234042, "learning_rate": 1.287968132331896e-09, "loss": 0.5591, "mean_token_accuracy": 0.8435301870107651, "num_tokens": 2201577869.0, "step": 4620 }, { "epoch": 1.9950441715147598, "grad_norm": 0.286273297009805, "learning_rate": 4.1147409113651e-10, "loss": 0.5696, "mean_token_accuracy": 0.8423977881669998, "num_tokens": 2206790019.0, "step": 4630 }, { "epoch": 1.9993535875888817, "grad_norm": 0.24142562700219233, "learning_rate": 2.1912963695802647e-11, "loss": 0.5722, "mean_token_accuracy": 0.8414206713438034, "num_tokens": 2212021058.0, "step": 4640 }, { "epoch": 2.0, "mean_token_accuracy": 0.8563221096992493, "num_tokens": 2212717378.0, "step": 4642, "total_flos": 3987371203756032.0, "train_loss": 0.5414124863869874, "train_runtime": 78625.9853, "train_samples_per_second": 3.777, "train_steps_per_second": 0.059 } ], "logging_steps": 10, "max_steps": 4642, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3987371203756032.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }