diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,14797 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 24.0, + "eval_steps": 500, + "global_step": 21096, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01138627953316254, + "grad_norm": 2.734767198562622, + "learning_rate": 0.0001999512432959532, + "loss": 30.333529663085937, + "step": 10 + }, + { + "epoch": 0.02277255906632508, + "grad_norm": 2.7044386863708496, + "learning_rate": 0.00019989706918034563, + "loss": 25.792864990234374, + "step": 20 + }, + { + "epoch": 0.034158838599487616, + "grad_norm": 2.4714598655700684, + "learning_rate": 0.00019984289506473808, + "loss": 24.416429138183595, + "step": 30 + }, + { + "epoch": 0.04554511813265016, + "grad_norm": 2.6829721927642822, + "learning_rate": 0.00019978872094913053, + "loss": 23.82067413330078, + "step": 40 + }, + { + "epoch": 0.0569313976658127, + "grad_norm": 3.0441884994506836, + "learning_rate": 0.00019973454683352296, + "loss": 23.323060607910158, + "step": 50 + }, + { + "epoch": 0.06831767719897523, + "grad_norm": 3.367356538772583, + "learning_rate": 0.0001996803727179154, + "loss": 22.732952880859376, + "step": 60 + }, + { + "epoch": 0.07970395673213777, + "grad_norm": 3.5883660316467285, + "learning_rate": 0.00019962619860230783, + "loss": 22.522509765625, + "step": 70 + }, + { + "epoch": 0.09109023626530031, + "grad_norm": 3.8543548583984375, + "learning_rate": 0.00019957202448670026, + "loss": 22.270034790039062, + "step": 80 + }, + { + "epoch": 0.10247651579846286, + "grad_norm": 4.374216079711914, + "learning_rate": 0.0001995178503710927, + "loss": 22.013529968261718, + "step": 90 + }, + { + "epoch": 0.1138627953316254, + "grad_norm": 4.15579891204834, + "learning_rate": 0.00019946367625548513, + "loss": 22.13349304199219, + "step": 100 + }, + { + "epoch": 0.12524907486478792, + "grad_norm": 4.63075065612793, + "learning_rate": 0.00019940950213987756, + "loss": 21.13990173339844, + "step": 110 + }, + { + "epoch": 0.13663535439795046, + "grad_norm": 5.391815662384033, + "learning_rate": 0.00019935532802427, + "loss": 21.209239196777343, + "step": 120 + }, + { + "epoch": 0.148021633931113, + "grad_norm": 5.1299896240234375, + "learning_rate": 0.00019930115390866243, + "loss": 21.18792266845703, + "step": 130 + }, + { + "epoch": 0.15940791346427555, + "grad_norm": 5.028836727142334, + "learning_rate": 0.00019924697979305489, + "loss": 20.73717346191406, + "step": 140 + }, + { + "epoch": 0.1707941929974381, + "grad_norm": 5.911370754241943, + "learning_rate": 0.00019919280567744734, + "loss": 20.920809936523437, + "step": 150 + }, + { + "epoch": 0.18218047253060063, + "grad_norm": 6.1007914543151855, + "learning_rate": 0.00019913863156183976, + "loss": 20.673001098632813, + "step": 160 + }, + { + "epoch": 0.19356675206376317, + "grad_norm": 7.011501312255859, + "learning_rate": 0.0001990844574462322, + "loss": 20.6161376953125, + "step": 170 + }, + { + "epoch": 0.2049530315969257, + "grad_norm": 6.00868558883667, + "learning_rate": 0.00019903028333062464, + "loss": 20.028971862792968, + "step": 180 + }, + { + "epoch": 0.21633931113008825, + "grad_norm": 6.089074611663818, + "learning_rate": 0.0001989761092150171, + "loss": 20.252516174316405, + "step": 190 + }, + { + "epoch": 0.2277255906632508, + "grad_norm": 6.774249076843262, + "learning_rate": 0.00019892193509940951, + "loss": 20.02190399169922, + "step": 200 + }, + { + "epoch": 0.23911187019641333, + "grad_norm": 5.999351501464844, + "learning_rate": 0.00019886776098380197, + "loss": 19.793946838378908, + "step": 210 + }, + { + "epoch": 0.25049814972957585, + "grad_norm": 6.9030256271362305, + "learning_rate": 0.0001988135868681944, + "loss": 19.749664306640625, + "step": 220 + }, + { + "epoch": 0.2618844292627384, + "grad_norm": 6.463044166564941, + "learning_rate": 0.00019875941275258681, + "loss": 19.66212615966797, + "step": 230 + }, + { + "epoch": 0.27327070879590093, + "grad_norm": 6.855940818786621, + "learning_rate": 0.00019870523863697924, + "loss": 19.732081604003906, + "step": 240 + }, + { + "epoch": 0.2846569883290635, + "grad_norm": 5.966732025146484, + "learning_rate": 0.0001986510645213717, + "loss": 19.346263122558593, + "step": 250 + }, + { + "epoch": 0.296043267862226, + "grad_norm": 7.0768632888793945, + "learning_rate": 0.00019859689040576412, + "loss": 19.514622497558594, + "step": 260 + }, + { + "epoch": 0.3074295473953886, + "grad_norm": 6.5192694664001465, + "learning_rate": 0.00019854271629015657, + "loss": 19.53545379638672, + "step": 270 + }, + { + "epoch": 0.3188158269285511, + "grad_norm": 6.5199103355407715, + "learning_rate": 0.00019848854217454902, + "loss": 18.95964813232422, + "step": 280 + }, + { + "epoch": 0.33020210646171366, + "grad_norm": 6.990855693817139, + "learning_rate": 0.00019843436805894144, + "loss": 18.899111938476562, + "step": 290 + }, + { + "epoch": 0.3415883859948762, + "grad_norm": 7.77662992477417, + "learning_rate": 0.0001983801939433339, + "loss": 18.688560485839844, + "step": 300 + }, + { + "epoch": 0.3529746655280387, + "grad_norm": 7.311860084533691, + "learning_rate": 0.00019832601982772632, + "loss": 18.872056579589845, + "step": 310 + }, + { + "epoch": 0.36436094506120126, + "grad_norm": 8.168448448181152, + "learning_rate": 0.00019827184571211877, + "loss": 18.846502685546874, + "step": 320 + }, + { + "epoch": 0.37574722459436377, + "grad_norm": 7.2531046867370605, + "learning_rate": 0.0001982176715965112, + "loss": 18.281404113769533, + "step": 330 + }, + { + "epoch": 0.38713350412752634, + "grad_norm": 7.322739601135254, + "learning_rate": 0.00019816349748090365, + "loss": 18.37748565673828, + "step": 340 + }, + { + "epoch": 0.39851978366068885, + "grad_norm": 7.208354949951172, + "learning_rate": 0.00019810932336529607, + "loss": 18.53016815185547, + "step": 350 + }, + { + "epoch": 0.4099060631938514, + "grad_norm": 8.217365264892578, + "learning_rate": 0.00019805514924968852, + "loss": 18.07104949951172, + "step": 360 + }, + { + "epoch": 0.42129234272701394, + "grad_norm": 6.938902854919434, + "learning_rate": 0.00019800097513408095, + "loss": 18.317633056640624, + "step": 370 + }, + { + "epoch": 0.4326786222601765, + "grad_norm": 7.835426330566406, + "learning_rate": 0.00019794680101847337, + "loss": 18.164633178710936, + "step": 380 + }, + { + "epoch": 0.444064901793339, + "grad_norm": 6.851658821105957, + "learning_rate": 0.00019789262690286582, + "loss": 18.049453735351562, + "step": 390 + }, + { + "epoch": 0.4554511813265016, + "grad_norm": 6.942612648010254, + "learning_rate": 0.00019783845278725825, + "loss": 18.057662963867188, + "step": 400 + }, + { + "epoch": 0.4668374608596641, + "grad_norm": 7.211177349090576, + "learning_rate": 0.0001977842786716507, + "loss": 17.956227111816407, + "step": 410 + }, + { + "epoch": 0.47822374039282667, + "grad_norm": 7.398573398590088, + "learning_rate": 0.00019773010455604312, + "loss": 17.89140167236328, + "step": 420 + }, + { + "epoch": 0.4896100199259892, + "grad_norm": 7.789492607116699, + "learning_rate": 0.00019767593044043558, + "loss": 17.603182983398437, + "step": 430 + }, + { + "epoch": 0.5009962994591517, + "grad_norm": 6.86952018737793, + "learning_rate": 0.000197621756324828, + "loss": 17.51262969970703, + "step": 440 + }, + { + "epoch": 0.5123825789923142, + "grad_norm": 6.798729419708252, + "learning_rate": 0.00019756758220922045, + "loss": 17.721162414550783, + "step": 450 + }, + { + "epoch": 0.5237688585254768, + "grad_norm": 7.071723461151123, + "learning_rate": 0.00019751340809361288, + "loss": 17.830340576171874, + "step": 460 + }, + { + "epoch": 0.5351551380586393, + "grad_norm": 7.081210136413574, + "learning_rate": 0.00019745923397800533, + "loss": 17.28885498046875, + "step": 470 + }, + { + "epoch": 0.5465414175918019, + "grad_norm": 7.516231536865234, + "learning_rate": 0.00019740505986239775, + "loss": 17.6906005859375, + "step": 480 + }, + { + "epoch": 0.5579276971249644, + "grad_norm": 6.515166759490967, + "learning_rate": 0.0001973508857467902, + "loss": 17.18426971435547, + "step": 490 + }, + { + "epoch": 0.569313976658127, + "grad_norm": 7.4100189208984375, + "learning_rate": 0.00019729671163118263, + "loss": 17.34864501953125, + "step": 500 + }, + { + "epoch": 0.5807002561912895, + "grad_norm": 6.820520401000977, + "learning_rate": 0.00019724253751557505, + "loss": 17.613563537597656, + "step": 510 + }, + { + "epoch": 0.592086535724452, + "grad_norm": 7.629634857177734, + "learning_rate": 0.0001971883633999675, + "loss": 17.18858947753906, + "step": 520 + }, + { + "epoch": 0.6034728152576145, + "grad_norm": 7.522180080413818, + "learning_rate": 0.00019713418928435993, + "loss": 17.541119384765626, + "step": 530 + }, + { + "epoch": 0.6148590947907772, + "grad_norm": 7.64506196975708, + "learning_rate": 0.00019708001516875238, + "loss": 17.047702026367187, + "step": 540 + }, + { + "epoch": 0.6262453743239397, + "grad_norm": 6.857038497924805, + "learning_rate": 0.0001970258410531448, + "loss": 17.394444274902344, + "step": 550 + }, + { + "epoch": 0.6376316538571022, + "grad_norm": 6.667849063873291, + "learning_rate": 0.00019697166693753726, + "loss": 17.071144104003906, + "step": 560 + }, + { + "epoch": 0.6490179333902647, + "grad_norm": 7.24165153503418, + "learning_rate": 0.00019691749282192968, + "loss": 16.311009216308594, + "step": 570 + }, + { + "epoch": 0.6604042129234273, + "grad_norm": 7.296228408813477, + "learning_rate": 0.00019686331870632213, + "loss": 16.9224609375, + "step": 580 + }, + { + "epoch": 0.6717904924565898, + "grad_norm": 6.7600016593933105, + "learning_rate": 0.00019680914459071456, + "loss": 17.025440979003907, + "step": 590 + }, + { + "epoch": 0.6831767719897524, + "grad_norm": 7.559263229370117, + "learning_rate": 0.000196754970475107, + "loss": 16.395526123046874, + "step": 600 + }, + { + "epoch": 0.6945630515229149, + "grad_norm": 7.639865398406982, + "learning_rate": 0.00019670079635949946, + "loss": 17.01804962158203, + "step": 610 + }, + { + "epoch": 0.7059493310560774, + "grad_norm": 7.71447229385376, + "learning_rate": 0.00019664662224389188, + "loss": 16.441880798339845, + "step": 620 + }, + { + "epoch": 0.71733561058924, + "grad_norm": 6.845354080200195, + "learning_rate": 0.00019659244812828434, + "loss": 16.84131622314453, + "step": 630 + }, + { + "epoch": 0.7287218901224025, + "grad_norm": 6.601531505584717, + "learning_rate": 0.00019653827401267676, + "loss": 16.1950439453125, + "step": 640 + }, + { + "epoch": 0.740108169655565, + "grad_norm": 6.828949928283691, + "learning_rate": 0.00019648409989706918, + "loss": 16.982223510742188, + "step": 650 + }, + { + "epoch": 0.7514944491887275, + "grad_norm": 7.29005765914917, + "learning_rate": 0.0001964299257814616, + "loss": 16.249069213867188, + "step": 660 + }, + { + "epoch": 0.7628807287218902, + "grad_norm": 7.450255870819092, + "learning_rate": 0.00019637575166585406, + "loss": 16.10637664794922, + "step": 670 + }, + { + "epoch": 0.7742670082550527, + "grad_norm": 7.040278434753418, + "learning_rate": 0.00019632157755024649, + "loss": 16.148355102539064, + "step": 680 + }, + { + "epoch": 0.7856532877882152, + "grad_norm": 7.316576957702637, + "learning_rate": 0.00019626740343463894, + "loss": 16.328166198730468, + "step": 690 + }, + { + "epoch": 0.7970395673213777, + "grad_norm": 6.502153396606445, + "learning_rate": 0.00019621322931903136, + "loss": 15.988864135742187, + "step": 700 + }, + { + "epoch": 0.8084258468545403, + "grad_norm": 7.165337085723877, + "learning_rate": 0.0001961590552034238, + "loss": 16.477142333984375, + "step": 710 + }, + { + "epoch": 0.8198121263877028, + "grad_norm": 7.06484317779541, + "learning_rate": 0.00019610488108781626, + "loss": 16.042941284179687, + "step": 720 + }, + { + "epoch": 0.8311984059208654, + "grad_norm": 7.751636505126953, + "learning_rate": 0.0001960507069722087, + "loss": 16.278134155273438, + "step": 730 + }, + { + "epoch": 0.8425846854540279, + "grad_norm": 6.873840808868408, + "learning_rate": 0.00019599653285660114, + "loss": 15.732162475585938, + "step": 740 + }, + { + "epoch": 0.8539709649871904, + "grad_norm": 6.263609886169434, + "learning_rate": 0.00019594235874099356, + "loss": 16.18165283203125, + "step": 750 + }, + { + "epoch": 0.865357244520353, + "grad_norm": 6.69379186630249, + "learning_rate": 0.00019588818462538602, + "loss": 16.150958251953124, + "step": 760 + }, + { + "epoch": 0.8767435240535155, + "grad_norm": 6.563896656036377, + "learning_rate": 0.00019583401050977844, + "loss": 15.676901245117188, + "step": 770 + }, + { + "epoch": 0.888129803586678, + "grad_norm": 6.714715003967285, + "learning_rate": 0.0001957798363941709, + "loss": 15.763505554199218, + "step": 780 + }, + { + "epoch": 0.8995160831198405, + "grad_norm": 6.818701267242432, + "learning_rate": 0.00019572566227856332, + "loss": 16.16502685546875, + "step": 790 + }, + { + "epoch": 0.9109023626530032, + "grad_norm": 7.140949249267578, + "learning_rate": 0.00019567148816295574, + "loss": 15.515142822265625, + "step": 800 + }, + { + "epoch": 0.9222886421861657, + "grad_norm": 6.7389020919799805, + "learning_rate": 0.00019561731404734817, + "loss": 15.634983825683594, + "step": 810 + }, + { + "epoch": 0.9336749217193282, + "grad_norm": 6.847448825836182, + "learning_rate": 0.00019556313993174062, + "loss": 15.689366149902344, + "step": 820 + }, + { + "epoch": 0.9450612012524907, + "grad_norm": 7.26194429397583, + "learning_rate": 0.00019550896581613307, + "loss": 15.806666564941406, + "step": 830 + }, + { + "epoch": 0.9564474807856533, + "grad_norm": 6.119577407836914, + "learning_rate": 0.0001954547917005255, + "loss": 15.300843811035156, + "step": 840 + }, + { + "epoch": 0.9678337603188158, + "grad_norm": 6.605442047119141, + "learning_rate": 0.00019540061758491794, + "loss": 15.186727905273438, + "step": 850 + }, + { + "epoch": 0.9792200398519784, + "grad_norm": 6.060615062713623, + "learning_rate": 0.00019534644346931037, + "loss": 15.392152404785156, + "step": 860 + }, + { + "epoch": 0.9906063193851409, + "grad_norm": 6.676738262176514, + "learning_rate": 0.00019529226935370282, + "loss": 15.1415283203125, + "step": 870 + }, + { + "epoch": 1.0011386279533163, + "grad_norm": 6.309657096862793, + "learning_rate": 0.00019523809523809525, + "loss": 14.388438415527343, + "step": 880 + }, + { + "epoch": 1.0125249074864788, + "grad_norm": 6.854212760925293, + "learning_rate": 0.0001951839211224877, + "loss": 14.986418151855469, + "step": 890 + }, + { + "epoch": 1.0239111870196413, + "grad_norm": 6.602004051208496, + "learning_rate": 0.00019512974700688012, + "loss": 14.524969482421875, + "step": 900 + }, + { + "epoch": 1.0352974665528039, + "grad_norm": 7.170373439788818, + "learning_rate": 0.00019507557289127257, + "loss": 15.004403686523437, + "step": 910 + }, + { + "epoch": 1.0466837460859664, + "grad_norm": 6.895066738128662, + "learning_rate": 0.000195021398775665, + "loss": 14.763569641113282, + "step": 920 + }, + { + "epoch": 1.0580700256191289, + "grad_norm": 6.917144775390625, + "learning_rate": 0.00019496722466005742, + "loss": 14.740924072265624, + "step": 930 + }, + { + "epoch": 1.0694563051522914, + "grad_norm": 6.31508731842041, + "learning_rate": 0.00019491305054444987, + "loss": 14.591043090820312, + "step": 940 + }, + { + "epoch": 1.080842584685454, + "grad_norm": 6.562560558319092, + "learning_rate": 0.0001948588764288423, + "loss": 14.91121826171875, + "step": 950 + }, + { + "epoch": 1.0922288642186166, + "grad_norm": 6.635254383087158, + "learning_rate": 0.00019480470231323475, + "loss": 15.324360656738282, + "step": 960 + }, + { + "epoch": 1.1036151437517792, + "grad_norm": 6.704561233520508, + "learning_rate": 0.00019475052819762717, + "loss": 15.346974182128907, + "step": 970 + }, + { + "epoch": 1.1150014232849417, + "grad_norm": 6.2377190589904785, + "learning_rate": 0.00019469635408201963, + "loss": 15.083432006835938, + "step": 980 + }, + { + "epoch": 1.1263877028181042, + "grad_norm": 6.517440319061279, + "learning_rate": 0.00019464217996641205, + "loss": 14.828807067871093, + "step": 990 + }, + { + "epoch": 1.1377739823512667, + "grad_norm": 6.731694221496582, + "learning_rate": 0.0001945880058508045, + "loss": 14.4470703125, + "step": 1000 + }, + { + "epoch": 1.1491602618844292, + "grad_norm": 6.918997287750244, + "learning_rate": 0.00019453383173519693, + "loss": 14.706845092773438, + "step": 1010 + }, + { + "epoch": 1.1605465414175917, + "grad_norm": 6.228609561920166, + "learning_rate": 0.00019447965761958938, + "loss": 14.568064880371093, + "step": 1020 + }, + { + "epoch": 1.1719328209507545, + "grad_norm": 7.079123497009277, + "learning_rate": 0.0001944254835039818, + "loss": 14.400852966308594, + "step": 1030 + }, + { + "epoch": 1.183319100483917, + "grad_norm": 6.567544937133789, + "learning_rate": 0.00019437130938837425, + "loss": 14.280006408691406, + "step": 1040 + }, + { + "epoch": 1.1947053800170795, + "grad_norm": 7.218766689300537, + "learning_rate": 0.00019431713527276668, + "loss": 14.390548706054688, + "step": 1050 + }, + { + "epoch": 1.206091659550242, + "grad_norm": 6.728633880615234, + "learning_rate": 0.00019426296115715913, + "loss": 14.768504333496093, + "step": 1060 + }, + { + "epoch": 1.2174779390834045, + "grad_norm": 6.698533535003662, + "learning_rate": 0.00019420878704155155, + "loss": 14.518492126464844, + "step": 1070 + }, + { + "epoch": 1.228864218616567, + "grad_norm": 6.782904148101807, + "learning_rate": 0.00019415461292594398, + "loss": 14.52556915283203, + "step": 1080 + }, + { + "epoch": 1.2402504981497295, + "grad_norm": 6.522706508636475, + "learning_rate": 0.00019410043881033643, + "loss": 14.64172821044922, + "step": 1090 + }, + { + "epoch": 1.251636777682892, + "grad_norm": 6.942655563354492, + "learning_rate": 0.00019404626469472885, + "loss": 14.770219421386718, + "step": 1100 + }, + { + "epoch": 1.2630230572160546, + "grad_norm": 6.034182071685791, + "learning_rate": 0.0001939920905791213, + "loss": 14.227700805664062, + "step": 1110 + }, + { + "epoch": 1.2744093367492173, + "grad_norm": 6.776942253112793, + "learning_rate": 0.00019393791646351373, + "loss": 14.423974609375, + "step": 1120 + }, + { + "epoch": 1.2857956162823796, + "grad_norm": 6.075601100921631, + "learning_rate": 0.00019388374234790618, + "loss": 14.799586486816406, + "step": 1130 + }, + { + "epoch": 1.2971818958155423, + "grad_norm": 6.346046447753906, + "learning_rate": 0.0001938295682322986, + "loss": 14.888410949707032, + "step": 1140 + }, + { + "epoch": 1.3085681753487048, + "grad_norm": 6.099461078643799, + "learning_rate": 0.00019377539411669106, + "loss": 14.434004211425782, + "step": 1150 + }, + { + "epoch": 1.3199544548818674, + "grad_norm": 6.728332042694092, + "learning_rate": 0.00019372122000108348, + "loss": 14.108644104003906, + "step": 1160 + }, + { + "epoch": 1.3313407344150299, + "grad_norm": 6.106889247894287, + "learning_rate": 0.00019366704588547593, + "loss": 13.984442138671875, + "step": 1170 + }, + { + "epoch": 1.3427270139481924, + "grad_norm": 6.419578552246094, + "learning_rate": 0.00019361287176986839, + "loss": 14.819793701171875, + "step": 1180 + }, + { + "epoch": 1.354113293481355, + "grad_norm": 6.706035614013672, + "learning_rate": 0.0001935586976542608, + "loss": 14.234246826171875, + "step": 1190 + }, + { + "epoch": 1.3654995730145174, + "grad_norm": 5.745364665985107, + "learning_rate": 0.00019350452353865324, + "loss": 14.144187927246094, + "step": 1200 + }, + { + "epoch": 1.3768858525476801, + "grad_norm": 6.465015411376953, + "learning_rate": 0.0001934503494230457, + "loss": 14.569776916503907, + "step": 1210 + }, + { + "epoch": 1.3882721320808427, + "grad_norm": 5.789831638336182, + "learning_rate": 0.0001933961753074381, + "loss": 14.204434204101563, + "step": 1220 + }, + { + "epoch": 1.3996584116140052, + "grad_norm": 6.589231014251709, + "learning_rate": 0.00019334200119183054, + "loss": 13.60784912109375, + "step": 1230 + }, + { + "epoch": 1.4110446911471677, + "grad_norm": 6.706899166107178, + "learning_rate": 0.000193287827076223, + "loss": 14.148931884765625, + "step": 1240 + }, + { + "epoch": 1.4224309706803302, + "grad_norm": 6.473396301269531, + "learning_rate": 0.0001932336529606154, + "loss": 14.05177001953125, + "step": 1250 + }, + { + "epoch": 1.4338172502134927, + "grad_norm": 6.002316474914551, + "learning_rate": 0.00019317947884500786, + "loss": 13.77730712890625, + "step": 1260 + }, + { + "epoch": 1.4452035297466552, + "grad_norm": 5.781302452087402, + "learning_rate": 0.0001931253047294003, + "loss": 14.126388549804688, + "step": 1270 + }, + { + "epoch": 1.4565898092798177, + "grad_norm": 6.653365612030029, + "learning_rate": 0.00019307113061379274, + "loss": 14.278581237792968, + "step": 1280 + }, + { + "epoch": 1.4679760888129803, + "grad_norm": 6.22074556350708, + "learning_rate": 0.0001930169564981852, + "loss": 13.808645629882813, + "step": 1290 + }, + { + "epoch": 1.479362368346143, + "grad_norm": 6.5593647956848145, + "learning_rate": 0.00019296278238257762, + "loss": 13.794139099121093, + "step": 1300 + }, + { + "epoch": 1.4907486478793055, + "grad_norm": 5.953279972076416, + "learning_rate": 0.00019290860826697007, + "loss": 13.860592651367188, + "step": 1310 + }, + { + "epoch": 1.502134927412468, + "grad_norm": 5.616598606109619, + "learning_rate": 0.0001928544341513625, + "loss": 13.759190368652344, + "step": 1320 + }, + { + "epoch": 1.5135212069456305, + "grad_norm": 5.759921073913574, + "learning_rate": 0.00019280026003575494, + "loss": 13.61251220703125, + "step": 1330 + }, + { + "epoch": 1.524907486478793, + "grad_norm": 5.853238105773926, + "learning_rate": 0.00019274608592014737, + "loss": 13.766014099121094, + "step": 1340 + }, + { + "epoch": 1.5362937660119556, + "grad_norm": 6.38026762008667, + "learning_rate": 0.0001926919118045398, + "loss": 13.8654296875, + "step": 1350 + }, + { + "epoch": 1.547680045545118, + "grad_norm": 6.176024436950684, + "learning_rate": 0.00019263773768893222, + "loss": 13.856707763671874, + "step": 1360 + }, + { + "epoch": 1.5590663250782808, + "grad_norm": 5.767195701599121, + "learning_rate": 0.00019258356357332467, + "loss": 13.602825927734376, + "step": 1370 + }, + { + "epoch": 1.570452604611443, + "grad_norm": 6.558640480041504, + "learning_rate": 0.0001925293894577171, + "loss": 13.450588989257813, + "step": 1380 + }, + { + "epoch": 1.5818388841446058, + "grad_norm": 5.716969966888428, + "learning_rate": 0.00019247521534210954, + "loss": 13.9936279296875, + "step": 1390 + }, + { + "epoch": 1.5932251636777683, + "grad_norm": 6.217525005340576, + "learning_rate": 0.000192421041226502, + "loss": 13.740388488769531, + "step": 1400 + }, + { + "epoch": 1.6046114432109309, + "grad_norm": 6.646007061004639, + "learning_rate": 0.00019236686711089442, + "loss": 13.809278869628907, + "step": 1410 + }, + { + "epoch": 1.6159977227440934, + "grad_norm": 5.91151237487793, + "learning_rate": 0.00019231269299528687, + "loss": 13.775947570800781, + "step": 1420 + }, + { + "epoch": 1.6273840022772559, + "grad_norm": 5.8763957023620605, + "learning_rate": 0.0001922585188796793, + "loss": 13.332572937011719, + "step": 1430 + }, + { + "epoch": 1.6387702818104184, + "grad_norm": 5.781065940856934, + "learning_rate": 0.00019220434476407175, + "loss": 13.051596069335938, + "step": 1440 + }, + { + "epoch": 1.650156561343581, + "grad_norm": 5.9461445808410645, + "learning_rate": 0.00019215017064846417, + "loss": 13.740011596679688, + "step": 1450 + }, + { + "epoch": 1.6615428408767436, + "grad_norm": 5.9059224128723145, + "learning_rate": 0.00019209599653285662, + "loss": 13.289398193359375, + "step": 1460 + }, + { + "epoch": 1.672929120409906, + "grad_norm": 6.076962947845459, + "learning_rate": 0.00019204182241724905, + "loss": 13.65223388671875, + "step": 1470 + }, + { + "epoch": 1.6843153999430687, + "grad_norm": 6.304300308227539, + "learning_rate": 0.0001919876483016415, + "loss": 13.400830078125, + "step": 1480 + }, + { + "epoch": 1.6957016794762312, + "grad_norm": 6.031702041625977, + "learning_rate": 0.00019193347418603392, + "loss": 13.493438720703125, + "step": 1490 + }, + { + "epoch": 1.7070879590093937, + "grad_norm": 6.418713092803955, + "learning_rate": 0.00019187930007042635, + "loss": 13.696240234375, + "step": 1500 + }, + { + "epoch": 1.7184742385425562, + "grad_norm": 5.954122543334961, + "learning_rate": 0.0001918251259548188, + "loss": 13.905665588378906, + "step": 1510 + }, + { + "epoch": 1.7298605180757187, + "grad_norm": 6.412307262420654, + "learning_rate": 0.00019177095183921122, + "loss": 13.017045593261718, + "step": 1520 + }, + { + "epoch": 1.7412467976088815, + "grad_norm": 6.024705410003662, + "learning_rate": 0.00019171677772360368, + "loss": 13.734344482421875, + "step": 1530 + }, + { + "epoch": 1.7526330771420437, + "grad_norm": 6.235986709594727, + "learning_rate": 0.0001916626036079961, + "loss": 13.35974578857422, + "step": 1540 + }, + { + "epoch": 1.7640193566752065, + "grad_norm": 6.963014602661133, + "learning_rate": 0.00019160842949238855, + "loss": 13.176609802246094, + "step": 1550 + }, + { + "epoch": 1.7754056362083688, + "grad_norm": 5.929773807525635, + "learning_rate": 0.00019155425537678098, + "loss": 13.372627258300781, + "step": 1560 + }, + { + "epoch": 1.7867919157415315, + "grad_norm": 6.559890270233154, + "learning_rate": 0.00019150008126117343, + "loss": 13.397409057617187, + "step": 1570 + }, + { + "epoch": 1.798178195274694, + "grad_norm": 6.073424816131592, + "learning_rate": 0.00019144590714556585, + "loss": 13.51752166748047, + "step": 1580 + }, + { + "epoch": 1.8095644748078565, + "grad_norm": 6.313562393188477, + "learning_rate": 0.0001913917330299583, + "loss": 13.768846130371093, + "step": 1590 + }, + { + "epoch": 1.820950754341019, + "grad_norm": 6.578205108642578, + "learning_rate": 0.00019133755891435073, + "loss": 13.342623901367187, + "step": 1600 + }, + { + "epoch": 1.8323370338741816, + "grad_norm": 6.062254428863525, + "learning_rate": 0.00019128338479874318, + "loss": 13.149093627929688, + "step": 1610 + }, + { + "epoch": 1.8437233134073443, + "grad_norm": 6.041051864624023, + "learning_rate": 0.0001912292106831356, + "loss": 12.984222412109375, + "step": 1620 + }, + { + "epoch": 1.8551095929405066, + "grad_norm": 6.353360652923584, + "learning_rate": 0.00019117503656752803, + "loss": 13.421148681640625, + "step": 1630 + }, + { + "epoch": 1.8664958724736693, + "grad_norm": 6.004988670349121, + "learning_rate": 0.00019112086245192048, + "loss": 13.050341796875, + "step": 1640 + }, + { + "epoch": 1.8778821520068316, + "grad_norm": 5.7523932456970215, + "learning_rate": 0.0001910666883363129, + "loss": 12.99239501953125, + "step": 1650 + }, + { + "epoch": 1.8892684315399944, + "grad_norm": 6.328149795532227, + "learning_rate": 0.00019101251422070536, + "loss": 12.906375122070312, + "step": 1660 + }, + { + "epoch": 1.9006547110731569, + "grad_norm": 5.817428112030029, + "learning_rate": 0.00019095834010509778, + "loss": 12.926182556152344, + "step": 1670 + }, + { + "epoch": 1.9120409906063194, + "grad_norm": 6.4034247398376465, + "learning_rate": 0.00019090416598949023, + "loss": 12.87743682861328, + "step": 1680 + }, + { + "epoch": 1.923427270139482, + "grad_norm": 5.860933303833008, + "learning_rate": 0.00019084999187388266, + "loss": 13.04234619140625, + "step": 1690 + }, + { + "epoch": 1.9348135496726444, + "grad_norm": 5.991933345794678, + "learning_rate": 0.0001907958177582751, + "loss": 12.981626892089844, + "step": 1700 + }, + { + "epoch": 1.9461998292058071, + "grad_norm": 5.810291767120361, + "learning_rate": 0.00019074164364266753, + "loss": 13.029983520507812, + "step": 1710 + }, + { + "epoch": 1.9575861087389694, + "grad_norm": 6.07528829574585, + "learning_rate": 0.00019068746952705999, + "loss": 13.375914001464844, + "step": 1720 + }, + { + "epoch": 1.9689723882721322, + "grad_norm": 6.209125995635986, + "learning_rate": 0.0001906332954114524, + "loss": 13.055149841308594, + "step": 1730 + }, + { + "epoch": 1.9803586678052945, + "grad_norm": 5.829214572906494, + "learning_rate": 0.00019057912129584486, + "loss": 13.251846313476562, + "step": 1740 + }, + { + "epoch": 1.9917449473384572, + "grad_norm": 5.681432723999023, + "learning_rate": 0.0001905249471802373, + "loss": 13.179347229003906, + "step": 1750 + }, + { + "epoch": 2.0022772559066326, + "grad_norm": 6.714110374450684, + "learning_rate": 0.00019047077306462974, + "loss": 12.078095245361329, + "step": 1760 + }, + { + "epoch": 2.013663535439795, + "grad_norm": 5.950106620788574, + "learning_rate": 0.00019041659894902216, + "loss": 12.684372711181641, + "step": 1770 + }, + { + "epoch": 2.0250498149729577, + "grad_norm": 6.075564861297607, + "learning_rate": 0.00019036242483341459, + "loss": 12.492652893066406, + "step": 1780 + }, + { + "epoch": 2.03643609450612, + "grad_norm": 5.315488338470459, + "learning_rate": 0.00019030825071780704, + "loss": 12.558822631835938, + "step": 1790 + }, + { + "epoch": 2.0478223740392827, + "grad_norm": 6.444157600402832, + "learning_rate": 0.00019025407660219946, + "loss": 12.013819122314453, + "step": 1800 + }, + { + "epoch": 2.059208653572445, + "grad_norm": 7.443634510040283, + "learning_rate": 0.00019019990248659191, + "loss": 12.401322174072266, + "step": 1810 + }, + { + "epoch": 2.0705949331056077, + "grad_norm": 6.039581298828125, + "learning_rate": 0.00019014572837098434, + "loss": 11.99999542236328, + "step": 1820 + }, + { + "epoch": 2.0819812126387705, + "grad_norm": 6.211398601531982, + "learning_rate": 0.0001900915542553768, + "loss": 12.471622467041016, + "step": 1830 + }, + { + "epoch": 2.0933674921719327, + "grad_norm": 5.827359199523926, + "learning_rate": 0.00019003738013976921, + "loss": 12.675922393798828, + "step": 1840 + }, + { + "epoch": 2.1047537717050955, + "grad_norm": 6.171777248382568, + "learning_rate": 0.00018998320602416167, + "loss": 12.37955093383789, + "step": 1850 + }, + { + "epoch": 2.1161400512382578, + "grad_norm": 5.889030456542969, + "learning_rate": 0.00018992903190855412, + "loss": 12.620635986328125, + "step": 1860 + }, + { + "epoch": 2.1275263307714205, + "grad_norm": 5.960679531097412, + "learning_rate": 0.00018987485779294654, + "loss": 12.005726623535157, + "step": 1870 + }, + { + "epoch": 2.138912610304583, + "grad_norm": 6.680525302886963, + "learning_rate": 0.000189820683677339, + "loss": 12.5527099609375, + "step": 1880 + }, + { + "epoch": 2.1502988898377455, + "grad_norm": 6.306058883666992, + "learning_rate": 0.00018976650956173142, + "loss": 12.299282836914063, + "step": 1890 + }, + { + "epoch": 2.161685169370908, + "grad_norm": 6.001065254211426, + "learning_rate": 0.00018971233544612387, + "loss": 12.561842346191407, + "step": 1900 + }, + { + "epoch": 2.1730714489040706, + "grad_norm": 5.667927265167236, + "learning_rate": 0.0001896581613305163, + "loss": 12.342430877685548, + "step": 1910 + }, + { + "epoch": 2.1844577284372333, + "grad_norm": 5.749220848083496, + "learning_rate": 0.00018960398721490872, + "loss": 11.777164459228516, + "step": 1920 + }, + { + "epoch": 2.1958440079703956, + "grad_norm": 5.756120681762695, + "learning_rate": 0.00018954981309930114, + "loss": 12.499293518066406, + "step": 1930 + }, + { + "epoch": 2.2072302875035583, + "grad_norm": 5.476713180541992, + "learning_rate": 0.0001894956389836936, + "loss": 12.119635009765625, + "step": 1940 + }, + { + "epoch": 2.2186165670367206, + "grad_norm": 6.148690700531006, + "learning_rate": 0.00018944146486808602, + "loss": 12.262448120117188, + "step": 1950 + }, + { + "epoch": 2.2300028465698833, + "grad_norm": 6.188674449920654, + "learning_rate": 0.00018938729075247847, + "loss": 11.836429595947266, + "step": 1960 + }, + { + "epoch": 2.2413891261030456, + "grad_norm": 5.806178569793701, + "learning_rate": 0.00018933311663687092, + "loss": 12.008864593505859, + "step": 1970 + }, + { + "epoch": 2.2527754056362084, + "grad_norm": 6.355106830596924, + "learning_rate": 0.00018927894252126335, + "loss": 12.07802505493164, + "step": 1980 + }, + { + "epoch": 2.2641616851693707, + "grad_norm": 5.5258612632751465, + "learning_rate": 0.0001892247684056558, + "loss": 12.283776092529298, + "step": 1990 + }, + { + "epoch": 2.2755479647025334, + "grad_norm": 5.87457275390625, + "learning_rate": 0.00018917059429004822, + "loss": 12.212881469726563, + "step": 2000 + }, + { + "epoch": 2.286934244235696, + "grad_norm": 5.784579277038574, + "learning_rate": 0.00018911642017444067, + "loss": 12.175121307373047, + "step": 2010 + }, + { + "epoch": 2.2983205237688584, + "grad_norm": 5.964444637298584, + "learning_rate": 0.00018906766347039387, + "loss": 12.410752868652343, + "step": 2020 + }, + { + "epoch": 2.309706803302021, + "grad_norm": 5.863014221191406, + "learning_rate": 0.00018901890676634707, + "loss": 12.259366607666015, + "step": 2030 + }, + { + "epoch": 2.3210930828351835, + "grad_norm": 6.4362382888793945, + "learning_rate": 0.0001889647326507395, + "loss": 12.253427124023437, + "step": 2040 + }, + { + "epoch": 2.332479362368346, + "grad_norm": 5.864607810974121, + "learning_rate": 0.00018891055853513194, + "loss": 12.033533477783203, + "step": 2050 + }, + { + "epoch": 2.343865641901509, + "grad_norm": 6.0086822509765625, + "learning_rate": 0.00018885638441952437, + "loss": 12.2366943359375, + "step": 2060 + }, + { + "epoch": 2.355251921434671, + "grad_norm": 5.782958507537842, + "learning_rate": 0.00018880221030391682, + "loss": 12.448239135742188, + "step": 2070 + }, + { + "epoch": 2.366638200967834, + "grad_norm": 5.685044288635254, + "learning_rate": 0.00018874803618830924, + "loss": 11.95639419555664, + "step": 2080 + }, + { + "epoch": 2.3780244805009962, + "grad_norm": 5.853184700012207, + "learning_rate": 0.00018869386207270167, + "loss": 12.157659912109375, + "step": 2090 + }, + { + "epoch": 2.389410760034159, + "grad_norm": 5.814517974853516, + "learning_rate": 0.0001886396879570941, + "loss": 11.77392349243164, + "step": 2100 + }, + { + "epoch": 2.4007970395673213, + "grad_norm": 6.793750286102295, + "learning_rate": 0.00018858551384148654, + "loss": 12.14357681274414, + "step": 2110 + }, + { + "epoch": 2.412183319100484, + "grad_norm": 5.957283020019531, + "learning_rate": 0.00018853133972587897, + "loss": 11.823822021484375, + "step": 2120 + }, + { + "epoch": 2.4235695986336463, + "grad_norm": 6.891757488250732, + "learning_rate": 0.00018847716561027142, + "loss": 12.402496337890625, + "step": 2130 + }, + { + "epoch": 2.434955878166809, + "grad_norm": 5.9682135581970215, + "learning_rate": 0.00018842299149466387, + "loss": 12.376535797119141, + "step": 2140 + }, + { + "epoch": 2.4463421576999718, + "grad_norm": 6.419319152832031, + "learning_rate": 0.0001883688173790563, + "loss": 11.78542709350586, + "step": 2150 + }, + { + "epoch": 2.457728437233134, + "grad_norm": 5.599404335021973, + "learning_rate": 0.00018831464326344875, + "loss": 11.65877685546875, + "step": 2160 + }, + { + "epoch": 2.469114716766297, + "grad_norm": 6.27384090423584, + "learning_rate": 0.00018826046914784117, + "loss": 12.384040832519531, + "step": 2170 + }, + { + "epoch": 2.480500996299459, + "grad_norm": 6.210123062133789, + "learning_rate": 0.00018820629503223362, + "loss": 11.9489990234375, + "step": 2180 + }, + { + "epoch": 2.491887275832622, + "grad_norm": 6.209619998931885, + "learning_rate": 0.00018815212091662605, + "loss": 12.142713928222657, + "step": 2190 + }, + { + "epoch": 2.503273555365784, + "grad_norm": 5.776041030883789, + "learning_rate": 0.0001880979468010185, + "loss": 12.368475341796875, + "step": 2200 + }, + { + "epoch": 2.514659834898947, + "grad_norm": 5.5831217765808105, + "learning_rate": 0.00018804377268541092, + "loss": 12.049860382080078, + "step": 2210 + }, + { + "epoch": 2.526046114432109, + "grad_norm": 6.643979072570801, + "learning_rate": 0.00018798959856980335, + "loss": 12.57002944946289, + "step": 2220 + }, + { + "epoch": 2.537432393965272, + "grad_norm": 5.437014102935791, + "learning_rate": 0.00018793542445419577, + "loss": 12.165242767333984, + "step": 2230 + }, + { + "epoch": 2.5488186734984346, + "grad_norm": 6.001914024353027, + "learning_rate": 0.00018788125033858822, + "loss": 11.841060638427734, + "step": 2240 + }, + { + "epoch": 2.560204953031597, + "grad_norm": 6.4120259284973145, + "learning_rate": 0.00018782707622298065, + "loss": 11.942637634277343, + "step": 2250 + }, + { + "epoch": 2.571591232564759, + "grad_norm": 6.355876922607422, + "learning_rate": 0.0001877729021073731, + "loss": 11.278862762451173, + "step": 2260 + }, + { + "epoch": 2.582977512097922, + "grad_norm": 5.9582600593566895, + "learning_rate": 0.00018771872799176555, + "loss": 12.05511932373047, + "step": 2270 + }, + { + "epoch": 2.5943637916310847, + "grad_norm": 6.194489479064941, + "learning_rate": 0.00018766455387615798, + "loss": 11.693316650390624, + "step": 2280 + }, + { + "epoch": 2.605750071164247, + "grad_norm": 6.129605770111084, + "learning_rate": 0.00018761037976055043, + "loss": 11.912431335449218, + "step": 2290 + }, + { + "epoch": 2.6171363506974097, + "grad_norm": 5.7922444343566895, + "learning_rate": 0.00018755620564494285, + "loss": 11.996822357177734, + "step": 2300 + }, + { + "epoch": 2.628522630230572, + "grad_norm": 5.562745094299316, + "learning_rate": 0.0001875020315293353, + "loss": 12.01687240600586, + "step": 2310 + }, + { + "epoch": 2.6399089097637347, + "grad_norm": 5.735344886779785, + "learning_rate": 0.00018744785741372773, + "loss": 12.01505355834961, + "step": 2320 + }, + { + "epoch": 2.6512951892968974, + "grad_norm": 5.924304485321045, + "learning_rate": 0.00018739368329812018, + "loss": 11.80968017578125, + "step": 2330 + }, + { + "epoch": 2.6626814688300597, + "grad_norm": 5.843868732452393, + "learning_rate": 0.0001873395091825126, + "loss": 12.081328582763671, + "step": 2340 + }, + { + "epoch": 2.6740677483632225, + "grad_norm": 5.781383514404297, + "learning_rate": 0.00018728533506690506, + "loss": 12.280255889892578, + "step": 2350 + }, + { + "epoch": 2.6854540278963848, + "grad_norm": 5.876210689544678, + "learning_rate": 0.00018723116095129748, + "loss": 11.941732788085938, + "step": 2360 + }, + { + "epoch": 2.6968403074295475, + "grad_norm": 5.828925609588623, + "learning_rate": 0.0001871769868356899, + "loss": 11.539690399169922, + "step": 2370 + }, + { + "epoch": 2.70822658696271, + "grad_norm": 6.443966388702393, + "learning_rate": 0.00018712281272008236, + "loss": 11.438835144042969, + "step": 2380 + }, + { + "epoch": 2.7196128664958725, + "grad_norm": 5.669212818145752, + "learning_rate": 0.00018706863860447478, + "loss": 12.036208343505859, + "step": 2390 + }, + { + "epoch": 2.730999146029035, + "grad_norm": 5.616700649261475, + "learning_rate": 0.00018701446448886723, + "loss": 11.880667114257813, + "step": 2400 + }, + { + "epoch": 2.7423854255621976, + "grad_norm": 5.890092372894287, + "learning_rate": 0.00018696029037325966, + "loss": 11.707859802246094, + "step": 2410 + }, + { + "epoch": 2.7537717050953603, + "grad_norm": 5.797362804412842, + "learning_rate": 0.0001869061162576521, + "loss": 11.643144989013672, + "step": 2420 + }, + { + "epoch": 2.7651579846285226, + "grad_norm": 5.619494438171387, + "learning_rate": 0.00018685194214204453, + "loss": 11.876329803466797, + "step": 2430 + }, + { + "epoch": 2.7765442641616853, + "grad_norm": 5.690425395965576, + "learning_rate": 0.00018679776802643698, + "loss": 11.896712493896484, + "step": 2440 + }, + { + "epoch": 2.7879305436948476, + "grad_norm": 5.649746894836426, + "learning_rate": 0.0001867435939108294, + "loss": 11.640943908691407, + "step": 2450 + }, + { + "epoch": 2.7993168232280103, + "grad_norm": 5.736121654510498, + "learning_rate": 0.00018668941979522186, + "loss": 11.80575942993164, + "step": 2460 + }, + { + "epoch": 2.8107031027611726, + "grad_norm": 5.586820125579834, + "learning_rate": 0.00018663524567961428, + "loss": 11.54848861694336, + "step": 2470 + }, + { + "epoch": 2.8220893822943354, + "grad_norm": 5.552623271942139, + "learning_rate": 0.00018658107156400674, + "loss": 11.699449157714843, + "step": 2480 + }, + { + "epoch": 2.8334756618274977, + "grad_norm": 5.449943542480469, + "learning_rate": 0.00018652689744839916, + "loss": 12.021916961669922, + "step": 2490 + }, + { + "epoch": 2.8448619413606604, + "grad_norm": 6.037112712860107, + "learning_rate": 0.0001864727233327916, + "loss": 11.778411865234375, + "step": 2500 + }, + { + "epoch": 2.856248220893823, + "grad_norm": 5.5578813552856445, + "learning_rate": 0.00018641854921718404, + "loss": 11.259034729003906, + "step": 2510 + }, + { + "epoch": 2.8676345004269854, + "grad_norm": 5.7229719161987305, + "learning_rate": 0.00018636437510157646, + "loss": 11.734458923339844, + "step": 2520 + }, + { + "epoch": 2.879020779960148, + "grad_norm": 5.95925235748291, + "learning_rate": 0.0001863102009859689, + "loss": 11.601738739013673, + "step": 2530 + }, + { + "epoch": 2.8904070594933104, + "grad_norm": 5.8496479988098145, + "learning_rate": 0.00018625602687036134, + "loss": 11.096221923828125, + "step": 2540 + }, + { + "epoch": 2.901793339026473, + "grad_norm": 6.491213321685791, + "learning_rate": 0.0001862018527547538, + "loss": 11.531390380859374, + "step": 2550 + }, + { + "epoch": 2.9131796185596355, + "grad_norm": 5.992333889007568, + "learning_rate": 0.0001861476786391462, + "loss": 11.45031509399414, + "step": 2560 + }, + { + "epoch": 2.924565898092798, + "grad_norm": 5.646914005279541, + "learning_rate": 0.00018609350452353866, + "loss": 11.133451080322265, + "step": 2570 + }, + { + "epoch": 2.9359521776259605, + "grad_norm": 5.3638176918029785, + "learning_rate": 0.0001860393304079311, + "loss": 11.416927337646484, + "step": 2580 + }, + { + "epoch": 2.9473384571591232, + "grad_norm": 5.817018985748291, + "learning_rate": 0.00018598515629232354, + "loss": 11.489157104492188, + "step": 2590 + }, + { + "epoch": 2.958724736692286, + "grad_norm": 6.094698905944824, + "learning_rate": 0.000185930982176716, + "loss": 11.472473907470704, + "step": 2600 + }, + { + "epoch": 2.9701110162254483, + "grad_norm": 6.355233669281006, + "learning_rate": 0.00018587680806110842, + "loss": 11.384600067138672, + "step": 2610 + }, + { + "epoch": 2.981497295758611, + "grad_norm": 5.65998649597168, + "learning_rate": 0.00018582263394550087, + "loss": 11.349325561523438, + "step": 2620 + }, + { + "epoch": 2.9928835752917733, + "grad_norm": 6.039316654205322, + "learning_rate": 0.0001857684598298933, + "loss": 11.65872802734375, + "step": 2630 + }, + { + "epoch": 3.0034158838599487, + "grad_norm": 5.6023993492126465, + "learning_rate": 0.00018571428571428572, + "loss": 10.584882354736328, + "step": 2640 + }, + { + "epoch": 3.0148021633931115, + "grad_norm": 6.057605743408203, + "learning_rate": 0.00018566011159867814, + "loss": 11.056887817382812, + "step": 2650 + }, + { + "epoch": 3.0261884429262738, + "grad_norm": 5.620913028717041, + "learning_rate": 0.0001856059374830706, + "loss": 11.04276351928711, + "step": 2660 + }, + { + "epoch": 3.0375747224594365, + "grad_norm": 5.724585056304932, + "learning_rate": 0.00018555176336746302, + "loss": 10.584162139892578, + "step": 2670 + }, + { + "epoch": 3.048961001992599, + "grad_norm": 6.663942337036133, + "learning_rate": 0.00018549758925185547, + "loss": 10.474478149414063, + "step": 2680 + }, + { + "epoch": 3.0603472815257615, + "grad_norm": 5.8548688888549805, + "learning_rate": 0.0001854434151362479, + "loss": 10.782363891601562, + "step": 2690 + }, + { + "epoch": 3.071733561058924, + "grad_norm": 5.4828314781188965, + "learning_rate": 0.00018538924102064035, + "loss": 10.86609115600586, + "step": 2700 + }, + { + "epoch": 3.0831198405920865, + "grad_norm": 5.855891704559326, + "learning_rate": 0.0001853350669050328, + "loss": 10.718875885009766, + "step": 2710 + }, + { + "epoch": 3.0945061201252493, + "grad_norm": 5.583263874053955, + "learning_rate": 0.00018528089278942522, + "loss": 10.526648712158202, + "step": 2720 + }, + { + "epoch": 3.1058923996584116, + "grad_norm": 5.6130690574646, + "learning_rate": 0.00018522671867381767, + "loss": 10.729558563232422, + "step": 2730 + }, + { + "epoch": 3.1172786791915743, + "grad_norm": 6.017326354980469, + "learning_rate": 0.0001851725445582101, + "loss": 10.696547698974609, + "step": 2740 + }, + { + "epoch": 3.1286649587247366, + "grad_norm": 5.918975353240967, + "learning_rate": 0.00018511837044260255, + "loss": 10.813334655761718, + "step": 2750 + }, + { + "epoch": 3.1400512382578993, + "grad_norm": 5.893662929534912, + "learning_rate": 0.00018506419632699497, + "loss": 10.956719970703125, + "step": 2760 + }, + { + "epoch": 3.1514375177910616, + "grad_norm": 6.190542221069336, + "learning_rate": 0.00018501002221138743, + "loss": 10.913722229003906, + "step": 2770 + }, + { + "epoch": 3.1628237973242244, + "grad_norm": 6.2601237297058105, + "learning_rate": 0.00018495584809577985, + "loss": 10.506605529785157, + "step": 2780 + }, + { + "epoch": 3.1742100768573867, + "grad_norm": 5.85802698135376, + "learning_rate": 0.00018490167398017227, + "loss": 11.01781005859375, + "step": 2790 + }, + { + "epoch": 3.1855963563905494, + "grad_norm": 5.59824800491333, + "learning_rate": 0.0001848474998645647, + "loss": 10.554005432128907, + "step": 2800 + }, + { + "epoch": 3.196982635923712, + "grad_norm": 5.9762282371521, + "learning_rate": 0.00018479332574895715, + "loss": 11.067266082763672, + "step": 2810 + }, + { + "epoch": 3.2083689154568744, + "grad_norm": 5.892134189605713, + "learning_rate": 0.0001847391516333496, + "loss": 10.800299072265625, + "step": 2820 + }, + { + "epoch": 3.219755194990037, + "grad_norm": 6.411027431488037, + "learning_rate": 0.00018468497751774203, + "loss": 10.963347625732421, + "step": 2830 + }, + { + "epoch": 3.2311414745231994, + "grad_norm": 5.7685394287109375, + "learning_rate": 0.00018463080340213448, + "loss": 10.73892822265625, + "step": 2840 + }, + { + "epoch": 3.242527754056362, + "grad_norm": 5.961293697357178, + "learning_rate": 0.0001845766292865269, + "loss": 10.619090270996093, + "step": 2850 + }, + { + "epoch": 3.2539140335895245, + "grad_norm": 7.078546524047852, + "learning_rate": 0.00018452245517091935, + "loss": 10.756937408447266, + "step": 2860 + }, + { + "epoch": 3.265300313122687, + "grad_norm": 6.000650405883789, + "learning_rate": 0.00018446828105531178, + "loss": 11.32503662109375, + "step": 2870 + }, + { + "epoch": 3.2766865926558495, + "grad_norm": 5.751965045928955, + "learning_rate": 0.00018441410693970423, + "loss": 11.059173583984375, + "step": 2880 + }, + { + "epoch": 3.2880728721890122, + "grad_norm": 6.229584693908691, + "learning_rate": 0.00018435993282409665, + "loss": 10.77396469116211, + "step": 2890 + }, + { + "epoch": 3.299459151722175, + "grad_norm": 5.8026123046875, + "learning_rate": 0.0001843057587084891, + "loss": 10.68900146484375, + "step": 2900 + }, + { + "epoch": 3.3108454312553373, + "grad_norm": 5.8648529052734375, + "learning_rate": 0.00018425158459288153, + "loss": 10.619082641601562, + "step": 2910 + }, + { + "epoch": 3.3222317107885, + "grad_norm": 6.061291217803955, + "learning_rate": 0.00018419741047727395, + "loss": 11.001821899414063, + "step": 2920 + }, + { + "epoch": 3.3336179903216623, + "grad_norm": 6.154610633850098, + "learning_rate": 0.0001841432363616664, + "loss": 10.811206817626953, + "step": 2930 + }, + { + "epoch": 3.345004269854825, + "grad_norm": 5.7560133934021, + "learning_rate": 0.00018408906224605883, + "loss": 11.004631805419923, + "step": 2940 + }, + { + "epoch": 3.3563905493879873, + "grad_norm": 5.845217227935791, + "learning_rate": 0.00018403488813045128, + "loss": 10.858744049072266, + "step": 2950 + }, + { + "epoch": 3.36777682892115, + "grad_norm": 5.666683197021484, + "learning_rate": 0.0001839807140148437, + "loss": 11.276985931396485, + "step": 2960 + }, + { + "epoch": 3.3791631084543123, + "grad_norm": 6.077807426452637, + "learning_rate": 0.00018392653989923616, + "loss": 10.676035308837891, + "step": 2970 + }, + { + "epoch": 3.390549387987475, + "grad_norm": 5.791286468505859, + "learning_rate": 0.00018387236578362858, + "loss": 10.966339874267579, + "step": 2980 + }, + { + "epoch": 3.401935667520638, + "grad_norm": 6.040390968322754, + "learning_rate": 0.00018381819166802103, + "loss": 10.893710327148437, + "step": 2990 + }, + { + "epoch": 3.4133219470538, + "grad_norm": 5.881897926330566, + "learning_rate": 0.00018376401755241346, + "loss": 10.969651031494141, + "step": 3000 + }, + { + "epoch": 3.424708226586963, + "grad_norm": 6.096680641174316, + "learning_rate": 0.0001837098434368059, + "loss": 10.903501892089844, + "step": 3010 + }, + { + "epoch": 3.436094506120125, + "grad_norm": 6.454861164093018, + "learning_rate": 0.00018365566932119834, + "loss": 10.728018188476563, + "step": 3020 + }, + { + "epoch": 3.447480785653288, + "grad_norm": 5.956232070922852, + "learning_rate": 0.0001836014952055908, + "loss": 10.590874481201173, + "step": 3030 + }, + { + "epoch": 3.45886706518645, + "grad_norm": 5.892461776733398, + "learning_rate": 0.0001835473210899832, + "loss": 10.788093566894531, + "step": 3040 + }, + { + "epoch": 3.470253344719613, + "grad_norm": 5.8851423263549805, + "learning_rate": 0.00018349314697437566, + "loss": 10.701885223388672, + "step": 3050 + }, + { + "epoch": 3.481639624252775, + "grad_norm": 6.0876078605651855, + "learning_rate": 0.0001834389728587681, + "loss": 10.101341247558594, + "step": 3060 + }, + { + "epoch": 3.493025903785938, + "grad_norm": 6.961325168609619, + "learning_rate": 0.0001833847987431605, + "loss": 10.46026153564453, + "step": 3070 + }, + { + "epoch": 3.5044121833191006, + "grad_norm": 6.137760639190674, + "learning_rate": 0.00018333062462755296, + "loss": 10.805878448486329, + "step": 3080 + }, + { + "epoch": 3.515798462852263, + "grad_norm": 5.97675895690918, + "learning_rate": 0.0001832764505119454, + "loss": 11.289669799804688, + "step": 3090 + }, + { + "epoch": 3.5271847423854257, + "grad_norm": 6.564485549926758, + "learning_rate": 0.00018322227639633784, + "loss": 10.845318603515626, + "step": 3100 + }, + { + "epoch": 3.538571021918588, + "grad_norm": 6.3509979248046875, + "learning_rate": 0.00018316810228073026, + "loss": 10.126212310791015, + "step": 3110 + }, + { + "epoch": 3.5499573014517507, + "grad_norm": 5.984918117523193, + "learning_rate": 0.00018311392816512272, + "loss": 10.31814956665039, + "step": 3120 + }, + { + "epoch": 3.5613435809849134, + "grad_norm": 5.558696746826172, + "learning_rate": 0.00018305975404951514, + "loss": 10.726148223876953, + "step": 3130 + }, + { + "epoch": 3.5727298605180757, + "grad_norm": 6.394120216369629, + "learning_rate": 0.0001830055799339076, + "loss": 10.786263275146485, + "step": 3140 + }, + { + "epoch": 3.584116140051238, + "grad_norm": 5.847959041595459, + "learning_rate": 0.00018295140581830002, + "loss": 10.560782623291015, + "step": 3150 + }, + { + "epoch": 3.5955024195844008, + "grad_norm": 5.2607879638671875, + "learning_rate": 0.00018289723170269247, + "loss": 10.637435913085938, + "step": 3160 + }, + { + "epoch": 3.6068886991175635, + "grad_norm": 6.327804088592529, + "learning_rate": 0.00018284305758708492, + "loss": 11.141863250732422, + "step": 3170 + }, + { + "epoch": 3.618274978650726, + "grad_norm": 5.784237861633301, + "learning_rate": 0.00018278888347147734, + "loss": 10.513392639160156, + "step": 3180 + }, + { + "epoch": 3.6296612581838885, + "grad_norm": 5.775552272796631, + "learning_rate": 0.0001827347093558698, + "loss": 10.784326934814453, + "step": 3190 + }, + { + "epoch": 3.641047537717051, + "grad_norm": 5.3834075927734375, + "learning_rate": 0.00018268053524026222, + "loss": 11.03073959350586, + "step": 3200 + }, + { + "epoch": 3.6524338172502135, + "grad_norm": 5.9715800285339355, + "learning_rate": 0.00018262636112465464, + "loss": 11.21976318359375, + "step": 3210 + }, + { + "epoch": 3.6638200967833763, + "grad_norm": 5.637823104858398, + "learning_rate": 0.00018257218700904707, + "loss": 10.567466735839844, + "step": 3220 + }, + { + "epoch": 3.6752063763165386, + "grad_norm": 6.525228500366211, + "learning_rate": 0.00018251801289343952, + "loss": 10.669526672363281, + "step": 3230 + }, + { + "epoch": 3.686592655849701, + "grad_norm": 5.675905704498291, + "learning_rate": 0.00018246383877783194, + "loss": 10.238113403320312, + "step": 3240 + }, + { + "epoch": 3.6979789353828636, + "grad_norm": 6.053264617919922, + "learning_rate": 0.0001824096646622244, + "loss": 10.952618408203126, + "step": 3250 + }, + { + "epoch": 3.7093652149160263, + "grad_norm": 6.067233562469482, + "learning_rate": 0.00018235549054661682, + "loss": 10.320521545410156, + "step": 3260 + }, + { + "epoch": 3.7207514944491886, + "grad_norm": 6.502700328826904, + "learning_rate": 0.00018230131643100927, + "loss": 10.383914947509766, + "step": 3270 + }, + { + "epoch": 3.7321377739823514, + "grad_norm": 5.968015670776367, + "learning_rate": 0.00018224714231540172, + "loss": 10.5248779296875, + "step": 3280 + }, + { + "epoch": 3.7435240535155136, + "grad_norm": 5.4662628173828125, + "learning_rate": 0.00018219296819979415, + "loss": 10.531697082519532, + "step": 3290 + }, + { + "epoch": 3.7549103330486764, + "grad_norm": 5.9595255851745605, + "learning_rate": 0.0001821387940841866, + "loss": 10.61429443359375, + "step": 3300 + }, + { + "epoch": 3.766296612581839, + "grad_norm": 6.310218334197998, + "learning_rate": 0.00018208461996857902, + "loss": 10.1953125, + "step": 3310 + }, + { + "epoch": 3.7776828921150014, + "grad_norm": 5.952932834625244, + "learning_rate": 0.00018203044585297148, + "loss": 10.674753570556641, + "step": 3320 + }, + { + "epoch": 3.7890691716481637, + "grad_norm": 6.101319313049316, + "learning_rate": 0.0001819762717373639, + "loss": 10.45705337524414, + "step": 3330 + }, + { + "epoch": 3.8004554511813264, + "grad_norm": 5.611819744110107, + "learning_rate": 0.00018192209762175632, + "loss": 10.72674560546875, + "step": 3340 + }, + { + "epoch": 3.811841730714489, + "grad_norm": 5.927121162414551, + "learning_rate": 0.00018186792350614878, + "loss": 10.702400970458985, + "step": 3350 + }, + { + "epoch": 3.8232280102476515, + "grad_norm": 6.374614715576172, + "learning_rate": 0.0001818137493905412, + "loss": 10.934248352050782, + "step": 3360 + }, + { + "epoch": 3.834614289780814, + "grad_norm": 6.032021522521973, + "learning_rate": 0.00018175957527493363, + "loss": 10.430912017822266, + "step": 3370 + }, + { + "epoch": 3.8460005693139765, + "grad_norm": 5.958329677581787, + "learning_rate": 0.00018170540115932608, + "loss": 10.657380676269531, + "step": 3380 + }, + { + "epoch": 3.8573868488471392, + "grad_norm": 5.539936542510986, + "learning_rate": 0.00018165122704371853, + "loss": 10.864521026611328, + "step": 3390 + }, + { + "epoch": 3.868773128380302, + "grad_norm": 5.501955986022949, + "learning_rate": 0.00018159705292811095, + "loss": 10.671641540527343, + "step": 3400 + }, + { + "epoch": 3.8801594079134643, + "grad_norm": 6.021368503570557, + "learning_rate": 0.0001815428788125034, + "loss": 10.651559448242187, + "step": 3410 + }, + { + "epoch": 3.8915456874466265, + "grad_norm": 5.438447952270508, + "learning_rate": 0.00018148870469689583, + "loss": 10.541700744628907, + "step": 3420 + }, + { + "epoch": 3.9029319669797893, + "grad_norm": 5.7153215408325195, + "learning_rate": 0.00018143453058128828, + "loss": 10.635433959960938, + "step": 3430 + }, + { + "epoch": 3.914318246512952, + "grad_norm": 5.716714382171631, + "learning_rate": 0.0001813803564656807, + "loss": 10.716056060791015, + "step": 3440 + }, + { + "epoch": 3.9257045260461143, + "grad_norm": 5.737730979919434, + "learning_rate": 0.00018132618235007316, + "loss": 10.449008178710937, + "step": 3450 + }, + { + "epoch": 3.937090805579277, + "grad_norm": 5.603618621826172, + "learning_rate": 0.00018127200823446558, + "loss": 10.800203704833985, + "step": 3460 + }, + { + "epoch": 3.9484770851124393, + "grad_norm": 5.745213508605957, + "learning_rate": 0.00018121783411885803, + "loss": 10.808299255371093, + "step": 3470 + }, + { + "epoch": 3.959863364645602, + "grad_norm": 5.7340850830078125, + "learning_rate": 0.00018116366000325046, + "loss": 10.769782257080077, + "step": 3480 + }, + { + "epoch": 3.971249644178765, + "grad_norm": 6.192749500274658, + "learning_rate": 0.00018110948588764288, + "loss": 10.350373840332031, + "step": 3490 + }, + { + "epoch": 3.982635923711927, + "grad_norm": 5.680601119995117, + "learning_rate": 0.00018105531177203533, + "loss": 10.832538604736328, + "step": 3500 + }, + { + "epoch": 3.99402220324509, + "grad_norm": 5.494757652282715, + "learning_rate": 0.00018100113765642776, + "loss": 10.541521453857422, + "step": 3510 + }, + { + "epoch": 4.004554511813265, + "grad_norm": 5.503895282745361, + "learning_rate": 0.0001809469635408202, + "loss": 9.482205200195313, + "step": 3520 + }, + { + "epoch": 4.015940791346427, + "grad_norm": 5.427241802215576, + "learning_rate": 0.00018089278942521263, + "loss": 9.544134521484375, + "step": 3530 + }, + { + "epoch": 4.02732707087959, + "grad_norm": 6.108213424682617, + "learning_rate": 0.00018083861530960509, + "loss": 10.208637237548828, + "step": 3540 + }, + { + "epoch": 4.038713350412753, + "grad_norm": 5.874690055847168, + "learning_rate": 0.0001807844411939975, + "loss": 9.760848236083984, + "step": 3550 + }, + { + "epoch": 4.050099629945915, + "grad_norm": 5.867772102355957, + "learning_rate": 0.00018073026707838996, + "loss": 9.961907958984375, + "step": 3560 + }, + { + "epoch": 4.061485909479078, + "grad_norm": 5.899345874786377, + "learning_rate": 0.00018067609296278239, + "loss": 9.273954010009765, + "step": 3570 + }, + { + "epoch": 4.07287218901224, + "grad_norm": 6.068745136260986, + "learning_rate": 0.00018062191884717484, + "loss": 10.06835708618164, + "step": 3580 + }, + { + "epoch": 4.084258468545403, + "grad_norm": 6.152563095092773, + "learning_rate": 0.00018056774473156726, + "loss": 9.612554931640625, + "step": 3590 + }, + { + "epoch": 4.095644748078565, + "grad_norm": 6.043203830718994, + "learning_rate": 0.0001805135706159597, + "loss": 9.750363159179688, + "step": 3600 + }, + { + "epoch": 4.107031027611728, + "grad_norm": 5.493103504180908, + "learning_rate": 0.00018045939650035214, + "loss": 9.853363800048829, + "step": 3610 + }, + { + "epoch": 4.11841730714489, + "grad_norm": 6.1369242668151855, + "learning_rate": 0.0001804052223847446, + "loss": 10.104590606689452, + "step": 3620 + }, + { + "epoch": 4.129803586678053, + "grad_norm": 6.410492897033691, + "learning_rate": 0.00018035104826913701, + "loss": 10.213319396972656, + "step": 3630 + }, + { + "epoch": 4.141189866211215, + "grad_norm": 5.654990196228027, + "learning_rate": 0.00018029687415352944, + "loss": 9.714891052246093, + "step": 3640 + }, + { + "epoch": 4.152576145744378, + "grad_norm": 6.079676151275635, + "learning_rate": 0.0001802427000379219, + "loss": 9.392752838134765, + "step": 3650 + }, + { + "epoch": 4.163962425277541, + "grad_norm": 6.427825927734375, + "learning_rate": 0.00018018852592231431, + "loss": 9.643639373779298, + "step": 3660 + }, + { + "epoch": 4.175348704810703, + "grad_norm": 5.963044166564941, + "learning_rate": 0.00018013435180670677, + "loss": 9.924317932128906, + "step": 3670 + }, + { + "epoch": 4.1867349843438655, + "grad_norm": 5.854789733886719, + "learning_rate": 0.0001800801776910992, + "loss": 9.63187255859375, + "step": 3680 + }, + { + "epoch": 4.198121263877028, + "grad_norm": 5.785211086273193, + "learning_rate": 0.00018002600357549164, + "loss": 9.579486846923828, + "step": 3690 + }, + { + "epoch": 4.209507543410191, + "grad_norm": 5.718067169189453, + "learning_rate": 0.00017997182945988407, + "loss": 9.896450805664063, + "step": 3700 + }, + { + "epoch": 4.220893822943353, + "grad_norm": 6.051468849182129, + "learning_rate": 0.00017991765534427652, + "loss": 9.21618194580078, + "step": 3710 + }, + { + "epoch": 4.2322801024765155, + "grad_norm": 5.758811950683594, + "learning_rate": 0.00017986348122866894, + "loss": 9.783828735351562, + "step": 3720 + }, + { + "epoch": 4.243666382009678, + "grad_norm": 6.194498062133789, + "learning_rate": 0.0001798093071130614, + "loss": 9.940241241455078, + "step": 3730 + }, + { + "epoch": 4.255052661542841, + "grad_norm": 5.6434125900268555, + "learning_rate": 0.00017975513299745385, + "loss": 9.683363342285157, + "step": 3740 + }, + { + "epoch": 4.266438941076004, + "grad_norm": 5.981060028076172, + "learning_rate": 0.00017970095888184627, + "loss": 10.015727996826172, + "step": 3750 + }, + { + "epoch": 4.277825220609166, + "grad_norm": 5.95534610748291, + "learning_rate": 0.0001796467847662387, + "loss": 10.008995819091798, + "step": 3760 + }, + { + "epoch": 4.289211500142328, + "grad_norm": 6.140223503112793, + "learning_rate": 0.00017959261065063112, + "loss": 9.99697265625, + "step": 3770 + }, + { + "epoch": 4.300597779675491, + "grad_norm": 5.725556373596191, + "learning_rate": 0.00017953843653502357, + "loss": 9.58441390991211, + "step": 3780 + }, + { + "epoch": 4.311984059208654, + "grad_norm": 6.234734058380127, + "learning_rate": 0.000179484262419416, + "loss": 10.283954620361328, + "step": 3790 + }, + { + "epoch": 4.323370338741816, + "grad_norm": 5.869581699371338, + "learning_rate": 0.00017943008830380845, + "loss": 9.939014434814453, + "step": 3800 + }, + { + "epoch": 4.334756618274978, + "grad_norm": 6.069183826446533, + "learning_rate": 0.00017937591418820087, + "loss": 9.780485534667969, + "step": 3810 + }, + { + "epoch": 4.346142897808141, + "grad_norm": 6.241194725036621, + "learning_rate": 0.00017932174007259332, + "loss": 9.680929565429688, + "step": 3820 + }, + { + "epoch": 4.357529177341304, + "grad_norm": 5.660580635070801, + "learning_rate": 0.00017926756595698575, + "loss": 9.670439147949219, + "step": 3830 + }, + { + "epoch": 4.368915456874467, + "grad_norm": 6.770995140075684, + "learning_rate": 0.0001792133918413782, + "loss": 10.093508911132812, + "step": 3840 + }, + { + "epoch": 4.380301736407628, + "grad_norm": 6.052023887634277, + "learning_rate": 0.00017915921772577065, + "loss": 10.153280639648438, + "step": 3850 + }, + { + "epoch": 4.391688015940791, + "grad_norm": 5.781612396240234, + "learning_rate": 0.00017910504361016308, + "loss": 9.563631439208985, + "step": 3860 + }, + { + "epoch": 4.403074295473954, + "grad_norm": 6.453577995300293, + "learning_rate": 0.00017905086949455553, + "loss": 10.253567504882813, + "step": 3870 + }, + { + "epoch": 4.414460575007117, + "grad_norm": 5.5569539070129395, + "learning_rate": 0.00017899669537894795, + "loss": 9.466841888427734, + "step": 3880 + }, + { + "epoch": 4.425846854540279, + "grad_norm": 6.755964756011963, + "learning_rate": 0.0001789425212633404, + "loss": 10.00699234008789, + "step": 3890 + }, + { + "epoch": 4.437233134073441, + "grad_norm": 5.5527472496032715, + "learning_rate": 0.00017888834714773283, + "loss": 9.851504516601562, + "step": 3900 + }, + { + "epoch": 4.448619413606604, + "grad_norm": 6.0654096603393555, + "learning_rate": 0.00017883417303212525, + "loss": 9.837632751464843, + "step": 3910 + }, + { + "epoch": 4.460005693139767, + "grad_norm": 5.762067794799805, + "learning_rate": 0.00017877999891651768, + "loss": 9.580419158935547, + "step": 3920 + }, + { + "epoch": 4.471391972672929, + "grad_norm": 6.3049139976501465, + "learning_rate": 0.00017872582480091013, + "loss": 10.155924987792968, + "step": 3930 + }, + { + "epoch": 4.482778252206091, + "grad_norm": 6.560263633728027, + "learning_rate": 0.00017867165068530255, + "loss": 9.540550231933594, + "step": 3940 + }, + { + "epoch": 4.494164531739254, + "grad_norm": 6.04788064956665, + "learning_rate": 0.000178617476569695, + "loss": 9.617048645019532, + "step": 3950 + }, + { + "epoch": 4.505550811272417, + "grad_norm": 6.14173698425293, + "learning_rate": 0.00017856330245408746, + "loss": 9.965501403808593, + "step": 3960 + }, + { + "epoch": 4.5169370908055795, + "grad_norm": 6.081863880157471, + "learning_rate": 0.00017850912833847988, + "loss": 10.01316146850586, + "step": 3970 + }, + { + "epoch": 4.528323370338741, + "grad_norm": 5.400860786437988, + "learning_rate": 0.00017845495422287233, + "loss": 10.162345886230469, + "step": 3980 + }, + { + "epoch": 4.539709649871904, + "grad_norm": 5.537269592285156, + "learning_rate": 0.00017840078010726476, + "loss": 9.989215850830078, + "step": 3990 + }, + { + "epoch": 4.551095929405067, + "grad_norm": 5.948838710784912, + "learning_rate": 0.0001783466059916572, + "loss": 10.042298126220704, + "step": 4000 + }, + { + "epoch": 4.5624822089382295, + "grad_norm": 5.892082691192627, + "learning_rate": 0.00017829243187604963, + "loss": 10.084636688232422, + "step": 4010 + }, + { + "epoch": 4.573868488471392, + "grad_norm": 6.188459396362305, + "learning_rate": 0.00017823825776044208, + "loss": 9.930858612060547, + "step": 4020 + }, + { + "epoch": 4.585254768004555, + "grad_norm": 6.035775661468506, + "learning_rate": 0.0001781840836448345, + "loss": 9.804845428466797, + "step": 4030 + }, + { + "epoch": 4.596641047537717, + "grad_norm": 6.26588249206543, + "learning_rate": 0.00017812990952922696, + "loss": 9.714436340332032, + "step": 4040 + }, + { + "epoch": 4.60802732707088, + "grad_norm": 5.697126865386963, + "learning_rate": 0.00017807573541361938, + "loss": 9.831003570556641, + "step": 4050 + }, + { + "epoch": 4.619413606604042, + "grad_norm": 5.572251796722412, + "learning_rate": 0.0001780215612980118, + "loss": 10.07955551147461, + "step": 4060 + }, + { + "epoch": 4.630799886137204, + "grad_norm": 5.784109592437744, + "learning_rate": 0.00017796738718240426, + "loss": 9.46136016845703, + "step": 4070 + }, + { + "epoch": 4.642186165670367, + "grad_norm": 6.043442726135254, + "learning_rate": 0.00017791321306679668, + "loss": 10.091639709472656, + "step": 4080 + }, + { + "epoch": 4.65357244520353, + "grad_norm": 5.721872329711914, + "learning_rate": 0.00017785903895118914, + "loss": 10.043199157714843, + "step": 4090 + }, + { + "epoch": 4.664958724736692, + "grad_norm": 5.753310203552246, + "learning_rate": 0.00017780486483558156, + "loss": 9.644969940185547, + "step": 4100 + }, + { + "epoch": 4.676345004269855, + "grad_norm": 6.087526321411133, + "learning_rate": 0.000177750690719974, + "loss": 9.953328704833984, + "step": 4110 + }, + { + "epoch": 4.687731283803018, + "grad_norm": 5.621133327484131, + "learning_rate": 0.00017769651660436644, + "loss": 9.69726104736328, + "step": 4120 + }, + { + "epoch": 4.69911756333618, + "grad_norm": 6.229346752166748, + "learning_rate": 0.0001776423424887589, + "loss": 9.628369140625, + "step": 4130 + }, + { + "epoch": 4.710503842869342, + "grad_norm": 5.986842155456543, + "learning_rate": 0.0001775881683731513, + "loss": 9.742918395996094, + "step": 4140 + }, + { + "epoch": 4.721890122402505, + "grad_norm": 5.907449245452881, + "learning_rate": 0.00017753399425754376, + "loss": 9.967567443847656, + "step": 4150 + }, + { + "epoch": 4.733276401935668, + "grad_norm": 6.126130104064941, + "learning_rate": 0.0001774798201419362, + "loss": 9.773100280761719, + "step": 4160 + }, + { + "epoch": 4.74466268146883, + "grad_norm": 5.871231555938721, + "learning_rate": 0.00017742564602632864, + "loss": 9.900487518310547, + "step": 4170 + }, + { + "epoch": 4.7560489610019925, + "grad_norm": 5.7376275062561035, + "learning_rate": 0.00017737147191072106, + "loss": 9.520377349853515, + "step": 4180 + }, + { + "epoch": 4.767435240535155, + "grad_norm": 5.596216201782227, + "learning_rate": 0.0001773172977951135, + "loss": 10.175640869140626, + "step": 4190 + }, + { + "epoch": 4.778821520068318, + "grad_norm": 6.0666351318359375, + "learning_rate": 0.00017726312367950594, + "loss": 9.647950744628906, + "step": 4200 + }, + { + "epoch": 4.790207799601481, + "grad_norm": 6.583392143249512, + "learning_rate": 0.00017720894956389837, + "loss": 9.790153503417969, + "step": 4210 + }, + { + "epoch": 4.8015940791346425, + "grad_norm": 5.613272190093994, + "learning_rate": 0.00017715477544829082, + "loss": 9.466513061523438, + "step": 4220 + }, + { + "epoch": 4.812980358667805, + "grad_norm": 5.842050552368164, + "learning_rate": 0.00017710060133268324, + "loss": 9.622956085205079, + "step": 4230 + }, + { + "epoch": 4.824366638200968, + "grad_norm": 5.759915828704834, + "learning_rate": 0.0001770464272170757, + "loss": 9.80495834350586, + "step": 4240 + }, + { + "epoch": 4.835752917734131, + "grad_norm": 5.608836650848389, + "learning_rate": 0.00017699225310146812, + "loss": 9.477723693847656, + "step": 4250 + }, + { + "epoch": 4.847139197267293, + "grad_norm": 6.463353157043457, + "learning_rate": 0.00017693807898586057, + "loss": 9.470064544677735, + "step": 4260 + }, + { + "epoch": 4.858525476800455, + "grad_norm": 5.962118148803711, + "learning_rate": 0.000176883904870253, + "loss": 9.367919921875, + "step": 4270 + }, + { + "epoch": 4.869911756333618, + "grad_norm": 5.565269470214844, + "learning_rate": 0.00017682973075464544, + "loss": 9.758961486816407, + "step": 4280 + }, + { + "epoch": 4.881298035866781, + "grad_norm": 5.9193010330200195, + "learning_rate": 0.0001767755566390379, + "loss": 9.685740661621093, + "step": 4290 + }, + { + "epoch": 4.8926843153999435, + "grad_norm": 5.730422019958496, + "learning_rate": 0.00017672138252343032, + "loss": 9.70977554321289, + "step": 4300 + }, + { + "epoch": 4.904070594933105, + "grad_norm": 5.72115421295166, + "learning_rate": 0.00017666720840782277, + "loss": 9.60604476928711, + "step": 4310 + }, + { + "epoch": 4.915456874466268, + "grad_norm": 6.489387035369873, + "learning_rate": 0.0001766130342922152, + "loss": 9.322960662841798, + "step": 4320 + }, + { + "epoch": 4.926843153999431, + "grad_norm": 5.830432415008545, + "learning_rate": 0.00017655886017660762, + "loss": 9.564017486572265, + "step": 4330 + }, + { + "epoch": 4.938229433532594, + "grad_norm": 6.302420616149902, + "learning_rate": 0.00017650468606100005, + "loss": 10.048651123046875, + "step": 4340 + }, + { + "epoch": 4.949615713065755, + "grad_norm": 6.4496660232543945, + "learning_rate": 0.0001764505119453925, + "loss": 9.775814819335938, + "step": 4350 + }, + { + "epoch": 4.961001992598918, + "grad_norm": 6.146448612213135, + "learning_rate": 0.00017639633782978492, + "loss": 9.664933013916016, + "step": 4360 + }, + { + "epoch": 4.972388272132081, + "grad_norm": 6.442460060119629, + "learning_rate": 0.00017634216371417737, + "loss": 9.787579345703126, + "step": 4370 + }, + { + "epoch": 4.983774551665244, + "grad_norm": 5.952507019042969, + "learning_rate": 0.0001762879895985698, + "loss": 9.962568664550782, + "step": 4380 + }, + { + "epoch": 4.995160831198406, + "grad_norm": 5.775949954986572, + "learning_rate": 0.00017623381548296225, + "loss": 9.72928466796875, + "step": 4390 + }, + { + "epoch": 5.005693139766581, + "grad_norm": 6.083469390869141, + "learning_rate": 0.00017617964136735467, + "loss": 8.628085327148437, + "step": 4400 + }, + { + "epoch": 5.017079419299744, + "grad_norm": 6.120087623596191, + "learning_rate": 0.00017612546725174713, + "loss": 8.888311004638672, + "step": 4410 + }, + { + "epoch": 5.028465698832906, + "grad_norm": 6.082287311553955, + "learning_rate": 0.00017607129313613958, + "loss": 9.107136535644532, + "step": 4420 + }, + { + "epoch": 5.039851978366069, + "grad_norm": 5.743897438049316, + "learning_rate": 0.000176017119020532, + "loss": 8.61147232055664, + "step": 4430 + }, + { + "epoch": 5.051238257899231, + "grad_norm": 5.427106857299805, + "learning_rate": 0.00017596294490492445, + "loss": 8.694246673583985, + "step": 4440 + }, + { + "epoch": 5.062624537432394, + "grad_norm": 6.282980442047119, + "learning_rate": 0.00017590877078931688, + "loss": 9.144784545898437, + "step": 4450 + }, + { + "epoch": 5.074010816965557, + "grad_norm": 6.395759105682373, + "learning_rate": 0.0001758545966737093, + "loss": 8.932855224609375, + "step": 4460 + }, + { + "epoch": 5.085397096498719, + "grad_norm": 6.1782917976379395, + "learning_rate": 0.00017580042255810175, + "loss": 9.115586853027343, + "step": 4470 + }, + { + "epoch": 5.0967833760318815, + "grad_norm": 6.45112943649292, + "learning_rate": 0.00017574624844249418, + "loss": 8.773223876953125, + "step": 4480 + }, + { + "epoch": 5.108169655565044, + "grad_norm": 5.9199371337890625, + "learning_rate": 0.0001756920743268866, + "loss": 8.678252410888671, + "step": 4490 + }, + { + "epoch": 5.119555935098207, + "grad_norm": 5.901211738586426, + "learning_rate": 0.00017563790021127905, + "loss": 9.167632293701171, + "step": 4500 + }, + { + "epoch": 5.130942214631369, + "grad_norm": 6.0059661865234375, + "learning_rate": 0.00017558372609567148, + "loss": 8.844408416748047, + "step": 4510 + }, + { + "epoch": 5.1423284941645315, + "grad_norm": 6.615947246551514, + "learning_rate": 0.00017552955198006393, + "loss": 8.853307342529297, + "step": 4520 + }, + { + "epoch": 5.153714773697694, + "grad_norm": 5.707233428955078, + "learning_rate": 0.00017547537786445638, + "loss": 8.580878448486327, + "step": 4530 + }, + { + "epoch": 5.165101053230857, + "grad_norm": 6.256187915802002, + "learning_rate": 0.0001754212037488488, + "loss": 8.889945983886719, + "step": 4540 + }, + { + "epoch": 5.17648733276402, + "grad_norm": 5.724560260772705, + "learning_rate": 0.00017536702963324126, + "loss": 9.283840942382813, + "step": 4550 + }, + { + "epoch": 5.187873612297182, + "grad_norm": 6.063653469085693, + "learning_rate": 0.00017531285551763368, + "loss": 9.45126953125, + "step": 4560 + }, + { + "epoch": 5.199259891830344, + "grad_norm": 6.490650177001953, + "learning_rate": 0.00017525868140202613, + "loss": 9.213851928710938, + "step": 4570 + }, + { + "epoch": 5.210646171363507, + "grad_norm": 6.077616214752197, + "learning_rate": 0.00017520450728641856, + "loss": 8.94876937866211, + "step": 4580 + }, + { + "epoch": 5.22203245089667, + "grad_norm": 6.140579700469971, + "learning_rate": 0.000175150333170811, + "loss": 9.061854553222656, + "step": 4590 + }, + { + "epoch": 5.233418730429832, + "grad_norm": 6.086933135986328, + "learning_rate": 0.00017509615905520343, + "loss": 8.97676773071289, + "step": 4600 + }, + { + "epoch": 5.244805009962994, + "grad_norm": 6.013683319091797, + "learning_rate": 0.00017504198493959586, + "loss": 9.112580871582031, + "step": 4610 + }, + { + "epoch": 5.256191289496157, + "grad_norm": 6.05435037612915, + "learning_rate": 0.00017498781082398828, + "loss": 8.846112823486328, + "step": 4620 + }, + { + "epoch": 5.26757756902932, + "grad_norm": 6.0196919441223145, + "learning_rate": 0.00017493363670838074, + "loss": 9.190632629394532, + "step": 4630 + }, + { + "epoch": 5.278963848562483, + "grad_norm": 5.778266906738281, + "learning_rate": 0.0001748794625927732, + "loss": 9.008673858642577, + "step": 4640 + }, + { + "epoch": 5.290350128095644, + "grad_norm": 6.087759971618652, + "learning_rate": 0.0001748252884771656, + "loss": 9.25833969116211, + "step": 4650 + }, + { + "epoch": 5.301736407628807, + "grad_norm": 5.978583812713623, + "learning_rate": 0.00017477111436155806, + "loss": 8.707977294921875, + "step": 4660 + }, + { + "epoch": 5.31312268716197, + "grad_norm": 5.761394500732422, + "learning_rate": 0.0001747169402459505, + "loss": 9.27506103515625, + "step": 4670 + }, + { + "epoch": 5.324508966695133, + "grad_norm": 6.357019901275635, + "learning_rate": 0.00017466276613034294, + "loss": 9.029557037353516, + "step": 4680 + }, + { + "epoch": 5.335895246228295, + "grad_norm": 6.016201496124268, + "learning_rate": 0.00017460859201473536, + "loss": 8.777602386474609, + "step": 4690 + }, + { + "epoch": 5.347281525761457, + "grad_norm": 6.20839786529541, + "learning_rate": 0.00017455441789912781, + "loss": 8.966375732421875, + "step": 4700 + }, + { + "epoch": 5.35866780529462, + "grad_norm": 6.455466270446777, + "learning_rate": 0.00017450024378352024, + "loss": 9.284576416015625, + "step": 4710 + }, + { + "epoch": 5.370054084827783, + "grad_norm": 6.2462663650512695, + "learning_rate": 0.0001744460696679127, + "loss": 8.876305389404298, + "step": 4720 + }, + { + "epoch": 5.381440364360945, + "grad_norm": 6.3350396156311035, + "learning_rate": 0.00017439189555230512, + "loss": 9.187533569335937, + "step": 4730 + }, + { + "epoch": 5.392826643894107, + "grad_norm": 6.246124267578125, + "learning_rate": 0.00017433772143669757, + "loss": 8.913845825195313, + "step": 4740 + }, + { + "epoch": 5.40421292342727, + "grad_norm": 5.948974609375, + "learning_rate": 0.00017428354732109, + "loss": 9.092901611328125, + "step": 4750 + }, + { + "epoch": 5.415599202960433, + "grad_norm": 6.238656520843506, + "learning_rate": 0.00017422937320548242, + "loss": 9.446102905273438, + "step": 4760 + }, + { + "epoch": 5.4269854824935955, + "grad_norm": 6.002922058105469, + "learning_rate": 0.00017417519908987487, + "loss": 9.022492980957031, + "step": 4770 + }, + { + "epoch": 5.438371762026758, + "grad_norm": 5.8916707038879395, + "learning_rate": 0.0001741210249742673, + "loss": 8.732567596435548, + "step": 4780 + }, + { + "epoch": 5.44975804155992, + "grad_norm": 6.253623008728027, + "learning_rate": 0.00017406685085865974, + "loss": 8.986526489257812, + "step": 4790 + }, + { + "epoch": 5.461144321093083, + "grad_norm": 6.794173717498779, + "learning_rate": 0.00017401267674305217, + "loss": 8.910897827148437, + "step": 4800 + }, + { + "epoch": 5.4725306006262455, + "grad_norm": 6.226437568664551, + "learning_rate": 0.00017395850262744462, + "loss": 9.053468322753906, + "step": 4810 + }, + { + "epoch": 5.483916880159408, + "grad_norm": 5.871084213256836, + "learning_rate": 0.00017390432851183704, + "loss": 8.758150482177735, + "step": 4820 + }, + { + "epoch": 5.49530315969257, + "grad_norm": 6.246805667877197, + "learning_rate": 0.0001738501543962295, + "loss": 9.195549774169923, + "step": 4830 + }, + { + "epoch": 5.506689439225733, + "grad_norm": 6.047657489776611, + "learning_rate": 0.00017379598028062192, + "loss": 9.35385513305664, + "step": 4840 + }, + { + "epoch": 5.518075718758896, + "grad_norm": 5.7672529220581055, + "learning_rate": 0.00017374180616501437, + "loss": 9.131436157226563, + "step": 4850 + }, + { + "epoch": 5.529461998292058, + "grad_norm": 6.151646614074707, + "learning_rate": 0.00017368763204940682, + "loss": 9.32352523803711, + "step": 4860 + }, + { + "epoch": 5.540848277825221, + "grad_norm": 6.154118537902832, + "learning_rate": 0.00017363345793379925, + "loss": 9.332242584228515, + "step": 4870 + }, + { + "epoch": 5.552234557358383, + "grad_norm": 5.870425224304199, + "learning_rate": 0.00017357928381819167, + "loss": 8.844430541992187, + "step": 4880 + }, + { + "epoch": 5.563620836891546, + "grad_norm": 6.215554237365723, + "learning_rate": 0.0001735251097025841, + "loss": 9.3614013671875, + "step": 4890 + }, + { + "epoch": 5.575007116424708, + "grad_norm": 6.865769386291504, + "learning_rate": 0.00017347093558697655, + "loss": 8.7501708984375, + "step": 4900 + }, + { + "epoch": 5.586393395957871, + "grad_norm": 5.617834091186523, + "learning_rate": 0.00017341676147136897, + "loss": 9.023835754394531, + "step": 4910 + }, + { + "epoch": 5.597779675491033, + "grad_norm": 6.217780590057373, + "learning_rate": 0.00017336258735576142, + "loss": 8.83472900390625, + "step": 4920 + }, + { + "epoch": 5.609165955024196, + "grad_norm": 6.248758316040039, + "learning_rate": 0.00017330841324015385, + "loss": 9.115097808837891, + "step": 4930 + }, + { + "epoch": 5.620552234557358, + "grad_norm": 5.923741817474365, + "learning_rate": 0.0001732542391245463, + "loss": 9.077340698242187, + "step": 4940 + }, + { + "epoch": 5.631938514090521, + "grad_norm": 6.346497058868408, + "learning_rate": 0.00017320006500893872, + "loss": 9.040455627441407, + "step": 4950 + }, + { + "epoch": 5.643324793623684, + "grad_norm": 6.081203937530518, + "learning_rate": 0.00017314589089333118, + "loss": 9.156749725341797, + "step": 4960 + }, + { + "epoch": 5.654711073156846, + "grad_norm": 5.836169719696045, + "learning_rate": 0.00017309171677772363, + "loss": 9.09630126953125, + "step": 4970 + }, + { + "epoch": 5.6660973526900085, + "grad_norm": 6.374985218048096, + "learning_rate": 0.00017303754266211605, + "loss": 9.138162231445312, + "step": 4980 + }, + { + "epoch": 5.677483632223171, + "grad_norm": 6.967176914215088, + "learning_rate": 0.0001729833685465085, + "loss": 9.05536346435547, + "step": 4990 + }, + { + "epoch": 5.688869911756334, + "grad_norm": 6.718258857727051, + "learning_rate": 0.00017292919443090093, + "loss": 8.888587951660156, + "step": 5000 + }, + { + "epoch": 5.700256191289496, + "grad_norm": 6.106222629547119, + "learning_rate": 0.00017287502031529338, + "loss": 9.413323974609375, + "step": 5010 + }, + { + "epoch": 5.7116424708226585, + "grad_norm": 5.6303229331970215, + "learning_rate": 0.0001728208461996858, + "loss": 9.086915588378906, + "step": 5020 + }, + { + "epoch": 5.723028750355821, + "grad_norm": 6.345915794372559, + "learning_rate": 0.00017276667208407823, + "loss": 8.981614685058593, + "step": 5030 + }, + { + "epoch": 5.734415029888984, + "grad_norm": 5.963151454925537, + "learning_rate": 0.00017271249796847065, + "loss": 8.846481323242188, + "step": 5040 + }, + { + "epoch": 5.745801309422147, + "grad_norm": 6.332173824310303, + "learning_rate": 0.0001726583238528631, + "loss": 9.01428451538086, + "step": 5050 + }, + { + "epoch": 5.757187588955309, + "grad_norm": 5.805280685424805, + "learning_rate": 0.00017260414973725553, + "loss": 8.98281021118164, + "step": 5060 + }, + { + "epoch": 5.768573868488471, + "grad_norm": 6.478013038635254, + "learning_rate": 0.00017254997562164798, + "loss": 9.026184844970704, + "step": 5070 + }, + { + "epoch": 5.779960148021634, + "grad_norm": 6.193087100982666, + "learning_rate": 0.00017249580150604043, + "loss": 8.499151611328125, + "step": 5080 + }, + { + "epoch": 5.791346427554797, + "grad_norm": 6.035495758056641, + "learning_rate": 0.00017244162739043286, + "loss": 9.09105224609375, + "step": 5090 + }, + { + "epoch": 5.802732707087959, + "grad_norm": 6.009443759918213, + "learning_rate": 0.0001723874532748253, + "loss": 9.080800628662109, + "step": 5100 + }, + { + "epoch": 5.814118986621121, + "grad_norm": 6.147922992706299, + "learning_rate": 0.00017233327915921773, + "loss": 9.082097625732422, + "step": 5110 + }, + { + "epoch": 5.825505266154284, + "grad_norm": 6.0678019523620605, + "learning_rate": 0.00017227910504361018, + "loss": 8.975556182861329, + "step": 5120 + }, + { + "epoch": 5.836891545687447, + "grad_norm": 5.851400852203369, + "learning_rate": 0.0001722249309280026, + "loss": 8.932150268554688, + "step": 5130 + }, + { + "epoch": 5.84827782522061, + "grad_norm": 5.831391334533691, + "learning_rate": 0.00017217075681239506, + "loss": 8.861358642578125, + "step": 5140 + }, + { + "epoch": 5.859664104753771, + "grad_norm": 6.788999557495117, + "learning_rate": 0.00017211658269678749, + "loss": 9.362552642822266, + "step": 5150 + }, + { + "epoch": 5.871050384286934, + "grad_norm": 6.117767333984375, + "learning_rate": 0.00017206240858117994, + "loss": 9.051403045654297, + "step": 5160 + }, + { + "epoch": 5.882436663820097, + "grad_norm": 5.815280914306641, + "learning_rate": 0.00017200823446557236, + "loss": 8.855480194091797, + "step": 5170 + }, + { + "epoch": 5.89382294335326, + "grad_norm": 6.476868152618408, + "learning_rate": 0.00017195406034996479, + "loss": 9.67321319580078, + "step": 5180 + }, + { + "epoch": 5.9052092228864215, + "grad_norm": 6.074749946594238, + "learning_rate": 0.0001718998862343572, + "loss": 8.859770965576171, + "step": 5190 + }, + { + "epoch": 5.916595502419584, + "grad_norm": 5.674811840057373, + "learning_rate": 0.00017184571211874966, + "loss": 9.397520446777344, + "step": 5200 + }, + { + "epoch": 5.927981781952747, + "grad_norm": 5.898608684539795, + "learning_rate": 0.0001717915380031421, + "loss": 9.098382568359375, + "step": 5210 + }, + { + "epoch": 5.93936806148591, + "grad_norm": 6.260279178619385, + "learning_rate": 0.00017173736388753454, + "loss": 9.085452270507812, + "step": 5220 + }, + { + "epoch": 5.950754341019072, + "grad_norm": 6.2707719802856445, + "learning_rate": 0.000171683189771927, + "loss": 8.759799194335937, + "step": 5230 + }, + { + "epoch": 5.962140620552234, + "grad_norm": 6.20477294921875, + "learning_rate": 0.00017162901565631941, + "loss": 9.199006652832031, + "step": 5240 + }, + { + "epoch": 5.973526900085397, + "grad_norm": 5.900763511657715, + "learning_rate": 0.00017157484154071187, + "loss": 9.182913970947265, + "step": 5250 + }, + { + "epoch": 5.98491317961856, + "grad_norm": 6.241118431091309, + "learning_rate": 0.0001715206674251043, + "loss": 8.96974868774414, + "step": 5260 + }, + { + "epoch": 5.9962994591517225, + "grad_norm": 5.996070384979248, + "learning_rate": 0.00017146649330949674, + "loss": 9.107511138916015, + "step": 5270 + }, + { + "epoch": 6.0068317677198975, + "grad_norm": 6.609316349029541, + "learning_rate": 0.00017141231919388917, + "loss": 7.848031616210937, + "step": 5280 + }, + { + "epoch": 6.01821804725306, + "grad_norm": 6.0844197273254395, + "learning_rate": 0.00017135814507828162, + "loss": 8.242960357666016, + "step": 5290 + }, + { + "epoch": 6.029604326786223, + "grad_norm": 5.959528923034668, + "learning_rate": 0.00017130397096267404, + "loss": 8.17882080078125, + "step": 5300 + }, + { + "epoch": 6.040990606319385, + "grad_norm": 5.812763690948486, + "learning_rate": 0.00017124979684706647, + "loss": 8.189881134033204, + "step": 5310 + }, + { + "epoch": 6.0523768858525475, + "grad_norm": 6.203407287597656, + "learning_rate": 0.00017119562273145892, + "loss": 7.9870758056640625, + "step": 5320 + }, + { + "epoch": 6.06376316538571, + "grad_norm": 6.108156681060791, + "learning_rate": 0.00017114144861585134, + "loss": 8.187033081054688, + "step": 5330 + }, + { + "epoch": 6.075149444918873, + "grad_norm": 6.818889617919922, + "learning_rate": 0.0001710872745002438, + "loss": 8.802757263183594, + "step": 5340 + }, + { + "epoch": 6.086535724452036, + "grad_norm": 6.080479145050049, + "learning_rate": 0.00017103310038463622, + "loss": 8.3635986328125, + "step": 5350 + }, + { + "epoch": 6.097922003985198, + "grad_norm": 6.009644508361816, + "learning_rate": 0.00017097892626902867, + "loss": 8.316793060302734, + "step": 5360 + }, + { + "epoch": 6.10930828351836, + "grad_norm": 5.928283214569092, + "learning_rate": 0.0001709247521534211, + "loss": 8.05916976928711, + "step": 5370 + }, + { + "epoch": 6.120694563051523, + "grad_norm": 6.18734073638916, + "learning_rate": 0.00017087057803781355, + "loss": 8.414893341064452, + "step": 5380 + }, + { + "epoch": 6.132080842584686, + "grad_norm": 5.8552021980285645, + "learning_rate": 0.00017081640392220597, + "loss": 8.328546905517578, + "step": 5390 + }, + { + "epoch": 6.143467122117848, + "grad_norm": 6.283035755157471, + "learning_rate": 0.00017076222980659842, + "loss": 8.294504547119141, + "step": 5400 + }, + { + "epoch": 6.15485340165101, + "grad_norm": 6.527463436126709, + "learning_rate": 0.00017070805569099085, + "loss": 8.205842590332031, + "step": 5410 + }, + { + "epoch": 6.166239681184173, + "grad_norm": 6.081474304199219, + "learning_rate": 0.0001706538815753833, + "loss": 8.215932464599609, + "step": 5420 + }, + { + "epoch": 6.177625960717336, + "grad_norm": 6.173142433166504, + "learning_rate": 0.00017059970745977575, + "loss": 8.617554473876954, + "step": 5430 + }, + { + "epoch": 6.189012240250499, + "grad_norm": 6.146962642669678, + "learning_rate": 0.00017054553334416817, + "loss": 8.21309814453125, + "step": 5440 + }, + { + "epoch": 6.20039851978366, + "grad_norm": 6.535096645355225, + "learning_rate": 0.0001704913592285606, + "loss": 8.393982696533204, + "step": 5450 + }, + { + "epoch": 6.211784799316823, + "grad_norm": 5.731675624847412, + "learning_rate": 0.00017043718511295302, + "loss": 8.488709259033204, + "step": 5460 + }, + { + "epoch": 6.223171078849986, + "grad_norm": 6.357123374938965, + "learning_rate": 0.00017038301099734547, + "loss": 8.502271270751953, + "step": 5470 + }, + { + "epoch": 6.234557358383149, + "grad_norm": 6.319135665893555, + "learning_rate": 0.0001703288368817379, + "loss": 8.702040100097657, + "step": 5480 + }, + { + "epoch": 6.2459436379163105, + "grad_norm": 6.496546268463135, + "learning_rate": 0.00017027466276613035, + "loss": 8.020443725585938, + "step": 5490 + }, + { + "epoch": 6.257329917449473, + "grad_norm": 6.284101963043213, + "learning_rate": 0.00017022048865052278, + "loss": 8.432199096679687, + "step": 5500 + }, + { + "epoch": 6.268716196982636, + "grad_norm": 6.48223876953125, + "learning_rate": 0.00017016631453491523, + "loss": 8.392076873779297, + "step": 5510 + }, + { + "epoch": 6.280102476515799, + "grad_norm": 6.497861862182617, + "learning_rate": 0.00017011214041930765, + "loss": 8.759424591064453, + "step": 5520 + }, + { + "epoch": 6.291488756048961, + "grad_norm": 6.510660648345947, + "learning_rate": 0.0001700579663037001, + "loss": 8.286143493652343, + "step": 5530 + }, + { + "epoch": 6.302875035582123, + "grad_norm": 5.99029541015625, + "learning_rate": 0.00017000379218809255, + "loss": 8.367719268798828, + "step": 5540 + }, + { + "epoch": 6.314261315115286, + "grad_norm": 6.750374794006348, + "learning_rate": 0.00016994961807248498, + "loss": 8.649713134765625, + "step": 5550 + }, + { + "epoch": 6.325647594648449, + "grad_norm": 5.9475202560424805, + "learning_rate": 0.00016989544395687743, + "loss": 8.837234497070312, + "step": 5560 + }, + { + "epoch": 6.3370338741816115, + "grad_norm": 6.30899715423584, + "learning_rate": 0.00016984126984126986, + "loss": 8.32588882446289, + "step": 5570 + }, + { + "epoch": 6.348420153714773, + "grad_norm": 6.342610836029053, + "learning_rate": 0.00016978709572566228, + "loss": 8.394316101074219, + "step": 5580 + }, + { + "epoch": 6.359806433247936, + "grad_norm": 6.095706462860107, + "learning_rate": 0.00016973292161005473, + "loss": 8.327243041992187, + "step": 5590 + }, + { + "epoch": 6.371192712781099, + "grad_norm": 6.3162336349487305, + "learning_rate": 0.00016967874749444716, + "loss": 8.443352508544923, + "step": 5600 + }, + { + "epoch": 6.3825789923142615, + "grad_norm": 6.393646240234375, + "learning_rate": 0.00016962457337883958, + "loss": 8.368497467041015, + "step": 5610 + }, + { + "epoch": 6.393965271847424, + "grad_norm": 6.893944263458252, + "learning_rate": 0.00016957039926323203, + "loss": 8.11514892578125, + "step": 5620 + }, + { + "epoch": 6.405351551380586, + "grad_norm": 6.172192573547363, + "learning_rate": 0.00016951622514762446, + "loss": 8.440129852294922, + "step": 5630 + }, + { + "epoch": 6.416737830913749, + "grad_norm": 6.662540435791016, + "learning_rate": 0.0001694620510320169, + "loss": 8.530294799804688, + "step": 5640 + }, + { + "epoch": 6.428124110446912, + "grad_norm": 6.44663143157959, + "learning_rate": 0.00016940787691640936, + "loss": 8.647301483154298, + "step": 5650 + }, + { + "epoch": 6.439510389980074, + "grad_norm": 7.118195056915283, + "learning_rate": 0.00016935370280080178, + "loss": 8.56711654663086, + "step": 5660 + }, + { + "epoch": 6.450896669513236, + "grad_norm": 5.5173540115356445, + "learning_rate": 0.00016929952868519424, + "loss": 8.647555541992187, + "step": 5670 + }, + { + "epoch": 6.462282949046399, + "grad_norm": 5.75939416885376, + "learning_rate": 0.00016924535456958666, + "loss": 8.223889923095703, + "step": 5680 + }, + { + "epoch": 6.473669228579562, + "grad_norm": 6.170014381408691, + "learning_rate": 0.0001691911804539791, + "loss": 7.9549812316894535, + "step": 5690 + }, + { + "epoch": 6.485055508112724, + "grad_norm": 6.4585490226745605, + "learning_rate": 0.00016913700633837154, + "loss": 8.623312377929688, + "step": 5700 + }, + { + "epoch": 6.496441787645887, + "grad_norm": 6.330554485321045, + "learning_rate": 0.000169082832222764, + "loss": 8.541600036621094, + "step": 5710 + }, + { + "epoch": 6.507828067179049, + "grad_norm": 6.698922634124756, + "learning_rate": 0.0001690286581071564, + "loss": 8.024588012695313, + "step": 5720 + }, + { + "epoch": 6.519214346712212, + "grad_norm": 6.28864049911499, + "learning_rate": 0.00016897448399154884, + "loss": 8.494007110595703, + "step": 5730 + }, + { + "epoch": 6.530600626245374, + "grad_norm": 6.495082378387451, + "learning_rate": 0.00016892030987594126, + "loss": 8.174886322021484, + "step": 5740 + }, + { + "epoch": 6.541986905778537, + "grad_norm": 6.506287097930908, + "learning_rate": 0.0001688661357603337, + "loss": 8.86504898071289, + "step": 5750 + }, + { + "epoch": 6.553373185311699, + "grad_norm": 6.550364971160889, + "learning_rate": 0.00016881196164472616, + "loss": 8.222732543945312, + "step": 5760 + }, + { + "epoch": 6.564759464844862, + "grad_norm": 6.110886096954346, + "learning_rate": 0.0001687577875291186, + "loss": 7.874899291992188, + "step": 5770 + }, + { + "epoch": 6.5761457443780245, + "grad_norm": 5.996333122253418, + "learning_rate": 0.00016870361341351104, + "loss": 8.523206329345703, + "step": 5780 + }, + { + "epoch": 6.587532023911187, + "grad_norm": 6.117859840393066, + "learning_rate": 0.00016864943929790346, + "loss": 8.307976531982423, + "step": 5790 + }, + { + "epoch": 6.59891830344435, + "grad_norm": 6.8809404373168945, + "learning_rate": 0.00016859526518229592, + "loss": 8.540397644042969, + "step": 5800 + }, + { + "epoch": 6.610304582977512, + "grad_norm": 6.469377040863037, + "learning_rate": 0.00016854109106668834, + "loss": 8.485488128662109, + "step": 5810 + }, + { + "epoch": 6.6216908625106745, + "grad_norm": 6.278227806091309, + "learning_rate": 0.0001684869169510808, + "loss": 8.363449096679688, + "step": 5820 + }, + { + "epoch": 6.633077142043837, + "grad_norm": 6.4080586433410645, + "learning_rate": 0.00016843274283547322, + "loss": 8.468556213378907, + "step": 5830 + }, + { + "epoch": 6.644463421577, + "grad_norm": 5.922115802764893, + "learning_rate": 0.00016837856871986567, + "loss": 8.429467010498048, + "step": 5840 + }, + { + "epoch": 6.655849701110162, + "grad_norm": 6.8086018562316895, + "learning_rate": 0.0001683243946042581, + "loss": 8.133071136474609, + "step": 5850 + }, + { + "epoch": 6.667235980643325, + "grad_norm": 6.547855377197266, + "learning_rate": 0.00016827022048865054, + "loss": 7.84344253540039, + "step": 5860 + }, + { + "epoch": 6.678622260176487, + "grad_norm": 6.792558193206787, + "learning_rate": 0.00016821604637304297, + "loss": 8.842105102539062, + "step": 5870 + }, + { + "epoch": 6.69000853970965, + "grad_norm": 6.492033004760742, + "learning_rate": 0.0001681618722574354, + "loss": 8.356997680664062, + "step": 5880 + }, + { + "epoch": 6.701394819242813, + "grad_norm": 6.224930763244629, + "learning_rate": 0.00016810769814182784, + "loss": 8.31610565185547, + "step": 5890 + }, + { + "epoch": 6.712781098775975, + "grad_norm": 6.407049655914307, + "learning_rate": 0.00016805352402622027, + "loss": 8.490274047851562, + "step": 5900 + }, + { + "epoch": 6.724167378309137, + "grad_norm": 6.361656665802002, + "learning_rate": 0.00016799934991061272, + "loss": 8.183870697021485, + "step": 5910 + }, + { + "epoch": 6.7355536578423, + "grad_norm": 6.558957576751709, + "learning_rate": 0.00016794517579500515, + "loss": 8.488135528564452, + "step": 5920 + }, + { + "epoch": 6.746939937375463, + "grad_norm": 7.5453925132751465, + "learning_rate": 0.0001678910016793976, + "loss": 8.479955291748047, + "step": 5930 + }, + { + "epoch": 6.758326216908625, + "grad_norm": 6.576931476593018, + "learning_rate": 0.00016783682756379002, + "loss": 8.354127502441406, + "step": 5940 + }, + { + "epoch": 6.769712496441787, + "grad_norm": 6.558088302612305, + "learning_rate": 0.00016778265344818247, + "loss": 8.517078399658203, + "step": 5950 + }, + { + "epoch": 6.78109877597495, + "grad_norm": 6.338796615600586, + "learning_rate": 0.0001677284793325749, + "loss": 8.582713317871093, + "step": 5960 + }, + { + "epoch": 6.792485055508113, + "grad_norm": 5.990917205810547, + "learning_rate": 0.00016767430521696735, + "loss": 8.49090347290039, + "step": 5970 + }, + { + "epoch": 6.803871335041276, + "grad_norm": 5.543584823608398, + "learning_rate": 0.00016762013110135977, + "loss": 8.585533905029298, + "step": 5980 + }, + { + "epoch": 6.8152576145744375, + "grad_norm": 6.125088214874268, + "learning_rate": 0.00016756595698575223, + "loss": 8.365689086914063, + "step": 5990 + }, + { + "epoch": 6.8266438941076, + "grad_norm": 6.520383358001709, + "learning_rate": 0.00016751178287014465, + "loss": 7.997092437744141, + "step": 6000 + }, + { + "epoch": 6.838030173640763, + "grad_norm": 6.5187530517578125, + "learning_rate": 0.00016745760875453707, + "loss": 8.529048919677734, + "step": 6010 + }, + { + "epoch": 6.849416453173926, + "grad_norm": 6.216691493988037, + "learning_rate": 0.00016740343463892953, + "loss": 8.508572387695313, + "step": 6020 + }, + { + "epoch": 6.8608027327070875, + "grad_norm": 6.541543960571289, + "learning_rate": 0.00016734926052332195, + "loss": 8.271363830566406, + "step": 6030 + }, + { + "epoch": 6.87218901224025, + "grad_norm": 6.519371509552002, + "learning_rate": 0.0001672950864077144, + "loss": 8.481680297851563, + "step": 6040 + }, + { + "epoch": 6.883575291773413, + "grad_norm": 6.069828510284424, + "learning_rate": 0.00016724091229210683, + "loss": 8.080921936035157, + "step": 6050 + }, + { + "epoch": 6.894961571306576, + "grad_norm": 6.756743907928467, + "learning_rate": 0.00016718673817649928, + "loss": 8.416082763671875, + "step": 6060 + }, + { + "epoch": 6.9063478508397385, + "grad_norm": 6.148792266845703, + "learning_rate": 0.0001671325640608917, + "loss": 8.236578369140625, + "step": 6070 + }, + { + "epoch": 6.9177341303729, + "grad_norm": 6.631039619445801, + "learning_rate": 0.00016707838994528415, + "loss": 8.468033599853516, + "step": 6080 + }, + { + "epoch": 6.929120409906063, + "grad_norm": 7.002105236053467, + "learning_rate": 0.00016702421582967658, + "loss": 8.596262359619141, + "step": 6090 + }, + { + "epoch": 6.940506689439226, + "grad_norm": 6.974485397338867, + "learning_rate": 0.00016697004171406903, + "loss": 8.201548767089843, + "step": 6100 + }, + { + "epoch": 6.9518929689723885, + "grad_norm": 6.358666896820068, + "learning_rate": 0.00016691586759846148, + "loss": 8.327249908447266, + "step": 6110 + }, + { + "epoch": 6.96327924850555, + "grad_norm": 6.620755195617676, + "learning_rate": 0.00016686711089441465, + "loss": 8.569712829589843, + "step": 6120 + }, + { + "epoch": 6.974665528038713, + "grad_norm": 6.0767059326171875, + "learning_rate": 0.0001668129367788071, + "loss": 8.352509307861329, + "step": 6130 + }, + { + "epoch": 6.986051807571876, + "grad_norm": 6.384547233581543, + "learning_rate": 0.00016675876266319955, + "loss": 8.438029479980468, + "step": 6140 + }, + { + "epoch": 6.997438087105039, + "grad_norm": 5.849452972412109, + "learning_rate": 0.00016670458854759198, + "loss": 8.576494598388672, + "step": 6150 + }, + { + "epoch": 7.007970395673214, + "grad_norm": 6.016312122344971, + "learning_rate": 0.0001666504144319844, + "loss": 7.190988922119141, + "step": 6160 + }, + { + "epoch": 7.019356675206376, + "grad_norm": 7.674123764038086, + "learning_rate": 0.00016659624031637683, + "loss": 7.777459716796875, + "step": 6170 + }, + { + "epoch": 7.030742954739539, + "grad_norm": 5.954489707946777, + "learning_rate": 0.00016654206620076928, + "loss": 8.038485717773437, + "step": 6180 + }, + { + "epoch": 7.042129234272702, + "grad_norm": 6.098645210266113, + "learning_rate": 0.0001664878920851617, + "loss": 7.921611022949219, + "step": 6190 + }, + { + "epoch": 7.053515513805864, + "grad_norm": 6.347640514373779, + "learning_rate": 0.00016643371796955415, + "loss": 7.475108337402344, + "step": 6200 + }, + { + "epoch": 7.064901793339026, + "grad_norm": 6.295144081115723, + "learning_rate": 0.00016637954385394658, + "loss": 7.842532348632813, + "step": 6210 + }, + { + "epoch": 7.076288072872189, + "grad_norm": 6.518668174743652, + "learning_rate": 0.00016632536973833903, + "loss": 8.195176696777343, + "step": 6220 + }, + { + "epoch": 7.087674352405352, + "grad_norm": 6.452752590179443, + "learning_rate": 0.00016627119562273145, + "loss": 7.876513671875, + "step": 6230 + }, + { + "epoch": 7.099060631938514, + "grad_norm": 6.639153003692627, + "learning_rate": 0.0001662170215071239, + "loss": 7.704139709472656, + "step": 6240 + }, + { + "epoch": 7.110446911471676, + "grad_norm": 6.341755390167236, + "learning_rate": 0.00016616284739151636, + "loss": 7.5064857482910154, + "step": 6250 + }, + { + "epoch": 7.121833191004839, + "grad_norm": 6.495119094848633, + "learning_rate": 0.00016610867327590878, + "loss": 7.757982635498047, + "step": 6260 + }, + { + "epoch": 7.133219470538002, + "grad_norm": 5.995544910430908, + "learning_rate": 0.00016605449916030123, + "loss": 7.433665466308594, + "step": 6270 + }, + { + "epoch": 7.144605750071165, + "grad_norm": 5.881967067718506, + "learning_rate": 0.00016600032504469366, + "loss": 7.579911804199218, + "step": 6280 + }, + { + "epoch": 7.1559920296043265, + "grad_norm": 6.570091724395752, + "learning_rate": 0.0001659461509290861, + "loss": 7.659120178222656, + "step": 6290 + }, + { + "epoch": 7.167378309137489, + "grad_norm": 6.527503490447998, + "learning_rate": 0.00016589197681347853, + "loss": 7.960893249511718, + "step": 6300 + }, + { + "epoch": 7.178764588670652, + "grad_norm": 6.29287576675415, + "learning_rate": 0.00016583780269787096, + "loss": 7.749033355712891, + "step": 6310 + }, + { + "epoch": 7.190150868203815, + "grad_norm": 6.531010627746582, + "learning_rate": 0.00016578362858226338, + "loss": 7.7865547180175785, + "step": 6320 + }, + { + "epoch": 7.2015371477369765, + "grad_norm": 6.445150852203369, + "learning_rate": 0.00016572945446665583, + "loss": 7.650438690185547, + "step": 6330 + }, + { + "epoch": 7.212923427270139, + "grad_norm": 6.2929816246032715, + "learning_rate": 0.00016567528035104826, + "loss": 7.579685974121094, + "step": 6340 + }, + { + "epoch": 7.224309706803302, + "grad_norm": 7.012646675109863, + "learning_rate": 0.0001656211062354407, + "loss": 7.769477844238281, + "step": 6350 + }, + { + "epoch": 7.235695986336465, + "grad_norm": 6.118347644805908, + "learning_rate": 0.00016556693211983316, + "loss": 7.643257904052734, + "step": 6360 + }, + { + "epoch": 7.2470822658696274, + "grad_norm": 5.888443946838379, + "learning_rate": 0.0001655127580042256, + "loss": 7.476705169677734, + "step": 6370 + }, + { + "epoch": 7.258468545402789, + "grad_norm": 6.040239334106445, + "learning_rate": 0.00016545858388861804, + "loss": 7.228121948242188, + "step": 6380 + }, + { + "epoch": 7.269854824935952, + "grad_norm": 6.494739055633545, + "learning_rate": 0.00016540440977301046, + "loss": 7.939952087402344, + "step": 6390 + }, + { + "epoch": 7.281241104469115, + "grad_norm": 6.026639461517334, + "learning_rate": 0.00016535023565740291, + "loss": 7.657688903808594, + "step": 6400 + }, + { + "epoch": 7.2926273840022775, + "grad_norm": 6.798668384552002, + "learning_rate": 0.00016529606154179534, + "loss": 8.28334732055664, + "step": 6410 + }, + { + "epoch": 7.304013663535439, + "grad_norm": 6.844184398651123, + "learning_rate": 0.0001652418874261878, + "loss": 7.631706237792969, + "step": 6420 + }, + { + "epoch": 7.315399943068602, + "grad_norm": 5.855307579040527, + "learning_rate": 0.00016518771331058022, + "loss": 7.990458679199219, + "step": 6430 + }, + { + "epoch": 7.326786222601765, + "grad_norm": 6.2944159507751465, + "learning_rate": 0.00016513353919497264, + "loss": 7.7971031188964846, + "step": 6440 + }, + { + "epoch": 7.3381725021349276, + "grad_norm": 5.8472900390625, + "learning_rate": 0.0001650793650793651, + "loss": 7.843349456787109, + "step": 6450 + }, + { + "epoch": 7.34955878166809, + "grad_norm": 6.461435317993164, + "learning_rate": 0.00016502519096375752, + "loss": 7.716208648681641, + "step": 6460 + }, + { + "epoch": 7.360945061201252, + "grad_norm": 7.011518955230713, + "learning_rate": 0.00016497101684814997, + "loss": 7.972675323486328, + "step": 6470 + }, + { + "epoch": 7.372331340734415, + "grad_norm": 6.545440196990967, + "learning_rate": 0.0001649168427325424, + "loss": 8.097943878173828, + "step": 6480 + }, + { + "epoch": 7.383717620267578, + "grad_norm": 6.5945820808410645, + "learning_rate": 0.00016486266861693484, + "loss": 8.236820220947266, + "step": 6490 + }, + { + "epoch": 7.39510389980074, + "grad_norm": 5.519364833831787, + "learning_rate": 0.00016480849450132727, + "loss": 7.836220550537109, + "step": 6500 + }, + { + "epoch": 7.406490179333902, + "grad_norm": 6.1039910316467285, + "learning_rate": 0.00016475432038571972, + "loss": 7.595920562744141, + "step": 6510 + }, + { + "epoch": 7.417876458867065, + "grad_norm": 6.2389020919799805, + "learning_rate": 0.00016470014627011214, + "loss": 7.607307434082031, + "step": 6520 + }, + { + "epoch": 7.429262738400228, + "grad_norm": 6.249361038208008, + "learning_rate": 0.0001646459721545046, + "loss": 7.690441131591797, + "step": 6530 + }, + { + "epoch": 7.44064901793339, + "grad_norm": 6.085043907165527, + "learning_rate": 0.00016459179803889702, + "loss": 7.78430404663086, + "step": 6540 + }, + { + "epoch": 7.452035297466553, + "grad_norm": 6.791484355926514, + "learning_rate": 0.00016453762392328947, + "loss": 8.073173522949219, + "step": 6550 + }, + { + "epoch": 7.463421576999715, + "grad_norm": 6.072941303253174, + "learning_rate": 0.0001644834498076819, + "loss": 7.5049797058105465, + "step": 6560 + }, + { + "epoch": 7.474807856532878, + "grad_norm": 6.538127422332764, + "learning_rate": 0.00016442927569207435, + "loss": 8.0398193359375, + "step": 6570 + }, + { + "epoch": 7.4861941360660404, + "grad_norm": 6.887174129486084, + "learning_rate": 0.00016437510157646677, + "loss": 7.953376007080078, + "step": 6580 + }, + { + "epoch": 7.497580415599203, + "grad_norm": 6.870800495147705, + "learning_rate": 0.0001643209274608592, + "loss": 7.675375366210938, + "step": 6590 + }, + { + "epoch": 7.508966695132365, + "grad_norm": 6.101712703704834, + "learning_rate": 0.00016426675334525165, + "loss": 7.8550971984863285, + "step": 6600 + }, + { + "epoch": 7.520352974665528, + "grad_norm": 6.480758190155029, + "learning_rate": 0.00016421257922964407, + "loss": 7.9089508056640625, + "step": 6610 + }, + { + "epoch": 7.5317392541986905, + "grad_norm": 6.663425445556641, + "learning_rate": 0.00016415840511403652, + "loss": 7.739738464355469, + "step": 6620 + }, + { + "epoch": 7.543125533731853, + "grad_norm": 6.832794666290283, + "learning_rate": 0.00016410423099842895, + "loss": 7.970989990234375, + "step": 6630 + }, + { + "epoch": 7.554511813265016, + "grad_norm": 6.2909770011901855, + "learning_rate": 0.0001640500568828214, + "loss": 8.1257080078125, + "step": 6640 + }, + { + "epoch": 7.565898092798178, + "grad_norm": 6.659548759460449, + "learning_rate": 0.00016399588276721382, + "loss": 7.914602661132813, + "step": 6650 + }, + { + "epoch": 7.577284372331341, + "grad_norm": 6.749256610870361, + "learning_rate": 0.00016394170865160628, + "loss": 7.901271057128906, + "step": 6660 + }, + { + "epoch": 7.588670651864503, + "grad_norm": 6.320638656616211, + "learning_rate": 0.0001638875345359987, + "loss": 7.549610137939453, + "step": 6670 + }, + { + "epoch": 7.600056931397666, + "grad_norm": 6.600435256958008, + "learning_rate": 0.00016383336042039115, + "loss": 7.900602722167969, + "step": 6680 + }, + { + "epoch": 7.611443210930828, + "grad_norm": 7.011569976806641, + "learning_rate": 0.00016377918630478358, + "loss": 7.672644805908203, + "step": 6690 + }, + { + "epoch": 7.622829490463991, + "grad_norm": 8.137212753295898, + "learning_rate": 0.00016372501218917603, + "loss": 7.771617126464844, + "step": 6700 + }, + { + "epoch": 7.634215769997153, + "grad_norm": 6.862710952758789, + "learning_rate": 0.00016367083807356848, + "loss": 8.049059295654297, + "step": 6710 + }, + { + "epoch": 7.645602049530316, + "grad_norm": 6.581108093261719, + "learning_rate": 0.0001636166639579609, + "loss": 7.8606727600097654, + "step": 6720 + }, + { + "epoch": 7.656988329063479, + "grad_norm": 6.7956647872924805, + "learning_rate": 0.00016356248984235333, + "loss": 8.168240356445313, + "step": 6730 + }, + { + "epoch": 7.668374608596641, + "grad_norm": 6.123491287231445, + "learning_rate": 0.00016350831572674575, + "loss": 7.896241760253906, + "step": 6740 + }, + { + "epoch": 7.679760888129803, + "grad_norm": 6.9219136238098145, + "learning_rate": 0.0001634541416111382, + "loss": 8.173204040527343, + "step": 6750 + }, + { + "epoch": 7.691147167662966, + "grad_norm": 6.664566993713379, + "learning_rate": 0.00016339996749553063, + "loss": 8.276904296875, + "step": 6760 + }, + { + "epoch": 7.702533447196129, + "grad_norm": 7.015839576721191, + "learning_rate": 0.00016334579337992308, + "loss": 7.637835693359375, + "step": 6770 + }, + { + "epoch": 7.713919726729291, + "grad_norm": 7.610071659088135, + "learning_rate": 0.0001632916192643155, + "loss": 7.590660095214844, + "step": 6780 + }, + { + "epoch": 7.7253060062624535, + "grad_norm": 6.450679779052734, + "learning_rate": 0.00016323744514870796, + "loss": 7.862370300292969, + "step": 6790 + }, + { + "epoch": 7.736692285795616, + "grad_norm": 6.770766735076904, + "learning_rate": 0.00016318327103310038, + "loss": 7.959809112548828, + "step": 6800 + }, + { + "epoch": 7.748078565328779, + "grad_norm": 6.543522357940674, + "learning_rate": 0.00016312909691749283, + "loss": 7.892201232910156, + "step": 6810 + }, + { + "epoch": 7.759464844861942, + "grad_norm": 6.228302955627441, + "learning_rate": 0.00016307492280188528, + "loss": 7.535732269287109, + "step": 6820 + }, + { + "epoch": 7.770851124395104, + "grad_norm": 6.860931396484375, + "learning_rate": 0.0001630207486862777, + "loss": 8.195712280273437, + "step": 6830 + }, + { + "epoch": 7.782237403928266, + "grad_norm": 6.871212482452393, + "learning_rate": 0.00016296657457067016, + "loss": 7.680415344238281, + "step": 6840 + }, + { + "epoch": 7.793623683461429, + "grad_norm": 6.185060977935791, + "learning_rate": 0.00016291240045506259, + "loss": 7.795463562011719, + "step": 6850 + }, + { + "epoch": 7.805009962994592, + "grad_norm": 6.6304826736450195, + "learning_rate": 0.000162858226339455, + "loss": 7.885947418212891, + "step": 6860 + }, + { + "epoch": 7.816396242527754, + "grad_norm": 5.791022777557373, + "learning_rate": 0.00016280405222384743, + "loss": 7.429241943359375, + "step": 6870 + }, + { + "epoch": 7.827782522060916, + "grad_norm": 6.679376602172852, + "learning_rate": 0.00016274987810823989, + "loss": 7.767063140869141, + "step": 6880 + }, + { + "epoch": 7.839168801594079, + "grad_norm": 6.607264518737793, + "learning_rate": 0.0001626957039926323, + "loss": 8.051114654541015, + "step": 6890 + }, + { + "epoch": 7.850555081127242, + "grad_norm": 6.548991680145264, + "learning_rate": 0.00016264152987702476, + "loss": 7.440624237060547, + "step": 6900 + }, + { + "epoch": 7.8619413606604045, + "grad_norm": 6.445249557495117, + "learning_rate": 0.00016258735576141719, + "loss": 7.984051513671875, + "step": 6910 + }, + { + "epoch": 7.873327640193567, + "grad_norm": 6.627695560455322, + "learning_rate": 0.00016253318164580964, + "loss": 7.8278968811035154, + "step": 6920 + }, + { + "epoch": 7.884713919726729, + "grad_norm": 6.76314640045166, + "learning_rate": 0.0001624790075302021, + "loss": 7.394937133789062, + "step": 6930 + }, + { + "epoch": 7.896100199259892, + "grad_norm": 6.663532257080078, + "learning_rate": 0.00016242483341459451, + "loss": 8.056930541992188, + "step": 6940 + }, + { + "epoch": 7.9074864787930546, + "grad_norm": 7.132768630981445, + "learning_rate": 0.00016237065929898697, + "loss": 7.636945343017578, + "step": 6950 + }, + { + "epoch": 7.918872758326217, + "grad_norm": 6.137714385986328, + "learning_rate": 0.0001623164851833794, + "loss": 8.028910827636718, + "step": 6960 + }, + { + "epoch": 7.930259037859379, + "grad_norm": 6.413361072540283, + "learning_rate": 0.00016226231106777184, + "loss": 7.650550842285156, + "step": 6970 + }, + { + "epoch": 7.941645317392542, + "grad_norm": 5.806910514831543, + "learning_rate": 0.00016220813695216427, + "loss": 7.590312194824219, + "step": 6980 + }, + { + "epoch": 7.953031596925705, + "grad_norm": 6.411592483520508, + "learning_rate": 0.00016215396283655672, + "loss": 8.053181457519532, + "step": 6990 + }, + { + "epoch": 7.964417876458867, + "grad_norm": 6.627279758453369, + "learning_rate": 0.00016209978872094914, + "loss": 7.878144073486328, + "step": 7000 + }, + { + "epoch": 7.97580415599203, + "grad_norm": 7.155358791351318, + "learning_rate": 0.00016204561460534157, + "loss": 7.987397003173828, + "step": 7010 + }, + { + "epoch": 7.987190435525192, + "grad_norm": 6.340551376342773, + "learning_rate": 0.000161991440489734, + "loss": 7.976909637451172, + "step": 7020 + }, + { + "epoch": 7.998576715058355, + "grad_norm": 6.796611309051514, + "learning_rate": 0.00016193726637412644, + "loss": 7.7033638000488285, + "step": 7030 + }, + { + "epoch": 8.00910902362653, + "grad_norm": 6.260311126708984, + "learning_rate": 0.0001618830922585189, + "loss": 6.682421112060547, + "step": 7040 + }, + { + "epoch": 8.020495303159693, + "grad_norm": 7.427417278289795, + "learning_rate": 0.00016182891814291132, + "loss": 7.5024055480957035, + "step": 7050 + }, + { + "epoch": 8.031881582692854, + "grad_norm": 6.3494486808776855, + "learning_rate": 0.00016177474402730377, + "loss": 7.2604820251464846, + "step": 7060 + }, + { + "epoch": 8.043267862226017, + "grad_norm": 6.655555725097656, + "learning_rate": 0.0001617205699116962, + "loss": 7.514097595214844, + "step": 7070 + }, + { + "epoch": 8.05465414175918, + "grad_norm": 6.684828758239746, + "learning_rate": 0.00016166639579608865, + "loss": 7.446005249023438, + "step": 7080 + }, + { + "epoch": 8.066040421292342, + "grad_norm": 6.890992164611816, + "learning_rate": 0.00016161222168048107, + "loss": 7.443931579589844, + "step": 7090 + }, + { + "epoch": 8.077426700825505, + "grad_norm": 6.673741340637207, + "learning_rate": 0.00016155804756487352, + "loss": 7.16052017211914, + "step": 7100 + }, + { + "epoch": 8.088812980358668, + "grad_norm": 6.923252105712891, + "learning_rate": 0.00016150387344926595, + "loss": 6.938526153564453, + "step": 7110 + }, + { + "epoch": 8.10019925989183, + "grad_norm": 6.302939414978027, + "learning_rate": 0.0001614496993336584, + "loss": 7.243843841552734, + "step": 7120 + }, + { + "epoch": 8.111585539424993, + "grad_norm": 6.899267673492432, + "learning_rate": 0.00016139552521805082, + "loss": 7.297703552246094, + "step": 7130 + }, + { + "epoch": 8.122971818958156, + "grad_norm": 7.268035888671875, + "learning_rate": 0.00016134135110244327, + "loss": 7.230997467041016, + "step": 7140 + }, + { + "epoch": 8.134358098491319, + "grad_norm": 7.285719871520996, + "learning_rate": 0.00016129259439839647, + "loss": 7.024531555175781, + "step": 7150 + }, + { + "epoch": 8.14574437802448, + "grad_norm": 6.868885040283203, + "learning_rate": 0.0001612384202827889, + "loss": 7.714209747314453, + "step": 7160 + }, + { + "epoch": 8.157130657557643, + "grad_norm": 6.319274425506592, + "learning_rate": 0.00016118424616718132, + "loss": 6.995166015625, + "step": 7170 + }, + { + "epoch": 8.168516937090805, + "grad_norm": 6.736001491546631, + "learning_rate": 0.00016113007205157374, + "loss": 7.084954833984375, + "step": 7180 + }, + { + "epoch": 8.179903216623968, + "grad_norm": 6.393606185913086, + "learning_rate": 0.0001610758979359662, + "loss": 7.56659164428711, + "step": 7190 + }, + { + "epoch": 8.19128949615713, + "grad_norm": 6.33527135848999, + "learning_rate": 0.00016102172382035865, + "loss": 7.253793334960937, + "step": 7200 + }, + { + "epoch": 8.202675775690293, + "grad_norm": 7.905284404754639, + "learning_rate": 0.00016096754970475107, + "loss": 7.590048217773438, + "step": 7210 + }, + { + "epoch": 8.214062055223456, + "grad_norm": 6.864686965942383, + "learning_rate": 0.00016091337558914352, + "loss": 7.1065208435058596, + "step": 7220 + }, + { + "epoch": 8.225448334756619, + "grad_norm": 6.355857849121094, + "learning_rate": 0.00016085920147353595, + "loss": 7.114351654052735, + "step": 7230 + }, + { + "epoch": 8.23683461428978, + "grad_norm": 7.217507839202881, + "learning_rate": 0.0001608050273579284, + "loss": 7.072686767578125, + "step": 7240 + }, + { + "epoch": 8.248220893822943, + "grad_norm": 6.846201419830322, + "learning_rate": 0.00016075085324232082, + "loss": 7.156787109375, + "step": 7250 + }, + { + "epoch": 8.259607173356105, + "grad_norm": 7.219080448150635, + "learning_rate": 0.00016069667912671327, + "loss": 7.439443969726563, + "step": 7260 + }, + { + "epoch": 8.270993452889268, + "grad_norm": 6.303773880004883, + "learning_rate": 0.0001606425050111057, + "loss": 7.361647033691407, + "step": 7270 + }, + { + "epoch": 8.28237973242243, + "grad_norm": 7.611039161682129, + "learning_rate": 0.00016058833089549815, + "loss": 7.37061767578125, + "step": 7280 + }, + { + "epoch": 8.293766011955594, + "grad_norm": 6.5654497146606445, + "learning_rate": 0.00016053415677989057, + "loss": 7.721343994140625, + "step": 7290 + }, + { + "epoch": 8.305152291488756, + "grad_norm": 5.921697616577148, + "learning_rate": 0.000160479982664283, + "loss": 7.14508285522461, + "step": 7300 + }, + { + "epoch": 8.316538571021919, + "grad_norm": 6.788980484008789, + "learning_rate": 0.00016042580854867545, + "loss": 7.477375793457031, + "step": 7310 + }, + { + "epoch": 8.327924850555082, + "grad_norm": 7.130054950714111, + "learning_rate": 0.00016037163443306788, + "loss": 6.889068603515625, + "step": 7320 + }, + { + "epoch": 8.339311130088245, + "grad_norm": 6.373105049133301, + "learning_rate": 0.00016031746031746033, + "loss": 7.234850311279297, + "step": 7330 + }, + { + "epoch": 8.350697409621405, + "grad_norm": 7.498371124267578, + "learning_rate": 0.00016026328620185275, + "loss": 7.064684295654297, + "step": 7340 + }, + { + "epoch": 8.362083689154568, + "grad_norm": 6.624664783477783, + "learning_rate": 0.0001602091120862452, + "loss": 7.807039642333985, + "step": 7350 + }, + { + "epoch": 8.373469968687731, + "grad_norm": 6.387814044952393, + "learning_rate": 0.00016015493797063763, + "loss": 7.460430908203125, + "step": 7360 + }, + { + "epoch": 8.384856248220894, + "grad_norm": 7.011965274810791, + "learning_rate": 0.00016010076385503008, + "loss": 7.333013916015625, + "step": 7370 + }, + { + "epoch": 8.396242527754056, + "grad_norm": 6.425861835479736, + "learning_rate": 0.0001600465897394225, + "loss": 7.393313598632813, + "step": 7380 + }, + { + "epoch": 8.40762880728722, + "grad_norm": 6.637088775634766, + "learning_rate": 0.00015999241562381496, + "loss": 7.426857757568359, + "step": 7390 + }, + { + "epoch": 8.419015086820382, + "grad_norm": 7.036196708679199, + "learning_rate": 0.00015993824150820738, + "loss": 7.142207336425781, + "step": 7400 + }, + { + "epoch": 8.430401366353545, + "grad_norm": 6.357840061187744, + "learning_rate": 0.00015988406739259983, + "loss": 7.445525360107422, + "step": 7410 + }, + { + "epoch": 8.441787645886706, + "grad_norm": 6.880340099334717, + "learning_rate": 0.00015982989327699228, + "loss": 7.163670349121094, + "step": 7420 + }, + { + "epoch": 8.453173925419868, + "grad_norm": 6.362428188323975, + "learning_rate": 0.0001597757191613847, + "loss": 7.434315490722656, + "step": 7430 + }, + { + "epoch": 8.464560204953031, + "grad_norm": 6.664401531219482, + "learning_rate": 0.00015972154504577713, + "loss": 7.144511413574219, + "step": 7440 + }, + { + "epoch": 8.475946484486194, + "grad_norm": 6.501242637634277, + "learning_rate": 0.00015966737093016956, + "loss": 7.108064270019531, + "step": 7450 + }, + { + "epoch": 8.487332764019357, + "grad_norm": 6.9143476486206055, + "learning_rate": 0.000159613196814562, + "loss": 7.1984611511230465, + "step": 7460 + }, + { + "epoch": 8.49871904355252, + "grad_norm": 6.557959079742432, + "learning_rate": 0.00015955902269895443, + "loss": 7.022095489501953, + "step": 7470 + }, + { + "epoch": 8.510105323085682, + "grad_norm": 6.557929992675781, + "learning_rate": 0.00015950484858334688, + "loss": 7.145917510986328, + "step": 7480 + }, + { + "epoch": 8.521491602618845, + "grad_norm": 6.603511810302734, + "learning_rate": 0.0001594506744677393, + "loss": 7.341641998291015, + "step": 7490 + }, + { + "epoch": 8.532877882152007, + "grad_norm": 7.191463470458984, + "learning_rate": 0.00015939650035213176, + "loss": 6.9570457458496096, + "step": 7500 + }, + { + "epoch": 8.54426416168517, + "grad_norm": 6.919197082519531, + "learning_rate": 0.00015934232623652418, + "loss": 7.235923004150391, + "step": 7510 + }, + { + "epoch": 8.555650441218331, + "grad_norm": 6.883152008056641, + "learning_rate": 0.00015928815212091664, + "loss": 7.444821166992187, + "step": 7520 + }, + { + "epoch": 8.567036720751494, + "grad_norm": 6.3207244873046875, + "learning_rate": 0.0001592339780053091, + "loss": 7.598611450195312, + "step": 7530 + }, + { + "epoch": 8.578423000284657, + "grad_norm": 6.418648719787598, + "learning_rate": 0.0001591798038897015, + "loss": 7.209278869628906, + "step": 7540 + }, + { + "epoch": 8.58980927981782, + "grad_norm": 6.839104175567627, + "learning_rate": 0.00015912562977409396, + "loss": 7.241474914550781, + "step": 7550 + }, + { + "epoch": 8.601195559350982, + "grad_norm": 6.884853363037109, + "learning_rate": 0.0001590714556584864, + "loss": 7.021162414550782, + "step": 7560 + }, + { + "epoch": 8.612581838884145, + "grad_norm": 6.1609787940979, + "learning_rate": 0.00015901728154287884, + "loss": 7.474005126953125, + "step": 7570 + }, + { + "epoch": 8.623968118417308, + "grad_norm": 6.195271968841553, + "learning_rate": 0.00015896310742727126, + "loss": 6.983114624023438, + "step": 7580 + }, + { + "epoch": 8.63535439795047, + "grad_norm": 6.70726203918457, + "learning_rate": 0.0001589089333116637, + "loss": 7.420458984375, + "step": 7590 + }, + { + "epoch": 8.646740677483631, + "grad_norm": 7.531630516052246, + "learning_rate": 0.0001588547591960561, + "loss": 7.718193817138672, + "step": 7600 + }, + { + "epoch": 8.658126957016794, + "grad_norm": 6.9179534912109375, + "learning_rate": 0.00015880058508044856, + "loss": 7.357187652587891, + "step": 7610 + }, + { + "epoch": 8.669513236549957, + "grad_norm": 6.675024509429932, + "learning_rate": 0.000158746410964841, + "loss": 7.010201263427734, + "step": 7620 + }, + { + "epoch": 8.68089951608312, + "grad_norm": 6.622598648071289, + "learning_rate": 0.00015869223684923344, + "loss": 7.162730407714844, + "step": 7630 + }, + { + "epoch": 8.692285795616282, + "grad_norm": 6.835996627807617, + "learning_rate": 0.0001586380627336259, + "loss": 7.303845977783203, + "step": 7640 + }, + { + "epoch": 8.703672075149445, + "grad_norm": 6.367660045623779, + "learning_rate": 0.00015858388861801832, + "loss": 7.625505065917968, + "step": 7650 + }, + { + "epoch": 8.715058354682608, + "grad_norm": 6.169827938079834, + "learning_rate": 0.00015852971450241077, + "loss": 7.09228744506836, + "step": 7660 + }, + { + "epoch": 8.72644463421577, + "grad_norm": 6.215500354766846, + "learning_rate": 0.0001584755403868032, + "loss": 7.060032653808594, + "step": 7670 + }, + { + "epoch": 8.737830913748933, + "grad_norm": 6.325623035430908, + "learning_rate": 0.00015842136627119564, + "loss": 7.31326904296875, + "step": 7680 + }, + { + "epoch": 8.749217193282096, + "grad_norm": 6.559711933135986, + "learning_rate": 0.00015836719215558807, + "loss": 7.409132385253907, + "step": 7690 + }, + { + "epoch": 8.760603472815257, + "grad_norm": 6.366078853607178, + "learning_rate": 0.00015831301803998052, + "loss": 6.915621948242188, + "step": 7700 + }, + { + "epoch": 8.77198975234842, + "grad_norm": 6.734594345092773, + "learning_rate": 0.00015825884392437294, + "loss": 7.271147155761719, + "step": 7710 + }, + { + "epoch": 8.783376031881582, + "grad_norm": 7.609946250915527, + "learning_rate": 0.00015820466980876537, + "loss": 7.408287048339844, + "step": 7720 + }, + { + "epoch": 8.794762311414745, + "grad_norm": 6.976548671722412, + "learning_rate": 0.00015815049569315782, + "loss": 7.269252777099609, + "step": 7730 + }, + { + "epoch": 8.806148590947908, + "grad_norm": 7.052664279937744, + "learning_rate": 0.00015809632157755025, + "loss": 7.457789611816406, + "step": 7740 + }, + { + "epoch": 8.81753487048107, + "grad_norm": 6.683445453643799, + "learning_rate": 0.0001580421474619427, + "loss": 7.239376831054687, + "step": 7750 + }, + { + "epoch": 8.828921150014233, + "grad_norm": 6.740503311157227, + "learning_rate": 0.00015798797334633512, + "loss": 7.014515686035156, + "step": 7760 + }, + { + "epoch": 8.840307429547396, + "grad_norm": 7.012024879455566, + "learning_rate": 0.00015793379923072757, + "loss": 7.423218536376953, + "step": 7770 + }, + { + "epoch": 8.851693709080559, + "grad_norm": 6.79911994934082, + "learning_rate": 0.00015787962511512, + "loss": 7.527141571044922, + "step": 7780 + }, + { + "epoch": 8.86307998861372, + "grad_norm": 6.133242130279541, + "learning_rate": 0.00015782545099951245, + "loss": 7.559020233154297, + "step": 7790 + }, + { + "epoch": 8.874466268146882, + "grad_norm": 6.954835891723633, + "learning_rate": 0.00015777127688390487, + "loss": 7.530062103271485, + "step": 7800 + }, + { + "epoch": 8.885852547680045, + "grad_norm": 6.71362829208374, + "learning_rate": 0.00015771710276829733, + "loss": 7.113396453857422, + "step": 7810 + }, + { + "epoch": 8.897238827213208, + "grad_norm": 6.9403862953186035, + "learning_rate": 0.00015766292865268975, + "loss": 7.578011322021484, + "step": 7820 + }, + { + "epoch": 8.90862510674637, + "grad_norm": 6.492265701293945, + "learning_rate": 0.0001576087545370822, + "loss": 7.283029937744141, + "step": 7830 + }, + { + "epoch": 8.920011386279533, + "grad_norm": 5.762649059295654, + "learning_rate": 0.00015755458042147463, + "loss": 7.412893676757813, + "step": 7840 + }, + { + "epoch": 8.931397665812696, + "grad_norm": 6.935257911682129, + "learning_rate": 0.00015750040630586708, + "loss": 7.5289451599121096, + "step": 7850 + }, + { + "epoch": 8.942783945345859, + "grad_norm": 6.323666095733643, + "learning_rate": 0.0001574462321902595, + "loss": 7.003956604003906, + "step": 7860 + }, + { + "epoch": 8.954170224879022, + "grad_norm": 5.6248779296875, + "learning_rate": 0.00015739205807465193, + "loss": 7.441354370117187, + "step": 7870 + }, + { + "epoch": 8.965556504412183, + "grad_norm": 6.482755661010742, + "learning_rate": 0.00015733788395904438, + "loss": 7.2617744445800785, + "step": 7880 + }, + { + "epoch": 8.976942783945345, + "grad_norm": 6.673778533935547, + "learning_rate": 0.0001572837098434368, + "loss": 7.212700653076172, + "step": 7890 + }, + { + "epoch": 8.988329063478508, + "grad_norm": 6.261040210723877, + "learning_rate": 0.00015722953572782925, + "loss": 7.402249145507812, + "step": 7900 + }, + { + "epoch": 8.99971534301167, + "grad_norm": 6.743113994598389, + "learning_rate": 0.00015717536161222168, + "loss": 7.82423095703125, + "step": 7910 + }, + { + "epoch": 9.010247651579846, + "grad_norm": 6.245288372039795, + "learning_rate": 0.00015712118749661413, + "loss": 6.120718765258789, + "step": 7920 + }, + { + "epoch": 9.021633931113008, + "grad_norm": 6.373956203460693, + "learning_rate": 0.00015706701338100655, + "loss": 6.5073394775390625, + "step": 7930 + }, + { + "epoch": 9.033020210646171, + "grad_norm": 6.782715320587158, + "learning_rate": 0.000157012839265399, + "loss": 6.719621276855468, + "step": 7940 + }, + { + "epoch": 9.044406490179334, + "grad_norm": 6.204227924346924, + "learning_rate": 0.00015695866514979143, + "loss": 6.846994781494141, + "step": 7950 + }, + { + "epoch": 9.055792769712497, + "grad_norm": 5.706500053405762, + "learning_rate": 0.00015690449103418388, + "loss": 6.545850372314453, + "step": 7960 + }, + { + "epoch": 9.06717904924566, + "grad_norm": 6.953368186950684, + "learning_rate": 0.0001568503169185763, + "loss": 6.754498291015625, + "step": 7970 + }, + { + "epoch": 9.078565328778822, + "grad_norm": 7.3118743896484375, + "learning_rate": 0.00015679614280296876, + "loss": 7.14571533203125, + "step": 7980 + }, + { + "epoch": 9.089951608311985, + "grad_norm": 7.408244609832764, + "learning_rate": 0.00015674196868736118, + "loss": 6.966554260253906, + "step": 7990 + }, + { + "epoch": 9.101337887845146, + "grad_norm": 6.550361156463623, + "learning_rate": 0.00015668779457175363, + "loss": 6.683185577392578, + "step": 8000 + }, + { + "epoch": 9.112724167378309, + "grad_norm": 7.23964262008667, + "learning_rate": 0.00015663362045614606, + "loss": 6.666042327880859, + "step": 8010 + }, + { + "epoch": 9.124110446911471, + "grad_norm": 6.447325706481934, + "learning_rate": 0.00015657944634053848, + "loss": 6.605396270751953, + "step": 8020 + }, + { + "epoch": 9.135496726444634, + "grad_norm": 6.252988338470459, + "learning_rate": 0.00015652527222493093, + "loss": 6.713479614257812, + "step": 8030 + }, + { + "epoch": 9.146883005977797, + "grad_norm": 7.02787971496582, + "learning_rate": 0.00015647109810932336, + "loss": 6.7254997253417965, + "step": 8040 + }, + { + "epoch": 9.15826928551096, + "grad_norm": 6.618356704711914, + "learning_rate": 0.0001564169239937158, + "loss": 6.731692504882813, + "step": 8050 + }, + { + "epoch": 9.169655565044122, + "grad_norm": 6.890060901641846, + "learning_rate": 0.00015636274987810824, + "loss": 6.741816711425781, + "step": 8060 + }, + { + "epoch": 9.181041844577285, + "grad_norm": 7.1875901222229, + "learning_rate": 0.0001563085757625007, + "loss": 6.789379119873047, + "step": 8070 + }, + { + "epoch": 9.192428124110448, + "grad_norm": 7.181351184844971, + "learning_rate": 0.0001562544016468931, + "loss": 6.818449401855469, + "step": 8080 + }, + { + "epoch": 9.203814403643609, + "grad_norm": 7.285808086395264, + "learning_rate": 0.00015620022753128556, + "loss": 6.539177703857422, + "step": 8090 + }, + { + "epoch": 9.215200683176771, + "grad_norm": 6.714638710021973, + "learning_rate": 0.00015614605341567801, + "loss": 6.656063842773437, + "step": 8100 + }, + { + "epoch": 9.226586962709934, + "grad_norm": 7.055619716644287, + "learning_rate": 0.00015609187930007044, + "loss": 6.620281982421875, + "step": 8110 + }, + { + "epoch": 9.237973242243097, + "grad_norm": 6.574080467224121, + "learning_rate": 0.0001560377051844629, + "loss": 6.747283172607422, + "step": 8120 + }, + { + "epoch": 9.24935952177626, + "grad_norm": 6.460296630859375, + "learning_rate": 0.00015598353106885531, + "loss": 6.993648529052734, + "step": 8130 + }, + { + "epoch": 9.260745801309422, + "grad_norm": 7.454887866973877, + "learning_rate": 0.00015592935695324774, + "loss": 6.948415374755859, + "step": 8140 + }, + { + "epoch": 9.272132080842585, + "grad_norm": 6.729802131652832, + "learning_rate": 0.00015587518283764016, + "loss": 6.89251937866211, + "step": 8150 + }, + { + "epoch": 9.283518360375748, + "grad_norm": 6.335530757904053, + "learning_rate": 0.00015582100872203262, + "loss": 6.703284454345703, + "step": 8160 + }, + { + "epoch": 9.29490463990891, + "grad_norm": 7.840336322784424, + "learning_rate": 0.00015576683460642504, + "loss": 7.09771957397461, + "step": 8170 + }, + { + "epoch": 9.306290919442072, + "grad_norm": 6.923723220825195, + "learning_rate": 0.0001557126604908175, + "loss": 6.888484954833984, + "step": 8180 + }, + { + "epoch": 9.317677198975234, + "grad_norm": 6.690946578979492, + "learning_rate": 0.00015565848637520992, + "loss": 7.289754486083984, + "step": 8190 + }, + { + "epoch": 9.329063478508397, + "grad_norm": 6.629517555236816, + "learning_rate": 0.00015560431225960237, + "loss": 7.009606170654297, + "step": 8200 + }, + { + "epoch": 9.34044975804156, + "grad_norm": 7.02864933013916, + "learning_rate": 0.00015555013814399482, + "loss": 6.7587532043457035, + "step": 8210 + }, + { + "epoch": 9.351836037574722, + "grad_norm": 6.852596282958984, + "learning_rate": 0.00015549596402838724, + "loss": 6.829173278808594, + "step": 8220 + }, + { + "epoch": 9.363222317107885, + "grad_norm": 6.9407243728637695, + "learning_rate": 0.0001554417899127797, + "loss": 6.788723754882812, + "step": 8230 + }, + { + "epoch": 9.374608596641048, + "grad_norm": 6.7164812088012695, + "learning_rate": 0.00015538761579717212, + "loss": 6.483773040771484, + "step": 8240 + }, + { + "epoch": 9.38599487617421, + "grad_norm": 6.983746528625488, + "learning_rate": 0.00015533344168156457, + "loss": 6.634527587890625, + "step": 8250 + }, + { + "epoch": 9.397381155707373, + "grad_norm": 7.202597618103027, + "learning_rate": 0.000155279267565957, + "loss": 6.860771942138672, + "step": 8260 + }, + { + "epoch": 9.408767435240534, + "grad_norm": 6.925684452056885, + "learning_rate": 0.00015522509345034945, + "loss": 7.078357696533203, + "step": 8270 + }, + { + "epoch": 9.420153714773697, + "grad_norm": 7.451114177703857, + "learning_rate": 0.00015517091933474187, + "loss": 6.643927764892578, + "step": 8280 + }, + { + "epoch": 9.43153999430686, + "grad_norm": 6.825454235076904, + "learning_rate": 0.0001551167452191343, + "loss": 6.680137634277344, + "step": 8290 + }, + { + "epoch": 9.442926273840023, + "grad_norm": 6.374398708343506, + "learning_rate": 0.00015506257110352672, + "loss": 6.5555778503417965, + "step": 8300 + }, + { + "epoch": 9.454312553373185, + "grad_norm": 7.115841388702393, + "learning_rate": 0.00015500839698791917, + "loss": 7.261045837402344, + "step": 8310 + }, + { + "epoch": 9.465698832906348, + "grad_norm": 6.24588680267334, + "learning_rate": 0.00015495422287231162, + "loss": 6.373281860351563, + "step": 8320 + }, + { + "epoch": 9.47708511243951, + "grad_norm": 6.433933258056641, + "learning_rate": 0.00015490004875670405, + "loss": 7.186137390136719, + "step": 8330 + }, + { + "epoch": 9.488471391972674, + "grad_norm": 6.373920440673828, + "learning_rate": 0.0001548458746410965, + "loss": 6.6659690856933596, + "step": 8340 + }, + { + "epoch": 9.499857671505836, + "grad_norm": 6.638841152191162, + "learning_rate": 0.00015479170052548892, + "loss": 6.709719848632813, + "step": 8350 + }, + { + "epoch": 9.511243951038997, + "grad_norm": 7.437965393066406, + "learning_rate": 0.00015473752640988138, + "loss": 6.974462890625, + "step": 8360 + }, + { + "epoch": 9.52263023057216, + "grad_norm": 7.176238059997559, + "learning_rate": 0.0001546833522942738, + "loss": 7.053614807128906, + "step": 8370 + }, + { + "epoch": 9.534016510105323, + "grad_norm": 6.885675430297852, + "learning_rate": 0.00015462917817866625, + "loss": 6.691897583007813, + "step": 8380 + }, + { + "epoch": 9.545402789638485, + "grad_norm": 5.81322717666626, + "learning_rate": 0.00015457500406305868, + "loss": 6.6090576171875, + "step": 8390 + }, + { + "epoch": 9.556789069171648, + "grad_norm": 6.862832069396973, + "learning_rate": 0.00015452082994745113, + "loss": 7.214694976806641, + "step": 8400 + }, + { + "epoch": 9.568175348704811, + "grad_norm": 7.299694538116455, + "learning_rate": 0.00015446665583184355, + "loss": 6.833685302734375, + "step": 8410 + }, + { + "epoch": 9.579561628237974, + "grad_norm": 6.393362522125244, + "learning_rate": 0.000154412481716236, + "loss": 7.033847045898438, + "step": 8420 + }, + { + "epoch": 9.590947907771136, + "grad_norm": 7.435060977935791, + "learning_rate": 0.00015435830760062843, + "loss": 6.689753723144531, + "step": 8430 + }, + { + "epoch": 9.602334187304299, + "grad_norm": 7.483737468719482, + "learning_rate": 0.00015430413348502085, + "loss": 7.141274261474609, + "step": 8440 + }, + { + "epoch": 9.61372046683746, + "grad_norm": 6.771705150604248, + "learning_rate": 0.0001542499593694133, + "loss": 6.743639373779297, + "step": 8450 + }, + { + "epoch": 9.625106746370623, + "grad_norm": 6.806461334228516, + "learning_rate": 0.00015419578525380573, + "loss": 6.662245178222657, + "step": 8460 + }, + { + "epoch": 9.636493025903786, + "grad_norm": 6.6654372215271, + "learning_rate": 0.00015414161113819818, + "loss": 6.932222747802735, + "step": 8470 + }, + { + "epoch": 9.647879305436948, + "grad_norm": 6.507349491119385, + "learning_rate": 0.0001540874370225906, + "loss": 7.092259216308594, + "step": 8480 + }, + { + "epoch": 9.659265584970111, + "grad_norm": 7.4099202156066895, + "learning_rate": 0.00015403326290698306, + "loss": 6.74954833984375, + "step": 8490 + }, + { + "epoch": 9.670651864503274, + "grad_norm": 6.947366714477539, + "learning_rate": 0.00015397908879137548, + "loss": 6.710967254638672, + "step": 8500 + }, + { + "epoch": 9.682038144036436, + "grad_norm": 7.391485214233398, + "learning_rate": 0.00015392491467576793, + "loss": 6.349737167358398, + "step": 8510 + }, + { + "epoch": 9.6934244235696, + "grad_norm": 6.834575653076172, + "learning_rate": 0.00015387074056016036, + "loss": 6.986430358886719, + "step": 8520 + }, + { + "epoch": 9.704810703102762, + "grad_norm": 6.834963321685791, + "learning_rate": 0.0001538165664445528, + "loss": 6.768232727050782, + "step": 8530 + }, + { + "epoch": 9.716196982635923, + "grad_norm": 6.545167446136475, + "learning_rate": 0.00015376239232894526, + "loss": 7.306761932373047, + "step": 8540 + }, + { + "epoch": 9.727583262169086, + "grad_norm": 6.159801959991455, + "learning_rate": 0.00015370821821333768, + "loss": 6.77575454711914, + "step": 8550 + }, + { + "epoch": 9.738969541702248, + "grad_norm": 6.571531772613525, + "learning_rate": 0.0001536540440977301, + "loss": 6.770185089111328, + "step": 8560 + }, + { + "epoch": 9.750355821235411, + "grad_norm": 7.542463779449463, + "learning_rate": 0.00015359986998212253, + "loss": 7.183005523681641, + "step": 8570 + }, + { + "epoch": 9.761742100768574, + "grad_norm": 6.521090984344482, + "learning_rate": 0.00015354569586651499, + "loss": 6.722921752929688, + "step": 8580 + }, + { + "epoch": 9.773128380301737, + "grad_norm": 6.779551029205322, + "learning_rate": 0.0001534915217509074, + "loss": 6.963375854492187, + "step": 8590 + }, + { + "epoch": 9.7845146598349, + "grad_norm": 7.098196029663086, + "learning_rate": 0.00015343734763529986, + "loss": 7.068116760253906, + "step": 8600 + }, + { + "epoch": 9.795900939368062, + "grad_norm": 6.990663528442383, + "learning_rate": 0.00015338317351969229, + "loss": 6.641743469238281, + "step": 8610 + }, + { + "epoch": 9.807287218901225, + "grad_norm": 6.753698348999023, + "learning_rate": 0.00015332899940408474, + "loss": 6.880230712890625, + "step": 8620 + }, + { + "epoch": 9.818673498434386, + "grad_norm": 7.067351341247559, + "learning_rate": 0.00015327482528847716, + "loss": 6.862619781494141, + "step": 8630 + }, + { + "epoch": 9.830059777967548, + "grad_norm": 7.574794292449951, + "learning_rate": 0.0001532206511728696, + "loss": 6.817996978759766, + "step": 8640 + }, + { + "epoch": 9.841446057500711, + "grad_norm": 6.906538963317871, + "learning_rate": 0.00015316647705726204, + "loss": 6.774374389648438, + "step": 8650 + }, + { + "epoch": 9.852832337033874, + "grad_norm": 6.665681838989258, + "learning_rate": 0.0001531123029416545, + "loss": 6.8718719482421875, + "step": 8660 + }, + { + "epoch": 9.864218616567037, + "grad_norm": 6.376792907714844, + "learning_rate": 0.00015305812882604694, + "loss": 6.672955322265625, + "step": 8670 + }, + { + "epoch": 9.8756048961002, + "grad_norm": 7.075804710388184, + "learning_rate": 0.00015300395471043937, + "loss": 7.0490264892578125, + "step": 8680 + }, + { + "epoch": 9.886991175633362, + "grad_norm": 6.787551403045654, + "learning_rate": 0.00015294978059483182, + "loss": 7.223753356933594, + "step": 8690 + }, + { + "epoch": 9.898377455166525, + "grad_norm": 6.694747447967529, + "learning_rate": 0.00015289560647922424, + "loss": 6.798531341552734, + "step": 8700 + }, + { + "epoch": 9.909763734699688, + "grad_norm": 6.917640686035156, + "learning_rate": 0.00015284143236361667, + "loss": 7.0906829833984375, + "step": 8710 + }, + { + "epoch": 9.921150014232849, + "grad_norm": 7.117720127105713, + "learning_rate": 0.0001527872582480091, + "loss": 7.004290771484375, + "step": 8720 + }, + { + "epoch": 9.932536293766011, + "grad_norm": 7.377773761749268, + "learning_rate": 0.00015273308413240154, + "loss": 6.975653839111328, + "step": 8730 + }, + { + "epoch": 9.943922573299174, + "grad_norm": 6.890756130218506, + "learning_rate": 0.00015267891001679397, + "loss": 6.900422668457031, + "step": 8740 + }, + { + "epoch": 9.955308852832337, + "grad_norm": 7.174078941345215, + "learning_rate": 0.00015262473590118642, + "loss": 7.064826202392578, + "step": 8750 + }, + { + "epoch": 9.9666951323655, + "grad_norm": 6.936395168304443, + "learning_rate": 0.00015257056178557884, + "loss": 6.7657325744628904, + "step": 8760 + }, + { + "epoch": 9.978081411898662, + "grad_norm": 6.216085433959961, + "learning_rate": 0.0001525163876699713, + "loss": 6.81783447265625, + "step": 8770 + }, + { + "epoch": 9.989467691431825, + "grad_norm": 6.8290839195251465, + "learning_rate": 0.00015246221355436375, + "loss": 6.718660736083985, + "step": 8780 + }, + { + "epoch": 10.0, + "grad_norm": 5.303323745727539, + "learning_rate": 0.00015240803943875617, + "loss": 6.455754089355469, + "step": 8790 + }, + { + "epoch": 10.011386279533163, + "grad_norm": 6.515944480895996, + "learning_rate": 0.00015235386532314862, + "loss": 6.108221435546875, + "step": 8800 + }, + { + "epoch": 10.022772559066325, + "grad_norm": 6.763042449951172, + "learning_rate": 0.00015229969120754105, + "loss": 6.451548004150391, + "step": 8810 + }, + { + "epoch": 10.034158838599488, + "grad_norm": 7.289494037628174, + "learning_rate": 0.0001522455170919335, + "loss": 6.422400665283203, + "step": 8820 + }, + { + "epoch": 10.045545118132651, + "grad_norm": 6.849950790405273, + "learning_rate": 0.00015219134297632592, + "loss": 6.369514083862304, + "step": 8830 + }, + { + "epoch": 10.056931397665812, + "grad_norm": 6.839559555053711, + "learning_rate": 0.00015213716886071835, + "loss": 6.199007415771485, + "step": 8840 + }, + { + "epoch": 10.068317677198975, + "grad_norm": 7.444436550140381, + "learning_rate": 0.0001520829947451108, + "loss": 6.206901931762696, + "step": 8850 + }, + { + "epoch": 10.079703956732137, + "grad_norm": 7.2634782791137695, + "learning_rate": 0.00015202882062950322, + "loss": 6.411515808105468, + "step": 8860 + }, + { + "epoch": 10.0910902362653, + "grad_norm": 7.075826168060303, + "learning_rate": 0.00015197464651389565, + "loss": 6.321298980712891, + "step": 8870 + }, + { + "epoch": 10.102476515798463, + "grad_norm": 8.332730293273926, + "learning_rate": 0.0001519204723982881, + "loss": 6.658344268798828, + "step": 8880 + }, + { + "epoch": 10.113862795331626, + "grad_norm": 6.364902019500732, + "learning_rate": 0.00015186629828268055, + "loss": 6.109456634521484, + "step": 8890 + }, + { + "epoch": 10.125249074864788, + "grad_norm": 7.3898138999938965, + "learning_rate": 0.00015181212416707297, + "loss": 6.706474304199219, + "step": 8900 + }, + { + "epoch": 10.136635354397951, + "grad_norm": 7.5353264808654785, + "learning_rate": 0.00015175795005146543, + "loss": 6.191032791137696, + "step": 8910 + }, + { + "epoch": 10.148021633931114, + "grad_norm": 6.525182723999023, + "learning_rate": 0.00015170377593585785, + "loss": 6.050387191772461, + "step": 8920 + }, + { + "epoch": 10.159407913464275, + "grad_norm": 6.768884658813477, + "learning_rate": 0.0001516496018202503, + "loss": 6.448866271972657, + "step": 8930 + }, + { + "epoch": 10.170794192997437, + "grad_norm": 6.508273601531982, + "learning_rate": 0.00015159542770464273, + "loss": 6.556841278076172, + "step": 8940 + }, + { + "epoch": 10.1821804725306, + "grad_norm": 7.144713401794434, + "learning_rate": 0.00015154125358903518, + "loss": 6.7161407470703125, + "step": 8950 + }, + { + "epoch": 10.193566752063763, + "grad_norm": 6.919196128845215, + "learning_rate": 0.0001514870794734276, + "loss": 6.426815032958984, + "step": 8960 + }, + { + "epoch": 10.204953031596926, + "grad_norm": 6.691727161407471, + "learning_rate": 0.00015143290535782005, + "loss": 6.2255096435546875, + "step": 8970 + }, + { + "epoch": 10.216339311130088, + "grad_norm": 6.866335391998291, + "learning_rate": 0.00015137873124221248, + "loss": 6.1784015655517575, + "step": 8980 + }, + { + "epoch": 10.227725590663251, + "grad_norm": 6.891946792602539, + "learning_rate": 0.0001513245571266049, + "loss": 6.152831268310547, + "step": 8990 + }, + { + "epoch": 10.239111870196414, + "grad_norm": 6.802209854125977, + "learning_rate": 0.00015127038301099736, + "loss": 6.270240783691406, + "step": 9000 + }, + { + "epoch": 10.250498149729577, + "grad_norm": 7.624642372131348, + "learning_rate": 0.00015121620889538978, + "loss": 6.247557067871094, + "step": 9010 + }, + { + "epoch": 10.261884429262738, + "grad_norm": 7.880180835723877, + "learning_rate": 0.00015116203477978223, + "loss": 6.371080017089843, + "step": 9020 + }, + { + "epoch": 10.2732707087959, + "grad_norm": 6.9289116859436035, + "learning_rate": 0.00015110786066417466, + "loss": 6.2116645812988285, + "step": 9030 + }, + { + "epoch": 10.284656988329063, + "grad_norm": 7.059987545013428, + "learning_rate": 0.0001510536865485671, + "loss": 6.152032852172852, + "step": 9040 + }, + { + "epoch": 10.296043267862226, + "grad_norm": 7.274111270904541, + "learning_rate": 0.00015099951243295953, + "loss": 6.412078857421875, + "step": 9050 + }, + { + "epoch": 10.307429547395389, + "grad_norm": 6.609608173370361, + "learning_rate": 0.00015094533831735198, + "loss": 6.340757369995117, + "step": 9060 + }, + { + "epoch": 10.318815826928551, + "grad_norm": 6.39749002456665, + "learning_rate": 0.0001508911642017444, + "loss": 6.413553619384766, + "step": 9070 + }, + { + "epoch": 10.330202106461714, + "grad_norm": 6.884727478027344, + "learning_rate": 0.00015083699008613686, + "loss": 6.569889068603516, + "step": 9080 + }, + { + "epoch": 10.341588385994877, + "grad_norm": 7.041878700256348, + "learning_rate": 0.00015078281597052928, + "loss": 6.179785537719726, + "step": 9090 + }, + { + "epoch": 10.35297466552804, + "grad_norm": 6.370310306549072, + "learning_rate": 0.00015072864185492174, + "loss": 6.136286926269531, + "step": 9100 + }, + { + "epoch": 10.3643609450612, + "grad_norm": 7.227022647857666, + "learning_rate": 0.0001506744677393142, + "loss": 6.1931709289550785, + "step": 9110 + }, + { + "epoch": 10.375747224594363, + "grad_norm": 6.813343048095703, + "learning_rate": 0.0001506202936237066, + "loss": 6.568285369873047, + "step": 9120 + }, + { + "epoch": 10.387133504127526, + "grad_norm": 6.486794948577881, + "learning_rate": 0.00015056611950809904, + "loss": 6.357745361328125, + "step": 9130 + }, + { + "epoch": 10.398519783660689, + "grad_norm": 7.465872764587402, + "learning_rate": 0.00015051194539249146, + "loss": 6.26553955078125, + "step": 9140 + }, + { + "epoch": 10.409906063193851, + "grad_norm": 7.438695907592773, + "learning_rate": 0.0001504577712768839, + "loss": 6.602199554443359, + "step": 9150 + }, + { + "epoch": 10.421292342727014, + "grad_norm": 8.035082817077637, + "learning_rate": 0.00015040359716127634, + "loss": 6.462114715576172, + "step": 9160 + }, + { + "epoch": 10.432678622260177, + "grad_norm": 6.581029891967773, + "learning_rate": 0.0001503494230456688, + "loss": 6.531999206542968, + "step": 9170 + }, + { + "epoch": 10.44406490179334, + "grad_norm": 7.6987175941467285, + "learning_rate": 0.0001502952489300612, + "loss": 6.53165283203125, + "step": 9180 + }, + { + "epoch": 10.455451181326502, + "grad_norm": 5.949412822723389, + "learning_rate": 0.00015024107481445366, + "loss": 5.95707893371582, + "step": 9190 + }, + { + "epoch": 10.466837460859663, + "grad_norm": 7.387345314025879, + "learning_rate": 0.0001501869006988461, + "loss": 6.544392395019531, + "step": 9200 + }, + { + "epoch": 10.478223740392826, + "grad_norm": 7.590228080749512, + "learning_rate": 0.00015013272658323854, + "loss": 6.59503173828125, + "step": 9210 + }, + { + "epoch": 10.489610019925989, + "grad_norm": 6.827988147735596, + "learning_rate": 0.000150078552467631, + "loss": 6.5579277038574215, + "step": 9220 + }, + { + "epoch": 10.500996299459151, + "grad_norm": 6.989532470703125, + "learning_rate": 0.00015002437835202342, + "loss": 6.056536102294922, + "step": 9230 + }, + { + "epoch": 10.512382578992314, + "grad_norm": 6.390324592590332, + "learning_rate": 0.00014997020423641587, + "loss": 6.437796783447266, + "step": 9240 + }, + { + "epoch": 10.523768858525477, + "grad_norm": 7.057545185089111, + "learning_rate": 0.0001499160301208083, + "loss": 6.424283599853515, + "step": 9250 + }, + { + "epoch": 10.53515513805864, + "grad_norm": 7.236074924468994, + "learning_rate": 0.00014986185600520072, + "loss": 6.465802001953125, + "step": 9260 + }, + { + "epoch": 10.546541417591802, + "grad_norm": 7.380622863769531, + "learning_rate": 0.00014980768188959314, + "loss": 6.5994873046875, + "step": 9270 + }, + { + "epoch": 10.557927697124965, + "grad_norm": 6.613060474395752, + "learning_rate": 0.0001497535077739856, + "loss": 6.399759292602539, + "step": 9280 + }, + { + "epoch": 10.569313976658126, + "grad_norm": 7.060276508331299, + "learning_rate": 0.00014969933365837802, + "loss": 6.557682800292969, + "step": 9290 + }, + { + "epoch": 10.580700256191289, + "grad_norm": 6.692377090454102, + "learning_rate": 0.00014964515954277047, + "loss": 6.2362712860107425, + "step": 9300 + }, + { + "epoch": 10.592086535724452, + "grad_norm": 7.573443412780762, + "learning_rate": 0.0001495909854271629, + "loss": 6.369208908081054, + "step": 9310 + }, + { + "epoch": 10.603472815257614, + "grad_norm": 7.296620845794678, + "learning_rate": 0.00014953681131155534, + "loss": 6.499317932128906, + "step": 9320 + }, + { + "epoch": 10.614859094790777, + "grad_norm": 7.194809436798096, + "learning_rate": 0.00014948263719594777, + "loss": 6.53308334350586, + "step": 9330 + }, + { + "epoch": 10.62624537432394, + "grad_norm": 7.089186191558838, + "learning_rate": 0.00014942846308034022, + "loss": 6.357146835327148, + "step": 9340 + }, + { + "epoch": 10.637631653857103, + "grad_norm": 6.138644695281982, + "learning_rate": 0.00014937428896473267, + "loss": 6.987527465820312, + "step": 9350 + }, + { + "epoch": 10.649017933390265, + "grad_norm": 6.458044528961182, + "learning_rate": 0.0001493201148491251, + "loss": 6.447171783447265, + "step": 9360 + }, + { + "epoch": 10.660404212923428, + "grad_norm": 6.982960224151611, + "learning_rate": 0.00014926594073351755, + "loss": 6.367984390258789, + "step": 9370 + }, + { + "epoch": 10.67179049245659, + "grad_norm": 7.014771461486816, + "learning_rate": 0.00014921176661790997, + "loss": 6.597218322753906, + "step": 9380 + }, + { + "epoch": 10.683176771989752, + "grad_norm": 7.192128658294678, + "learning_rate": 0.00014915759250230242, + "loss": 6.468855285644532, + "step": 9390 + }, + { + "epoch": 10.694563051522914, + "grad_norm": 6.968006134033203, + "learning_rate": 0.00014910341838669485, + "loss": 6.514220428466797, + "step": 9400 + }, + { + "epoch": 10.705949331056077, + "grad_norm": 7.2908220291137695, + "learning_rate": 0.00014904924427108727, + "loss": 5.984500885009766, + "step": 9410 + }, + { + "epoch": 10.71733561058924, + "grad_norm": 7.383593559265137, + "learning_rate": 0.0001489950701554797, + "loss": 6.390668869018555, + "step": 9420 + }, + { + "epoch": 10.728721890122403, + "grad_norm": 7.428356170654297, + "learning_rate": 0.00014894089603987215, + "loss": 6.504000854492188, + "step": 9430 + }, + { + "epoch": 10.740108169655565, + "grad_norm": 6.935097694396973, + "learning_rate": 0.00014888672192426457, + "loss": 6.668525695800781, + "step": 9440 + }, + { + "epoch": 10.751494449188728, + "grad_norm": 7.8939595222473145, + "learning_rate": 0.00014883254780865703, + "loss": 6.7964630126953125, + "step": 9450 + }, + { + "epoch": 10.76288072872189, + "grad_norm": 7.261744976043701, + "learning_rate": 0.00014877837369304948, + "loss": 6.494899749755859, + "step": 9460 + }, + { + "epoch": 10.774267008255052, + "grad_norm": 7.5321044921875, + "learning_rate": 0.0001487241995774419, + "loss": 6.319461441040039, + "step": 9470 + }, + { + "epoch": 10.785653287788215, + "grad_norm": 7.152090072631836, + "learning_rate": 0.00014867002546183435, + "loss": 6.386759948730469, + "step": 9480 + }, + { + "epoch": 10.797039567321377, + "grad_norm": 6.318842887878418, + "learning_rate": 0.00014861585134622678, + "loss": 6.789165496826172, + "step": 9490 + }, + { + "epoch": 10.80842584685454, + "grad_norm": 7.488574981689453, + "learning_rate": 0.00014856167723061923, + "loss": 6.679385375976563, + "step": 9500 + }, + { + "epoch": 10.819812126387703, + "grad_norm": 7.495085716247559, + "learning_rate": 0.00014851292052657242, + "loss": 6.523779296875, + "step": 9510 + }, + { + "epoch": 10.831198405920865, + "grad_norm": 6.9265522956848145, + "learning_rate": 0.00014845874641096485, + "loss": 6.4424797058105465, + "step": 9520 + }, + { + "epoch": 10.842584685454028, + "grad_norm": 6.2375359535217285, + "learning_rate": 0.0001484045722953573, + "loss": 6.325106811523438, + "step": 9530 + }, + { + "epoch": 10.853970964987191, + "grad_norm": 7.461876392364502, + "learning_rate": 0.00014835039817974973, + "loss": 6.4818115234375, + "step": 9540 + }, + { + "epoch": 10.865357244520354, + "grad_norm": 6.406781196594238, + "learning_rate": 0.00014829622406414218, + "loss": 6.454816436767578, + "step": 9550 + }, + { + "epoch": 10.876743524053516, + "grad_norm": 6.492645263671875, + "learning_rate": 0.0001482420499485346, + "loss": 6.191008377075195, + "step": 9560 + }, + { + "epoch": 10.888129803586677, + "grad_norm": 7.3028645515441895, + "learning_rate": 0.00014818787583292703, + "loss": 6.245122909545898, + "step": 9570 + }, + { + "epoch": 10.89951608311984, + "grad_norm": 6.861218452453613, + "learning_rate": 0.00014813370171731945, + "loss": 6.476416778564453, + "step": 9580 + }, + { + "epoch": 10.910902362653003, + "grad_norm": 7.525650501251221, + "learning_rate": 0.0001480795276017119, + "loss": 6.472120666503907, + "step": 9590 + }, + { + "epoch": 10.922288642186166, + "grad_norm": 7.155680179595947, + "learning_rate": 0.00014802535348610435, + "loss": 6.589835357666016, + "step": 9600 + }, + { + "epoch": 10.933674921719328, + "grad_norm": 7.079594612121582, + "learning_rate": 0.00014797117937049678, + "loss": 6.457518005371094, + "step": 9610 + }, + { + "epoch": 10.945061201252491, + "grad_norm": 7.258676052093506, + "learning_rate": 0.00014791700525488923, + "loss": 6.361143112182617, + "step": 9620 + }, + { + "epoch": 10.956447480785654, + "grad_norm": 7.055770397186279, + "learning_rate": 0.00014786283113928165, + "loss": 6.50369873046875, + "step": 9630 + }, + { + "epoch": 10.967833760318817, + "grad_norm": 6.666393756866455, + "learning_rate": 0.0001478086570236741, + "loss": 6.464437866210938, + "step": 9640 + }, + { + "epoch": 10.979220039851977, + "grad_norm": 6.598408222198486, + "learning_rate": 0.00014775448290806653, + "loss": 6.123414993286133, + "step": 9650 + }, + { + "epoch": 10.99060631938514, + "grad_norm": 6.270415306091309, + "learning_rate": 0.00014770030879245898, + "loss": 6.181151580810547, + "step": 9660 + }, + { + "epoch": 11.001138627953317, + "grad_norm": 6.535286903381348, + "learning_rate": 0.0001476461346768514, + "loss": 5.540151977539063, + "step": 9670 + }, + { + "epoch": 11.012524907486478, + "grad_norm": 6.144158363342285, + "learning_rate": 0.00014759196056124386, + "loss": 5.872829818725586, + "step": 9680 + }, + { + "epoch": 11.02391118701964, + "grad_norm": 7.178324222564697, + "learning_rate": 0.00014753778644563628, + "loss": 6.154612350463867, + "step": 9690 + }, + { + "epoch": 11.035297466552803, + "grad_norm": 6.866833209991455, + "learning_rate": 0.0001474836123300287, + "loss": 5.634689712524414, + "step": 9700 + }, + { + "epoch": 11.046683746085966, + "grad_norm": 7.015516757965088, + "learning_rate": 0.00014742943821442116, + "loss": 5.930116271972656, + "step": 9710 + }, + { + "epoch": 11.058070025619129, + "grad_norm": 7.600584983825684, + "learning_rate": 0.00014737526409881358, + "loss": 5.9960792541503904, + "step": 9720 + }, + { + "epoch": 11.069456305152292, + "grad_norm": 7.366182804107666, + "learning_rate": 0.00014732108998320603, + "loss": 5.631900787353516, + "step": 9730 + }, + { + "epoch": 11.080842584685454, + "grad_norm": 7.046829700469971, + "learning_rate": 0.00014726691586759846, + "loss": 6.252207183837891, + "step": 9740 + }, + { + "epoch": 11.092228864218617, + "grad_norm": 6.92673397064209, + "learning_rate": 0.0001472127417519909, + "loss": 6.086066818237304, + "step": 9750 + }, + { + "epoch": 11.10361514375178, + "grad_norm": 9.307673454284668, + "learning_rate": 0.00014715856763638333, + "loss": 6.022590255737304, + "step": 9760 + }, + { + "epoch": 11.11500142328494, + "grad_norm": 7.187988758087158, + "learning_rate": 0.00014710439352077579, + "loss": 5.973920440673828, + "step": 9770 + }, + { + "epoch": 11.126387702818104, + "grad_norm": 7.109073638916016, + "learning_rate": 0.0001470502194051682, + "loss": 5.864291000366211, + "step": 9780 + }, + { + "epoch": 11.137773982351266, + "grad_norm": 7.3360276222229, + "learning_rate": 0.00014699604528956066, + "loss": 6.124382781982422, + "step": 9790 + }, + { + "epoch": 11.149160261884429, + "grad_norm": 7.7672343254089355, + "learning_rate": 0.0001469418711739531, + "loss": 6.148440170288086, + "step": 9800 + }, + { + "epoch": 11.160546541417592, + "grad_norm": 6.8055100440979, + "learning_rate": 0.00014688769705834554, + "loss": 5.931351852416992, + "step": 9810 + }, + { + "epoch": 11.171932820950754, + "grad_norm": 7.583737373352051, + "learning_rate": 0.000146833522942738, + "loss": 5.668278503417969, + "step": 9820 + }, + { + "epoch": 11.183319100483917, + "grad_norm": 7.5180463790893555, + "learning_rate": 0.00014677934882713041, + "loss": 6.020656585693359, + "step": 9830 + }, + { + "epoch": 11.19470538001708, + "grad_norm": 7.665243625640869, + "learning_rate": 0.00014672517471152284, + "loss": 5.651744842529297, + "step": 9840 + }, + { + "epoch": 11.206091659550243, + "grad_norm": 7.284232139587402, + "learning_rate": 0.00014667100059591526, + "loss": 5.927288818359375, + "step": 9850 + }, + { + "epoch": 11.217477939083404, + "grad_norm": 6.677280426025391, + "learning_rate": 0.00014661682648030772, + "loss": 5.800758743286133, + "step": 9860 + }, + { + "epoch": 11.228864218616566, + "grad_norm": 6.858931064605713, + "learning_rate": 0.00014656265236470014, + "loss": 5.876351928710937, + "step": 9870 + }, + { + "epoch": 11.240250498149729, + "grad_norm": 6.993981838226318, + "learning_rate": 0.0001465084782490926, + "loss": 5.805530166625976, + "step": 9880 + }, + { + "epoch": 11.251636777682892, + "grad_norm": 7.795031547546387, + "learning_rate": 0.00014645430413348502, + "loss": 6.2553356170654295, + "step": 9890 + }, + { + "epoch": 11.263023057216055, + "grad_norm": 7.493925094604492, + "learning_rate": 0.00014640013001787747, + "loss": 6.046601867675781, + "step": 9900 + }, + { + "epoch": 11.274409336749217, + "grad_norm": 7.555092811584473, + "learning_rate": 0.0001463459559022699, + "loss": 5.973406982421875, + "step": 9910 + }, + { + "epoch": 11.28579561628238, + "grad_norm": 6.385543346405029, + "learning_rate": 0.00014629178178666234, + "loss": 6.176846694946289, + "step": 9920 + }, + { + "epoch": 11.297181895815543, + "grad_norm": 7.017746448516846, + "learning_rate": 0.00014623760767105477, + "loss": 6.001755142211914, + "step": 9930 + }, + { + "epoch": 11.308568175348706, + "grad_norm": 7.1694416999816895, + "learning_rate": 0.00014618343355544722, + "loss": 6.172482299804687, + "step": 9940 + }, + { + "epoch": 11.319954454881866, + "grad_norm": 7.279458999633789, + "learning_rate": 0.00014612925943983967, + "loss": 6.369392013549804, + "step": 9950 + }, + { + "epoch": 11.33134073441503, + "grad_norm": 6.9396772384643555, + "learning_rate": 0.0001460750853242321, + "loss": 5.813999176025391, + "step": 9960 + }, + { + "epoch": 11.342727013948192, + "grad_norm": 7.31268310546875, + "learning_rate": 0.00014602091120862455, + "loss": 5.91674575805664, + "step": 9970 + }, + { + "epoch": 11.354113293481355, + "grad_norm": 6.88545560836792, + "learning_rate": 0.00014596673709301697, + "loss": 5.994848251342773, + "step": 9980 + }, + { + "epoch": 11.365499573014517, + "grad_norm": 7.708160400390625, + "learning_rate": 0.0001459125629774094, + "loss": 5.773896408081055, + "step": 9990 + }, + { + "epoch": 11.37688585254768, + "grad_norm": 7.004419803619385, + "learning_rate": 0.00014585838886180182, + "loss": 5.685453796386719, + "step": 10000 + }, + { + "epoch": 11.388272132080843, + "grad_norm": 6.840615749359131, + "learning_rate": 0.00014580421474619427, + "loss": 5.64319953918457, + "step": 10010 + }, + { + "epoch": 11.399658411614006, + "grad_norm": 7.056737899780273, + "learning_rate": 0.0001457500406305867, + "loss": 5.996237945556641, + "step": 10020 + }, + { + "epoch": 11.411044691147168, + "grad_norm": 7.051031589508057, + "learning_rate": 0.00014569586651497915, + "loss": 6.065059661865234, + "step": 10030 + }, + { + "epoch": 11.422430970680331, + "grad_norm": 6.999020576477051, + "learning_rate": 0.00014564169239937157, + "loss": 5.551327896118164, + "step": 10040 + }, + { + "epoch": 11.433817250213492, + "grad_norm": 7.3235650062561035, + "learning_rate": 0.00014558751828376402, + "loss": 5.727095031738282, + "step": 10050 + }, + { + "epoch": 11.445203529746655, + "grad_norm": 6.848404884338379, + "learning_rate": 0.00014553334416815648, + "loss": 6.206461334228516, + "step": 10060 + }, + { + "epoch": 11.456589809279818, + "grad_norm": 6.96482515335083, + "learning_rate": 0.0001454791700525489, + "loss": 6.19579963684082, + "step": 10070 + }, + { + "epoch": 11.46797608881298, + "grad_norm": 7.321053981781006, + "learning_rate": 0.00014542499593694135, + "loss": 6.439669799804688, + "step": 10080 + }, + { + "epoch": 11.479362368346143, + "grad_norm": 6.996761798858643, + "learning_rate": 0.00014537082182133378, + "loss": 5.96948356628418, + "step": 10090 + }, + { + "epoch": 11.490748647879306, + "grad_norm": 6.9859466552734375, + "learning_rate": 0.00014531664770572623, + "loss": 5.9932861328125, + "step": 10100 + }, + { + "epoch": 11.502134927412468, + "grad_norm": 6.946524620056152, + "learning_rate": 0.00014526247359011865, + "loss": 5.791952514648438, + "step": 10110 + }, + { + "epoch": 11.513521206945631, + "grad_norm": 7.48444938659668, + "learning_rate": 0.00014520829947451108, + "loss": 6.16157112121582, + "step": 10120 + }, + { + "epoch": 11.524907486478792, + "grad_norm": 7.355890274047852, + "learning_rate": 0.0001451541253589035, + "loss": 5.91258659362793, + "step": 10130 + }, + { + "epoch": 11.536293766011955, + "grad_norm": 7.450939178466797, + "learning_rate": 0.00014509995124329595, + "loss": 6.787471008300781, + "step": 10140 + }, + { + "epoch": 11.547680045545118, + "grad_norm": 7.768852710723877, + "learning_rate": 0.00014504577712768838, + "loss": 6.156349563598633, + "step": 10150 + }, + { + "epoch": 11.55906632507828, + "grad_norm": 7.6208953857421875, + "learning_rate": 0.00014499160301208083, + "loss": 6.0792900085449215, + "step": 10160 + }, + { + "epoch": 11.570452604611443, + "grad_norm": 6.2815632820129395, + "learning_rate": 0.00014493742889647328, + "loss": 5.962456130981446, + "step": 10170 + }, + { + "epoch": 11.581838884144606, + "grad_norm": 7.529908180236816, + "learning_rate": 0.0001448832547808657, + "loss": 6.132223510742188, + "step": 10180 + }, + { + "epoch": 11.593225163677769, + "grad_norm": 7.08761739730835, + "learning_rate": 0.00014482908066525816, + "loss": 6.127777099609375, + "step": 10190 + }, + { + "epoch": 11.604611443210931, + "grad_norm": 8.239002227783203, + "learning_rate": 0.00014477490654965058, + "loss": 5.923508834838867, + "step": 10200 + }, + { + "epoch": 11.615997722744094, + "grad_norm": 7.077022552490234, + "learning_rate": 0.00014472073243404303, + "loss": 5.75263786315918, + "step": 10210 + }, + { + "epoch": 11.627384002277257, + "grad_norm": 7.765398979187012, + "learning_rate": 0.00014466655831843546, + "loss": 6.193927001953125, + "step": 10220 + }, + { + "epoch": 11.638770281810418, + "grad_norm": 7.009711265563965, + "learning_rate": 0.0001446123842028279, + "loss": 5.800412368774414, + "step": 10230 + }, + { + "epoch": 11.65015656134358, + "grad_norm": 7.33494234085083, + "learning_rate": 0.00014455821008722033, + "loss": 5.910943222045899, + "step": 10240 + }, + { + "epoch": 11.661542840876743, + "grad_norm": 7.698012351989746, + "learning_rate": 0.00014450403597161278, + "loss": 6.299586486816406, + "step": 10250 + }, + { + "epoch": 11.672929120409906, + "grad_norm": 7.655066967010498, + "learning_rate": 0.0001444498618560052, + "loss": 6.288556671142578, + "step": 10260 + }, + { + "epoch": 11.684315399943069, + "grad_norm": 7.222828388214111, + "learning_rate": 0.00014439568774039763, + "loss": 5.758666610717773, + "step": 10270 + }, + { + "epoch": 11.695701679476231, + "grad_norm": 7.702454566955566, + "learning_rate": 0.00014434151362479008, + "loss": 6.061179733276367, + "step": 10280 + }, + { + "epoch": 11.707087959009394, + "grad_norm": 8.399511337280273, + "learning_rate": 0.0001442873395091825, + "loss": 6.088422012329102, + "step": 10290 + }, + { + "epoch": 11.718474238542557, + "grad_norm": 6.886438846588135, + "learning_rate": 0.00014423316539357496, + "loss": 6.215003585815429, + "step": 10300 + }, + { + "epoch": 11.729860518075718, + "grad_norm": 6.97210168838501, + "learning_rate": 0.00014417899127796739, + "loss": 5.770806121826172, + "step": 10310 + }, + { + "epoch": 11.74124679760888, + "grad_norm": 6.872792720794678, + "learning_rate": 0.00014412481716235984, + "loss": 6.1828559875488285, + "step": 10320 + }, + { + "epoch": 11.752633077142043, + "grad_norm": 7.779046535491943, + "learning_rate": 0.00014407064304675226, + "loss": 5.819805908203125, + "step": 10330 + }, + { + "epoch": 11.764019356675206, + "grad_norm": 6.890455722808838, + "learning_rate": 0.0001440164689311447, + "loss": 5.905824279785156, + "step": 10340 + }, + { + "epoch": 11.775405636208369, + "grad_norm": 7.692152976989746, + "learning_rate": 0.00014396229481553714, + "loss": 5.950515365600586, + "step": 10350 + }, + { + "epoch": 11.786791915741532, + "grad_norm": 7.703096866607666, + "learning_rate": 0.0001439081206999296, + "loss": 6.3083740234375, + "step": 10360 + }, + { + "epoch": 11.798178195274694, + "grad_norm": 7.19696044921875, + "learning_rate": 0.00014385394658432201, + "loss": 6.3550971984863285, + "step": 10370 + }, + { + "epoch": 11.809564474807857, + "grad_norm": 6.914282321929932, + "learning_rate": 0.00014379977246871447, + "loss": 5.792062759399414, + "step": 10380 + }, + { + "epoch": 11.82095075434102, + "grad_norm": 7.224524974822998, + "learning_rate": 0.0001437455983531069, + "loss": 5.989566802978516, + "step": 10390 + }, + { + "epoch": 11.832337033874182, + "grad_norm": 7.418369293212891, + "learning_rate": 0.00014369142423749934, + "loss": 5.995355224609375, + "step": 10400 + }, + { + "epoch": 11.843723313407343, + "grad_norm": 6.645310878753662, + "learning_rate": 0.00014363725012189177, + "loss": 5.791942596435547, + "step": 10410 + }, + { + "epoch": 11.855109592940506, + "grad_norm": 7.1590375900268555, + "learning_rate": 0.0001435830760062842, + "loss": 5.956428146362304, + "step": 10420 + }, + { + "epoch": 11.866495872473669, + "grad_norm": 7.662755489349365, + "learning_rate": 0.00014352890189067664, + "loss": 6.0646930694580075, + "step": 10430 + }, + { + "epoch": 11.877882152006832, + "grad_norm": 7.7084550857543945, + "learning_rate": 0.00014347472777506907, + "loss": 6.142111206054688, + "step": 10440 + }, + { + "epoch": 11.889268431539994, + "grad_norm": 6.89467191696167, + "learning_rate": 0.00014342055365946152, + "loss": 5.906317138671875, + "step": 10450 + }, + { + "epoch": 11.900654711073157, + "grad_norm": 7.120398044586182, + "learning_rate": 0.00014336637954385394, + "loss": 6.146725463867187, + "step": 10460 + }, + { + "epoch": 11.91204099060632, + "grad_norm": 7.415362358093262, + "learning_rate": 0.0001433122054282464, + "loss": 6.037253570556641, + "step": 10470 + }, + { + "epoch": 11.923427270139483, + "grad_norm": 7.9902262687683105, + "learning_rate": 0.00014325803131263882, + "loss": 6.28980598449707, + "step": 10480 + }, + { + "epoch": 11.934813549672645, + "grad_norm": 6.856546878814697, + "learning_rate": 0.00014320385719703127, + "loss": 6.058418273925781, + "step": 10490 + }, + { + "epoch": 11.946199829205806, + "grad_norm": 7.198825836181641, + "learning_rate": 0.00014314968308142372, + "loss": 6.076106262207031, + "step": 10500 + }, + { + "epoch": 11.957586108738969, + "grad_norm": 7.608924388885498, + "learning_rate": 0.00014309550896581615, + "loss": 5.793802261352539, + "step": 10510 + }, + { + "epoch": 11.968972388272132, + "grad_norm": 7.0075225830078125, + "learning_rate": 0.0001430413348502086, + "loss": 6.062651443481445, + "step": 10520 + }, + { + "epoch": 11.980358667805294, + "grad_norm": 6.545762062072754, + "learning_rate": 0.00014298716073460102, + "loss": 5.8560432434082035, + "step": 10530 + }, + { + "epoch": 11.991744947338457, + "grad_norm": 7.019240379333496, + "learning_rate": 0.00014293298661899345, + "loss": 6.309774398803711, + "step": 10540 + }, + { + "epoch": 12.002277255906632, + "grad_norm": 6.560851097106934, + "learning_rate": 0.00014287881250338587, + "loss": 5.244910812377929, + "step": 10550 + }, + { + "epoch": 12.013663535439795, + "grad_norm": 6.521633625030518, + "learning_rate": 0.00014282463838777832, + "loss": 5.584713363647461, + "step": 10560 + }, + { + "epoch": 12.025049814972958, + "grad_norm": 7.09952974319458, + "learning_rate": 0.00014277046427217075, + "loss": 5.414521789550781, + "step": 10570 + }, + { + "epoch": 12.03643609450612, + "grad_norm": 7.587541103363037, + "learning_rate": 0.0001427162901565632, + "loss": 5.422273635864258, + "step": 10580 + }, + { + "epoch": 12.047822374039283, + "grad_norm": 8.005366325378418, + "learning_rate": 0.00014266211604095562, + "loss": 5.438925552368164, + "step": 10590 + }, + { + "epoch": 12.059208653572446, + "grad_norm": 7.477734565734863, + "learning_rate": 0.00014260794192534807, + "loss": 5.222858810424805, + "step": 10600 + }, + { + "epoch": 12.070594933105609, + "grad_norm": 7.249375343322754, + "learning_rate": 0.00014255376780974053, + "loss": 5.38745231628418, + "step": 10610 + }, + { + "epoch": 12.08198121263877, + "grad_norm": 7.713613510131836, + "learning_rate": 0.00014249959369413295, + "loss": 5.812319946289063, + "step": 10620 + }, + { + "epoch": 12.093367492171932, + "grad_norm": 7.597423553466797, + "learning_rate": 0.0001424454195785254, + "loss": 5.536786270141602, + "step": 10630 + }, + { + "epoch": 12.104753771705095, + "grad_norm": 6.994418621063232, + "learning_rate": 0.00014239124546291783, + "loss": 5.595279312133789, + "step": 10640 + }, + { + "epoch": 12.116140051238258, + "grad_norm": 7.918735027313232, + "learning_rate": 0.00014233707134731028, + "loss": 5.551082611083984, + "step": 10650 + }, + { + "epoch": 12.12752633077142, + "grad_norm": 7.9564666748046875, + "learning_rate": 0.0001422828972317027, + "loss": 5.564613342285156, + "step": 10660 + }, + { + "epoch": 12.138912610304583, + "grad_norm": 6.786741733551025, + "learning_rate": 0.00014222872311609515, + "loss": 5.819846725463867, + "step": 10670 + }, + { + "epoch": 12.150298889837746, + "grad_norm": 6.291092872619629, + "learning_rate": 0.00014217454900048758, + "loss": 5.597542190551758, + "step": 10680 + }, + { + "epoch": 12.161685169370909, + "grad_norm": 8.643335342407227, + "learning_rate": 0.00014212037488488, + "loss": 5.279590225219726, + "step": 10690 + }, + { + "epoch": 12.173071448904071, + "grad_norm": 8.687835693359375, + "learning_rate": 0.00014206620076927243, + "loss": 5.638618087768554, + "step": 10700 + }, + { + "epoch": 12.184457728437232, + "grad_norm": 7.45721960067749, + "learning_rate": 0.00014201202665366488, + "loss": 5.268462371826172, + "step": 10710 + }, + { + "epoch": 12.195844007970395, + "grad_norm": 7.256382942199707, + "learning_rate": 0.0001419578525380573, + "loss": 5.795388412475586, + "step": 10720 + }, + { + "epoch": 12.207230287503558, + "grad_norm": 7.054190635681152, + "learning_rate": 0.00014190367842244976, + "loss": 5.79822006225586, + "step": 10730 + }, + { + "epoch": 12.21861656703672, + "grad_norm": 6.79879093170166, + "learning_rate": 0.0001418495043068422, + "loss": 5.52197036743164, + "step": 10740 + }, + { + "epoch": 12.230002846569883, + "grad_norm": 7.19872522354126, + "learning_rate": 0.00014179533019123463, + "loss": 5.513645935058594, + "step": 10750 + }, + { + "epoch": 12.241389126103046, + "grad_norm": 6.726789951324463, + "learning_rate": 0.00014174115607562708, + "loss": 5.6524711608886715, + "step": 10760 + }, + { + "epoch": 12.252775405636209, + "grad_norm": 7.368878364562988, + "learning_rate": 0.0001416869819600195, + "loss": 5.634532928466797, + "step": 10770 + }, + { + "epoch": 12.264161685169372, + "grad_norm": 7.050151348114014, + "learning_rate": 0.00014163280784441196, + "loss": 5.614359283447266, + "step": 10780 + }, + { + "epoch": 12.275547964702534, + "grad_norm": 7.243103981018066, + "learning_rate": 0.00014157863372880438, + "loss": 5.495789337158203, + "step": 10790 + }, + { + "epoch": 12.286934244235695, + "grad_norm": 7.616795063018799, + "learning_rate": 0.00014152445961319684, + "loss": 5.778979110717773, + "step": 10800 + }, + { + "epoch": 12.298320523768858, + "grad_norm": 7.822340488433838, + "learning_rate": 0.00014147028549758926, + "loss": 5.528537750244141, + "step": 10810 + }, + { + "epoch": 12.30970680330202, + "grad_norm": 7.852270603179932, + "learning_rate": 0.00014141611138198168, + "loss": 5.403163909912109, + "step": 10820 + }, + { + "epoch": 12.321093082835183, + "grad_norm": 7.205753326416016, + "learning_rate": 0.00014136193726637414, + "loss": 5.604093551635742, + "step": 10830 + }, + { + "epoch": 12.332479362368346, + "grad_norm": 6.92218017578125, + "learning_rate": 0.00014130776315076656, + "loss": 5.867033004760742, + "step": 10840 + }, + { + "epoch": 12.343865641901509, + "grad_norm": 7.30786657333374, + "learning_rate": 0.000141253589035159, + "loss": 5.671744537353516, + "step": 10850 + }, + { + "epoch": 12.355251921434672, + "grad_norm": 7.897021293640137, + "learning_rate": 0.00014119941491955144, + "loss": 5.346432495117187, + "step": 10860 + }, + { + "epoch": 12.366638200967834, + "grad_norm": 7.8339738845825195, + "learning_rate": 0.0001411452408039439, + "loss": 5.636806488037109, + "step": 10870 + }, + { + "epoch": 12.378024480500997, + "grad_norm": 7.503673553466797, + "learning_rate": 0.0001410910666883363, + "loss": 5.6904136657714846, + "step": 10880 + }, + { + "epoch": 12.389410760034158, + "grad_norm": 7.782256603240967, + "learning_rate": 0.00014103689257272876, + "loss": 5.786883544921875, + "step": 10890 + }, + { + "epoch": 12.40079703956732, + "grad_norm": 7.115951061248779, + "learning_rate": 0.0001409827184571212, + "loss": 5.459210968017578, + "step": 10900 + }, + { + "epoch": 12.412183319100484, + "grad_norm": 7.4705281257629395, + "learning_rate": 0.00014092854434151364, + "loss": 5.9279014587402346, + "step": 10910 + }, + { + "epoch": 12.423569598633646, + "grad_norm": 7.553955554962158, + "learning_rate": 0.00014087437022590606, + "loss": 5.823044204711914, + "step": 10920 + }, + { + "epoch": 12.434955878166809, + "grad_norm": 7.552109241485596, + "learning_rate": 0.00014082019611029852, + "loss": 5.539102172851562, + "step": 10930 + }, + { + "epoch": 12.446342157699972, + "grad_norm": 6.597175598144531, + "learning_rate": 0.00014076602199469094, + "loss": 6.030828857421875, + "step": 10940 + }, + { + "epoch": 12.457728437233134, + "grad_norm": 6.793281078338623, + "learning_rate": 0.0001407118478790834, + "loss": 5.719361114501953, + "step": 10950 + }, + { + "epoch": 12.469114716766297, + "grad_norm": 7.471744537353516, + "learning_rate": 0.00014065767376347582, + "loss": 5.419223403930664, + "step": 10960 + }, + { + "epoch": 12.48050099629946, + "grad_norm": 6.892999649047852, + "learning_rate": 0.00014060349964786824, + "loss": 5.7255298614501955, + "step": 10970 + }, + { + "epoch": 12.491887275832621, + "grad_norm": 7.7696533203125, + "learning_rate": 0.0001405493255322607, + "loss": 6.058211898803711, + "step": 10980 + }, + { + "epoch": 12.503273555365784, + "grad_norm": 7.769250869750977, + "learning_rate": 0.00014049515141665312, + "loss": 5.425541687011719, + "step": 10990 + }, + { + "epoch": 12.514659834898946, + "grad_norm": 7.765056610107422, + "learning_rate": 0.00014044097730104557, + "loss": 5.620434951782227, + "step": 11000 + }, + { + "epoch": 12.52604611443211, + "grad_norm": 7.338669300079346, + "learning_rate": 0.000140386803185438, + "loss": 5.756977081298828, + "step": 11010 + }, + { + "epoch": 12.537432393965272, + "grad_norm": 7.880843162536621, + "learning_rate": 0.00014033262906983044, + "loss": 5.7971046447753904, + "step": 11020 + }, + { + "epoch": 12.548818673498435, + "grad_norm": 7.511135101318359, + "learning_rate": 0.00014027845495422287, + "loss": 5.453466415405273, + "step": 11030 + }, + { + "epoch": 12.560204953031597, + "grad_norm": 7.005084037780762, + "learning_rate": 0.00014022428083861532, + "loss": 5.595069503784179, + "step": 11040 + }, + { + "epoch": 12.57159123256476, + "grad_norm": 7.802360534667969, + "learning_rate": 0.00014017010672300775, + "loss": 6.111663436889648, + "step": 11050 + }, + { + "epoch": 12.582977512097923, + "grad_norm": 7.821834564208984, + "learning_rate": 0.0001401159326074002, + "loss": 5.398255920410156, + "step": 11060 + }, + { + "epoch": 12.594363791631084, + "grad_norm": 7.565220832824707, + "learning_rate": 0.00014006175849179265, + "loss": 5.964350891113281, + "step": 11070 + }, + { + "epoch": 12.605750071164247, + "grad_norm": 7.540068626403809, + "learning_rate": 0.00014000758437618507, + "loss": 5.82092399597168, + "step": 11080 + }, + { + "epoch": 12.61713635069741, + "grad_norm": 7.710811138153076, + "learning_rate": 0.00013995341026057752, + "loss": 5.725724411010742, + "step": 11090 + }, + { + "epoch": 12.628522630230572, + "grad_norm": 6.454217433929443, + "learning_rate": 0.00013989923614496995, + "loss": 5.528153991699218, + "step": 11100 + }, + { + "epoch": 12.639908909763735, + "grad_norm": 6.6477837562561035, + "learning_rate": 0.00013984506202936237, + "loss": 5.676776885986328, + "step": 11110 + }, + { + "epoch": 12.651295189296897, + "grad_norm": 6.51104736328125, + "learning_rate": 0.0001397908879137548, + "loss": 5.7555488586425785, + "step": 11120 + }, + { + "epoch": 12.66268146883006, + "grad_norm": 7.661293029785156, + "learning_rate": 0.00013973671379814725, + "loss": 5.601126861572266, + "step": 11130 + }, + { + "epoch": 12.674067748363223, + "grad_norm": 7.333502292633057, + "learning_rate": 0.00013968253968253967, + "loss": 5.845965957641601, + "step": 11140 + }, + { + "epoch": 12.685454027896386, + "grad_norm": 8.663681030273438, + "learning_rate": 0.00013962836556693213, + "loss": 5.837159729003906, + "step": 11150 + }, + { + "epoch": 12.696840307429547, + "grad_norm": 6.259514808654785, + "learning_rate": 0.00013957419145132455, + "loss": 5.721532821655273, + "step": 11160 + }, + { + "epoch": 12.70822658696271, + "grad_norm": 6.932978630065918, + "learning_rate": 0.000139520017335717, + "loss": 5.9294178009033205, + "step": 11170 + }, + { + "epoch": 12.719612866495872, + "grad_norm": 7.781189441680908, + "learning_rate": 0.00013946584322010945, + "loss": 5.403998184204101, + "step": 11180 + }, + { + "epoch": 12.730999146029035, + "grad_norm": 7.4265336990356445, + "learning_rate": 0.00013941166910450188, + "loss": 5.511170196533203, + "step": 11190 + }, + { + "epoch": 12.742385425562198, + "grad_norm": 8.001691818237305, + "learning_rate": 0.00013935749498889433, + "loss": 5.863291931152344, + "step": 11200 + }, + { + "epoch": 12.75377170509536, + "grad_norm": 6.850292682647705, + "learning_rate": 0.00013930332087328675, + "loss": 5.665401458740234, + "step": 11210 + }, + { + "epoch": 12.765157984628523, + "grad_norm": 8.020200729370117, + "learning_rate": 0.0001392491467576792, + "loss": 5.8318031311035154, + "step": 11220 + }, + { + "epoch": 12.776544264161686, + "grad_norm": 7.251684188842773, + "learning_rate": 0.00013919497264207163, + "loss": 5.751366806030274, + "step": 11230 + }, + { + "epoch": 12.787930543694848, + "grad_norm": 8.099172592163086, + "learning_rate": 0.00013914079852646405, + "loss": 5.816202545166016, + "step": 11240 + }, + { + "epoch": 12.79931682322801, + "grad_norm": 7.1152024269104, + "learning_rate": 0.00013908662441085648, + "loss": 5.4952960968017575, + "step": 11250 + }, + { + "epoch": 12.810703102761172, + "grad_norm": 7.486706256866455, + "learning_rate": 0.00013903245029524893, + "loss": 5.637163162231445, + "step": 11260 + }, + { + "epoch": 12.822089382294335, + "grad_norm": 7.652496337890625, + "learning_rate": 0.00013897827617964135, + "loss": 5.713247680664063, + "step": 11270 + }, + { + "epoch": 12.833475661827498, + "grad_norm": 7.462140083312988, + "learning_rate": 0.0001389241020640338, + "loss": 5.825135040283203, + "step": 11280 + }, + { + "epoch": 12.84486194136066, + "grad_norm": 7.004937648773193, + "learning_rate": 0.00013886992794842626, + "loss": 5.827632904052734, + "step": 11290 + }, + { + "epoch": 12.856248220893823, + "grad_norm": 7.434920310974121, + "learning_rate": 0.00013881575383281868, + "loss": 5.4522960662841795, + "step": 11300 + }, + { + "epoch": 12.867634500426986, + "grad_norm": 6.466522693634033, + "learning_rate": 0.00013876157971721113, + "loss": 5.210875701904297, + "step": 11310 + }, + { + "epoch": 12.879020779960149, + "grad_norm": 7.1903510093688965, + "learning_rate": 0.00013870740560160356, + "loss": 5.339904022216797, + "step": 11320 + }, + { + "epoch": 12.890407059493311, + "grad_norm": 7.564988613128662, + "learning_rate": 0.000138653231485996, + "loss": 5.6416679382324215, + "step": 11330 + }, + { + "epoch": 12.901793339026472, + "grad_norm": 8.242656707763672, + "learning_rate": 0.00013859905737038843, + "loss": 5.6682594299316404, + "step": 11340 + }, + { + "epoch": 12.913179618559635, + "grad_norm": 7.32057523727417, + "learning_rate": 0.00013854488325478089, + "loss": 5.620843124389649, + "step": 11350 + }, + { + "epoch": 12.924565898092798, + "grad_norm": 7.37946891784668, + "learning_rate": 0.0001384907091391733, + "loss": 5.613275527954102, + "step": 11360 + }, + { + "epoch": 12.93595217762596, + "grad_norm": 7.773562431335449, + "learning_rate": 0.00013843653502356576, + "loss": 5.346442794799804, + "step": 11370 + }, + { + "epoch": 12.947338457159123, + "grad_norm": 7.526298999786377, + "learning_rate": 0.00013838236090795819, + "loss": 5.606759643554687, + "step": 11380 + }, + { + "epoch": 12.958724736692286, + "grad_norm": 7.332315921783447, + "learning_rate": 0.0001383281867923506, + "loss": 5.419976425170899, + "step": 11390 + }, + { + "epoch": 12.970111016225449, + "grad_norm": 7.382599830627441, + "learning_rate": 0.00013827401267674304, + "loss": 5.998534393310547, + "step": 11400 + }, + { + "epoch": 12.981497295758611, + "grad_norm": 7.097250461578369, + "learning_rate": 0.0001382198385611355, + "loss": 5.930072021484375, + "step": 11410 + }, + { + "epoch": 12.992883575291774, + "grad_norm": 6.900557041168213, + "learning_rate": 0.00013816566444552794, + "loss": 5.5991252899169925, + "step": 11420 + }, + { + "epoch": 13.00341588385995, + "grad_norm": 6.776562213897705, + "learning_rate": 0.00013811149032992036, + "loss": 4.849653244018555, + "step": 11430 + }, + { + "epoch": 13.014802163393112, + "grad_norm": 7.419433116912842, + "learning_rate": 0.00013805731621431281, + "loss": 5.020238876342773, + "step": 11440 + }, + { + "epoch": 13.026188442926275, + "grad_norm": 6.931766986846924, + "learning_rate": 0.00013800314209870524, + "loss": 5.281552124023437, + "step": 11450 + }, + { + "epoch": 13.037574722459436, + "grad_norm": 7.03234338760376, + "learning_rate": 0.0001379489679830977, + "loss": 5.1979625701904295, + "step": 11460 + }, + { + "epoch": 13.048961001992598, + "grad_norm": 7.978625774383545, + "learning_rate": 0.00013789479386749011, + "loss": 5.348671722412109, + "step": 11470 + }, + { + "epoch": 13.060347281525761, + "grad_norm": 7.255367755889893, + "learning_rate": 0.00013784061975188257, + "loss": 5.117027282714844, + "step": 11480 + }, + { + "epoch": 13.071733561058924, + "grad_norm": 7.438676834106445, + "learning_rate": 0.000137786445636275, + "loss": 5.207336044311523, + "step": 11490 + }, + { + "epoch": 13.083119840592087, + "grad_norm": 6.7722578048706055, + "learning_rate": 0.00013773227152066744, + "loss": 4.885826110839844, + "step": 11500 + }, + { + "epoch": 13.09450612012525, + "grad_norm": 6.830091953277588, + "learning_rate": 0.00013767809740505987, + "loss": 4.998377990722656, + "step": 11510 + }, + { + "epoch": 13.105892399658412, + "grad_norm": 7.623549461364746, + "learning_rate": 0.00013762392328945232, + "loss": 5.187086486816407, + "step": 11520 + }, + { + "epoch": 13.117278679191575, + "grad_norm": 8.049860954284668, + "learning_rate": 0.00013756974917384474, + "loss": 4.911079788208008, + "step": 11530 + }, + { + "epoch": 13.128664958724737, + "grad_norm": 7.040311336517334, + "learning_rate": 0.00013751557505823717, + "loss": 5.190642547607422, + "step": 11540 + }, + { + "epoch": 13.140051238257898, + "grad_norm": 7.5759806632995605, + "learning_rate": 0.00013746140094262962, + "loss": 5.331892776489258, + "step": 11550 + }, + { + "epoch": 13.151437517791061, + "grad_norm": 7.5590362548828125, + "learning_rate": 0.00013740722682702204, + "loss": 5.309208679199219, + "step": 11560 + }, + { + "epoch": 13.162823797324224, + "grad_norm": 7.6602559089660645, + "learning_rate": 0.0001373530527114145, + "loss": 5.0000556945800785, + "step": 11570 + }, + { + "epoch": 13.174210076857387, + "grad_norm": 6.687933921813965, + "learning_rate": 0.00013729887859580692, + "loss": 5.226931381225586, + "step": 11580 + }, + { + "epoch": 13.18559635639055, + "grad_norm": 7.443079471588135, + "learning_rate": 0.00013724470448019937, + "loss": 5.208231735229492, + "step": 11590 + }, + { + "epoch": 13.196982635923712, + "grad_norm": 7.559839725494385, + "learning_rate": 0.0001371905303645918, + "loss": 5.439892578125, + "step": 11600 + }, + { + "epoch": 13.208368915456875, + "grad_norm": 7.104702472686768, + "learning_rate": 0.00013713635624898425, + "loss": 5.115517807006836, + "step": 11610 + }, + { + "epoch": 13.219755194990038, + "grad_norm": NaN, + "learning_rate": 0.00013708218213337667, + "loss": 4.99559440612793, + "step": 11620 + }, + { + "epoch": 13.2311414745232, + "grad_norm": 7.759593486785889, + "learning_rate": 0.00013703342542932987, + "loss": 5.390767288208008, + "step": 11630 + }, + { + "epoch": 13.242527754056361, + "grad_norm": 7.8164286613464355, + "learning_rate": 0.00013697925131372232, + "loss": 5.311045455932617, + "step": 11640 + }, + { + "epoch": 13.253914033589524, + "grad_norm": 6.780811309814453, + "learning_rate": 0.00013692507719811474, + "loss": 5.088868713378906, + "step": 11650 + }, + { + "epoch": 13.265300313122687, + "grad_norm": 7.4553022384643555, + "learning_rate": 0.0001368709030825072, + "loss": 5.636979675292968, + "step": 11660 + }, + { + "epoch": 13.27668659265585, + "grad_norm": 7.379472255706787, + "learning_rate": 0.00013681672896689962, + "loss": 5.3572509765625, + "step": 11670 + }, + { + "epoch": 13.288072872189012, + "grad_norm": 7.313671588897705, + "learning_rate": 0.00013676255485129204, + "loss": 5.362029647827148, + "step": 11680 + }, + { + "epoch": 13.299459151722175, + "grad_norm": 7.911259174346924, + "learning_rate": 0.0001367083807356845, + "loss": 5.288186645507812, + "step": 11690 + }, + { + "epoch": 13.310845431255338, + "grad_norm": 7.032965183258057, + "learning_rate": 0.00013665420662007692, + "loss": 5.303681182861328, + "step": 11700 + }, + { + "epoch": 13.3222317107885, + "grad_norm": 8.510554313659668, + "learning_rate": 0.00013660003250446937, + "loss": 5.175273513793945, + "step": 11710 + }, + { + "epoch": 13.333617990321663, + "grad_norm": 7.989207744598389, + "learning_rate": 0.0001365458583888618, + "loss": 4.8652587890625, + "step": 11720 + }, + { + "epoch": 13.345004269854824, + "grad_norm": 8.093276977539062, + "learning_rate": 0.00013649168427325425, + "loss": 5.2994426727294925, + "step": 11730 + }, + { + "epoch": 13.356390549387987, + "grad_norm": 6.790765285491943, + "learning_rate": 0.00013643751015764667, + "loss": 5.194287109375, + "step": 11740 + }, + { + "epoch": 13.36777682892115, + "grad_norm": 7.429324626922607, + "learning_rate": 0.00013638333604203912, + "loss": 5.5465232849121096, + "step": 11750 + }, + { + "epoch": 13.379163108454312, + "grad_norm": 7.065948486328125, + "learning_rate": 0.00013632916192643155, + "loss": 5.191884613037109, + "step": 11760 + }, + { + "epoch": 13.390549387987475, + "grad_norm": 7.3118510246276855, + "learning_rate": 0.000136274987810824, + "loss": 5.531578826904297, + "step": 11770 + }, + { + "epoch": 13.401935667520638, + "grad_norm": 7.449267864227295, + "learning_rate": 0.00013622081369521645, + "loss": 5.3741718292236325, + "step": 11780 + }, + { + "epoch": 13.4133219470538, + "grad_norm": 7.919931411743164, + "learning_rate": 0.00013616663957960888, + "loss": 5.327036666870117, + "step": 11790 + }, + { + "epoch": 13.424708226586963, + "grad_norm": 7.673199653625488, + "learning_rate": 0.00013611246546400133, + "loss": 5.344607925415039, + "step": 11800 + }, + { + "epoch": 13.436094506120126, + "grad_norm": 7.837534427642822, + "learning_rate": 0.00013605829134839375, + "loss": 5.430044174194336, + "step": 11810 + }, + { + "epoch": 13.447480785653287, + "grad_norm": 7.384720325469971, + "learning_rate": 0.00013600411723278618, + "loss": 5.121067810058594, + "step": 11820 + }, + { + "epoch": 13.45886706518645, + "grad_norm": 7.407566070556641, + "learning_rate": 0.0001359499431171786, + "loss": 5.254208755493164, + "step": 11830 + }, + { + "epoch": 13.470253344719612, + "grad_norm": 7.422298431396484, + "learning_rate": 0.00013589576900157105, + "loss": 5.210581207275391, + "step": 11840 + }, + { + "epoch": 13.481639624252775, + "grad_norm": 7.2973737716674805, + "learning_rate": 0.00013584159488596348, + "loss": 5.4987037658691404, + "step": 11850 + }, + { + "epoch": 13.493025903785938, + "grad_norm": 7.023255348205566, + "learning_rate": 0.00013578742077035593, + "loss": 5.042602920532227, + "step": 11860 + }, + { + "epoch": 13.5044121833191, + "grad_norm": 7.912652492523193, + "learning_rate": 0.00013573324665474835, + "loss": 5.281571578979492, + "step": 11870 + }, + { + "epoch": 13.515798462852263, + "grad_norm": 7.739786148071289, + "learning_rate": 0.0001356790725391408, + "loss": 5.43682861328125, + "step": 11880 + }, + { + "epoch": 13.527184742385426, + "grad_norm": 7.652828216552734, + "learning_rate": 0.00013562489842353326, + "loss": 5.211148071289062, + "step": 11890 + }, + { + "epoch": 13.538571021918589, + "grad_norm": 8.158793449401855, + "learning_rate": 0.00013557072430792568, + "loss": 5.4678295135498045, + "step": 11900 + }, + { + "epoch": 13.54995730145175, + "grad_norm": 7.335878372192383, + "learning_rate": 0.00013551655019231813, + "loss": 5.306196975708008, + "step": 11910 + }, + { + "epoch": 13.561343580984913, + "grad_norm": 7.801717758178711, + "learning_rate": 0.00013546237607671056, + "loss": 5.475741195678711, + "step": 11920 + }, + { + "epoch": 13.572729860518075, + "grad_norm": 8.175219535827637, + "learning_rate": 0.000135408201961103, + "loss": 5.457925796508789, + "step": 11930 + }, + { + "epoch": 13.584116140051238, + "grad_norm": 6.884897232055664, + "learning_rate": 0.00013535402784549543, + "loss": 5.128502655029297, + "step": 11940 + }, + { + "epoch": 13.5955024195844, + "grad_norm": 6.6634521484375, + "learning_rate": 0.00013529985372988788, + "loss": 5.319187164306641, + "step": 11950 + }, + { + "epoch": 13.606888699117563, + "grad_norm": 8.205178260803223, + "learning_rate": 0.0001352456796142803, + "loss": 5.198386001586914, + "step": 11960 + }, + { + "epoch": 13.618274978650726, + "grad_norm": 6.997448444366455, + "learning_rate": 0.00013519150549867273, + "loss": 5.356737899780273, + "step": 11970 + }, + { + "epoch": 13.629661258183889, + "grad_norm": 7.976472854614258, + "learning_rate": 0.00013513733138306516, + "loss": 5.224060440063477, + "step": 11980 + }, + { + "epoch": 13.641047537717052, + "grad_norm": 7.6008477210998535, + "learning_rate": 0.0001350831572674576, + "loss": 5.205660629272461, + "step": 11990 + }, + { + "epoch": 13.652433817250213, + "grad_norm": 8.134635925292969, + "learning_rate": 0.00013502898315185003, + "loss": 5.284549331665039, + "step": 12000 + }, + { + "epoch": 13.663820096783375, + "grad_norm": 7.697743892669678, + "learning_rate": 0.00013497480903624249, + "loss": 5.365811157226562, + "step": 12010 + }, + { + "epoch": 13.675206376316538, + "grad_norm": 6.987123966217041, + "learning_rate": 0.00013492063492063494, + "loss": 5.420026779174805, + "step": 12020 + }, + { + "epoch": 13.6865926558497, + "grad_norm": 7.670924663543701, + "learning_rate": 0.00013486646080502736, + "loss": 5.44122543334961, + "step": 12030 + }, + { + "epoch": 13.697978935382864, + "grad_norm": 7.6146955490112305, + "learning_rate": 0.0001348122866894198, + "loss": 5.368587493896484, + "step": 12040 + }, + { + "epoch": 13.709365214916026, + "grad_norm": 7.290314674377441, + "learning_rate": 0.00013475811257381224, + "loss": 4.899549102783203, + "step": 12050 + }, + { + "epoch": 13.720751494449189, + "grad_norm": 7.6227569580078125, + "learning_rate": 0.0001347039384582047, + "loss": 5.226024627685547, + "step": 12060 + }, + { + "epoch": 13.732137773982352, + "grad_norm": 8.122727394104004, + "learning_rate": 0.0001346497643425971, + "loss": 5.342987060546875, + "step": 12070 + }, + { + "epoch": 13.743524053515515, + "grad_norm": 7.191249847412109, + "learning_rate": 0.00013459559022698956, + "loss": 4.999538421630859, + "step": 12080 + }, + { + "epoch": 13.754910333048677, + "grad_norm": 7.330814361572266, + "learning_rate": 0.000134541416111382, + "loss": 5.499901199340821, + "step": 12090 + }, + { + "epoch": 13.766296612581838, + "grad_norm": 7.275107383728027, + "learning_rate": 0.00013448724199577441, + "loss": 5.174909591674805, + "step": 12100 + }, + { + "epoch": 13.777682892115001, + "grad_norm": 7.071986675262451, + "learning_rate": 0.00013443306788016687, + "loss": 5.219038391113282, + "step": 12110 + }, + { + "epoch": 13.789069171648164, + "grad_norm": 6.817813396453857, + "learning_rate": 0.0001343788937645593, + "loss": 5.07507553100586, + "step": 12120 + }, + { + "epoch": 13.800455451181326, + "grad_norm": 7.419076442718506, + "learning_rate": 0.00013432471964895174, + "loss": 5.351544952392578, + "step": 12130 + }, + { + "epoch": 13.81184173071449, + "grad_norm": 7.717519283294678, + "learning_rate": 0.00013427054553334417, + "loss": 5.706594085693359, + "step": 12140 + }, + { + "epoch": 13.823228010247652, + "grad_norm": 7.404618263244629, + "learning_rate": 0.00013421637141773662, + "loss": 5.376433563232422, + "step": 12150 + }, + { + "epoch": 13.834614289780815, + "grad_norm": 7.158414363861084, + "learning_rate": 0.00013416219730212904, + "loss": 5.363436508178711, + "step": 12160 + }, + { + "epoch": 13.846000569313977, + "grad_norm": 7.699437618255615, + "learning_rate": 0.0001341080231865215, + "loss": 5.6122314453125, + "step": 12170 + }, + { + "epoch": 13.857386848847138, + "grad_norm": 7.237250804901123, + "learning_rate": 0.00013405384907091392, + "loss": 5.395546340942383, + "step": 12180 + }, + { + "epoch": 13.868773128380301, + "grad_norm": 7.333144187927246, + "learning_rate": 0.00013399967495530637, + "loss": 5.211500549316407, + "step": 12190 + }, + { + "epoch": 13.880159407913464, + "grad_norm": 6.954226493835449, + "learning_rate": 0.0001339455008396988, + "loss": 5.263496017456054, + "step": 12200 + }, + { + "epoch": 13.891545687446627, + "grad_norm": 7.646538734436035, + "learning_rate": 0.00013389132672409125, + "loss": 5.6256969451904295, + "step": 12210 + }, + { + "epoch": 13.90293196697979, + "grad_norm": 7.714359760284424, + "learning_rate": 0.00013383715260848367, + "loss": 5.669702529907227, + "step": 12220 + }, + { + "epoch": 13.914318246512952, + "grad_norm": 7.628199577331543, + "learning_rate": 0.00013378297849287612, + "loss": 5.399072265625, + "step": 12230 + }, + { + "epoch": 13.925704526046115, + "grad_norm": 8.523447036743164, + "learning_rate": 0.00013372880437726855, + "loss": 5.366733169555664, + "step": 12240 + }, + { + "epoch": 13.937090805579277, + "grad_norm": 7.897779941558838, + "learning_rate": 0.00013367463026166097, + "loss": 5.20464973449707, + "step": 12250 + }, + { + "epoch": 13.94847708511244, + "grad_norm": 7.602762699127197, + "learning_rate": 0.00013362045614605342, + "loss": 5.602539443969727, + "step": 12260 + }, + { + "epoch": 13.959863364645603, + "grad_norm": 7.468645095825195, + "learning_rate": 0.00013356628203044585, + "loss": 5.513700485229492, + "step": 12270 + }, + { + "epoch": 13.971249644178764, + "grad_norm": 7.841520309448242, + "learning_rate": 0.0001335121079148383, + "loss": 5.51887321472168, + "step": 12280 + }, + { + "epoch": 13.982635923711927, + "grad_norm": 7.047943115234375, + "learning_rate": 0.00013345793379923072, + "loss": 5.111991500854492, + "step": 12290 + }, + { + "epoch": 13.99402220324509, + "grad_norm": 6.829543113708496, + "learning_rate": 0.00013340375968362317, + "loss": 5.058186340332031, + "step": 12300 + }, + { + "epoch": 14.004554511813264, + "grad_norm": 7.048587799072266, + "learning_rate": 0.0001333495855680156, + "loss": 4.963529586791992, + "step": 12310 + }, + { + "epoch": 14.015940791346427, + "grad_norm": 6.852132797241211, + "learning_rate": 0.00013329541145240805, + "loss": 4.623985290527344, + "step": 12320 + }, + { + "epoch": 14.02732707087959, + "grad_norm": 7.333976745605469, + "learning_rate": 0.00013324123733680047, + "loss": 5.1697029113769535, + "step": 12330 + }, + { + "epoch": 14.038713350412753, + "grad_norm": 8.014049530029297, + "learning_rate": 0.00013318706322119293, + "loss": 4.859414672851562, + "step": 12340 + }, + { + "epoch": 14.050099629945915, + "grad_norm": 7.355411529541016, + "learning_rate": 0.00013313288910558538, + "loss": 4.995968627929687, + "step": 12350 + }, + { + "epoch": 14.061485909479078, + "grad_norm": 7.705524921417236, + "learning_rate": 0.0001330787149899778, + "loss": 5.119239807128906, + "step": 12360 + }, + { + "epoch": 14.07287218901224, + "grad_norm": 7.468197822570801, + "learning_rate": 0.00013302454087437023, + "loss": 5.107388305664062, + "step": 12370 + }, + { + "epoch": 14.084258468545404, + "grad_norm": 6.574582099914551, + "learning_rate": 0.00013297036675876268, + "loss": 4.7201496124267575, + "step": 12380 + }, + { + "epoch": 14.095644748078564, + "grad_norm": 6.222922325134277, + "learning_rate": 0.0001329161926431551, + "loss": 4.612395477294922, + "step": 12390 + }, + { + "epoch": 14.107031027611727, + "grad_norm": 7.6845011711120605, + "learning_rate": 0.00013286201852754753, + "loss": 5.049299621582032, + "step": 12400 + }, + { + "epoch": 14.11841730714489, + "grad_norm": 7.420602798461914, + "learning_rate": 0.00013280784441193998, + "loss": 4.884346389770508, + "step": 12410 + }, + { + "epoch": 14.129803586678053, + "grad_norm": 7.716628074645996, + "learning_rate": 0.0001327536702963324, + "loss": 4.882117080688476, + "step": 12420 + }, + { + "epoch": 14.141189866211215, + "grad_norm": 7.8805646896362305, + "learning_rate": 0.00013269949618072486, + "loss": 4.686184692382812, + "step": 12430 + }, + { + "epoch": 14.152576145744378, + "grad_norm": 7.1464948654174805, + "learning_rate": 0.00013264532206511728, + "loss": 4.810683822631836, + "step": 12440 + }, + { + "epoch": 14.163962425277541, + "grad_norm": 7.473194599151611, + "learning_rate": 0.00013259114794950973, + "loss": 4.700112533569336, + "step": 12450 + }, + { + "epoch": 14.175348704810704, + "grad_norm": 6.588601589202881, + "learning_rate": 0.00013253697383390218, + "loss": 4.817057037353516, + "step": 12460 + }, + { + "epoch": 14.186734984343866, + "grad_norm": 7.659865856170654, + "learning_rate": 0.0001324827997182946, + "loss": 4.8424022674560545, + "step": 12470 + }, + { + "epoch": 14.198121263877027, + "grad_norm": 7.2345194816589355, + "learning_rate": 0.00013242862560268706, + "loss": 4.720790100097656, + "step": 12480 + }, + { + "epoch": 14.20950754341019, + "grad_norm": 7.407914161682129, + "learning_rate": 0.00013237445148707948, + "loss": 4.918620681762695, + "step": 12490 + }, + { + "epoch": 14.220893822943353, + "grad_norm": 8.086915969848633, + "learning_rate": 0.00013232027737147193, + "loss": 5.120816421508789, + "step": 12500 + }, + { + "epoch": 14.232280102476516, + "grad_norm": 7.739943981170654, + "learning_rate": 0.00013226610325586436, + "loss": 4.804103851318359, + "step": 12510 + }, + { + "epoch": 14.243666382009678, + "grad_norm": 6.809656620025635, + "learning_rate": 0.00013221192914025678, + "loss": 4.88025016784668, + "step": 12520 + }, + { + "epoch": 14.255052661542841, + "grad_norm": 7.113979816436768, + "learning_rate": 0.0001321577550246492, + "loss": 5.060388565063477, + "step": 12530 + }, + { + "epoch": 14.266438941076004, + "grad_norm": 7.600307941436768, + "learning_rate": 0.00013210358090904166, + "loss": 4.81654167175293, + "step": 12540 + }, + { + "epoch": 14.277825220609166, + "grad_norm": 7.201952934265137, + "learning_rate": 0.00013204940679343408, + "loss": 4.638751220703125, + "step": 12550 + }, + { + "epoch": 14.28921150014233, + "grad_norm": 7.577877521514893, + "learning_rate": 0.00013199523267782654, + "loss": 5.024491500854492, + "step": 12560 + }, + { + "epoch": 14.30059777967549, + "grad_norm": 7.727479934692383, + "learning_rate": 0.000131941058562219, + "loss": 5.108154296875, + "step": 12570 + }, + { + "epoch": 14.311984059208653, + "grad_norm": 7.286412239074707, + "learning_rate": 0.0001318868844466114, + "loss": 5.149254608154297, + "step": 12580 + }, + { + "epoch": 14.323370338741816, + "grad_norm": 7.6460723876953125, + "learning_rate": 0.00013183271033100386, + "loss": 4.752541351318359, + "step": 12590 + }, + { + "epoch": 14.334756618274978, + "grad_norm": 7.025350093841553, + "learning_rate": 0.0001317785362153963, + "loss": 4.839694213867188, + "step": 12600 + }, + { + "epoch": 14.346142897808141, + "grad_norm": 7.519998073577881, + "learning_rate": 0.00013172436209978874, + "loss": 4.982858276367187, + "step": 12610 + }, + { + "epoch": 14.357529177341304, + "grad_norm": 7.484885215759277, + "learning_rate": 0.00013167018798418116, + "loss": 4.672926330566407, + "step": 12620 + }, + { + "epoch": 14.368915456874467, + "grad_norm": 8.382508277893066, + "learning_rate": 0.00013161601386857362, + "loss": 5.045550537109375, + "step": 12630 + }, + { + "epoch": 14.38030173640763, + "grad_norm": 7.656764030456543, + "learning_rate": 0.00013156183975296604, + "loss": 5.17280158996582, + "step": 12640 + }, + { + "epoch": 14.391688015940792, + "grad_norm": 7.431979179382324, + "learning_rate": 0.0001315076656373585, + "loss": 4.8266761779785154, + "step": 12650 + }, + { + "epoch": 14.403074295473953, + "grad_norm": 7.853142738342285, + "learning_rate": 0.00013145349152175092, + "loss": 4.80140266418457, + "step": 12660 + }, + { + "epoch": 14.414460575007116, + "grad_norm": 7.3754987716674805, + "learning_rate": 0.00013139931740614334, + "loss": 4.92738037109375, + "step": 12670 + }, + { + "epoch": 14.425846854540278, + "grad_norm": 6.4924116134643555, + "learning_rate": 0.0001313451432905358, + "loss": 4.55872802734375, + "step": 12680 + }, + { + "epoch": 14.437233134073441, + "grad_norm": 8.214619636535645, + "learning_rate": 0.00013129096917492822, + "loss": 4.9627235412597654, + "step": 12690 + }, + { + "epoch": 14.448619413606604, + "grad_norm": 7.3021559715271, + "learning_rate": 0.00013123679505932067, + "loss": 5.223368072509766, + "step": 12700 + }, + { + "epoch": 14.460005693139767, + "grad_norm": 7.899365425109863, + "learning_rate": 0.0001311826209437131, + "loss": 5.1207012176513675, + "step": 12710 + }, + { + "epoch": 14.47139197267293, + "grad_norm": 7.248983860015869, + "learning_rate": 0.00013112844682810554, + "loss": 4.907715606689453, + "step": 12720 + }, + { + "epoch": 14.482778252206092, + "grad_norm": 7.069028377532959, + "learning_rate": 0.00013107427271249797, + "loss": 4.999393463134766, + "step": 12730 + }, + { + "epoch": 14.494164531739255, + "grad_norm": 7.262919902801514, + "learning_rate": 0.00013102009859689042, + "loss": 4.917198562622071, + "step": 12740 + }, + { + "epoch": 14.505550811272418, + "grad_norm": 7.535858154296875, + "learning_rate": 0.00013096592448128284, + "loss": 4.810692596435547, + "step": 12750 + }, + { + "epoch": 14.516937090805579, + "grad_norm": 7.142795562744141, + "learning_rate": 0.0001309117503656753, + "loss": 5.230551910400391, + "step": 12760 + }, + { + "epoch": 14.528323370338741, + "grad_norm": 7.323062419891357, + "learning_rate": 0.00013085757625006772, + "loss": 5.168463134765625, + "step": 12770 + }, + { + "epoch": 14.539709649871904, + "grad_norm": 8.29926872253418, + "learning_rate": 0.00013080340213446017, + "loss": 5.1340171813964846, + "step": 12780 + }, + { + "epoch": 14.551095929405067, + "grad_norm": 8.317549705505371, + "learning_rate": 0.0001307492280188526, + "loss": 4.967045593261719, + "step": 12790 + }, + { + "epoch": 14.56248220893823, + "grad_norm": 7.742959499359131, + "learning_rate": 0.00013069505390324505, + "loss": 4.707772827148437, + "step": 12800 + }, + { + "epoch": 14.573868488471392, + "grad_norm": 7.057898044586182, + "learning_rate": 0.00013064087978763747, + "loss": 4.755387496948242, + "step": 12810 + }, + { + "epoch": 14.585254768004555, + "grad_norm": 7.152231216430664, + "learning_rate": 0.0001305867056720299, + "loss": 5.241105270385742, + "step": 12820 + }, + { + "epoch": 14.596641047537718, + "grad_norm": 8.085304260253906, + "learning_rate": 0.00013053253155642235, + "loss": 4.788356781005859, + "step": 12830 + }, + { + "epoch": 14.608027327070879, + "grad_norm": 7.510035037994385, + "learning_rate": 0.00013047835744081477, + "loss": 5.091434478759766, + "step": 12840 + }, + { + "epoch": 14.619413606604041, + "grad_norm": 7.749240875244141, + "learning_rate": 0.00013042418332520723, + "loss": 4.968235778808594, + "step": 12850 + }, + { + "epoch": 14.630799886137204, + "grad_norm": 6.937013626098633, + "learning_rate": 0.00013037000920959965, + "loss": 5.242551040649414, + "step": 12860 + }, + { + "epoch": 14.642186165670367, + "grad_norm": 7.75348424911499, + "learning_rate": 0.0001303158350939921, + "loss": 4.967795944213867, + "step": 12870 + }, + { + "epoch": 14.65357244520353, + "grad_norm": 8.130202293395996, + "learning_rate": 0.00013026166097838453, + "loss": 5.108961868286133, + "step": 12880 + }, + { + "epoch": 14.664958724736692, + "grad_norm": 6.920219898223877, + "learning_rate": 0.00013020748686277698, + "loss": 4.6893165588378904, + "step": 12890 + }, + { + "epoch": 14.676345004269855, + "grad_norm": 7.73007869720459, + "learning_rate": 0.0001301533127471694, + "loss": 5.106839370727539, + "step": 12900 + }, + { + "epoch": 14.687731283803018, + "grad_norm": 7.3374924659729, + "learning_rate": 0.00013009913863156185, + "loss": 5.362062072753906, + "step": 12910 + }, + { + "epoch": 14.69911756333618, + "grad_norm": 7.493285655975342, + "learning_rate": 0.0001300449645159543, + "loss": 5.187427139282226, + "step": 12920 + }, + { + "epoch": 14.710503842869343, + "grad_norm": 7.747722625732422, + "learning_rate": 0.00012999079040034673, + "loss": 4.937312316894531, + "step": 12930 + }, + { + "epoch": 14.721890122402504, + "grad_norm": 7.39769983291626, + "learning_rate": 0.00012993661628473915, + "loss": 5.141265869140625, + "step": 12940 + }, + { + "epoch": 14.733276401935667, + "grad_norm": 6.812027931213379, + "learning_rate": 0.00012988244216913158, + "loss": 5.5300537109375, + "step": 12950 + }, + { + "epoch": 14.74466268146883, + "grad_norm": 7.199182987213135, + "learning_rate": 0.00012982826805352403, + "loss": 4.976782608032226, + "step": 12960 + }, + { + "epoch": 14.756048961001992, + "grad_norm": 7.161105155944824, + "learning_rate": 0.00012977409393791645, + "loss": 4.928173828125, + "step": 12970 + }, + { + "epoch": 14.767435240535155, + "grad_norm": 8.261829376220703, + "learning_rate": 0.0001297199198223089, + "loss": 4.858361053466797, + "step": 12980 + }, + { + "epoch": 14.778821520068318, + "grad_norm": 6.941257953643799, + "learning_rate": 0.00012966574570670133, + "loss": 4.880601119995117, + "step": 12990 + }, + { + "epoch": 14.79020779960148, + "grad_norm": 7.993049144744873, + "learning_rate": 0.00012961157159109378, + "loss": 5.036775970458985, + "step": 13000 + }, + { + "epoch": 14.801594079134643, + "grad_norm": 8.901403427124023, + "learning_rate": 0.0001295573974754862, + "loss": 5.106240844726562, + "step": 13010 + }, + { + "epoch": 14.812980358667804, + "grad_norm": 8.168084144592285, + "learning_rate": 0.00012950322335987866, + "loss": 5.213864135742187, + "step": 13020 + }, + { + "epoch": 14.824366638200967, + "grad_norm": 7.145936965942383, + "learning_rate": 0.0001294490492442711, + "loss": 5.022368621826172, + "step": 13030 + }, + { + "epoch": 14.83575291773413, + "grad_norm": 8.161090850830078, + "learning_rate": 0.00012939487512866353, + "loss": 5.109722518920899, + "step": 13040 + }, + { + "epoch": 14.847139197267293, + "grad_norm": 7.26900053024292, + "learning_rate": 0.00012934070101305599, + "loss": 5.046876907348633, + "step": 13050 + }, + { + "epoch": 14.858525476800455, + "grad_norm": 7.036923408508301, + "learning_rate": 0.0001292865268974484, + "loss": 4.823116683959961, + "step": 13060 + }, + { + "epoch": 14.869911756333618, + "grad_norm": 7.964908599853516, + "learning_rate": 0.00012923235278184086, + "loss": 5.159635925292969, + "step": 13070 + }, + { + "epoch": 14.88129803586678, + "grad_norm": 7.919378757476807, + "learning_rate": 0.00012917817866623329, + "loss": 4.9494384765625, + "step": 13080 + }, + { + "epoch": 14.892684315399944, + "grad_norm": 7.289670944213867, + "learning_rate": 0.0001291240045506257, + "loss": 5.084150695800782, + "step": 13090 + }, + { + "epoch": 14.904070594933106, + "grad_norm": 7.601792335510254, + "learning_rate": 0.00012906983043501813, + "loss": 4.942096328735351, + "step": 13100 + }, + { + "epoch": 14.915456874466269, + "grad_norm": 8.224839210510254, + "learning_rate": 0.0001290156563194106, + "loss": 4.804985046386719, + "step": 13110 + }, + { + "epoch": 14.92684315399943, + "grad_norm": 7.499617099761963, + "learning_rate": 0.000128961482203803, + "loss": 5.03905029296875, + "step": 13120 + }, + { + "epoch": 14.938229433532593, + "grad_norm": 8.021928787231445, + "learning_rate": 0.00012890730808819546, + "loss": 5.192089080810547, + "step": 13130 + }, + { + "epoch": 14.949615713065755, + "grad_norm": 7.354033470153809, + "learning_rate": 0.00012885313397258791, + "loss": 5.053718948364258, + "step": 13140 + }, + { + "epoch": 14.961001992598918, + "grad_norm": 7.821739673614502, + "learning_rate": 0.00012879895985698034, + "loss": 5.054132080078125, + "step": 13150 + }, + { + "epoch": 14.972388272132081, + "grad_norm": 7.435361862182617, + "learning_rate": 0.0001287447857413728, + "loss": 5.183145904541016, + "step": 13160 + }, + { + "epoch": 14.983774551665244, + "grad_norm": 7.934272289276123, + "learning_rate": 0.00012869061162576521, + "loss": 4.926422882080078, + "step": 13170 + }, + { + "epoch": 14.995160831198406, + "grad_norm": 8.532721519470215, + "learning_rate": 0.00012863643751015767, + "loss": 5.203876113891601, + "step": 13180 + }, + { + "epoch": 15.005693139766581, + "grad_norm": 7.0397629737854, + "learning_rate": 0.0001285822633945501, + "loss": 4.161993408203125, + "step": 13190 + }, + { + "epoch": 15.017079419299744, + "grad_norm": 8.11047649383545, + "learning_rate": 0.00012852808927894254, + "loss": 4.52406120300293, + "step": 13200 + }, + { + "epoch": 15.028465698832907, + "grad_norm": 7.387153148651123, + "learning_rate": 0.00012847391516333497, + "loss": 4.291901397705078, + "step": 13210 + }, + { + "epoch": 15.03985197836607, + "grad_norm": 7.610694885253906, + "learning_rate": 0.0001284197410477274, + "loss": 4.618151473999023, + "step": 13220 + }, + { + "epoch": 15.051238257899232, + "grad_norm": 8.041857719421387, + "learning_rate": 0.00012836556693211984, + "loss": 4.415428543090821, + "step": 13230 + }, + { + "epoch": 15.062624537432393, + "grad_norm": 8.153491020202637, + "learning_rate": 0.00012831139281651227, + "loss": 4.578215026855469, + "step": 13240 + }, + { + "epoch": 15.074010816965556, + "grad_norm": 7.969481468200684, + "learning_rate": 0.00012825721870090472, + "loss": 4.715361785888672, + "step": 13250 + }, + { + "epoch": 15.085397096498719, + "grad_norm": 6.474404811859131, + "learning_rate": 0.00012820304458529714, + "loss": 4.324704360961914, + "step": 13260 + }, + { + "epoch": 15.096783376031881, + "grad_norm": 7.767828464508057, + "learning_rate": 0.0001281488704696896, + "loss": 5.102642440795899, + "step": 13270 + }, + { + "epoch": 15.108169655565044, + "grad_norm": 6.66921854019165, + "learning_rate": 0.00012809469635408202, + "loss": 4.494071960449219, + "step": 13280 + }, + { + "epoch": 15.119555935098207, + "grad_norm": 7.946902275085449, + "learning_rate": 0.00012804052223847447, + "loss": 4.48153076171875, + "step": 13290 + }, + { + "epoch": 15.13094221463137, + "grad_norm": 7.2356486320495605, + "learning_rate": 0.0001279863481228669, + "loss": 4.334786224365234, + "step": 13300 + }, + { + "epoch": 15.142328494164532, + "grad_norm": 8.00400161743164, + "learning_rate": 0.00012793217400725935, + "loss": 4.645303344726562, + "step": 13310 + }, + { + "epoch": 15.153714773697695, + "grad_norm": 7.499370098114014, + "learning_rate": 0.00012787799989165177, + "loss": 4.6021240234375, + "step": 13320 + }, + { + "epoch": 15.165101053230856, + "grad_norm": 7.6751580238342285, + "learning_rate": 0.00012782382577604422, + "loss": 4.559888076782227, + "step": 13330 + }, + { + "epoch": 15.176487332764019, + "grad_norm": 7.785442352294922, + "learning_rate": 0.00012776965166043665, + "loss": 4.634941864013672, + "step": 13340 + }, + { + "epoch": 15.187873612297182, + "grad_norm": 7.800275802612305, + "learning_rate": 0.0001277154775448291, + "loss": 4.987696838378906, + "step": 13350 + }, + { + "epoch": 15.199259891830344, + "grad_norm": 7.645476341247559, + "learning_rate": 0.00012766130342922152, + "loss": 4.864948272705078, + "step": 13360 + }, + { + "epoch": 15.210646171363507, + "grad_norm": 7.993826866149902, + "learning_rate": 0.00012760712931361395, + "loss": 4.6918384552001955, + "step": 13370 + }, + { + "epoch": 15.22203245089667, + "grad_norm": 7.791032791137695, + "learning_rate": 0.0001275529551980064, + "loss": 4.629275131225586, + "step": 13380 + }, + { + "epoch": 15.233418730429833, + "grad_norm": 7.296651363372803, + "learning_rate": 0.00012749878108239882, + "loss": 4.688332748413086, + "step": 13390 + }, + { + "epoch": 15.244805009962995, + "grad_norm": 6.546830654144287, + "learning_rate": 0.00012744460696679128, + "loss": 4.458778381347656, + "step": 13400 + }, + { + "epoch": 15.256191289496158, + "grad_norm": 6.901986598968506, + "learning_rate": 0.0001273904328511837, + "loss": 4.439383316040039, + "step": 13410 + }, + { + "epoch": 15.267577569029319, + "grad_norm": 7.9615559577941895, + "learning_rate": 0.00012733625873557615, + "loss": 4.456288528442383, + "step": 13420 + }, + { + "epoch": 15.278963848562482, + "grad_norm": 7.0012593269348145, + "learning_rate": 0.00012728208461996858, + "loss": 4.7562213897705075, + "step": 13430 + }, + { + "epoch": 15.290350128095644, + "grad_norm": 8.112042427062988, + "learning_rate": 0.00012722791050436103, + "loss": 4.588046646118164, + "step": 13440 + }, + { + "epoch": 15.301736407628807, + "grad_norm": 8.545401573181152, + "learning_rate": 0.00012717373638875345, + "loss": 4.876891708374023, + "step": 13450 + }, + { + "epoch": 15.31312268716197, + "grad_norm": 7.401144981384277, + "learning_rate": 0.0001271195622731459, + "loss": 4.7802989959716795, + "step": 13460 + }, + { + "epoch": 15.324508966695133, + "grad_norm": 9.017389297485352, + "learning_rate": 0.00012706538815753836, + "loss": 4.951491165161133, + "step": 13470 + }, + { + "epoch": 15.335895246228295, + "grad_norm": 7.437997817993164, + "learning_rate": 0.00012701121404193078, + "loss": 4.486794662475586, + "step": 13480 + }, + { + "epoch": 15.347281525761458, + "grad_norm": 7.787178039550781, + "learning_rate": 0.00012695703992632323, + "loss": 4.632095718383789, + "step": 13490 + }, + { + "epoch": 15.35866780529462, + "grad_norm": 7.672351360321045, + "learning_rate": 0.00012690286581071566, + "loss": 4.595958709716797, + "step": 13500 + }, + { + "epoch": 15.370054084827782, + "grad_norm": 9.20431900024414, + "learning_rate": 0.00012684869169510808, + "loss": 4.731208038330078, + "step": 13510 + }, + { + "epoch": 15.381440364360945, + "grad_norm": 7.591139793395996, + "learning_rate": 0.0001267945175795005, + "loss": 4.76176643371582, + "step": 13520 + }, + { + "epoch": 15.392826643894107, + "grad_norm": 7.419577121734619, + "learning_rate": 0.00012674034346389296, + "loss": 4.38856201171875, + "step": 13530 + }, + { + "epoch": 15.40421292342727, + "grad_norm": 7.263993740081787, + "learning_rate": 0.00012668616934828538, + "loss": 4.695225524902344, + "step": 13540 + }, + { + "epoch": 15.415599202960433, + "grad_norm": 7.4275126457214355, + "learning_rate": 0.00012663199523267783, + "loss": 4.705525970458984, + "step": 13550 + }, + { + "epoch": 15.426985482493595, + "grad_norm": 8.560776710510254, + "learning_rate": 0.00012657782111707026, + "loss": 4.849109268188476, + "step": 13560 + }, + { + "epoch": 15.438371762026758, + "grad_norm": 8.176137924194336, + "learning_rate": 0.0001265236470014627, + "loss": 4.634634780883789, + "step": 13570 + }, + { + "epoch": 15.449758041559921, + "grad_norm": 7.90976095199585, + "learning_rate": 0.00012646947288585513, + "loss": 4.362587356567383, + "step": 13580 + }, + { + "epoch": 15.461144321093084, + "grad_norm": 8.50546646118164, + "learning_rate": 0.00012641529877024758, + "loss": 4.5653434753417965, + "step": 13590 + }, + { + "epoch": 15.472530600626245, + "grad_norm": 7.390838146209717, + "learning_rate": 0.00012636112465464004, + "loss": 4.620360565185547, + "step": 13600 + }, + { + "epoch": 15.483916880159407, + "grad_norm": 7.908941745758057, + "learning_rate": 0.00012630695053903246, + "loss": 4.444548416137695, + "step": 13610 + }, + { + "epoch": 15.49530315969257, + "grad_norm": 7.579942226409912, + "learning_rate": 0.0001262527764234249, + "loss": 4.685836410522461, + "step": 13620 + }, + { + "epoch": 15.506689439225733, + "grad_norm": 7.692448616027832, + "learning_rate": 0.00012619860230781734, + "loss": 4.817267227172851, + "step": 13630 + }, + { + "epoch": 15.518075718758896, + "grad_norm": 7.596404552459717, + "learning_rate": 0.00012614442819220976, + "loss": 4.794780349731445, + "step": 13640 + }, + { + "epoch": 15.529461998292058, + "grad_norm": 8.101449966430664, + "learning_rate": 0.00012609025407660219, + "loss": 4.803411483764648, + "step": 13650 + }, + { + "epoch": 15.540848277825221, + "grad_norm": 7.359433650970459, + "learning_rate": 0.00012603607996099464, + "loss": 4.751076889038086, + "step": 13660 + }, + { + "epoch": 15.552234557358384, + "grad_norm": 6.875908374786377, + "learning_rate": 0.00012598190584538706, + "loss": 4.687049484252929, + "step": 13670 + }, + { + "epoch": 15.563620836891547, + "grad_norm": 7.818774700164795, + "learning_rate": 0.0001259277317297795, + "loss": 4.953191757202148, + "step": 13680 + }, + { + "epoch": 15.575007116424707, + "grad_norm": 7.3973917961120605, + "learning_rate": 0.00012587355761417194, + "loss": 4.424762725830078, + "step": 13690 + }, + { + "epoch": 15.58639339595787, + "grad_norm": 8.235298156738281, + "learning_rate": 0.0001258193834985644, + "loss": 5.169184875488281, + "step": 13700 + }, + { + "epoch": 15.597779675491033, + "grad_norm": 7.2172532081604, + "learning_rate": 0.00012576520938295684, + "loss": 4.731855392456055, + "step": 13710 + }, + { + "epoch": 15.609165955024196, + "grad_norm": 7.717671871185303, + "learning_rate": 0.00012571103526734927, + "loss": 4.4836174011230465, + "step": 13720 + }, + { + "epoch": 15.620552234557358, + "grad_norm": 8.239593505859375, + "learning_rate": 0.00012565686115174172, + "loss": 4.932541275024414, + "step": 13730 + }, + { + "epoch": 15.631938514090521, + "grad_norm": 7.339993476867676, + "learning_rate": 0.00012560268703613414, + "loss": 4.306424331665039, + "step": 13740 + }, + { + "epoch": 15.643324793623684, + "grad_norm": 7.40126371383667, + "learning_rate": 0.0001255485129205266, + "loss": 4.77293815612793, + "step": 13750 + }, + { + "epoch": 15.654711073156847, + "grad_norm": 6.878884792327881, + "learning_rate": 0.00012549433880491902, + "loss": 4.673749923706055, + "step": 13760 + }, + { + "epoch": 15.66609735269001, + "grad_norm": 8.765450477600098, + "learning_rate": 0.00012544016468931147, + "loss": 4.610214614868164, + "step": 13770 + }, + { + "epoch": 15.67748363222317, + "grad_norm": 8.095568656921387, + "learning_rate": 0.0001253859905737039, + "loss": 4.4134258270263675, + "step": 13780 + }, + { + "epoch": 15.688869911756333, + "grad_norm": 8.046052932739258, + "learning_rate": 0.00012533181645809632, + "loss": 4.598631286621094, + "step": 13790 + }, + { + "epoch": 15.700256191289496, + "grad_norm": 7.999307155609131, + "learning_rate": 0.00012527764234248874, + "loss": 4.934955596923828, + "step": 13800 + }, + { + "epoch": 15.711642470822659, + "grad_norm": 6.909728050231934, + "learning_rate": 0.0001252234682268812, + "loss": 4.704043960571289, + "step": 13810 + }, + { + "epoch": 15.723028750355821, + "grad_norm": 7.83714485168457, + "learning_rate": 0.00012516929411127365, + "loss": 4.726215362548828, + "step": 13820 + }, + { + "epoch": 15.734415029888984, + "grad_norm": 7.958460330963135, + "learning_rate": 0.00012511511999566607, + "loss": 4.580567932128906, + "step": 13830 + }, + { + "epoch": 15.745801309422147, + "grad_norm": 8.28144359588623, + "learning_rate": 0.00012506094588005852, + "loss": 4.877122116088867, + "step": 13840 + }, + { + "epoch": 15.75718758895531, + "grad_norm": 7.191257953643799, + "learning_rate": 0.00012500677176445095, + "loss": 4.728401184082031, + "step": 13850 + }, + { + "epoch": 15.768573868488472, + "grad_norm": 8.373869895935059, + "learning_rate": 0.0001249525976488434, + "loss": 4.692030715942383, + "step": 13860 + }, + { + "epoch": 15.779960148021633, + "grad_norm": 7.541042804718018, + "learning_rate": 0.00012489842353323582, + "loss": 4.673214721679687, + "step": 13870 + }, + { + "epoch": 15.791346427554796, + "grad_norm": 7.176820278167725, + "learning_rate": 0.00012484424941762827, + "loss": 4.742586135864258, + "step": 13880 + }, + { + "epoch": 15.802732707087959, + "grad_norm": 7.285589694976807, + "learning_rate": 0.0001247900753020207, + "loss": 4.660622406005859, + "step": 13890 + }, + { + "epoch": 15.814118986621121, + "grad_norm": 7.229033946990967, + "learning_rate": 0.0001247413185979739, + "loss": 4.864823532104492, + "step": 13900 + }, + { + "epoch": 15.825505266154284, + "grad_norm": 8.045304298400879, + "learning_rate": 0.00012468714448236635, + "loss": 4.963446807861328, + "step": 13910 + }, + { + "epoch": 15.836891545687447, + "grad_norm": 8.248867988586426, + "learning_rate": 0.00012463297036675877, + "loss": 4.45933837890625, + "step": 13920 + }, + { + "epoch": 15.84827782522061, + "grad_norm": 7.996612071990967, + "learning_rate": 0.00012457879625115122, + "loss": 4.713712310791015, + "step": 13930 + }, + { + "epoch": 15.859664104753772, + "grad_norm": 7.0257568359375, + "learning_rate": 0.00012452462213554365, + "loss": 4.95684814453125, + "step": 13940 + }, + { + "epoch": 15.871050384286935, + "grad_norm": 7.490429401397705, + "learning_rate": 0.00012447044801993607, + "loss": 5.097381210327148, + "step": 13950 + }, + { + "epoch": 15.882436663820096, + "grad_norm": 8.141972541809082, + "learning_rate": 0.00012441627390432852, + "loss": 4.907619476318359, + "step": 13960 + }, + { + "epoch": 15.893822943353259, + "grad_norm": 7.669741153717041, + "learning_rate": 0.00012436209978872095, + "loss": 4.52638053894043, + "step": 13970 + }, + { + "epoch": 15.905209222886421, + "grad_norm": 7.623498439788818, + "learning_rate": 0.0001243079256731134, + "loss": 4.6605175018310545, + "step": 13980 + }, + { + "epoch": 15.916595502419584, + "grad_norm": 7.301693916320801, + "learning_rate": 0.00012425375155750582, + "loss": 4.762905883789062, + "step": 13990 + }, + { + "epoch": 15.927981781952747, + "grad_norm": 7.472468376159668, + "learning_rate": 0.00012419957744189827, + "loss": 4.756846237182617, + "step": 14000 + }, + { + "epoch": 15.93936806148591, + "grad_norm": 7.766934871673584, + "learning_rate": 0.0001241454033262907, + "loss": 4.748290252685547, + "step": 14010 + }, + { + "epoch": 15.950754341019072, + "grad_norm": 6.7247796058654785, + "learning_rate": 0.00012409122921068315, + "loss": 4.70723876953125, + "step": 14020 + }, + { + "epoch": 15.962140620552235, + "grad_norm": 8.007941246032715, + "learning_rate": 0.00012403705509507557, + "loss": 4.603389739990234, + "step": 14030 + }, + { + "epoch": 15.973526900085398, + "grad_norm": 7.72274923324585, + "learning_rate": 0.00012398288097946803, + "loss": 4.74395751953125, + "step": 14040 + }, + { + "epoch": 15.984913179618559, + "grad_norm": 7.55423641204834, + "learning_rate": 0.00012392870686386045, + "loss": 4.812894439697265, + "step": 14050 + }, + { + "epoch": 15.996299459151722, + "grad_norm": 7.276206016540527, + "learning_rate": 0.0001238745327482529, + "loss": 4.808458709716797, + "step": 14060 + }, + { + "epoch": 16.0068317677199, + "grad_norm": 7.715972900390625, + "learning_rate": 0.00012382035863264533, + "loss": 4.121777725219727, + "step": 14070 + }, + { + "epoch": 16.01821804725306, + "grad_norm": 7.861795425415039, + "learning_rate": 0.00012376618451703775, + "loss": 4.087036895751953, + "step": 14080 + }, + { + "epoch": 16.029604326786224, + "grad_norm": 7.377951622009277, + "learning_rate": 0.0001237120104014302, + "loss": 4.159128570556641, + "step": 14090 + }, + { + "epoch": 16.040990606319387, + "grad_norm": 6.9755730628967285, + "learning_rate": 0.00012365783628582263, + "loss": 4.118164443969727, + "step": 14100 + }, + { + "epoch": 16.05237688585255, + "grad_norm": 7.6896467208862305, + "learning_rate": 0.00012360366217021508, + "loss": 4.3465576171875, + "step": 14110 + }, + { + "epoch": 16.06376316538571, + "grad_norm": 7.475165367126465, + "learning_rate": 0.0001235494880546075, + "loss": 4.084910202026367, + "step": 14120 + }, + { + "epoch": 16.07514944491887, + "grad_norm": 7.571929931640625, + "learning_rate": 0.00012349531393899995, + "loss": 4.203293228149414, + "step": 14130 + }, + { + "epoch": 16.086535724452034, + "grad_norm": 7.941225051879883, + "learning_rate": 0.00012344113982339238, + "loss": 4.56109733581543, + "step": 14140 + }, + { + "epoch": 16.097922003985197, + "grad_norm": 7.033837795257568, + "learning_rate": 0.00012338696570778483, + "loss": 4.267443466186523, + "step": 14150 + }, + { + "epoch": 16.10930828351836, + "grad_norm": 7.488673210144043, + "learning_rate": 0.00012333279159217726, + "loss": 4.451668167114258, + "step": 14160 + }, + { + "epoch": 16.120694563051522, + "grad_norm": 7.623510837554932, + "learning_rate": 0.0001232786174765697, + "loss": 4.3589630126953125, + "step": 14170 + }, + { + "epoch": 16.132080842584685, + "grad_norm": 7.737654685974121, + "learning_rate": 0.00012322444336096213, + "loss": 4.383876037597656, + "step": 14180 + }, + { + "epoch": 16.143467122117848, + "grad_norm": 8.437525749206543, + "learning_rate": 0.00012317026924535458, + "loss": 4.350270843505859, + "step": 14190 + }, + { + "epoch": 16.15485340165101, + "grad_norm": 7.971991539001465, + "learning_rate": 0.00012311609512974703, + "loss": 4.457265472412109, + "step": 14200 + }, + { + "epoch": 16.166239681184173, + "grad_norm": 7.322634220123291, + "learning_rate": 0.00012306192101413946, + "loss": 4.124166488647461, + "step": 14210 + }, + { + "epoch": 16.177625960717336, + "grad_norm": 6.804745674133301, + "learning_rate": 0.00012300774689853188, + "loss": 4.338391876220703, + "step": 14220 + }, + { + "epoch": 16.1890122402505, + "grad_norm": 7.241665840148926, + "learning_rate": 0.0001229535727829243, + "loss": 3.9033172607421873, + "step": 14230 + }, + { + "epoch": 16.20039851978366, + "grad_norm": 7.835842609405518, + "learning_rate": 0.00012289939866731676, + "loss": 4.350611877441406, + "step": 14240 + }, + { + "epoch": 16.211784799316824, + "grad_norm": 7.062921047210693, + "learning_rate": 0.00012284522455170918, + "loss": 4.4512939453125, + "step": 14250 + }, + { + "epoch": 16.223171078849987, + "grad_norm": 7.936232566833496, + "learning_rate": 0.00012279105043610164, + "loss": 4.246225738525391, + "step": 14260 + }, + { + "epoch": 16.23455735838315, + "grad_norm": 8.720023155212402, + "learning_rate": 0.00012273687632049406, + "loss": 4.419322967529297, + "step": 14270 + }, + { + "epoch": 16.245943637916312, + "grad_norm": 7.396350383758545, + "learning_rate": 0.0001226827022048865, + "loss": 4.686096954345703, + "step": 14280 + }, + { + "epoch": 16.257329917449475, + "grad_norm": 7.3385515213012695, + "learning_rate": 0.00012262852808927894, + "loss": 4.519886016845703, + "step": 14290 + }, + { + "epoch": 16.268716196982638, + "grad_norm": 7.164921283721924, + "learning_rate": 0.0001225743539736714, + "loss": 4.170539474487304, + "step": 14300 + }, + { + "epoch": 16.280102476515797, + "grad_norm": 6.846629619598389, + "learning_rate": 0.00012252017985806384, + "loss": 4.29273796081543, + "step": 14310 + }, + { + "epoch": 16.29148875604896, + "grad_norm": 8.011646270751953, + "learning_rate": 0.00012246600574245626, + "loss": 4.424296951293945, + "step": 14320 + }, + { + "epoch": 16.302875035582122, + "grad_norm": 7.801056861877441, + "learning_rate": 0.00012241183162684872, + "loss": 4.3867851257324215, + "step": 14330 + }, + { + "epoch": 16.314261315115285, + "grad_norm": 7.563156604766846, + "learning_rate": 0.00012235765751124114, + "loss": 4.142354583740234, + "step": 14340 + }, + { + "epoch": 16.325647594648448, + "grad_norm": 7.874721527099609, + "learning_rate": 0.0001223034833956336, + "loss": 4.672331619262695, + "step": 14350 + }, + { + "epoch": 16.33703387418161, + "grad_norm": 8.173314094543457, + "learning_rate": 0.00012224930928002602, + "loss": 4.456505584716797, + "step": 14360 + }, + { + "epoch": 16.348420153714773, + "grad_norm": 7.8878092765808105, + "learning_rate": 0.00012219513516441844, + "loss": 4.445765686035156, + "step": 14370 + }, + { + "epoch": 16.359806433247936, + "grad_norm": 8.385254859924316, + "learning_rate": 0.00012214096104881086, + "loss": 4.569265747070313, + "step": 14380 + }, + { + "epoch": 16.3711927127811, + "grad_norm": 8.698989868164062, + "learning_rate": 0.00012208678693320332, + "loss": 4.381620788574219, + "step": 14390 + }, + { + "epoch": 16.38257899231426, + "grad_norm": 6.675107479095459, + "learning_rate": 0.00012203261281759575, + "loss": 4.386390686035156, + "step": 14400 + }, + { + "epoch": 16.393965271847424, + "grad_norm": 7.738358020782471, + "learning_rate": 0.00012197843870198819, + "loss": 3.9419265747070313, + "step": 14410 + }, + { + "epoch": 16.405351551380587, + "grad_norm": 7.995990753173828, + "learning_rate": 0.00012192426458638064, + "loss": 4.322775268554688, + "step": 14420 + }, + { + "epoch": 16.41673783091375, + "grad_norm": 7.4500813484191895, + "learning_rate": 0.00012187009047077307, + "loss": 4.260313415527344, + "step": 14430 + }, + { + "epoch": 16.428124110446912, + "grad_norm": 8.59277057647705, + "learning_rate": 0.00012181591635516552, + "loss": 4.660084533691406, + "step": 14440 + }, + { + "epoch": 16.439510389980075, + "grad_norm": 7.587380409240723, + "learning_rate": 0.00012176174223955794, + "loss": 4.50414924621582, + "step": 14450 + }, + { + "epoch": 16.450896669513238, + "grad_norm": 7.656953811645508, + "learning_rate": 0.00012170756812395038, + "loss": 4.5958251953125, + "step": 14460 + }, + { + "epoch": 16.4622829490464, + "grad_norm": 7.589241981506348, + "learning_rate": 0.00012165339400834281, + "loss": 4.327259063720703, + "step": 14470 + }, + { + "epoch": 16.47366922857956, + "grad_norm": 7.885225296020508, + "learning_rate": 0.00012159921989273526, + "loss": 4.439897537231445, + "step": 14480 + }, + { + "epoch": 16.485055508112723, + "grad_norm": 7.397141456604004, + "learning_rate": 0.00012154504577712768, + "loss": 4.272943115234375, + "step": 14490 + }, + { + "epoch": 16.496441787645885, + "grad_norm": 6.983608245849609, + "learning_rate": 0.00012149087166152013, + "loss": 4.209513092041016, + "step": 14500 + }, + { + "epoch": 16.507828067179048, + "grad_norm": 8.37636947631836, + "learning_rate": 0.00012143669754591256, + "loss": 4.410722732543945, + "step": 14510 + }, + { + "epoch": 16.51921434671221, + "grad_norm": 8.623056411743164, + "learning_rate": 0.00012138252343030501, + "loss": 4.736907577514648, + "step": 14520 + }, + { + "epoch": 16.530600626245374, + "grad_norm": 7.334944725036621, + "learning_rate": 0.00012132834931469745, + "loss": 4.085528945922851, + "step": 14530 + }, + { + "epoch": 16.541986905778536, + "grad_norm": 7.641763210296631, + "learning_rate": 0.00012127417519908987, + "loss": 4.196076202392578, + "step": 14540 + }, + { + "epoch": 16.5533731853117, + "grad_norm": 7.118811130523682, + "learning_rate": 0.00012122000108348232, + "loss": 4.219401931762695, + "step": 14550 + }, + { + "epoch": 16.56475946484486, + "grad_norm": 7.666287422180176, + "learning_rate": 0.00012116582696787475, + "loss": 4.500454330444336, + "step": 14560 + }, + { + "epoch": 16.576145744378024, + "grad_norm": 8.155136108398438, + "learning_rate": 0.0001211116528522672, + "loss": 4.450511932373047, + "step": 14570 + }, + { + "epoch": 16.587532023911187, + "grad_norm": 7.149616241455078, + "learning_rate": 0.00012105747873665963, + "loss": 4.229332733154297, + "step": 14580 + }, + { + "epoch": 16.59891830344435, + "grad_norm": 8.195096969604492, + "learning_rate": 0.00012100330462105208, + "loss": 4.491680908203125, + "step": 14590 + }, + { + "epoch": 16.610304582977513, + "grad_norm": 6.949581146240234, + "learning_rate": 0.0001209491305054445, + "loss": 4.443110656738281, + "step": 14600 + }, + { + "epoch": 16.621690862510675, + "grad_norm": 7.297284126281738, + "learning_rate": 0.00012089495638983694, + "loss": 4.731317520141602, + "step": 14610 + }, + { + "epoch": 16.633077142043838, + "grad_norm": 6.88108491897583, + "learning_rate": 0.00012084078227422936, + "loss": 4.465324783325196, + "step": 14620 + }, + { + "epoch": 16.644463421577, + "grad_norm": 8.48362922668457, + "learning_rate": 0.00012078660815862182, + "loss": 4.320807266235351, + "step": 14630 + }, + { + "epoch": 16.655849701110164, + "grad_norm": 8.934884071350098, + "learning_rate": 0.00012073243404301427, + "loss": 4.569874954223633, + "step": 14640 + }, + { + "epoch": 16.667235980643326, + "grad_norm": 7.942513942718506, + "learning_rate": 0.00012067825992740669, + "loss": 4.774484634399414, + "step": 14650 + }, + { + "epoch": 16.67862226017649, + "grad_norm": 7.406705379486084, + "learning_rate": 0.00012062408581179914, + "loss": 4.290068817138672, + "step": 14660 + }, + { + "epoch": 16.69000853970965, + "grad_norm": 7.856991291046143, + "learning_rate": 0.00012056991169619157, + "loss": 4.307246398925781, + "step": 14670 + }, + { + "epoch": 16.70139481924281, + "grad_norm": 8.263503074645996, + "learning_rate": 0.000120515737580584, + "loss": 4.2724559783935545, + "step": 14680 + }, + { + "epoch": 16.712781098775974, + "grad_norm": 8.12948226928711, + "learning_rate": 0.00012046156346497643, + "loss": 4.421898651123047, + "step": 14690 + }, + { + "epoch": 16.724167378309136, + "grad_norm": 8.351273536682129, + "learning_rate": 0.00012040738934936888, + "loss": 4.521099090576172, + "step": 14700 + }, + { + "epoch": 16.7355536578423, + "grad_norm": 7.658185958862305, + "learning_rate": 0.0001203532152337613, + "loss": 4.358420562744141, + "step": 14710 + }, + { + "epoch": 16.746939937375462, + "grad_norm": 7.626881122589111, + "learning_rate": 0.00012029904111815376, + "loss": 4.407142257690429, + "step": 14720 + }, + { + "epoch": 16.758326216908625, + "grad_norm": 7.826634407043457, + "learning_rate": 0.00012024486700254618, + "loss": 4.7284095764160154, + "step": 14730 + }, + { + "epoch": 16.769712496441787, + "grad_norm": 7.835376262664795, + "learning_rate": 0.00012019069288693863, + "loss": 4.649514007568359, + "step": 14740 + }, + { + "epoch": 16.78109877597495, + "grad_norm": 7.345791816711426, + "learning_rate": 0.00012013651877133107, + "loss": 4.5487213134765625, + "step": 14750 + }, + { + "epoch": 16.792485055508113, + "grad_norm": 7.653363227844238, + "learning_rate": 0.0001200823446557235, + "loss": 4.464413833618164, + "step": 14760 + }, + { + "epoch": 16.803871335041276, + "grad_norm": 7.4951043128967285, + "learning_rate": 0.00012002817054011595, + "loss": 4.552109146118164, + "step": 14770 + }, + { + "epoch": 16.81525761457444, + "grad_norm": 6.739938259124756, + "learning_rate": 0.00011997399642450837, + "loss": 4.435204315185547, + "step": 14780 + }, + { + "epoch": 16.8266438941076, + "grad_norm": 7.946920871734619, + "learning_rate": 0.00011991982230890082, + "loss": 4.394256591796875, + "step": 14790 + }, + { + "epoch": 16.838030173640764, + "grad_norm": 7.2037224769592285, + "learning_rate": 0.00011986564819329325, + "loss": 4.656013107299804, + "step": 14800 + }, + { + "epoch": 16.849416453173927, + "grad_norm": 7.150942325592041, + "learning_rate": 0.00011981147407768569, + "loss": 4.620296478271484, + "step": 14810 + }, + { + "epoch": 16.86080273270709, + "grad_norm": 8.865056037902832, + "learning_rate": 0.00011975729996207812, + "loss": 4.58060302734375, + "step": 14820 + }, + { + "epoch": 16.872189012240252, + "grad_norm": 7.587070941925049, + "learning_rate": 0.00011970312584647056, + "loss": 4.879767990112304, + "step": 14830 + }, + { + "epoch": 16.88357529177341, + "grad_norm": 8.456056594848633, + "learning_rate": 0.00011964895173086299, + "loss": 4.5918418884277346, + "step": 14840 + }, + { + "epoch": 16.894961571306574, + "grad_norm": 7.7380900382995605, + "learning_rate": 0.00011959477761525544, + "loss": 4.7449501037597654, + "step": 14850 + }, + { + "epoch": 16.906347850839737, + "grad_norm": 7.467341423034668, + "learning_rate": 0.00011954060349964786, + "loss": 4.479496765136719, + "step": 14860 + }, + { + "epoch": 16.9177341303729, + "grad_norm": 8.6979398727417, + "learning_rate": 0.00011948642938404031, + "loss": 4.7719371795654295, + "step": 14870 + }, + { + "epoch": 16.929120409906062, + "grad_norm": 7.696811676025391, + "learning_rate": 0.00011943225526843275, + "loss": 4.6034080505371096, + "step": 14880 + }, + { + "epoch": 16.940506689439225, + "grad_norm": 6.743338584899902, + "learning_rate": 0.00011937808115282518, + "loss": 4.397494888305664, + "step": 14890 + }, + { + "epoch": 16.951892968972388, + "grad_norm": 6.509140491485596, + "learning_rate": 0.00011932390703721763, + "loss": 4.695284271240235, + "step": 14900 + }, + { + "epoch": 16.96327924850555, + "grad_norm": 7.908071994781494, + "learning_rate": 0.00011926973292161005, + "loss": 4.142197799682617, + "step": 14910 + }, + { + "epoch": 16.974665528038713, + "grad_norm": 8.120683670043945, + "learning_rate": 0.0001192155588060025, + "loss": 4.365460968017578, + "step": 14920 + }, + { + "epoch": 16.986051807571876, + "grad_norm": 7.481232643127441, + "learning_rate": 0.00011916138469039493, + "loss": 4.682232284545899, + "step": 14930 + }, + { + "epoch": 16.99743808710504, + "grad_norm": 7.541874408721924, + "learning_rate": 0.00011910721057478738, + "loss": 4.875133895874024, + "step": 14940 + }, + { + "epoch": 17.007970395673215, + "grad_norm": 6.461215019226074, + "learning_rate": 0.0001190530364591798, + "loss": 3.5713672637939453, + "step": 14950 + }, + { + "epoch": 17.019356675206378, + "grad_norm": 8.02785873413086, + "learning_rate": 0.00011899886234357224, + "loss": 3.8977893829345702, + "step": 14960 + }, + { + "epoch": 17.030742954739537, + "grad_norm": 7.833250522613525, + "learning_rate": 0.00011894468822796467, + "loss": 4.051911926269531, + "step": 14970 + }, + { + "epoch": 17.0421292342727, + "grad_norm": 7.147907257080078, + "learning_rate": 0.00011889051411235712, + "loss": 4.067853546142578, + "step": 14980 + }, + { + "epoch": 17.053515513805863, + "grad_norm": 8.514575958251953, + "learning_rate": 0.00011883633999674957, + "loss": 4.16202392578125, + "step": 14990 + }, + { + "epoch": 17.064901793339025, + "grad_norm": 6.706639766693115, + "learning_rate": 0.000118782165881142, + "loss": 3.879001998901367, + "step": 15000 + }, + { + "epoch": 17.076288072872188, + "grad_norm": 7.3274827003479, + "learning_rate": 0.00011872799176553445, + "loss": 3.8930648803710937, + "step": 15010 + }, + { + "epoch": 17.08767435240535, + "grad_norm": 8.865731239318848, + "learning_rate": 0.00011867381764992687, + "loss": 4.000261688232422, + "step": 15020 + }, + { + "epoch": 17.099060631938514, + "grad_norm": 8.527647018432617, + "learning_rate": 0.00011861964353431931, + "loss": 4.380818557739258, + "step": 15030 + }, + { + "epoch": 17.110446911471676, + "grad_norm": 6.795366287231445, + "learning_rate": 0.00011856546941871173, + "loss": 4.000965118408203, + "step": 15040 + }, + { + "epoch": 17.12183319100484, + "grad_norm": 8.024996757507324, + "learning_rate": 0.00011851129530310419, + "loss": 3.8776901245117186, + "step": 15050 + }, + { + "epoch": 17.133219470538002, + "grad_norm": 7.76874303817749, + "learning_rate": 0.00011845712118749661, + "loss": 3.907266616821289, + "step": 15060 + }, + { + "epoch": 17.144605750071165, + "grad_norm": 7.170513153076172, + "learning_rate": 0.00011840294707188906, + "loss": 4.122880172729492, + "step": 15070 + }, + { + "epoch": 17.155992029604327, + "grad_norm": 7.131034851074219, + "learning_rate": 0.00011834877295628149, + "loss": 4.134088134765625, + "step": 15080 + }, + { + "epoch": 17.16737830913749, + "grad_norm": 9.287558555603027, + "learning_rate": 0.00011829459884067394, + "loss": 4.137316131591797, + "step": 15090 + }, + { + "epoch": 17.178764588670653, + "grad_norm": 6.616999626159668, + "learning_rate": 0.00011824042472506638, + "loss": 4.016845321655273, + "step": 15100 + }, + { + "epoch": 17.190150868203816, + "grad_norm": 7.829199314117432, + "learning_rate": 0.0001181862506094588, + "loss": 4.34521369934082, + "step": 15110 + }, + { + "epoch": 17.20153714773698, + "grad_norm": 8.117554664611816, + "learning_rate": 0.00011813207649385125, + "loss": 3.9818496704101562, + "step": 15120 + }, + { + "epoch": 17.21292342727014, + "grad_norm": 8.301651954650879, + "learning_rate": 0.00011807790237824368, + "loss": 4.036884307861328, + "step": 15130 + }, + { + "epoch": 17.224309706803304, + "grad_norm": 8.472175598144531, + "learning_rate": 0.00011802372826263613, + "loss": 4.04235725402832, + "step": 15140 + }, + { + "epoch": 17.235695986336463, + "grad_norm": 8.00710391998291, + "learning_rate": 0.00011796955414702855, + "loss": 4.428823089599609, + "step": 15150 + }, + { + "epoch": 17.247082265869626, + "grad_norm": 7.678086757659912, + "learning_rate": 0.00011791538003142099, + "loss": 4.123222351074219, + "step": 15160 + }, + { + "epoch": 17.25846854540279, + "grad_norm": 7.292410850524902, + "learning_rate": 0.00011786120591581343, + "loss": 4.134667205810547, + "step": 15170 + }, + { + "epoch": 17.26985482493595, + "grad_norm": 7.336828231811523, + "learning_rate": 0.00011780703180020587, + "loss": 4.142428970336914, + "step": 15180 + }, + { + "epoch": 17.281241104469114, + "grad_norm": 6.939968109130859, + "learning_rate": 0.00011775285768459829, + "loss": 4.088996505737304, + "step": 15190 + }, + { + "epoch": 17.292627384002277, + "grad_norm": 6.900147438049316, + "learning_rate": 0.00011769868356899074, + "loss": 4.092143630981445, + "step": 15200 + }, + { + "epoch": 17.30401366353544, + "grad_norm": 8.642024993896484, + "learning_rate": 0.0001176445094533832, + "loss": 4.337299728393555, + "step": 15210 + }, + { + "epoch": 17.315399943068602, + "grad_norm": 7.520287036895752, + "learning_rate": 0.00011759033533777562, + "loss": 3.809254837036133, + "step": 15220 + }, + { + "epoch": 17.326786222601765, + "grad_norm": 8.518157958984375, + "learning_rate": 0.00011753616122216806, + "loss": 4.095516967773437, + "step": 15230 + }, + { + "epoch": 17.338172502134928, + "grad_norm": 8.106553077697754, + "learning_rate": 0.00011748198710656048, + "loss": 4.244467163085938, + "step": 15240 + }, + { + "epoch": 17.34955878166809, + "grad_norm": 7.119726181030273, + "learning_rate": 0.00011742781299095293, + "loss": 4.04039192199707, + "step": 15250 + }, + { + "epoch": 17.360945061201253, + "grad_norm": 8.118093490600586, + "learning_rate": 0.00011737363887534536, + "loss": 4.112255859375, + "step": 15260 + }, + { + "epoch": 17.372331340734416, + "grad_norm": 7.7591729164123535, + "learning_rate": 0.00011731946475973781, + "loss": 3.9870590209960937, + "step": 15270 + }, + { + "epoch": 17.38371762026758, + "grad_norm": 6.926348686218262, + "learning_rate": 0.00011726529064413023, + "loss": 4.05236701965332, + "step": 15280 + }, + { + "epoch": 17.39510389980074, + "grad_norm": 7.686641216278076, + "learning_rate": 0.00011721111652852268, + "loss": 3.9766483306884766, + "step": 15290 + }, + { + "epoch": 17.406490179333904, + "grad_norm": 7.820587635040283, + "learning_rate": 0.00011715694241291511, + "loss": 4.117971801757813, + "step": 15300 + }, + { + "epoch": 17.417876458867067, + "grad_norm": 9.149397850036621, + "learning_rate": 0.00011710276829730755, + "loss": 4.0301368713378904, + "step": 15310 + }, + { + "epoch": 17.42926273840023, + "grad_norm": 7.755941390991211, + "learning_rate": 0.0001170485941817, + "loss": 4.266557312011718, + "step": 15320 + }, + { + "epoch": 17.44064901793339, + "grad_norm": 7.495479106903076, + "learning_rate": 0.00011699442006609242, + "loss": 4.452923202514649, + "step": 15330 + }, + { + "epoch": 17.45203529746655, + "grad_norm": 8.05996322631836, + "learning_rate": 0.00011694024595048487, + "loss": 4.292583847045899, + "step": 15340 + }, + { + "epoch": 17.463421576999714, + "grad_norm": 9.973164558410645, + "learning_rate": 0.0001168860718348773, + "loss": 4.290217590332031, + "step": 15350 + }, + { + "epoch": 17.474807856532877, + "grad_norm": 8.033830642700195, + "learning_rate": 0.00011683189771926975, + "loss": 4.155846405029297, + "step": 15360 + }, + { + "epoch": 17.48619413606604, + "grad_norm": 7.986049652099609, + "learning_rate": 0.00011677772360366217, + "loss": 4.047249603271484, + "step": 15370 + }, + { + "epoch": 17.497580415599202, + "grad_norm": 7.257813930511475, + "learning_rate": 0.00011672354948805461, + "loss": 4.1319633483886715, + "step": 15380 + }, + { + "epoch": 17.508966695132365, + "grad_norm": 8.351729393005371, + "learning_rate": 0.00011666937537244704, + "loss": 4.184175872802735, + "step": 15390 + }, + { + "epoch": 17.520352974665528, + "grad_norm": 7.465641021728516, + "learning_rate": 0.00011661520125683949, + "loss": 4.227563858032227, + "step": 15400 + }, + { + "epoch": 17.53173925419869, + "grad_norm": 7.468196392059326, + "learning_rate": 0.00011656102714123191, + "loss": 4.250960922241211, + "step": 15410 + }, + { + "epoch": 17.543125533731853, + "grad_norm": 7.476719856262207, + "learning_rate": 0.00011650685302562437, + "loss": 4.10565414428711, + "step": 15420 + }, + { + "epoch": 17.554511813265016, + "grad_norm": 7.9970269203186035, + "learning_rate": 0.00011645267891001682, + "loss": 4.217912292480468, + "step": 15430 + }, + { + "epoch": 17.56589809279818, + "grad_norm": 7.878209114074707, + "learning_rate": 0.00011639850479440924, + "loss": 4.038910293579102, + "step": 15440 + }, + { + "epoch": 17.57728437233134, + "grad_norm": 7.499887466430664, + "learning_rate": 0.00011634433067880168, + "loss": 4.3691764831542965, + "step": 15450 + }, + { + "epoch": 17.588670651864504, + "grad_norm": 7.418756484985352, + "learning_rate": 0.0001162901565631941, + "loss": 4.344051361083984, + "step": 15460 + }, + { + "epoch": 17.600056931397667, + "grad_norm": 8.542085647583008, + "learning_rate": 0.00011623598244758656, + "loss": 4.26641731262207, + "step": 15470 + }, + { + "epoch": 17.61144321093083, + "grad_norm": 8.016523361206055, + "learning_rate": 0.00011618180833197898, + "loss": 4.250433349609375, + "step": 15480 + }, + { + "epoch": 17.622829490463992, + "grad_norm": 7.668478488922119, + "learning_rate": 0.00011612763421637143, + "loss": 4.095648956298828, + "step": 15490 + }, + { + "epoch": 17.634215769997155, + "grad_norm": 7.5998101234436035, + "learning_rate": 0.00011607346010076386, + "loss": 4.096474075317383, + "step": 15500 + }, + { + "epoch": 17.645602049530314, + "grad_norm": 8.259299278259277, + "learning_rate": 0.00011601928598515631, + "loss": 4.329125213623047, + "step": 15510 + }, + { + "epoch": 17.656988329063477, + "grad_norm": 7.811594009399414, + "learning_rate": 0.00011596511186954873, + "loss": 4.401547622680664, + "step": 15520 + }, + { + "epoch": 17.66837460859664, + "grad_norm": 7.514590740203857, + "learning_rate": 0.00011591093775394117, + "loss": 4.361583709716797, + "step": 15530 + }, + { + "epoch": 17.679760888129803, + "grad_norm": 7.787680625915527, + "learning_rate": 0.00011585676363833362, + "loss": 4.375646591186523, + "step": 15540 + }, + { + "epoch": 17.691147167662965, + "grad_norm": 6.984362602233887, + "learning_rate": 0.00011580258952272605, + "loss": 4.231984329223633, + "step": 15550 + }, + { + "epoch": 17.702533447196128, + "grad_norm": 8.019062042236328, + "learning_rate": 0.0001157484154071185, + "loss": 4.141991806030274, + "step": 15560 + }, + { + "epoch": 17.71391972672929, + "grad_norm": 8.658064842224121, + "learning_rate": 0.00011569424129151092, + "loss": 4.37116813659668, + "step": 15570 + }, + { + "epoch": 17.725306006262453, + "grad_norm": 8.997333526611328, + "learning_rate": 0.00011564006717590336, + "loss": 4.313310241699218, + "step": 15580 + }, + { + "epoch": 17.736692285795616, + "grad_norm": 8.168388366699219, + "learning_rate": 0.0001155858930602958, + "loss": 4.10748291015625, + "step": 15590 + }, + { + "epoch": 17.74807856532878, + "grad_norm": 8.176225662231445, + "learning_rate": 0.00011553171894468824, + "loss": 4.249481964111328, + "step": 15600 + }, + { + "epoch": 17.75946484486194, + "grad_norm": 8.127287864685059, + "learning_rate": 0.00011547754482908066, + "loss": 4.257102203369141, + "step": 15610 + }, + { + "epoch": 17.770851124395104, + "grad_norm": 7.9209794998168945, + "learning_rate": 0.00011542337071347311, + "loss": 4.099342727661133, + "step": 15620 + }, + { + "epoch": 17.782237403928267, + "grad_norm": 7.507110118865967, + "learning_rate": 0.00011536919659786554, + "loss": 4.057368469238281, + "step": 15630 + }, + { + "epoch": 17.79362368346143, + "grad_norm": 7.258909702301025, + "learning_rate": 0.00011531502248225799, + "loss": 4.278286743164062, + "step": 15640 + }, + { + "epoch": 17.805009962994593, + "grad_norm": 8.5214204788208, + "learning_rate": 0.00011526084836665041, + "loss": 4.387587356567383, + "step": 15650 + }, + { + "epoch": 17.816396242527755, + "grad_norm": 6.49737548828125, + "learning_rate": 0.00011520667425104285, + "loss": 3.87442626953125, + "step": 15660 + }, + { + "epoch": 17.827782522060918, + "grad_norm": 8.327667236328125, + "learning_rate": 0.0001151525001354353, + "loss": 4.007180404663086, + "step": 15670 + }, + { + "epoch": 17.83916880159408, + "grad_norm": 8.513700485229492, + "learning_rate": 0.00011509832601982773, + "loss": 4.101554870605469, + "step": 15680 + }, + { + "epoch": 17.85055508112724, + "grad_norm": 8.8079833984375, + "learning_rate": 0.00011504415190422018, + "loss": 4.265046310424805, + "step": 15690 + }, + { + "epoch": 17.861941360660403, + "grad_norm": 7.73423957824707, + "learning_rate": 0.0001149899777886126, + "loss": 4.407987976074219, + "step": 15700 + }, + { + "epoch": 17.873327640193565, + "grad_norm": 7.101474285125732, + "learning_rate": 0.00011493580367300505, + "loss": 4.4353492736816404, + "step": 15710 + }, + { + "epoch": 17.884713919726728, + "grad_norm": 7.298971652984619, + "learning_rate": 0.00011488162955739748, + "loss": 4.033404159545898, + "step": 15720 + }, + { + "epoch": 17.89610019925989, + "grad_norm": 7.350695610046387, + "learning_rate": 0.00011482745544178992, + "loss": 4.468685150146484, + "step": 15730 + }, + { + "epoch": 17.907486478793054, + "grad_norm": 7.839898109436035, + "learning_rate": 0.00011477328132618234, + "loss": 4.322669601440429, + "step": 15740 + }, + { + "epoch": 17.918872758326216, + "grad_norm": 7.159393787384033, + "learning_rate": 0.00011471910721057479, + "loss": 4.0323749542236325, + "step": 15750 + }, + { + "epoch": 17.93025903785938, + "grad_norm": 7.601138591766357, + "learning_rate": 0.00011466493309496722, + "loss": 4.170376586914062, + "step": 15760 + }, + { + "epoch": 17.941645317392542, + "grad_norm": 7.914937496185303, + "learning_rate": 0.00011461075897935967, + "loss": 4.421757507324219, + "step": 15770 + }, + { + "epoch": 17.953031596925705, + "grad_norm": 7.36163854598999, + "learning_rate": 0.00011455658486375212, + "loss": 4.248828506469726, + "step": 15780 + }, + { + "epoch": 17.964417876458867, + "grad_norm": 7.326484680175781, + "learning_rate": 0.00011450241074814454, + "loss": 4.062700271606445, + "step": 15790 + }, + { + "epoch": 17.97580415599203, + "grad_norm": 7.293338775634766, + "learning_rate": 0.00011444823663253698, + "loss": 4.451213455200195, + "step": 15800 + }, + { + "epoch": 17.987190435525193, + "grad_norm": 7.929836273193359, + "learning_rate": 0.00011439406251692941, + "loss": 4.008718109130859, + "step": 15810 + }, + { + "epoch": 17.998576715058356, + "grad_norm": 7.278785705566406, + "learning_rate": 0.00011433988840132186, + "loss": 4.249869537353516, + "step": 15820 + }, + { + "epoch": 18.00910902362653, + "grad_norm": 7.38397216796875, + "learning_rate": 0.00011428571428571428, + "loss": 3.487931823730469, + "step": 15830 + }, + { + "epoch": 18.02049530315969, + "grad_norm": 7.779951572418213, + "learning_rate": 0.00011423154017010674, + "loss": 3.6420997619628905, + "step": 15840 + }, + { + "epoch": 18.031881582692854, + "grad_norm": 7.421573162078857, + "learning_rate": 0.00011417736605449916, + "loss": 3.697202682495117, + "step": 15850 + }, + { + "epoch": 18.043267862226017, + "grad_norm": 7.718578815460205, + "learning_rate": 0.00011412319193889161, + "loss": 3.6515987396240233, + "step": 15860 + }, + { + "epoch": 18.05465414175918, + "grad_norm": 8.21097183227539, + "learning_rate": 0.0001140744352348448, + "loss": 4.043758392333984, + "step": 15870 + }, + { + "epoch": 18.066040421292342, + "grad_norm": 8.3043794631958, + "learning_rate": 0.00011402026111923723, + "loss": 3.8113327026367188, + "step": 15880 + }, + { + "epoch": 18.077426700825505, + "grad_norm": 7.186902046203613, + "learning_rate": 0.00011396608700362967, + "loss": 3.746054458618164, + "step": 15890 + }, + { + "epoch": 18.088812980358668, + "grad_norm": 7.123048305511475, + "learning_rate": 0.0001139119128880221, + "loss": 3.691455078125, + "step": 15900 + }, + { + "epoch": 18.10019925989183, + "grad_norm": 7.646660804748535, + "learning_rate": 0.00011385773877241455, + "loss": 3.715811538696289, + "step": 15910 + }, + { + "epoch": 18.111585539424993, + "grad_norm": 7.243655204772949, + "learning_rate": 0.000113803564656807, + "loss": 3.995361328125, + "step": 15920 + }, + { + "epoch": 18.122971818958156, + "grad_norm": 7.36957311630249, + "learning_rate": 0.00011374939054119942, + "loss": 3.660219192504883, + "step": 15930 + }, + { + "epoch": 18.13435809849132, + "grad_norm": 8.030665397644043, + "learning_rate": 0.00011369521642559187, + "loss": 3.9243896484375, + "step": 15940 + }, + { + "epoch": 18.14574437802448, + "grad_norm": 7.206502914428711, + "learning_rate": 0.0001136410423099843, + "loss": 3.791769027709961, + "step": 15950 + }, + { + "epoch": 18.157130657557644, + "grad_norm": 8.390379905700684, + "learning_rate": 0.00011358686819437674, + "loss": 3.6325794219970704, + "step": 15960 + }, + { + "epoch": 18.168516937090807, + "grad_norm": 7.630048751831055, + "learning_rate": 0.00011353269407876916, + "loss": 3.683578872680664, + "step": 15970 + }, + { + "epoch": 18.17990321662397, + "grad_norm": 7.336859703063965, + "learning_rate": 0.00011347851996316161, + "loss": 3.736345672607422, + "step": 15980 + }, + { + "epoch": 18.19128949615713, + "grad_norm": 6.535525798797607, + "learning_rate": 0.00011342434584755404, + "loss": 3.869864654541016, + "step": 15990 + }, + { + "epoch": 18.20267577569029, + "grad_norm": 7.941544532775879, + "learning_rate": 0.00011337017173194649, + "loss": 3.8768684387207033, + "step": 16000 + }, + { + "epoch": 18.214062055223454, + "grad_norm": 8.93309497833252, + "learning_rate": 0.00011331599761633891, + "loss": 4.1240489959716795, + "step": 16010 + }, + { + "epoch": 18.225448334756617, + "grad_norm": 8.11670970916748, + "learning_rate": 0.00011326182350073135, + "loss": 3.7689064025878904, + "step": 16020 + }, + { + "epoch": 18.23683461428978, + "grad_norm": 8.06661319732666, + "learning_rate": 0.0001132076493851238, + "loss": 3.8555408477783204, + "step": 16030 + }, + { + "epoch": 18.248220893822943, + "grad_norm": 7.596058368682861, + "learning_rate": 0.00011315347526951623, + "loss": 3.899448013305664, + "step": 16040 + }, + { + "epoch": 18.259607173356105, + "grad_norm": 7.202267646789551, + "learning_rate": 0.00011309930115390868, + "loss": 3.954229736328125, + "step": 16050 + }, + { + "epoch": 18.270993452889268, + "grad_norm": 6.913619518280029, + "learning_rate": 0.0001130451270383011, + "loss": 4.055387496948242, + "step": 16060 + }, + { + "epoch": 18.28237973242243, + "grad_norm": 7.013467311859131, + "learning_rate": 0.00011299095292269355, + "loss": 3.91961784362793, + "step": 16070 + }, + { + "epoch": 18.293766011955594, + "grad_norm": 8.460749626159668, + "learning_rate": 0.00011293677880708598, + "loss": 3.9316688537597657, + "step": 16080 + }, + { + "epoch": 18.305152291488756, + "grad_norm": 6.905110836029053, + "learning_rate": 0.00011288260469147842, + "loss": 4.052811813354492, + "step": 16090 + }, + { + "epoch": 18.31653857102192, + "grad_norm": 6.9299116134643555, + "learning_rate": 0.00011282843057587084, + "loss": 3.9361194610595702, + "step": 16100 + }, + { + "epoch": 18.327924850555082, + "grad_norm": 6.814231872558594, + "learning_rate": 0.00011277425646026329, + "loss": 3.73162841796875, + "step": 16110 + }, + { + "epoch": 18.339311130088245, + "grad_norm": 7.320927619934082, + "learning_rate": 0.00011272008234465572, + "loss": 3.8358139038085937, + "step": 16120 + }, + { + "epoch": 18.350697409621407, + "grad_norm": 7.525396823883057, + "learning_rate": 0.00011266590822904817, + "loss": 3.948038864135742, + "step": 16130 + }, + { + "epoch": 18.36208368915457, + "grad_norm": 8.188949584960938, + "learning_rate": 0.00011261173411344062, + "loss": 4.153988647460937, + "step": 16140 + }, + { + "epoch": 18.373469968687733, + "grad_norm": 7.659078121185303, + "learning_rate": 0.00011255755999783304, + "loss": 3.8572532653808596, + "step": 16150 + }, + { + "epoch": 18.384856248220895, + "grad_norm": 7.888333320617676, + "learning_rate": 0.00011250338588222548, + "loss": 4.0505119323730465, + "step": 16160 + }, + { + "epoch": 18.396242527754055, + "grad_norm": 9.681838989257812, + "learning_rate": 0.0001124492117666179, + "loss": 3.6657783508300783, + "step": 16170 + }, + { + "epoch": 18.407628807287217, + "grad_norm": 7.816950798034668, + "learning_rate": 0.00011239503765101036, + "loss": 4.182400894165039, + "step": 16180 + }, + { + "epoch": 18.41901508682038, + "grad_norm": 8.955124855041504, + "learning_rate": 0.00011234086353540278, + "loss": 4.1284526824951175, + "step": 16190 + }, + { + "epoch": 18.430401366353543, + "grad_norm": 7.874047756195068, + "learning_rate": 0.00011228668941979523, + "loss": 4.089582824707032, + "step": 16200 + }, + { + "epoch": 18.441787645886706, + "grad_norm": 7.964369297027588, + "learning_rate": 0.00011223251530418766, + "loss": 4.226484298706055, + "step": 16210 + }, + { + "epoch": 18.45317392541987, + "grad_norm": 7.5684614181518555, + "learning_rate": 0.00011217834118858011, + "loss": 3.7357833862304686, + "step": 16220 + }, + { + "epoch": 18.46456020495303, + "grad_norm": 8.19275188446045, + "learning_rate": 0.00011212416707297253, + "loss": 3.658647918701172, + "step": 16230 + }, + { + "epoch": 18.475946484486194, + "grad_norm": 7.265321731567383, + "learning_rate": 0.00011206999295736497, + "loss": 4.110098266601563, + "step": 16240 + }, + { + "epoch": 18.487332764019357, + "grad_norm": 8.811004638671875, + "learning_rate": 0.0001120158188417574, + "loss": 4.014685821533203, + "step": 16250 + }, + { + "epoch": 18.49871904355252, + "grad_norm": 8.132495880126953, + "learning_rate": 0.00011196164472614985, + "loss": 3.972939682006836, + "step": 16260 + }, + { + "epoch": 18.510105323085682, + "grad_norm": 7.8982672691345215, + "learning_rate": 0.0001119074706105423, + "loss": 3.900859069824219, + "step": 16270 + }, + { + "epoch": 18.521491602618845, + "grad_norm": 8.57484245300293, + "learning_rate": 0.00011185329649493472, + "loss": 3.8809505462646485, + "step": 16280 + }, + { + "epoch": 18.532877882152007, + "grad_norm": 8.151063919067383, + "learning_rate": 0.00011179912237932718, + "loss": 3.9721370697021485, + "step": 16290 + }, + { + "epoch": 18.54426416168517, + "grad_norm": 7.32830810546875, + "learning_rate": 0.0001117449482637196, + "loss": 4.091455078125, + "step": 16300 + }, + { + "epoch": 18.555650441218333, + "grad_norm": 7.877411842346191, + "learning_rate": 0.00011169077414811204, + "loss": 3.9833248138427733, + "step": 16310 + }, + { + "epoch": 18.567036720751496, + "grad_norm": 7.7397379875183105, + "learning_rate": 0.00011163660003250446, + "loss": 3.792256164550781, + "step": 16320 + }, + { + "epoch": 18.57842300028466, + "grad_norm": 7.100308418273926, + "learning_rate": 0.00011158242591689692, + "loss": 3.683748245239258, + "step": 16330 + }, + { + "epoch": 18.58980927981782, + "grad_norm": 8.094539642333984, + "learning_rate": 0.00011152825180128934, + "loss": 4.14416732788086, + "step": 16340 + }, + { + "epoch": 18.60119555935098, + "grad_norm": 7.870161056518555, + "learning_rate": 0.00011147407768568179, + "loss": 3.89129753112793, + "step": 16350 + }, + { + "epoch": 18.612581838884143, + "grad_norm": 7.73281192779541, + "learning_rate": 0.00011141990357007422, + "loss": 4.153730773925782, + "step": 16360 + }, + { + "epoch": 18.623968118417306, + "grad_norm": 7.23974084854126, + "learning_rate": 0.00011136572945446667, + "loss": 3.8958515167236327, + "step": 16370 + }, + { + "epoch": 18.63535439795047, + "grad_norm": 11.619794845581055, + "learning_rate": 0.0001113115553388591, + "loss": 4.148723220825195, + "step": 16380 + }, + { + "epoch": 18.64674067748363, + "grad_norm": 7.968571186065674, + "learning_rate": 0.00011125738122325153, + "loss": 4.051975250244141, + "step": 16390 + }, + { + "epoch": 18.658126957016794, + "grad_norm": 8.169166564941406, + "learning_rate": 0.00011120320710764398, + "loss": 3.804905319213867, + "step": 16400 + }, + { + "epoch": 18.669513236549957, + "grad_norm": 7.773144721984863, + "learning_rate": 0.0001111490329920364, + "loss": 3.9082794189453125, + "step": 16410 + }, + { + "epoch": 18.68089951608312, + "grad_norm": 8.190699577331543, + "learning_rate": 0.00011109485887642886, + "loss": 4.108190536499023, + "step": 16420 + }, + { + "epoch": 18.692285795616282, + "grad_norm": 7.462385177612305, + "learning_rate": 0.00011104068476082128, + "loss": 4.018275451660156, + "step": 16430 + }, + { + "epoch": 18.703672075149445, + "grad_norm": 8.5626859664917, + "learning_rate": 0.00011098651064521372, + "loss": 3.7356529235839844, + "step": 16440 + }, + { + "epoch": 18.715058354682608, + "grad_norm": 8.161300659179688, + "learning_rate": 0.00011093233652960616, + "loss": 4.018674850463867, + "step": 16450 + }, + { + "epoch": 18.72644463421577, + "grad_norm": 7.462896347045898, + "learning_rate": 0.0001108781624139986, + "loss": 3.7080970764160157, + "step": 16460 + }, + { + "epoch": 18.737830913748933, + "grad_norm": 7.781260967254639, + "learning_rate": 0.00011082398829839102, + "loss": 4.161582183837891, + "step": 16470 + }, + { + "epoch": 18.749217193282096, + "grad_norm": 7.212603569030762, + "learning_rate": 0.00011076981418278347, + "loss": 3.770726776123047, + "step": 16480 + }, + { + "epoch": 18.76060347281526, + "grad_norm": 8.329229354858398, + "learning_rate": 0.00011071564006717592, + "loss": 4.128889083862305, + "step": 16490 + }, + { + "epoch": 18.77198975234842, + "grad_norm": 7.012000560760498, + "learning_rate": 0.00011066146595156835, + "loss": 4.022500991821289, + "step": 16500 + }, + { + "epoch": 18.783376031881584, + "grad_norm": 7.675992965698242, + "learning_rate": 0.00011060729183596079, + "loss": 3.725032424926758, + "step": 16510 + }, + { + "epoch": 18.794762311414747, + "grad_norm": 7.538510322570801, + "learning_rate": 0.00011055311772035321, + "loss": 3.657960891723633, + "step": 16520 + }, + { + "epoch": 18.806148590947906, + "grad_norm": 6.402365207672119, + "learning_rate": 0.00011049894360474566, + "loss": 3.9226734161376955, + "step": 16530 + }, + { + "epoch": 18.81753487048107, + "grad_norm": 6.849089622497559, + "learning_rate": 0.00011044476948913809, + "loss": 3.683644104003906, + "step": 16540 + }, + { + "epoch": 18.82892115001423, + "grad_norm": 7.539620399475098, + "learning_rate": 0.00011039059537353054, + "loss": 3.788827896118164, + "step": 16550 + }, + { + "epoch": 18.840307429547394, + "grad_norm": 8.003284454345703, + "learning_rate": 0.00011033642125792296, + "loss": 4.023321533203125, + "step": 16560 + }, + { + "epoch": 18.851693709080557, + "grad_norm": 8.320795059204102, + "learning_rate": 0.00011028224714231541, + "loss": 3.955525588989258, + "step": 16570 + }, + { + "epoch": 18.86307998861372, + "grad_norm": 8.324459075927734, + "learning_rate": 0.00011022807302670784, + "loss": 3.9589923858642577, + "step": 16580 + }, + { + "epoch": 18.874466268146882, + "grad_norm": 8.919875144958496, + "learning_rate": 0.00011017389891110028, + "loss": 4.231829833984375, + "step": 16590 + }, + { + "epoch": 18.885852547680045, + "grad_norm": 7.729628562927246, + "learning_rate": 0.00011011972479549273, + "loss": 3.9778141021728515, + "step": 16600 + }, + { + "epoch": 18.897238827213208, + "grad_norm": 7.793543815612793, + "learning_rate": 0.00011006555067988515, + "loss": 4.103470230102539, + "step": 16610 + }, + { + "epoch": 18.90862510674637, + "grad_norm": 7.603933334350586, + "learning_rate": 0.0001100113765642776, + "loss": 3.918026351928711, + "step": 16620 + }, + { + "epoch": 18.920011386279533, + "grad_norm": 9.164023399353027, + "learning_rate": 0.00010995720244867003, + "loss": 4.239971542358399, + "step": 16630 + }, + { + "epoch": 18.931397665812696, + "grad_norm": 8.179018020629883, + "learning_rate": 0.00010990302833306248, + "loss": 4.0833740234375, + "step": 16640 + }, + { + "epoch": 18.94278394534586, + "grad_norm": 8.107283592224121, + "learning_rate": 0.0001098488542174549, + "loss": 3.886615753173828, + "step": 16650 + }, + { + "epoch": 18.95417022487902, + "grad_norm": 8.483085632324219, + "learning_rate": 0.00010979468010184734, + "loss": 4.466776275634766, + "step": 16660 + }, + { + "epoch": 18.965556504412184, + "grad_norm": 7.803193092346191, + "learning_rate": 0.00010974050598623977, + "loss": 3.9009479522705077, + "step": 16670 + }, + { + "epoch": 18.976942783945347, + "grad_norm": 8.053387641906738, + "learning_rate": 0.00010968633187063222, + "loss": 4.092340469360352, + "step": 16680 + }, + { + "epoch": 18.98832906347851, + "grad_norm": 7.657060623168945, + "learning_rate": 0.00010963215775502464, + "loss": 4.109097290039062, + "step": 16690 + }, + { + "epoch": 18.999715343011673, + "grad_norm": 7.931739807128906, + "learning_rate": 0.0001095779836394171, + "loss": 3.970943069458008, + "step": 16700 + }, + { + "epoch": 19.010247651579846, + "grad_norm": 8.304778099060059, + "learning_rate": 0.00010952380952380953, + "loss": 3.4373973846435546, + "step": 16710 + }, + { + "epoch": 19.02163393111301, + "grad_norm": 8.502395629882812, + "learning_rate": 0.00010946963540820197, + "loss": 3.6647174835205076, + "step": 16720 + }, + { + "epoch": 19.03302021064617, + "grad_norm": 8.930685043334961, + "learning_rate": 0.00010941546129259441, + "loss": 3.4595081329345705, + "step": 16730 + }, + { + "epoch": 19.044406490179334, + "grad_norm": 7.8097920417785645, + "learning_rate": 0.00010936128717698683, + "loss": 3.548003005981445, + "step": 16740 + }, + { + "epoch": 19.055792769712497, + "grad_norm": 6.608278751373291, + "learning_rate": 0.00010930711306137929, + "loss": 3.3133895874023436, + "step": 16750 + }, + { + "epoch": 19.06717904924566, + "grad_norm": 7.428323745727539, + "learning_rate": 0.00010925293894577171, + "loss": 3.576006317138672, + "step": 16760 + }, + { + "epoch": 19.078565328778822, + "grad_norm": 7.900843620300293, + "learning_rate": 0.00010919876483016416, + "loss": 3.676096725463867, + "step": 16770 + }, + { + "epoch": 19.089951608311985, + "grad_norm": 7.658603191375732, + "learning_rate": 0.00010914459071455659, + "loss": 3.7328716278076173, + "step": 16780 + }, + { + "epoch": 19.101337887845148, + "grad_norm": 7.360223293304443, + "learning_rate": 0.00010909041659894902, + "loss": 3.652063751220703, + "step": 16790 + }, + { + "epoch": 19.11272416737831, + "grad_norm": 7.122159957885742, + "learning_rate": 0.00010903624248334146, + "loss": 3.690329360961914, + "step": 16800 + }, + { + "epoch": 19.124110446911473, + "grad_norm": 6.8670501708984375, + "learning_rate": 0.0001089820683677339, + "loss": 3.667049026489258, + "step": 16810 + }, + { + "epoch": 19.135496726444636, + "grad_norm": 7.987428665161133, + "learning_rate": 0.00010892789425212635, + "loss": 3.607272720336914, + "step": 16820 + }, + { + "epoch": 19.146883005977795, + "grad_norm": 7.282992839813232, + "learning_rate": 0.00010887372013651878, + "loss": 3.4856422424316404, + "step": 16830 + }, + { + "epoch": 19.158269285510958, + "grad_norm": 8.548340797424316, + "learning_rate": 0.00010881954602091123, + "loss": 3.8047679901123046, + "step": 16840 + }, + { + "epoch": 19.16965556504412, + "grad_norm": 7.989595413208008, + "learning_rate": 0.00010876537190530365, + "loss": 3.664881134033203, + "step": 16850 + }, + { + "epoch": 19.181041844577283, + "grad_norm": 7.799252033233643, + "learning_rate": 0.00010871119778969609, + "loss": 4.083931350708008, + "step": 16860 + }, + { + "epoch": 19.192428124110446, + "grad_norm": 7.790616512298584, + "learning_rate": 0.00010865702367408851, + "loss": 3.279020309448242, + "step": 16870 + }, + { + "epoch": 19.20381440364361, + "grad_norm": 7.22285795211792, + "learning_rate": 0.00010860284955848097, + "loss": 3.5854366302490233, + "step": 16880 + }, + { + "epoch": 19.21520068317677, + "grad_norm": 8.332810401916504, + "learning_rate": 0.00010854867544287339, + "loss": 3.9076087951660154, + "step": 16890 + }, + { + "epoch": 19.226586962709934, + "grad_norm": 6.827261924743652, + "learning_rate": 0.00010849450132726584, + "loss": 3.7439952850341798, + "step": 16900 + }, + { + "epoch": 19.237973242243097, + "grad_norm": 7.577035427093506, + "learning_rate": 0.00010844032721165827, + "loss": 3.5914188385009767, + "step": 16910 + }, + { + "epoch": 19.24935952177626, + "grad_norm": 8.372159957885742, + "learning_rate": 0.00010838615309605072, + "loss": 3.650664520263672, + "step": 16920 + }, + { + "epoch": 19.260745801309422, + "grad_norm": 7.719248294830322, + "learning_rate": 0.00010833197898044314, + "loss": 3.8901805877685547, + "step": 16930 + }, + { + "epoch": 19.272132080842585, + "grad_norm": 6.905912399291992, + "learning_rate": 0.00010827780486483558, + "loss": 3.5376930236816406, + "step": 16940 + }, + { + "epoch": 19.283518360375748, + "grad_norm": 6.742889404296875, + "learning_rate": 0.00010822363074922803, + "loss": 3.475457763671875, + "step": 16950 + }, + { + "epoch": 19.29490463990891, + "grad_norm": 7.5838446617126465, + "learning_rate": 0.00010816945663362046, + "loss": 3.4266204833984375, + "step": 16960 + }, + { + "epoch": 19.306290919442073, + "grad_norm": 7.7189836502075195, + "learning_rate": 0.00010811528251801291, + "loss": 3.5107288360595703, + "step": 16970 + }, + { + "epoch": 19.317677198975236, + "grad_norm": 7.462057590484619, + "learning_rate": 0.00010806110840240533, + "loss": 3.7074642181396484, + "step": 16980 + }, + { + "epoch": 19.3290634785084, + "grad_norm": 8.049522399902344, + "learning_rate": 0.00010800693428679778, + "loss": 3.521027374267578, + "step": 16990 + }, + { + "epoch": 19.34044975804156, + "grad_norm": 7.91304349899292, + "learning_rate": 0.00010795276017119021, + "loss": 3.9731681823730467, + "step": 17000 + }, + { + "epoch": 19.35183603757472, + "grad_norm": 8.217313766479492, + "learning_rate": 0.00010789858605558265, + "loss": 3.8252967834472655, + "step": 17010 + }, + { + "epoch": 19.363222317107883, + "grad_norm": 8.448884963989258, + "learning_rate": 0.00010784441193997507, + "loss": 3.677239990234375, + "step": 17020 + }, + { + "epoch": 19.374608596641046, + "grad_norm": 7.586091041564941, + "learning_rate": 0.00010779023782436752, + "loss": 3.6121559143066406, + "step": 17030 + }, + { + "epoch": 19.38599487617421, + "grad_norm": 7.399051189422607, + "learning_rate": 0.00010773606370875995, + "loss": 3.6527149200439455, + "step": 17040 + }, + { + "epoch": 19.39738115570737, + "grad_norm": 8.452932357788086, + "learning_rate": 0.0001076818895931524, + "loss": 3.5067722320556642, + "step": 17050 + }, + { + "epoch": 19.408767435240534, + "grad_norm": 7.3610382080078125, + "learning_rate": 0.00010762771547754485, + "loss": 3.557135009765625, + "step": 17060 + }, + { + "epoch": 19.420153714773697, + "grad_norm": 7.684902191162109, + "learning_rate": 0.00010757354136193727, + "loss": 3.7063770294189453, + "step": 17070 + }, + { + "epoch": 19.43153999430686, + "grad_norm": 7.2472825050354, + "learning_rate": 0.00010751936724632971, + "loss": 3.5020095825195314, + "step": 17080 + }, + { + "epoch": 19.442926273840023, + "grad_norm": 8.710956573486328, + "learning_rate": 0.00010746519313072214, + "loss": 3.7144115447998045, + "step": 17090 + }, + { + "epoch": 19.454312553373185, + "grad_norm": 7.865204811096191, + "learning_rate": 0.00010741101901511459, + "loss": 3.860599136352539, + "step": 17100 + }, + { + "epoch": 19.465698832906348, + "grad_norm": 7.315329551696777, + "learning_rate": 0.00010735684489950701, + "loss": 3.8806697845458986, + "step": 17110 + }, + { + "epoch": 19.47708511243951, + "grad_norm": 8.162956237792969, + "learning_rate": 0.00010730267078389946, + "loss": 3.6206798553466797, + "step": 17120 + }, + { + "epoch": 19.488471391972674, + "grad_norm": 7.473461627960205, + "learning_rate": 0.00010724849666829189, + "loss": 3.7630165100097654, + "step": 17130 + }, + { + "epoch": 19.499857671505836, + "grad_norm": 7.086675643920898, + "learning_rate": 0.00010719432255268434, + "loss": 3.6255001068115233, + "step": 17140 + }, + { + "epoch": 19.511243951039, + "grad_norm": 7.986152172088623, + "learning_rate": 0.00010714014843707677, + "loss": 3.654983139038086, + "step": 17150 + }, + { + "epoch": 19.52263023057216, + "grad_norm": 6.950078010559082, + "learning_rate": 0.0001070859743214692, + "loss": 3.5265995025634767, + "step": 17160 + }, + { + "epoch": 19.534016510105324, + "grad_norm": 8.496960639953613, + "learning_rate": 0.00010703180020586165, + "loss": 3.8141246795654298, + "step": 17170 + }, + { + "epoch": 19.545402789638487, + "grad_norm": 7.757366180419922, + "learning_rate": 0.00010697762609025408, + "loss": 3.748949432373047, + "step": 17180 + }, + { + "epoch": 19.55678906917165, + "grad_norm": 8.865738868713379, + "learning_rate": 0.00010692345197464653, + "loss": 3.841624069213867, + "step": 17190 + }, + { + "epoch": 19.56817534870481, + "grad_norm": 8.705485343933105, + "learning_rate": 0.00010686927785903896, + "loss": 3.5993213653564453, + "step": 17200 + }, + { + "epoch": 19.579561628237972, + "grad_norm": 8.318339347839355, + "learning_rate": 0.0001068151037434314, + "loss": 3.897611618041992, + "step": 17210 + }, + { + "epoch": 19.590947907771135, + "grad_norm": 8.347416877746582, + "learning_rate": 0.00010676092962782382, + "loss": 3.714302825927734, + "step": 17220 + }, + { + "epoch": 19.602334187304297, + "grad_norm": 6.775099754333496, + "learning_rate": 0.00010670675551221627, + "loss": 3.8760570526123046, + "step": 17230 + }, + { + "epoch": 19.61372046683746, + "grad_norm": 8.375359535217285, + "learning_rate": 0.0001066525813966087, + "loss": 3.7001274108886717, + "step": 17240 + }, + { + "epoch": 19.625106746370623, + "grad_norm": 8.114470481872559, + "learning_rate": 0.00010659840728100115, + "loss": 3.857496643066406, + "step": 17250 + }, + { + "epoch": 19.636493025903786, + "grad_norm": 8.257019996643066, + "learning_rate": 0.00010654423316539357, + "loss": 3.6363399505615233, + "step": 17260 + }, + { + "epoch": 19.64787930543695, + "grad_norm": 7.418999671936035, + "learning_rate": 0.00010649005904978602, + "loss": 3.6371334075927733, + "step": 17270 + }, + { + "epoch": 19.65926558497011, + "grad_norm": 8.225639343261719, + "learning_rate": 0.00010643588493417846, + "loss": 3.6147201538085936, + "step": 17280 + }, + { + "epoch": 19.670651864503274, + "grad_norm": 7.634995937347412, + "learning_rate": 0.00010638171081857088, + "loss": 3.783678436279297, + "step": 17290 + }, + { + "epoch": 19.682038144036436, + "grad_norm": 7.631542682647705, + "learning_rate": 0.00010632753670296334, + "loss": 3.7224945068359374, + "step": 17300 + }, + { + "epoch": 19.6934244235696, + "grad_norm": 8.868590354919434, + "learning_rate": 0.00010627336258735576, + "loss": 3.8564884185791017, + "step": 17310 + }, + { + "epoch": 19.704810703102762, + "grad_norm": 7.66297721862793, + "learning_rate": 0.00010621918847174821, + "loss": 3.8662670135498045, + "step": 17320 + }, + { + "epoch": 19.716196982635925, + "grad_norm": 7.4999589920043945, + "learning_rate": 0.00010616501435614064, + "loss": 3.648085021972656, + "step": 17330 + }, + { + "epoch": 19.727583262169087, + "grad_norm": 7.6754984855651855, + "learning_rate": 0.00010611084024053309, + "loss": 3.6652565002441406, + "step": 17340 + }, + { + "epoch": 19.73896954170225, + "grad_norm": 7.409824371337891, + "learning_rate": 0.00010605666612492551, + "loss": 3.8480754852294923, + "step": 17350 + }, + { + "epoch": 19.750355821235413, + "grad_norm": 7.804997444152832, + "learning_rate": 0.00010600249200931795, + "loss": 3.830272674560547, + "step": 17360 + }, + { + "epoch": 19.761742100768572, + "grad_norm": 9.533950805664062, + "learning_rate": 0.00010594831789371037, + "loss": 4.024043273925781, + "step": 17370 + }, + { + "epoch": 19.773128380301735, + "grad_norm": 7.675684452056885, + "learning_rate": 0.00010589414377810283, + "loss": 3.693775939941406, + "step": 17380 + }, + { + "epoch": 19.784514659834898, + "grad_norm": 8.96830940246582, + "learning_rate": 0.00010583996966249528, + "loss": 3.858016586303711, + "step": 17390 + }, + { + "epoch": 19.79590093936806, + "grad_norm": 6.618140697479248, + "learning_rate": 0.0001057857955468877, + "loss": 3.8667743682861326, + "step": 17400 + }, + { + "epoch": 19.807287218901223, + "grad_norm": 8.536417961120605, + "learning_rate": 0.00010573162143128015, + "loss": 3.633957290649414, + "step": 17410 + }, + { + "epoch": 19.818673498434386, + "grad_norm": 8.237014770507812, + "learning_rate": 0.00010567744731567258, + "loss": 3.944471740722656, + "step": 17420 + }, + { + "epoch": 19.83005977796755, + "grad_norm": 7.2452545166015625, + "learning_rate": 0.00010562327320006502, + "loss": 3.8378509521484374, + "step": 17430 + }, + { + "epoch": 19.84144605750071, + "grad_norm": 6.854305744171143, + "learning_rate": 0.00010556909908445744, + "loss": 3.6727760314941404, + "step": 17440 + }, + { + "epoch": 19.852832337033874, + "grad_norm": 7.254054546356201, + "learning_rate": 0.00010551492496884989, + "loss": 3.783167266845703, + "step": 17450 + }, + { + "epoch": 19.864218616567037, + "grad_norm": 7.683404445648193, + "learning_rate": 0.00010546075085324232, + "loss": 3.615367889404297, + "step": 17460 + }, + { + "epoch": 19.8756048961002, + "grad_norm": 7.9448065757751465, + "learning_rate": 0.00010540657673763477, + "loss": 3.5793502807617186, + "step": 17470 + }, + { + "epoch": 19.886991175633362, + "grad_norm": 7.222136974334717, + "learning_rate": 0.00010535240262202719, + "loss": 3.8391284942626953, + "step": 17480 + }, + { + "epoch": 19.898377455166525, + "grad_norm": 8.161956787109375, + "learning_rate": 0.00010529822850641964, + "loss": 3.8968780517578123, + "step": 17490 + }, + { + "epoch": 19.909763734699688, + "grad_norm": 7.755995750427246, + "learning_rate": 0.00010524405439081208, + "loss": 3.5014904022216795, + "step": 17500 + }, + { + "epoch": 19.92115001423285, + "grad_norm": 7.664623737335205, + "learning_rate": 0.00010518988027520451, + "loss": 3.861225128173828, + "step": 17510 + }, + { + "epoch": 19.932536293766013, + "grad_norm": 8.745182037353516, + "learning_rate": 0.00010513570615959696, + "loss": 3.8261661529541016, + "step": 17520 + }, + { + "epoch": 19.943922573299176, + "grad_norm": 7.4443206787109375, + "learning_rate": 0.00010508153204398938, + "loss": 3.686481475830078, + "step": 17530 + }, + { + "epoch": 19.95530885283234, + "grad_norm": 8.966758728027344, + "learning_rate": 0.00010502735792838183, + "loss": 3.7222663879394533, + "step": 17540 + }, + { + "epoch": 19.9666951323655, + "grad_norm": 9.107354164123535, + "learning_rate": 0.00010497318381277426, + "loss": 3.9835369110107424, + "step": 17550 + }, + { + "epoch": 19.97808141189866, + "grad_norm": 7.190637111663818, + "learning_rate": 0.0001049190096971667, + "loss": 3.5804458618164063, + "step": 17560 + }, + { + "epoch": 19.989467691431823, + "grad_norm": 8.058660507202148, + "learning_rate": 0.00010486483558155914, + "loss": 3.7318145751953127, + "step": 17570 + }, + { + "epoch": 20.0, + "grad_norm": 5.256486892700195, + "learning_rate": 0.00010481066146595157, + "loss": 3.4294113159179687, + "step": 17580 + }, + { + "epoch": 20.011386279533163, + "grad_norm": 6.450402736663818, + "learning_rate": 0.000104756487350344, + "loss": 3.1654720306396484, + "step": 17590 + }, + { + "epoch": 20.022772559066325, + "grad_norm": 8.31795597076416, + "learning_rate": 0.00010470231323473645, + "loss": 3.4220176696777345, + "step": 17600 + }, + { + "epoch": 20.034158838599488, + "grad_norm": 7.338985443115234, + "learning_rate": 0.0001046481391191289, + "loss": 3.2845233917236327, + "step": 17610 + }, + { + "epoch": 20.04554511813265, + "grad_norm": 7.033785820007324, + "learning_rate": 0.00010459396500352133, + "loss": 3.3925506591796877, + "step": 17620 + }, + { + "epoch": 20.056931397665814, + "grad_norm": 8.403705596923828, + "learning_rate": 0.00010453979088791376, + "loss": 3.3477813720703127, + "step": 17630 + }, + { + "epoch": 20.068317677198976, + "grad_norm": 8.051742553710938, + "learning_rate": 0.00010448561677230619, + "loss": 3.4361392974853517, + "step": 17640 + }, + { + "epoch": 20.07970395673214, + "grad_norm": 8.155322074890137, + "learning_rate": 0.00010443144265669864, + "loss": 3.4342941284179687, + "step": 17650 + }, + { + "epoch": 20.091090236265302, + "grad_norm": 7.207828998565674, + "learning_rate": 0.00010437726854109106, + "loss": 3.397259521484375, + "step": 17660 + }, + { + "epoch": 20.102476515798465, + "grad_norm": 8.539883613586426, + "learning_rate": 0.00010432309442548352, + "loss": 3.2583744049072267, + "step": 17670 + }, + { + "epoch": 20.113862795331624, + "grad_norm": 7.494978904724121, + "learning_rate": 0.00010426892030987594, + "loss": 3.2995159149169924, + "step": 17680 + }, + { + "epoch": 20.125249074864787, + "grad_norm": 8.173662185668945, + "learning_rate": 0.00010421474619426839, + "loss": 3.5137447357177733, + "step": 17690 + }, + { + "epoch": 20.13663535439795, + "grad_norm": 7.584948539733887, + "learning_rate": 0.00010416057207866082, + "loss": 3.537697601318359, + "step": 17700 + }, + { + "epoch": 20.148021633931112, + "grad_norm": 7.191871643066406, + "learning_rate": 0.00010410639796305325, + "loss": 3.5673038482666017, + "step": 17710 + }, + { + "epoch": 20.159407913464275, + "grad_norm": 7.3911356925964355, + "learning_rate": 0.00010405222384744568, + "loss": 3.557468032836914, + "step": 17720 + }, + { + "epoch": 20.170794192997437, + "grad_norm": 6.331779479980469, + "learning_rate": 0.00010399804973183813, + "loss": 3.5360912322998046, + "step": 17730 + }, + { + "epoch": 20.1821804725306, + "grad_norm": 7.614666938781738, + "learning_rate": 0.00010394387561623058, + "loss": 3.3655654907226564, + "step": 17740 + }, + { + "epoch": 20.193566752063763, + "grad_norm": 7.935511112213135, + "learning_rate": 0.000103889701500623, + "loss": 3.340676498413086, + "step": 17750 + }, + { + "epoch": 20.204953031596926, + "grad_norm": 8.085060119628906, + "learning_rate": 0.00010383552738501546, + "loss": 3.363076400756836, + "step": 17760 + }, + { + "epoch": 20.21633931113009, + "grad_norm": 7.689126491546631, + "learning_rate": 0.00010378135326940788, + "loss": 3.4896373748779297, + "step": 17770 + }, + { + "epoch": 20.22772559066325, + "grad_norm": 8.10032844543457, + "learning_rate": 0.00010372717915380032, + "loss": 3.325590133666992, + "step": 17780 + }, + { + "epoch": 20.239111870196414, + "grad_norm": 8.31187629699707, + "learning_rate": 0.00010367300503819274, + "loss": 3.2680404663085936, + "step": 17790 + }, + { + "epoch": 20.250498149729577, + "grad_norm": 7.616562366485596, + "learning_rate": 0.0001036188309225852, + "loss": 3.3835872650146483, + "step": 17800 + }, + { + "epoch": 20.26188442926274, + "grad_norm": 7.7585344314575195, + "learning_rate": 0.00010356465680697762, + "loss": 3.3168540954589845, + "step": 17810 + }, + { + "epoch": 20.273270708795902, + "grad_norm": 7.622613430023193, + "learning_rate": 0.00010351048269137007, + "loss": 3.4925868988037108, + "step": 17820 + }, + { + "epoch": 20.284656988329065, + "grad_norm": 7.538808345794678, + "learning_rate": 0.0001034563085757625, + "loss": 3.5138877868652343, + "step": 17830 + }, + { + "epoch": 20.296043267862228, + "grad_norm": 7.818309783935547, + "learning_rate": 0.00010340213446015495, + "loss": 3.594626617431641, + "step": 17840 + }, + { + "epoch": 20.30742954739539, + "grad_norm": 7.420721530914307, + "learning_rate": 0.00010334796034454739, + "loss": 3.3362926483154296, + "step": 17850 + }, + { + "epoch": 20.31881582692855, + "grad_norm": 8.078766822814941, + "learning_rate": 0.00010329378622893981, + "loss": 3.376148986816406, + "step": 17860 + }, + { + "epoch": 20.330202106461712, + "grad_norm": 8.338269233703613, + "learning_rate": 0.00010323961211333226, + "loss": 3.398274230957031, + "step": 17870 + }, + { + "epoch": 20.341588385994875, + "grad_norm": 8.21082592010498, + "learning_rate": 0.00010318543799772469, + "loss": 3.543021011352539, + "step": 17880 + }, + { + "epoch": 20.352974665528038, + "grad_norm": 7.475930213928223, + "learning_rate": 0.00010313126388211714, + "loss": 3.3988021850585937, + "step": 17890 + }, + { + "epoch": 20.3643609450612, + "grad_norm": 8.236494064331055, + "learning_rate": 0.00010307708976650956, + "loss": 3.582375717163086, + "step": 17900 + }, + { + "epoch": 20.375747224594363, + "grad_norm": 6.666841506958008, + "learning_rate": 0.000103022915650902, + "loss": 3.476043701171875, + "step": 17910 + }, + { + "epoch": 20.387133504127526, + "grad_norm": 8.686334609985352, + "learning_rate": 0.00010296874153529444, + "loss": 3.4657127380371096, + "step": 17920 + }, + { + "epoch": 20.39851978366069, + "grad_norm": 7.781078338623047, + "learning_rate": 0.00010291456741968688, + "loss": 3.4596488952636717, + "step": 17930 + }, + { + "epoch": 20.40990606319385, + "grad_norm": 6.893237590789795, + "learning_rate": 0.0001028603933040793, + "loss": 3.6022926330566407, + "step": 17940 + }, + { + "epoch": 20.421292342727014, + "grad_norm": 8.505749702453613, + "learning_rate": 0.00010280621918847175, + "loss": 3.511888122558594, + "step": 17950 + }, + { + "epoch": 20.432678622260177, + "grad_norm": 7.29093074798584, + "learning_rate": 0.0001027520450728642, + "loss": 3.803249740600586, + "step": 17960 + }, + { + "epoch": 20.44406490179334, + "grad_norm": 7.3153462409973145, + "learning_rate": 0.00010269787095725663, + "loss": 3.3840301513671873, + "step": 17970 + }, + { + "epoch": 20.455451181326502, + "grad_norm": 7.5833306312561035, + "learning_rate": 0.00010264369684164907, + "loss": 3.48956413269043, + "step": 17980 + }, + { + "epoch": 20.466837460859665, + "grad_norm": 8.305935859680176, + "learning_rate": 0.00010258952272604149, + "loss": 3.399882507324219, + "step": 17990 + }, + { + "epoch": 20.478223740392828, + "grad_norm": 8.22060775756836, + "learning_rate": 0.00010253534861043394, + "loss": 3.2761825561523437, + "step": 18000 + }, + { + "epoch": 20.48961001992599, + "grad_norm": 8.05759334564209, + "learning_rate": 0.00010248117449482637, + "loss": 3.5770782470703124, + "step": 18010 + }, + { + "epoch": 20.500996299459153, + "grad_norm": 8.089456558227539, + "learning_rate": 0.00010242700037921882, + "loss": 3.424999237060547, + "step": 18020 + }, + { + "epoch": 20.512382578992316, + "grad_norm": 7.183095932006836, + "learning_rate": 0.00010237282626361124, + "loss": 3.1551158905029295, + "step": 18030 + }, + { + "epoch": 20.523768858525475, + "grad_norm": 7.149391174316406, + "learning_rate": 0.0001023186521480037, + "loss": 3.384687805175781, + "step": 18040 + }, + { + "epoch": 20.535155138058638, + "grad_norm": 6.47637939453125, + "learning_rate": 0.00010226447803239612, + "loss": 3.601530075073242, + "step": 18050 + }, + { + "epoch": 20.5465414175918, + "grad_norm": 8.601676940917969, + "learning_rate": 0.00010221030391678856, + "loss": 3.6373306274414063, + "step": 18060 + }, + { + "epoch": 20.557927697124963, + "grad_norm": 8.294757843017578, + "learning_rate": 0.00010215612980118101, + "loss": 3.5030460357666016, + "step": 18070 + }, + { + "epoch": 20.569313976658126, + "grad_norm": 7.412970066070557, + "learning_rate": 0.00010210195568557343, + "loss": 3.206818389892578, + "step": 18080 + }, + { + "epoch": 20.58070025619129, + "grad_norm": 7.663300514221191, + "learning_rate": 0.00010204778156996589, + "loss": 3.5179752349853515, + "step": 18090 + }, + { + "epoch": 20.59208653572445, + "grad_norm": 7.107182025909424, + "learning_rate": 0.00010199360745435831, + "loss": 3.446649932861328, + "step": 18100 + }, + { + "epoch": 20.603472815257614, + "grad_norm": 7.1297688484191895, + "learning_rate": 0.00010193943333875076, + "loss": 3.4354694366455076, + "step": 18110 + }, + { + "epoch": 20.614859094790777, + "grad_norm": 7.961850643157959, + "learning_rate": 0.00010188525922314319, + "loss": 3.3658203125, + "step": 18120 + }, + { + "epoch": 20.62624537432394, + "grad_norm": 7.603863716125488, + "learning_rate": 0.00010183108510753562, + "loss": 3.439925765991211, + "step": 18130 + }, + { + "epoch": 20.637631653857103, + "grad_norm": 7.988809108734131, + "learning_rate": 0.00010177691099192805, + "loss": 3.531351089477539, + "step": 18140 + }, + { + "epoch": 20.649017933390265, + "grad_norm": 8.041953086853027, + "learning_rate": 0.0001017227368763205, + "loss": 3.673780822753906, + "step": 18150 + }, + { + "epoch": 20.660404212923428, + "grad_norm": 8.37690258026123, + "learning_rate": 0.00010166856276071292, + "loss": 3.794148254394531, + "step": 18160 + }, + { + "epoch": 20.67179049245659, + "grad_norm": 8.250043869018555, + "learning_rate": 0.00010161438864510538, + "loss": 3.5557697296142576, + "step": 18170 + }, + { + "epoch": 20.683176771989753, + "grad_norm": 8.75919246673584, + "learning_rate": 0.00010156021452949783, + "loss": 3.7775032043457033, + "step": 18180 + }, + { + "epoch": 20.694563051522916, + "grad_norm": 8.309146881103516, + "learning_rate": 0.00010150604041389025, + "loss": 3.4888229370117188, + "step": 18190 + }, + { + "epoch": 20.70594933105608, + "grad_norm": 8.30295467376709, + "learning_rate": 0.00010145186629828269, + "loss": 3.7718467712402344, + "step": 18200 + }, + { + "epoch": 20.71733561058924, + "grad_norm": 7.974371910095215, + "learning_rate": 0.00010139769218267511, + "loss": 3.401205062866211, + "step": 18210 + }, + { + "epoch": 20.7287218901224, + "grad_norm": 8.394783973693848, + "learning_rate": 0.00010134351806706757, + "loss": 3.713974380493164, + "step": 18220 + }, + { + "epoch": 20.740108169655564, + "grad_norm": 8.151832580566406, + "learning_rate": 0.00010128934395145999, + "loss": 3.8104171752929688, + "step": 18230 + }, + { + "epoch": 20.751494449188726, + "grad_norm": 7.931938171386719, + "learning_rate": 0.00010123516983585244, + "loss": 3.417683410644531, + "step": 18240 + }, + { + "epoch": 20.76288072872189, + "grad_norm": 7.705905914306641, + "learning_rate": 0.00010118099572024487, + "loss": 3.6185550689697266, + "step": 18250 + }, + { + "epoch": 20.774267008255052, + "grad_norm": 9.284891128540039, + "learning_rate": 0.00010112682160463732, + "loss": 3.7930877685546873, + "step": 18260 + }, + { + "epoch": 20.785653287788215, + "grad_norm": 7.982856750488281, + "learning_rate": 0.00010107264748902974, + "loss": 3.4403953552246094, + "step": 18270 + }, + { + "epoch": 20.797039567321377, + "grad_norm": 8.39303207397461, + "learning_rate": 0.00010101847337342218, + "loss": 3.756923294067383, + "step": 18280 + }, + { + "epoch": 20.80842584685454, + "grad_norm": 7.579355239868164, + "learning_rate": 0.00010096429925781463, + "loss": 3.770177459716797, + "step": 18290 + }, + { + "epoch": 20.819812126387703, + "grad_norm": 7.803186416625977, + "learning_rate": 0.00010091012514220706, + "loss": 3.6111522674560548, + "step": 18300 + }, + { + "epoch": 20.831198405920865, + "grad_norm": 7.868539810180664, + "learning_rate": 0.00010085595102659951, + "loss": 3.5871612548828127, + "step": 18310 + }, + { + "epoch": 20.842584685454028, + "grad_norm": 8.147734642028809, + "learning_rate": 0.00010080177691099193, + "loss": 3.5940319061279298, + "step": 18320 + }, + { + "epoch": 20.85397096498719, + "grad_norm": 7.550434589385986, + "learning_rate": 0.00010074760279538437, + "loss": 3.7070236206054688, + "step": 18330 + }, + { + "epoch": 20.865357244520354, + "grad_norm": 7.593775272369385, + "learning_rate": 0.00010069342867977681, + "loss": 3.5908023834228517, + "step": 18340 + }, + { + "epoch": 20.876743524053516, + "grad_norm": 9.302807807922363, + "learning_rate": 0.00010063925456416925, + "loss": 3.5747802734375, + "step": 18350 + }, + { + "epoch": 20.88812980358668, + "grad_norm": 8.078301429748535, + "learning_rate": 0.00010058508044856167, + "loss": 3.404593658447266, + "step": 18360 + }, + { + "epoch": 20.899516083119842, + "grad_norm": 7.820472240447998, + "learning_rate": 0.00010053090633295412, + "loss": 3.6110614776611327, + "step": 18370 + }, + { + "epoch": 20.910902362653005, + "grad_norm": 7.314359664916992, + "learning_rate": 0.00010047673221734655, + "loss": 3.4265087127685545, + "step": 18380 + }, + { + "epoch": 20.922288642186167, + "grad_norm": 7.681244373321533, + "learning_rate": 0.000100422558101739, + "loss": 3.6056175231933594, + "step": 18390 + }, + { + "epoch": 20.933674921719327, + "grad_norm": 8.012112617492676, + "learning_rate": 0.00010036838398613142, + "loss": 3.682481384277344, + "step": 18400 + }, + { + "epoch": 20.94506120125249, + "grad_norm": 7.457361221313477, + "learning_rate": 0.00010031420987052386, + "loss": 3.843639373779297, + "step": 18410 + }, + { + "epoch": 20.956447480785652, + "grad_norm": 8.518708229064941, + "learning_rate": 0.00010026003575491631, + "loss": 3.5751590728759766, + "step": 18420 + }, + { + "epoch": 20.967833760318815, + "grad_norm": 7.371086597442627, + "learning_rate": 0.00010020586163930874, + "loss": 3.6571311950683594, + "step": 18430 + }, + { + "epoch": 20.979220039851977, + "grad_norm": 7.497313976287842, + "learning_rate": 0.00010015168752370119, + "loss": 3.5427356719970704, + "step": 18440 + }, + { + "epoch": 20.99060631938514, + "grad_norm": 8.249246597290039, + "learning_rate": 0.00010009751340809361, + "loss": 3.492108154296875, + "step": 18450 + }, + { + "epoch": 21.001138627953317, + "grad_norm": 7.12626314163208, + "learning_rate": 0.00010004333929248607, + "loss": 3.4620594024658202, + "step": 18460 + }, + { + "epoch": 21.01252490748648, + "grad_norm": 7.223207473754883, + "learning_rate": 9.99891651768785e-05, + "loss": 2.993937301635742, + "step": 18470 + }, + { + "epoch": 21.023911187019642, + "grad_norm": 8.60847282409668, + "learning_rate": 9.993499106127093e-05, + "loss": 3.044596481323242, + "step": 18480 + }, + { + "epoch": 21.035297466552805, + "grad_norm": 7.056385040283203, + "learning_rate": 9.988081694566337e-05, + "loss": 3.329801559448242, + "step": 18490 + }, + { + "epoch": 21.046683746085968, + "grad_norm": 8.232537269592285, + "learning_rate": 9.98266428300558e-05, + "loss": 3.359906005859375, + "step": 18500 + }, + { + "epoch": 21.05807002561913, + "grad_norm": 6.871865272521973, + "learning_rate": 9.977246871444824e-05, + "loss": 3.1155139923095705, + "step": 18510 + }, + { + "epoch": 21.06945630515229, + "grad_norm": 8.1388578414917, + "learning_rate": 9.971829459884068e-05, + "loss": 3.1933937072753906, + "step": 18520 + }, + { + "epoch": 21.080842584685453, + "grad_norm": 7.495057106018066, + "learning_rate": 9.966412048323312e-05, + "loss": 3.4371055603027343, + "step": 18530 + }, + { + "epoch": 21.092228864218615, + "grad_norm": 6.833533763885498, + "learning_rate": 9.960994636762556e-05, + "loss": 3.0802495956420897, + "step": 18540 + }, + { + "epoch": 21.103615143751778, + "grad_norm": 6.975308895111084, + "learning_rate": 9.955577225201798e-05, + "loss": 3.318703842163086, + "step": 18550 + }, + { + "epoch": 21.11500142328494, + "grad_norm": 8.011495590209961, + "learning_rate": 9.950159813641042e-05, + "loss": 3.2149578094482423, + "step": 18560 + }, + { + "epoch": 21.126387702818104, + "grad_norm": 7.448869705200195, + "learning_rate": 9.944742402080287e-05, + "loss": 3.2515499114990236, + "step": 18570 + }, + { + "epoch": 21.137773982351266, + "grad_norm": 7.280409812927246, + "learning_rate": 9.939324990519531e-05, + "loss": 3.5192329406738283, + "step": 18580 + }, + { + "epoch": 21.14916026188443, + "grad_norm": 7.909364700317383, + "learning_rate": 9.933907578958775e-05, + "loss": 3.397074890136719, + "step": 18590 + }, + { + "epoch": 21.16054654141759, + "grad_norm": 7.611769676208496, + "learning_rate": 9.928490167398018e-05, + "loss": 3.0764091491699217, + "step": 18600 + }, + { + "epoch": 21.171932820950754, + "grad_norm": 7.476166248321533, + "learning_rate": 9.923072755837262e-05, + "loss": 3.295338821411133, + "step": 18610 + }, + { + "epoch": 21.183319100483917, + "grad_norm": 6.955434799194336, + "learning_rate": 9.917655344276505e-05, + "loss": 3.357422637939453, + "step": 18620 + }, + { + "epoch": 21.19470538001708, + "grad_norm": 8.147130966186523, + "learning_rate": 9.912237932715748e-05, + "loss": 3.1608652114868163, + "step": 18630 + }, + { + "epoch": 21.206091659550243, + "grad_norm": 8.146050453186035, + "learning_rate": 9.906820521154992e-05, + "loss": 3.1724903106689455, + "step": 18640 + }, + { + "epoch": 21.217477939083405, + "grad_norm": 7.68185567855835, + "learning_rate": 9.901403109594236e-05, + "loss": 3.063700485229492, + "step": 18650 + }, + { + "epoch": 21.228864218616568, + "grad_norm": 7.399899005889893, + "learning_rate": 9.89598569803348e-05, + "loss": 3.2527538299560548, + "step": 18660 + }, + { + "epoch": 21.24025049814973, + "grad_norm": 8.725862503051758, + "learning_rate": 9.890568286472724e-05, + "loss": 3.3397960662841797, + "step": 18670 + }, + { + "epoch": 21.251636777682894, + "grad_norm": 8.439692497253418, + "learning_rate": 9.885150874911967e-05, + "loss": 3.127163314819336, + "step": 18680 + }, + { + "epoch": 21.263023057216056, + "grad_norm": 7.275774955749512, + "learning_rate": 9.879733463351211e-05, + "loss": 3.3399112701416014, + "step": 18690 + }, + { + "epoch": 21.274409336749216, + "grad_norm": 8.231805801391602, + "learning_rate": 9.874316051790455e-05, + "loss": 3.3167404174804687, + "step": 18700 + }, + { + "epoch": 21.28579561628238, + "grad_norm": 8.807353019714355, + "learning_rate": 9.868898640229699e-05, + "loss": 3.2518096923828126, + "step": 18710 + }, + { + "epoch": 21.29718189581554, + "grad_norm": 7.745279788970947, + "learning_rate": 9.863481228668943e-05, + "loss": 2.9932077407836912, + "step": 18720 + }, + { + "epoch": 21.308568175348704, + "grad_norm": 8.119501113891602, + "learning_rate": 9.858063817108186e-05, + "loss": 3.360261917114258, + "step": 18730 + }, + { + "epoch": 21.319954454881866, + "grad_norm": 7.655581951141357, + "learning_rate": 9.85264640554743e-05, + "loss": 3.311513900756836, + "step": 18740 + }, + { + "epoch": 21.33134073441503, + "grad_norm": 7.7340240478515625, + "learning_rate": 9.847228993986674e-05, + "loss": 3.241596221923828, + "step": 18750 + }, + { + "epoch": 21.342727013948192, + "grad_norm": 7.133695602416992, + "learning_rate": 9.841811582425917e-05, + "loss": 3.315245819091797, + "step": 18760 + }, + { + "epoch": 21.354113293481355, + "grad_norm": 7.775753021240234, + "learning_rate": 9.83639417086516e-05, + "loss": 3.2109920501708986, + "step": 18770 + }, + { + "epoch": 21.365499573014517, + "grad_norm": 7.912374019622803, + "learning_rate": 9.830976759304404e-05, + "loss": 3.256867218017578, + "step": 18780 + }, + { + "epoch": 21.37688585254768, + "grad_norm": 7.180131435394287, + "learning_rate": 9.825559347743648e-05, + "loss": 3.2446563720703123, + "step": 18790 + }, + { + "epoch": 21.388272132080843, + "grad_norm": 8.159407615661621, + "learning_rate": 9.820141936182893e-05, + "loss": 3.262491226196289, + "step": 18800 + }, + { + "epoch": 21.399658411614006, + "grad_norm": 7.895713806152344, + "learning_rate": 9.814724524622137e-05, + "loss": 3.2164894104003907, + "step": 18810 + }, + { + "epoch": 21.41104469114717, + "grad_norm": 8.5209321975708, + "learning_rate": 9.809307113061381e-05, + "loss": 3.3610679626464846, + "step": 18820 + }, + { + "epoch": 21.42243097068033, + "grad_norm": 6.682577610015869, + "learning_rate": 9.803889701500623e-05, + "loss": 3.2081188201904296, + "step": 18830 + }, + { + "epoch": 21.433817250213494, + "grad_norm": 7.8088812828063965, + "learning_rate": 9.798472289939867e-05, + "loss": 3.5212165832519533, + "step": 18840 + }, + { + "epoch": 21.445203529746657, + "grad_norm": 7.557262897491455, + "learning_rate": 9.793054878379111e-05, + "loss": 3.161390495300293, + "step": 18850 + }, + { + "epoch": 21.45658980927982, + "grad_norm": 6.671349048614502, + "learning_rate": 9.787637466818355e-05, + "loss": 3.4181884765625, + "step": 18860 + }, + { + "epoch": 21.467976088812982, + "grad_norm": 8.085090637207031, + "learning_rate": 9.782220055257598e-05, + "loss": 3.2536998748779298, + "step": 18870 + }, + { + "epoch": 21.47936236834614, + "grad_norm": 7.863136291503906, + "learning_rate": 9.776802643696842e-05, + "loss": 3.254865264892578, + "step": 18880 + }, + { + "epoch": 21.490748647879304, + "grad_norm": 8.205495834350586, + "learning_rate": 9.771385232136086e-05, + "loss": 3.396427536010742, + "step": 18890 + }, + { + "epoch": 21.502134927412467, + "grad_norm": 7.5775861740112305, + "learning_rate": 9.76596782057533e-05, + "loss": 3.1754524230957033, + "step": 18900 + }, + { + "epoch": 21.51352120694563, + "grad_norm": 7.762941837310791, + "learning_rate": 9.760550409014574e-05, + "loss": 3.3684154510498048, + "step": 18910 + }, + { + "epoch": 21.524907486478792, + "grad_norm": 7.523013114929199, + "learning_rate": 9.755132997453817e-05, + "loss": 3.414249801635742, + "step": 18920 + }, + { + "epoch": 21.536293766011955, + "grad_norm": 8.015236854553223, + "learning_rate": 9.749715585893061e-05, + "loss": 3.2509552001953126, + "step": 18930 + }, + { + "epoch": 21.547680045545118, + "grad_norm": 9.543829917907715, + "learning_rate": 9.744298174332305e-05, + "loss": 3.6638416290283202, + "step": 18940 + }, + { + "epoch": 21.55906632507828, + "grad_norm": 8.284276008605957, + "learning_rate": 9.738880762771549e-05, + "loss": 3.4558319091796874, + "step": 18950 + }, + { + "epoch": 21.570452604611443, + "grad_norm": 8.424079895019531, + "learning_rate": 9.733463351210793e-05, + "loss": 3.319378662109375, + "step": 18960 + }, + { + "epoch": 21.581838884144606, + "grad_norm": 7.417494773864746, + "learning_rate": 9.728045939650035e-05, + "loss": 3.0912622451782226, + "step": 18970 + }, + { + "epoch": 21.59322516367777, + "grad_norm": 8.443395614624023, + "learning_rate": 9.722628528089279e-05, + "loss": 3.599252700805664, + "step": 18980 + }, + { + "epoch": 21.60461144321093, + "grad_norm": 7.053211688995361, + "learning_rate": 9.717211116528523e-05, + "loss": 3.4184902191162108, + "step": 18990 + }, + { + "epoch": 21.615997722744094, + "grad_norm": 7.322805881500244, + "learning_rate": 9.711793704967766e-05, + "loss": 3.3825252532958983, + "step": 19000 + }, + { + "epoch": 21.627384002277257, + "grad_norm": 7.776912212371826, + "learning_rate": 9.70637629340701e-05, + "loss": 3.413040542602539, + "step": 19010 + }, + { + "epoch": 21.63877028181042, + "grad_norm": 7.955855846405029, + "learning_rate": 9.700958881846254e-05, + "loss": 3.3618423461914064, + "step": 19020 + }, + { + "epoch": 21.650156561343582, + "grad_norm": 7.364816188812256, + "learning_rate": 9.695541470285499e-05, + "loss": 3.470820999145508, + "step": 19030 + }, + { + "epoch": 21.661542840876745, + "grad_norm": 7.257011413574219, + "learning_rate": 9.690124058724742e-05, + "loss": 3.3685569763183594, + "step": 19040 + }, + { + "epoch": 21.672929120409908, + "grad_norm": 7.3103485107421875, + "learning_rate": 9.684706647163985e-05, + "loss": 3.5275413513183596, + "step": 19050 + }, + { + "epoch": 21.684315399943067, + "grad_norm": 8.958032608032227, + "learning_rate": 9.679289235603229e-05, + "loss": 3.6240314483642577, + "step": 19060 + }, + { + "epoch": 21.69570167947623, + "grad_norm": 7.291479110717773, + "learning_rate": 9.673871824042473e-05, + "loss": 3.4977294921875, + "step": 19070 + }, + { + "epoch": 21.707087959009392, + "grad_norm": 7.926621437072754, + "learning_rate": 9.668454412481717e-05, + "loss": 3.3956787109375, + "step": 19080 + }, + { + "epoch": 21.718474238542555, + "grad_norm": 7.5746588706970215, + "learning_rate": 9.66303700092096e-05, + "loss": 3.4186546325683596, + "step": 19090 + }, + { + "epoch": 21.729860518075718, + "grad_norm": 8.237885475158691, + "learning_rate": 9.657619589360204e-05, + "loss": 3.315296173095703, + "step": 19100 + }, + { + "epoch": 21.74124679760888, + "grad_norm": 7.103343963623047, + "learning_rate": 9.652202177799447e-05, + "loss": 3.2063358306884764, + "step": 19110 + }, + { + "epoch": 21.752633077142043, + "grad_norm": 7.7268500328063965, + "learning_rate": 9.646784766238691e-05, + "loss": 3.576313781738281, + "step": 19120 + }, + { + "epoch": 21.764019356675206, + "grad_norm": 8.264037132263184, + "learning_rate": 9.641367354677935e-05, + "loss": 3.452584075927734, + "step": 19130 + }, + { + "epoch": 21.77540563620837, + "grad_norm": 7.947970390319824, + "learning_rate": 9.63594994311718e-05, + "loss": 3.2328662872314453, + "step": 19140 + }, + { + "epoch": 21.78679191574153, + "grad_norm": 8.009827613830566, + "learning_rate": 9.630532531556423e-05, + "loss": 3.3568878173828125, + "step": 19150 + }, + { + "epoch": 21.798178195274694, + "grad_norm": 7.520967483520508, + "learning_rate": 9.625115119995667e-05, + "loss": 3.2189865112304688, + "step": 19160 + }, + { + "epoch": 21.809564474807857, + "grad_norm": 6.894745349884033, + "learning_rate": 9.619697708434911e-05, + "loss": 3.3676876068115233, + "step": 19170 + }, + { + "epoch": 21.82095075434102, + "grad_norm": 7.391615867614746, + "learning_rate": 9.614280296874154e-05, + "loss": 3.1733936309814452, + "step": 19180 + }, + { + "epoch": 21.832337033874182, + "grad_norm": 8.549541473388672, + "learning_rate": 9.608862885313397e-05, + "loss": 3.4626365661621095, + "step": 19190 + }, + { + "epoch": 21.843723313407345, + "grad_norm": 7.7757182121276855, + "learning_rate": 9.603445473752641e-05, + "loss": 3.3878768920898437, + "step": 19200 + }, + { + "epoch": 21.855109592940508, + "grad_norm": 9.261902809143066, + "learning_rate": 9.598028062191885e-05, + "loss": 3.0820999145507812, + "step": 19210 + }, + { + "epoch": 21.86649587247367, + "grad_norm": 7.974034309387207, + "learning_rate": 9.592610650631129e-05, + "loss": 3.532638931274414, + "step": 19220 + }, + { + "epoch": 21.877882152006833, + "grad_norm": 7.275153160095215, + "learning_rate": 9.587193239070373e-05, + "loss": 3.409342956542969, + "step": 19230 + }, + { + "epoch": 21.889268431539996, + "grad_norm": 7.821238040924072, + "learning_rate": 9.581775827509616e-05, + "loss": 3.2864406585693358, + "step": 19240 + }, + { + "epoch": 21.900654711073155, + "grad_norm": 6.836928844451904, + "learning_rate": 9.57635841594886e-05, + "loss": 3.399283218383789, + "step": 19250 + }, + { + "epoch": 21.912040990606318, + "grad_norm": 8.900579452514648, + "learning_rate": 9.570941004388104e-05, + "loss": 3.277374267578125, + "step": 19260 + }, + { + "epoch": 21.92342727013948, + "grad_norm": 7.689421653747559, + "learning_rate": 9.565523592827348e-05, + "loss": 3.502179718017578, + "step": 19270 + }, + { + "epoch": 21.934813549672644, + "grad_norm": 7.474170684814453, + "learning_rate": 9.560106181266592e-05, + "loss": 3.3328819274902344, + "step": 19280 + }, + { + "epoch": 21.946199829205806, + "grad_norm": 7.40029764175415, + "learning_rate": 9.554688769705835e-05, + "loss": 3.339038848876953, + "step": 19290 + }, + { + "epoch": 21.95758610873897, + "grad_norm": 8.339094161987305, + "learning_rate": 9.549271358145079e-05, + "loss": 3.642498779296875, + "step": 19300 + }, + { + "epoch": 21.96897238827213, + "grad_norm": 8.725251197814941, + "learning_rate": 9.543853946584323e-05, + "loss": 3.3834274291992186, + "step": 19310 + }, + { + "epoch": 21.980358667805294, + "grad_norm": 7.735589504241943, + "learning_rate": 9.538436535023565e-05, + "loss": 3.362496185302734, + "step": 19320 + }, + { + "epoch": 21.991744947338457, + "grad_norm": 8.0684175491333, + "learning_rate": 9.533019123462809e-05, + "loss": 3.6285259246826174, + "step": 19330 + }, + { + "epoch": 22.002277255906634, + "grad_norm": 7.213688850402832, + "learning_rate": 9.527601711902053e-05, + "loss": 3.0752532958984373, + "step": 19340 + }, + { + "epoch": 22.013663535439797, + "grad_norm": 7.230661392211914, + "learning_rate": 9.522184300341297e-05, + "loss": 3.045562171936035, + "step": 19350 + }, + { + "epoch": 22.025049814972956, + "grad_norm": 7.4309587478637695, + "learning_rate": 9.51676688878054e-05, + "loss": 3.1042001724243162, + "step": 19360 + }, + { + "epoch": 22.03643609450612, + "grad_norm": 7.176165580749512, + "learning_rate": 9.511349477219786e-05, + "loss": 2.9163785934448243, + "step": 19370 + }, + { + "epoch": 22.04782237403928, + "grad_norm": 7.511384963989258, + "learning_rate": 9.50593206565903e-05, + "loss": 2.967784118652344, + "step": 19380 + }, + { + "epoch": 22.059208653572444, + "grad_norm": 7.659293174743652, + "learning_rate": 9.500514654098272e-05, + "loss": 3.1063777923583986, + "step": 19390 + }, + { + "epoch": 22.070594933105607, + "grad_norm": 7.430224895477295, + "learning_rate": 9.495097242537516e-05, + "loss": 3.028996467590332, + "step": 19400 + }, + { + "epoch": 22.08198121263877, + "grad_norm": 7.423468589782715, + "learning_rate": 9.48967983097676e-05, + "loss": 3.157891273498535, + "step": 19410 + }, + { + "epoch": 22.093367492171932, + "grad_norm": 7.423767566680908, + "learning_rate": 9.484262419416003e-05, + "loss": 2.9452770233154295, + "step": 19420 + }, + { + "epoch": 22.104753771705095, + "grad_norm": 7.975661754608154, + "learning_rate": 9.478845007855247e-05, + "loss": 3.0032615661621094, + "step": 19430 + }, + { + "epoch": 22.116140051238258, + "grad_norm": 7.372896194458008, + "learning_rate": 9.473427596294491e-05, + "loss": 3.0240869522094727, + "step": 19440 + }, + { + "epoch": 22.12752633077142, + "grad_norm": 8.073140144348145, + "learning_rate": 9.468010184733735e-05, + "loss": 3.047717475891113, + "step": 19450 + }, + { + "epoch": 22.138912610304583, + "grad_norm": 7.785470008850098, + "learning_rate": 9.462592773172979e-05, + "loss": 3.0755859375, + "step": 19460 + }, + { + "epoch": 22.150298889837746, + "grad_norm": 8.51516342163086, + "learning_rate": 9.457175361612221e-05, + "loss": 3.0527263641357423, + "step": 19470 + }, + { + "epoch": 22.16168516937091, + "grad_norm": 7.476900577545166, + "learning_rate": 9.451757950051466e-05, + "loss": 3.286362075805664, + "step": 19480 + }, + { + "epoch": 22.17307144890407, + "grad_norm": 9.160499572753906, + "learning_rate": 9.44634053849071e-05, + "loss": 3.2243637084960937, + "step": 19490 + }, + { + "epoch": 22.184457728437234, + "grad_norm": 7.854064464569092, + "learning_rate": 9.440923126929954e-05, + "loss": 2.984610748291016, + "step": 19500 + }, + { + "epoch": 22.195844007970397, + "grad_norm": 7.314716815948486, + "learning_rate": 9.435505715369198e-05, + "loss": 3.109171676635742, + "step": 19510 + }, + { + "epoch": 22.20723028750356, + "grad_norm": 6.959289073944092, + "learning_rate": 9.430088303808441e-05, + "loss": 3.1544565200805663, + "step": 19520 + }, + { + "epoch": 22.218616567036722, + "grad_norm": 7.609602928161621, + "learning_rate": 9.424670892247684e-05, + "loss": 3.071408271789551, + "step": 19530 + }, + { + "epoch": 22.23000284656988, + "grad_norm": 7.927331924438477, + "learning_rate": 9.419253480686928e-05, + "loss": 3.055502510070801, + "step": 19540 + }, + { + "epoch": 22.241389126103044, + "grad_norm": 8.021471977233887, + "learning_rate": 9.413836069126171e-05, + "loss": 2.963591194152832, + "step": 19550 + }, + { + "epoch": 22.252775405636207, + "grad_norm": 7.8159260749816895, + "learning_rate": 9.408418657565415e-05, + "loss": 3.1434553146362303, + "step": 19560 + }, + { + "epoch": 22.26416168516937, + "grad_norm": 7.591141223907471, + "learning_rate": 9.403001246004659e-05, + "loss": 3.1236942291259764, + "step": 19570 + }, + { + "epoch": 22.275547964702533, + "grad_norm": 7.809198379516602, + "learning_rate": 9.397583834443903e-05, + "loss": 2.9383840560913086, + "step": 19580 + }, + { + "epoch": 22.286934244235695, + "grad_norm": 7.057039737701416, + "learning_rate": 9.392166422883148e-05, + "loss": 2.704611396789551, + "step": 19590 + }, + { + "epoch": 22.298320523768858, + "grad_norm": 8.17033863067627, + "learning_rate": 9.38674901132239e-05, + "loss": 3.0170907974243164, + "step": 19600 + }, + { + "epoch": 22.30970680330202, + "grad_norm": 8.437378883361816, + "learning_rate": 9.381331599761634e-05, + "loss": 3.192144012451172, + "step": 19610 + }, + { + "epoch": 22.321093082835183, + "grad_norm": 8.427281379699707, + "learning_rate": 9.375914188200878e-05, + "loss": 3.265237808227539, + "step": 19620 + }, + { + "epoch": 22.332479362368346, + "grad_norm": 7.595215797424316, + "learning_rate": 9.370496776640122e-05, + "loss": 3.1780752182006835, + "step": 19630 + }, + { + "epoch": 22.34386564190151, + "grad_norm": 7.446441173553467, + "learning_rate": 9.365079365079366e-05, + "loss": 3.126137542724609, + "step": 19640 + }, + { + "epoch": 22.35525192143467, + "grad_norm": 8.297301292419434, + "learning_rate": 9.35966195351861e-05, + "loss": 3.281117630004883, + "step": 19650 + }, + { + "epoch": 22.366638200967834, + "grad_norm": 7.048451900482178, + "learning_rate": 9.354244541957853e-05, + "loss": 3.1563640594482423, + "step": 19660 + }, + { + "epoch": 22.378024480500997, + "grad_norm": 7.006303787231445, + "learning_rate": 9.348827130397097e-05, + "loss": 2.991481971740723, + "step": 19670 + }, + { + "epoch": 22.38941076003416, + "grad_norm": 6.830575942993164, + "learning_rate": 9.34340971883634e-05, + "loss": 3.372071075439453, + "step": 19680 + }, + { + "epoch": 22.400797039567323, + "grad_norm": 8.474934577941895, + "learning_rate": 9.337992307275583e-05, + "loss": 3.1545629501342773, + "step": 19690 + }, + { + "epoch": 22.412183319100485, + "grad_norm": 8.20909595489502, + "learning_rate": 9.332574895714827e-05, + "loss": 3.1375335693359374, + "step": 19700 + }, + { + "epoch": 22.423569598633648, + "grad_norm": 7.151069164276123, + "learning_rate": 9.327157484154072e-05, + "loss": 3.147255706787109, + "step": 19710 + }, + { + "epoch": 22.434955878166807, + "grad_norm": 8.133870124816895, + "learning_rate": 9.321740072593316e-05, + "loss": 3.096915435791016, + "step": 19720 + }, + { + "epoch": 22.44634215769997, + "grad_norm": 7.726839065551758, + "learning_rate": 9.31632266103256e-05, + "loss": 3.2699676513671876, + "step": 19730 + }, + { + "epoch": 22.457728437233133, + "grad_norm": 7.36300802230835, + "learning_rate": 9.310905249471802e-05, + "loss": 3.1540500640869142, + "step": 19740 + }, + { + "epoch": 22.469114716766295, + "grad_norm": 8.004473686218262, + "learning_rate": 9.305487837911046e-05, + "loss": 3.1127681732177734, + "step": 19750 + }, + { + "epoch": 22.480500996299458, + "grad_norm": 7.39958381652832, + "learning_rate": 9.30007042635029e-05, + "loss": 2.9369550704956056, + "step": 19760 + }, + { + "epoch": 22.49188727583262, + "grad_norm": 7.182417869567871, + "learning_rate": 9.294653014789534e-05, + "loss": 3.0352306365966797, + "step": 19770 + }, + { + "epoch": 22.503273555365784, + "grad_norm": 7.487427711486816, + "learning_rate": 9.289235603228778e-05, + "loss": 3.1564273834228516, + "step": 19780 + }, + { + "epoch": 22.514659834898946, + "grad_norm": 8.101351737976074, + "learning_rate": 9.283818191668021e-05, + "loss": 3.135332489013672, + "step": 19790 + }, + { + "epoch": 22.52604611443211, + "grad_norm": 7.65117883682251, + "learning_rate": 9.278400780107265e-05, + "loss": 3.1288909912109375, + "step": 19800 + }, + { + "epoch": 22.537432393965272, + "grad_norm": 10.03753662109375, + "learning_rate": 9.272983368546509e-05, + "loss": 2.8136066436767577, + "step": 19810 + }, + { + "epoch": 22.548818673498435, + "grad_norm": 7.884417533874512, + "learning_rate": 9.267565956985753e-05, + "loss": 3.3287643432617187, + "step": 19820 + }, + { + "epoch": 22.560204953031597, + "grad_norm": 7.343411922454834, + "learning_rate": 9.262148545424997e-05, + "loss": 3.0845478057861326, + "step": 19830 + }, + { + "epoch": 22.57159123256476, + "grad_norm": 7.07990026473999, + "learning_rate": 9.25673113386424e-05, + "loss": 3.0407432556152343, + "step": 19840 + }, + { + "epoch": 22.582977512097923, + "grad_norm": 7.047083377838135, + "learning_rate": 9.251313722303484e-05, + "loss": 2.975852394104004, + "step": 19850 + }, + { + "epoch": 22.594363791631086, + "grad_norm": 7.662022590637207, + "learning_rate": 9.245896310742728e-05, + "loss": 3.1719154357910155, + "step": 19860 + }, + { + "epoch": 22.60575007116425, + "grad_norm": 7.884767532348633, + "learning_rate": 9.240478899181972e-05, + "loss": 3.1837724685668944, + "step": 19870 + }, + { + "epoch": 22.61713635069741, + "grad_norm": 7.272804260253906, + "learning_rate": 9.235061487621214e-05, + "loss": 2.9902538299560546, + "step": 19880 + }, + { + "epoch": 22.628522630230574, + "grad_norm": 8.392932891845703, + "learning_rate": 9.229644076060458e-05, + "loss": 3.203661346435547, + "step": 19890 + }, + { + "epoch": 22.639908909763733, + "grad_norm": 7.606863975524902, + "learning_rate": 9.224226664499702e-05, + "loss": 3.245991516113281, + "step": 19900 + }, + { + "epoch": 22.651295189296896, + "grad_norm": 8.281023979187012, + "learning_rate": 9.218809252938946e-05, + "loss": 3.256304168701172, + "step": 19910 + }, + { + "epoch": 22.66268146883006, + "grad_norm": 9.497676849365234, + "learning_rate": 9.21339184137819e-05, + "loss": 3.2775390625, + "step": 19920 + }, + { + "epoch": 22.67406774836322, + "grad_norm": 8.350468635559082, + "learning_rate": 9.207974429817435e-05, + "loss": 3.1510446548461912, + "step": 19930 + }, + { + "epoch": 22.685454027896384, + "grad_norm": 8.590188980102539, + "learning_rate": 9.202557018256678e-05, + "loss": 3.364548110961914, + "step": 19940 + }, + { + "epoch": 22.696840307429547, + "grad_norm": 8.386192321777344, + "learning_rate": 9.197139606695921e-05, + "loss": 3.446294403076172, + "step": 19950 + }, + { + "epoch": 22.70822658696271, + "grad_norm": 8.23085880279541, + "learning_rate": 9.191722195135165e-05, + "loss": 3.0785924911499025, + "step": 19960 + }, + { + "epoch": 22.719612866495872, + "grad_norm": 7.4072771072387695, + "learning_rate": 9.186304783574408e-05, + "loss": 3.299879455566406, + "step": 19970 + }, + { + "epoch": 22.730999146029035, + "grad_norm": 6.338870048522949, + "learning_rate": 9.180887372013652e-05, + "loss": 3.1906932830810546, + "step": 19980 + }, + { + "epoch": 22.742385425562198, + "grad_norm": 7.546908378601074, + "learning_rate": 9.175469960452896e-05, + "loss": 3.0080894470214843, + "step": 19990 + }, + { + "epoch": 22.75377170509536, + "grad_norm": 8.439484596252441, + "learning_rate": 9.17005254889214e-05, + "loss": 3.1106647491455077, + "step": 20000 + }, + { + "epoch": 22.765157984628523, + "grad_norm": 8.297350883483887, + "learning_rate": 9.164635137331384e-05, + "loss": 2.9686201095581053, + "step": 20010 + }, + { + "epoch": 22.776544264161686, + "grad_norm": 7.963090896606445, + "learning_rate": 9.159217725770628e-05, + "loss": 3.098881721496582, + "step": 20020 + }, + { + "epoch": 22.78793054369485, + "grad_norm": 8.618762016296387, + "learning_rate": 9.15380031420987e-05, + "loss": 3.041916084289551, + "step": 20030 + }, + { + "epoch": 22.79931682322801, + "grad_norm": 7.110545635223389, + "learning_rate": 9.148382902649115e-05, + "loss": 3.047221565246582, + "step": 20040 + }, + { + "epoch": 22.810703102761174, + "grad_norm": 8.43532657623291, + "learning_rate": 9.142965491088359e-05, + "loss": 3.025491142272949, + "step": 20050 + }, + { + "epoch": 22.822089382294337, + "grad_norm": 7.121822834014893, + "learning_rate": 9.137548079527603e-05, + "loss": 3.299955368041992, + "step": 20060 + }, + { + "epoch": 22.8334756618275, + "grad_norm": 7.654300212860107, + "learning_rate": 9.132130667966847e-05, + "loss": 3.023772430419922, + "step": 20070 + }, + { + "epoch": 22.844861941360662, + "grad_norm": 7.399035930633545, + "learning_rate": 9.12671325640609e-05, + "loss": 3.1401979446411135, + "step": 20080 + }, + { + "epoch": 22.85624822089382, + "grad_norm": 8.279058456420898, + "learning_rate": 9.121295844845333e-05, + "loss": 3.2731346130371093, + "step": 20090 + }, + { + "epoch": 22.867634500426984, + "grad_norm": 8.097005844116211, + "learning_rate": 9.115878433284577e-05, + "loss": 3.2051586151123046, + "step": 20100 + }, + { + "epoch": 22.879020779960147, + "grad_norm": 7.273082256317139, + "learning_rate": 9.11046102172382e-05, + "loss": 3.2332443237304687, + "step": 20110 + }, + { + "epoch": 22.89040705949331, + "grad_norm": 8.696020126342773, + "learning_rate": 9.105043610163064e-05, + "loss": 3.323190689086914, + "step": 20120 + }, + { + "epoch": 22.901793339026472, + "grad_norm": 7.149501323699951, + "learning_rate": 9.099626198602308e-05, + "loss": 3.2843936920166015, + "step": 20130 + }, + { + "epoch": 22.913179618559635, + "grad_norm": 8.37787914276123, + "learning_rate": 9.094208787041552e-05, + "loss": 3.404499053955078, + "step": 20140 + }, + { + "epoch": 22.924565898092798, + "grad_norm": 8.074170112609863, + "learning_rate": 9.088791375480796e-05, + "loss": 3.290465545654297, + "step": 20150 + }, + { + "epoch": 22.93595217762596, + "grad_norm": 7.932847023010254, + "learning_rate": 9.08337396392004e-05, + "loss": 3.2561920166015623, + "step": 20160 + }, + { + "epoch": 22.947338457159123, + "grad_norm": 11.540017127990723, + "learning_rate": 9.077956552359283e-05, + "loss": 3.1390569686889647, + "step": 20170 + }, + { + "epoch": 22.958724736692286, + "grad_norm": 7.136811256408691, + "learning_rate": 9.072539140798527e-05, + "loss": 3.2981540679931642, + "step": 20180 + }, + { + "epoch": 22.97011101622545, + "grad_norm": 7.825453758239746, + "learning_rate": 9.067121729237771e-05, + "loss": 3.0614255905151366, + "step": 20190 + }, + { + "epoch": 22.98149729575861, + "grad_norm": 7.4651007652282715, + "learning_rate": 9.061704317677015e-05, + "loss": 3.2027751922607424, + "step": 20200 + }, + { + "epoch": 22.992883575291774, + "grad_norm": 7.570321559906006, + "learning_rate": 9.056286906116258e-05, + "loss": 3.1534257888793946, + "step": 20210 + }, + { + "epoch": 23.003415883859947, + "grad_norm": 6.957921028137207, + "learning_rate": 9.050869494555502e-05, + "loss": 2.89981746673584, + "step": 20220 + }, + { + "epoch": 23.01480216339311, + "grad_norm": 7.340804576873779, + "learning_rate": 9.045452082994746e-05, + "loss": 2.898666000366211, + "step": 20230 + }, + { + "epoch": 23.026188442926273, + "grad_norm": 7.689297676086426, + "learning_rate": 9.040034671433988e-05, + "loss": 3.076976776123047, + "step": 20240 + }, + { + "epoch": 23.037574722459436, + "grad_norm": 7.607706069946289, + "learning_rate": 9.034617259873232e-05, + "loss": 3.11181697845459, + "step": 20250 + }, + { + "epoch": 23.0489610019926, + "grad_norm": 7.3909502029418945, + "learning_rate": 9.029199848312476e-05, + "loss": 3.0006891250610352, + "step": 20260 + }, + { + "epoch": 23.06034728152576, + "grad_norm": 6.984787464141846, + "learning_rate": 9.023782436751721e-05, + "loss": 2.905653381347656, + "step": 20270 + }, + { + "epoch": 23.071733561058924, + "grad_norm": 8.02444839477539, + "learning_rate": 9.018365025190965e-05, + "loss": 2.8838172912597657, + "step": 20280 + }, + { + "epoch": 23.083119840592087, + "grad_norm": 7.629673480987549, + "learning_rate": 9.012947613630209e-05, + "loss": 3.2186203002929688, + "step": 20290 + }, + { + "epoch": 23.09450612012525, + "grad_norm": 7.7079877853393555, + "learning_rate": 9.007530202069451e-05, + "loss": 2.9591360092163086, + "step": 20300 + }, + { + "epoch": 23.105892399658412, + "grad_norm": 6.760339736938477, + "learning_rate": 9.002112790508695e-05, + "loss": 2.7239349365234373, + "step": 20310 + }, + { + "epoch": 23.117278679191575, + "grad_norm": 9.964285850524902, + "learning_rate": 8.996695378947939e-05, + "loss": 2.7104719161987303, + "step": 20320 + }, + { + "epoch": 23.128664958724737, + "grad_norm": 7.464658737182617, + "learning_rate": 8.991277967387183e-05, + "loss": 2.8070682525634765, + "step": 20330 + }, + { + "epoch": 23.1400512382579, + "grad_norm": 7.5616960525512695, + "learning_rate": 8.985860555826426e-05, + "loss": 2.708431434631348, + "step": 20340 + }, + { + "epoch": 23.151437517791063, + "grad_norm": 7.403007984161377, + "learning_rate": 8.98044314426567e-05, + "loss": 2.8072072982788088, + "step": 20350 + }, + { + "epoch": 23.162823797324226, + "grad_norm": 8.316858291625977, + "learning_rate": 8.975025732704914e-05, + "loss": 3.012776565551758, + "step": 20360 + }, + { + "epoch": 23.17421007685739, + "grad_norm": 7.397007942199707, + "learning_rate": 8.969608321144158e-05, + "loss": 3.178554344177246, + "step": 20370 + }, + { + "epoch": 23.185596356390548, + "grad_norm": 7.289053916931152, + "learning_rate": 8.964190909583402e-05, + "loss": 2.6939666748046873, + "step": 20380 + }, + { + "epoch": 23.19698263592371, + "grad_norm": 7.612484455108643, + "learning_rate": 8.958773498022645e-05, + "loss": 3.1522052764892576, + "step": 20390 + }, + { + "epoch": 23.208368915456873, + "grad_norm": 7.743655681610107, + "learning_rate": 8.953356086461889e-05, + "loss": 2.716688346862793, + "step": 20400 + }, + { + "epoch": 23.219755194990036, + "grad_norm": 10.625493049621582, + "learning_rate": 8.947938674901133e-05, + "loss": 3.075759506225586, + "step": 20410 + }, + { + "epoch": 23.2311414745232, + "grad_norm": 7.2747883796691895, + "learning_rate": 8.942521263340377e-05, + "loss": 2.8906108856201174, + "step": 20420 + }, + { + "epoch": 23.24252775405636, + "grad_norm": 8.012664794921875, + "learning_rate": 8.937103851779621e-05, + "loss": 2.891446113586426, + "step": 20430 + }, + { + "epoch": 23.253914033589524, + "grad_norm": 6.6297430992126465, + "learning_rate": 8.931686440218863e-05, + "loss": 2.9877086639404298, + "step": 20440 + }, + { + "epoch": 23.265300313122687, + "grad_norm": 7.2411274909973145, + "learning_rate": 8.926269028658107e-05, + "loss": 2.8750194549560546, + "step": 20450 + }, + { + "epoch": 23.27668659265585, + "grad_norm": 7.526220321655273, + "learning_rate": 8.920851617097351e-05, + "loss": 2.7831699371337892, + "step": 20460 + }, + { + "epoch": 23.288072872189012, + "grad_norm": 7.155262470245361, + "learning_rate": 8.915434205536595e-05, + "loss": 2.864755630493164, + "step": 20470 + }, + { + "epoch": 23.299459151722175, + "grad_norm": 8.7533540725708, + "learning_rate": 8.910016793975838e-05, + "loss": 3.0703664779663087, + "step": 20480 + }, + { + "epoch": 23.310845431255338, + "grad_norm": 8.419981956481934, + "learning_rate": 8.904599382415082e-05, + "loss": 3.047661781311035, + "step": 20490 + }, + { + "epoch": 23.3222317107885, + "grad_norm": 7.7640204429626465, + "learning_rate": 8.899181970854327e-05, + "loss": 2.887657356262207, + "step": 20500 + }, + { + "epoch": 23.333617990321663, + "grad_norm": 7.903047561645508, + "learning_rate": 8.89376455929357e-05, + "loss": 2.9257579803466798, + "step": 20510 + }, + { + "epoch": 23.345004269854826, + "grad_norm": 7.807440757751465, + "learning_rate": 8.888347147732814e-05, + "loss": 2.984356689453125, + "step": 20520 + }, + { + "epoch": 23.35639054938799, + "grad_norm": 7.732547760009766, + "learning_rate": 8.882929736172057e-05, + "loss": 3.2712615966796874, + "step": 20530 + }, + { + "epoch": 23.36777682892115, + "grad_norm": 9.053909301757812, + "learning_rate": 8.877512324611301e-05, + "loss": 2.9967681884765627, + "step": 20540 + }, + { + "epoch": 23.379163108454314, + "grad_norm": 7.440546035766602, + "learning_rate": 8.872094913050545e-05, + "loss": 3.0605968475341796, + "step": 20550 + }, + { + "epoch": 23.390549387987477, + "grad_norm": 6.801850318908691, + "learning_rate": 8.866677501489789e-05, + "loss": 3.1235517501831054, + "step": 20560 + }, + { + "epoch": 23.401935667520636, + "grad_norm": 8.020711898803711, + "learning_rate": 8.861260089929033e-05, + "loss": 2.8574342727661133, + "step": 20570 + }, + { + "epoch": 23.4133219470538, + "grad_norm": 7.850024223327637, + "learning_rate": 8.855842678368276e-05, + "loss": 3.050294303894043, + "step": 20580 + }, + { + "epoch": 23.42470822658696, + "grad_norm": 7.764718532562256, + "learning_rate": 8.850425266807519e-05, + "loss": 3.0289308547973635, + "step": 20590 + }, + { + "epoch": 23.436094506120124, + "grad_norm": 6.614105701446533, + "learning_rate": 8.845007855246763e-05, + "loss": 2.7201135635375975, + "step": 20600 + }, + { + "epoch": 23.447480785653287, + "grad_norm": 7.509239673614502, + "learning_rate": 8.839590443686008e-05, + "loss": 3.0174678802490233, + "step": 20610 + }, + { + "epoch": 23.45886706518645, + "grad_norm": 7.190005302429199, + "learning_rate": 8.834173032125252e-05, + "loss": 2.841200828552246, + "step": 20620 + }, + { + "epoch": 23.470253344719612, + "grad_norm": 8.263400077819824, + "learning_rate": 8.828755620564495e-05, + "loss": 2.7141368865966795, + "step": 20630 + }, + { + "epoch": 23.481639624252775, + "grad_norm": 6.695593357086182, + "learning_rate": 8.823338209003739e-05, + "loss": 2.7828269958496095, + "step": 20640 + }, + { + "epoch": 23.493025903785938, + "grad_norm": 8.470293998718262, + "learning_rate": 8.817920797442982e-05, + "loss": 3.0112863540649415, + "step": 20650 + }, + { + "epoch": 23.5044121833191, + "grad_norm": 7.583410263061523, + "learning_rate": 8.812503385882225e-05, + "loss": 2.9472209930419924, + "step": 20660 + }, + { + "epoch": 23.515798462852263, + "grad_norm": 7.364217758178711, + "learning_rate": 8.807085974321469e-05, + "loss": 2.9927459716796876, + "step": 20670 + }, + { + "epoch": 23.527184742385426, + "grad_norm": 8.428694725036621, + "learning_rate": 8.801668562760713e-05, + "loss": 3.037489128112793, + "step": 20680 + }, + { + "epoch": 23.53857102191859, + "grad_norm": 8.37234115600586, + "learning_rate": 8.796251151199957e-05, + "loss": 2.8356271743774415, + "step": 20690 + }, + { + "epoch": 23.54995730145175, + "grad_norm": 7.239820957183838, + "learning_rate": 8.7908337396392e-05, + "loss": 2.9129606246948243, + "step": 20700 + }, + { + "epoch": 23.561343580984914, + "grad_norm": 7.503726959228516, + "learning_rate": 8.785416328078444e-05, + "loss": 2.9780012130737306, + "step": 20710 + }, + { + "epoch": 23.572729860518077, + "grad_norm": 6.911001205444336, + "learning_rate": 8.779998916517688e-05, + "loss": 2.829633331298828, + "step": 20720 + }, + { + "epoch": 23.58411614005124, + "grad_norm": 8.57630729675293, + "learning_rate": 8.774581504956932e-05, + "loss": 3.0809423446655275, + "step": 20730 + }, + { + "epoch": 23.5955024195844, + "grad_norm": 7.118391990661621, + "learning_rate": 8.769164093396176e-05, + "loss": 2.9418434143066405, + "step": 20740 + }, + { + "epoch": 23.60688869911756, + "grad_norm": 8.470577239990234, + "learning_rate": 8.76374668183542e-05, + "loss": 3.073319435119629, + "step": 20750 + }, + { + "epoch": 23.618274978650724, + "grad_norm": 7.737377643585205, + "learning_rate": 8.758329270274663e-05, + "loss": 2.973114013671875, + "step": 20760 + }, + { + "epoch": 23.629661258183887, + "grad_norm": 7.212495803833008, + "learning_rate": 8.752911858713907e-05, + "loss": 3.256830596923828, + "step": 20770 + }, + { + "epoch": 23.64104753771705, + "grad_norm": 8.123977661132812, + "learning_rate": 8.747494447153151e-05, + "loss": 2.8294878005981445, + "step": 20780 + }, + { + "epoch": 23.652433817250213, + "grad_norm": 8.355793952941895, + "learning_rate": 8.742077035592395e-05, + "loss": 2.869054412841797, + "step": 20790 + }, + { + "epoch": 23.663820096783375, + "grad_norm": 8.310219764709473, + "learning_rate": 8.736659624031637e-05, + "loss": 3.0746349334716796, + "step": 20800 + }, + { + "epoch": 23.675206376316538, + "grad_norm": 7.146482944488525, + "learning_rate": 8.731242212470881e-05, + "loss": 2.7864452362060548, + "step": 20810 + }, + { + "epoch": 23.6865926558497, + "grad_norm": 7.662703990936279, + "learning_rate": 8.725824800910125e-05, + "loss": 2.792169952392578, + "step": 20820 + }, + { + "epoch": 23.697978935382864, + "grad_norm": 7.96389627456665, + "learning_rate": 8.720407389349369e-05, + "loss": 3.1457258224487306, + "step": 20830 + }, + { + "epoch": 23.709365214916026, + "grad_norm": 6.582839012145996, + "learning_rate": 8.714989977788614e-05, + "loss": 2.910396385192871, + "step": 20840 + }, + { + "epoch": 23.72075149444919, + "grad_norm": 7.634960651397705, + "learning_rate": 8.709572566227858e-05, + "loss": 3.0249223709106445, + "step": 20850 + }, + { + "epoch": 23.732137773982352, + "grad_norm": 8.20224380493164, + "learning_rate": 8.7041551546671e-05, + "loss": 2.8246204376220705, + "step": 20860 + }, + { + "epoch": 23.743524053515515, + "grad_norm": 7.890283584594727, + "learning_rate": 8.698737743106344e-05, + "loss": 3.098486137390137, + "step": 20870 + }, + { + "epoch": 23.754910333048677, + "grad_norm": 7.693148612976074, + "learning_rate": 8.693320331545588e-05, + "loss": 2.9983917236328126, + "step": 20880 + }, + { + "epoch": 23.76629661258184, + "grad_norm": 7.275303363800049, + "learning_rate": 8.687902919984832e-05, + "loss": 3.028023529052734, + "step": 20890 + }, + { + "epoch": 23.777682892115003, + "grad_norm": 9.735662460327148, + "learning_rate": 8.682485508424075e-05, + "loss": 2.907429313659668, + "step": 20900 + }, + { + "epoch": 23.789069171648165, + "grad_norm": 7.711111545562744, + "learning_rate": 8.677068096863319e-05, + "loss": 2.9936874389648436, + "step": 20910 + }, + { + "epoch": 23.800455451181328, + "grad_norm": 7.945051193237305, + "learning_rate": 8.671650685302563e-05, + "loss": 3.211922836303711, + "step": 20920 + }, + { + "epoch": 23.811841730714487, + "grad_norm": 7.160228729248047, + "learning_rate": 8.666233273741807e-05, + "loss": 2.911302375793457, + "step": 20930 + }, + { + "epoch": 23.82322801024765, + "grad_norm": 7.800594329833984, + "learning_rate": 8.660815862181049e-05, + "loss": 3.017009735107422, + "step": 20940 + }, + { + "epoch": 23.834614289780813, + "grad_norm": 7.576048851013184, + "learning_rate": 8.655398450620294e-05, + "loss": 3.0092130661010743, + "step": 20950 + }, + { + "epoch": 23.846000569313976, + "grad_norm": 7.396543979644775, + "learning_rate": 8.649981039059538e-05, + "loss": 3.004311180114746, + "step": 20960 + }, + { + "epoch": 23.85738684884714, + "grad_norm": 7.172977447509766, + "learning_rate": 8.644563627498782e-05, + "loss": 2.650033187866211, + "step": 20970 + }, + { + "epoch": 23.8687731283803, + "grad_norm": 7.44117546081543, + "learning_rate": 8.639146215938026e-05, + "loss": 2.9172443389892577, + "step": 20980 + }, + { + "epoch": 23.880159407913464, + "grad_norm": 8.184832572937012, + "learning_rate": 8.63372880437727e-05, + "loss": 3.129201316833496, + "step": 20990 + }, + { + "epoch": 23.891545687446627, + "grad_norm": 7.225976467132568, + "learning_rate": 8.628311392816512e-05, + "loss": 2.9618593215942384, + "step": 21000 + }, + { + "epoch": 23.90293196697979, + "grad_norm": 7.948321342468262, + "learning_rate": 8.622893981255756e-05, + "loss": 3.0456144332885744, + "step": 21010 + }, + { + "epoch": 23.914318246512952, + "grad_norm": 8.000722885131836, + "learning_rate": 8.617476569695e-05, + "loss": 3.007844924926758, + "step": 21020 + }, + { + "epoch": 23.925704526046115, + "grad_norm": 7.9927215576171875, + "learning_rate": 8.612059158134243e-05, + "loss": 2.997426223754883, + "step": 21030 + }, + { + "epoch": 23.937090805579277, + "grad_norm": 9.083395957946777, + "learning_rate": 8.606641746573487e-05, + "loss": 3.4572025299072267, + "step": 21040 + }, + { + "epoch": 23.94847708511244, + "grad_norm": 7.858376502990723, + "learning_rate": 8.601224335012731e-05, + "loss": 2.95123291015625, + "step": 21050 + }, + { + "epoch": 23.959863364645603, + "grad_norm": 6.826962471008301, + "learning_rate": 8.595806923451976e-05, + "loss": 3.145720672607422, + "step": 21060 + }, + { + "epoch": 23.971249644178766, + "grad_norm": 8.091140747070312, + "learning_rate": 8.590389511891219e-05, + "loss": 2.7883310317993164, + "step": 21070 + }, + { + "epoch": 23.98263592371193, + "grad_norm": 7.429564952850342, + "learning_rate": 8.584972100330462e-05, + "loss": 3.2133480072021485, + "step": 21080 + }, + { + "epoch": 23.99402220324509, + "grad_norm": 6.603694438934326, + "learning_rate": 8.579554688769706e-05, + "loss": 2.7781740188598634, + "step": 21090 + } + ], + "logging_steps": 10, + "max_steps": 36918, + "num_input_tokens_seen": 0, + "num_train_epochs": 42, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.6981989206528e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}