{
  "best_global_step": 6878,
  "best_metric": 0.8412191271781921,
  "best_model_checkpoint": "outputs/vlm-age-rating-qwen25vl/checkpoint-6878",
  "epoch": 2.0,
  "eval_steps": 500,
  "global_step": 6878,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.002907822041291073,
      "grad_norm": 4.79532527923584,
      "learning_rate": 8.695652173913044e-06,
      "loss": 2.4354520797729493,
      "step": 10
    },
    {
      "epoch": 0.005815644082582146,
      "grad_norm": 4.398916721343994,
      "learning_rate": 1.8357487922705315e-05,
      "loss": 2.348348045349121,
      "step": 20
    },
    {
      "epoch": 0.008723466123873218,
      "grad_norm": 3.8363561630249023,
      "learning_rate": 2.8019323671497587e-05,
      "loss": 2.2402442932128905,
      "step": 30
    },
    {
      "epoch": 0.011631288165164292,
      "grad_norm": 3.9996328353881836,
      "learning_rate": 3.7681159420289856e-05,
      "loss": 2.0806962966918947,
      "step": 40
    },
    {
      "epoch": 0.014539110206455364,
      "grad_norm": 3.623289108276367,
      "learning_rate": 4.7342995169082125e-05,
      "loss": 1.9209724426269532,
      "step": 50
    },
    {
      "epoch": 0.017446932247746436,
      "grad_norm": 3.6283390522003174,
      "learning_rate": 5.7004830917874394e-05,
      "loss": 1.8231483459472657,
      "step": 60
    },
    {
      "epoch": 0.020354754289037512,
      "grad_norm": 3.6183929443359375,
      "learning_rate": 6.666666666666667e-05,
      "loss": 1.7064151763916016,
      "step": 70
    },
    {
      "epoch": 0.023262576330328584,
      "grad_norm": 4.0795063972473145,
      "learning_rate": 7.632850241545893e-05,
      "loss": 1.6121023178100586,
      "step": 80
    },
    {
      "epoch": 0.026170398371619656,
      "grad_norm": 3.4447927474975586,
      "learning_rate": 8.599033816425122e-05,
      "loss": 1.6030376434326172,
      "step": 90
    },
    {
      "epoch": 0.02907822041291073,
      "grad_norm": 3.6108040809631348,
      "learning_rate": 9.565217391304348e-05,
      "loss": 1.5841608047485352,
      "step": 100
    },
    {
      "epoch": 0.0319860424542018,
      "grad_norm": 3.598888635635376,
      "learning_rate": 0.00010531400966183576,
      "loss": 1.4951010704040528,
      "step": 110
    },
    {
      "epoch": 0.03489386449549287,
      "grad_norm": 3.70912766456604,
      "learning_rate": 0.00011497584541062802,
      "loss": 1.482225227355957,
      "step": 120
    },
    {
      "epoch": 0.03780168653678395,
      "grad_norm": 3.4434823989868164,
      "learning_rate": 0.0001246376811594203,
      "loss": 1.5170270919799804,
      "step": 130
    },
    {
      "epoch": 0.040709508578075024,
      "grad_norm": 3.361293315887451,
      "learning_rate": 0.00013429951690821257,
      "loss": 1.4422802925109863,
      "step": 140
    },
    {
      "epoch": 0.043617330619366096,
      "grad_norm": 3.285003900527954,
      "learning_rate": 0.00014396135265700482,
      "loss": 1.4413420677185058,
      "step": 150
    },
    {
      "epoch": 0.04652515266065717,
      "grad_norm": 3.6356050968170166,
      "learning_rate": 0.0001536231884057971,
      "loss": 1.4118947982788086,
      "step": 160
    },
    {
      "epoch": 0.04943297470194824,
      "grad_norm": 3.8388562202453613,
      "learning_rate": 0.00016328502415458937,
      "loss": 1.4001873970031737,
      "step": 170
    },
    {
      "epoch": 0.05234079674323931,
      "grad_norm": 3.3433616161346436,
      "learning_rate": 0.00017294685990338165,
      "loss": 1.3969547271728515,
      "step": 180
    },
    {
      "epoch": 0.055248618784530384,
      "grad_norm": 3.4669270515441895,
      "learning_rate": 0.00018260869565217392,
      "loss": 1.337031650543213,
      "step": 190
    },
    {
      "epoch": 0.05815644082582146,
      "grad_norm": 3.1025021076202393,
      "learning_rate": 0.0001922705314009662,
      "loss": 1.435082244873047,
      "step": 200
    },
    {
      "epoch": 0.061064262867112536,
      "grad_norm": 3.3911843299865723,
      "learning_rate": 0.00019994003897466647,
      "loss": 1.392270278930664,
      "step": 210
    },
    {
      "epoch": 0.0639720849084036,
      "grad_norm": 3.142547130584717,
      "learning_rate": 0.0001996402338479988,
      "loss": 1.243046760559082,
      "step": 220
    },
    {
      "epoch": 0.06687990694969467,
      "grad_norm": 3.35446834564209,
      "learning_rate": 0.00019934042872133114,
      "loss": 1.367037868499756,
      "step": 230
    },
    {
      "epoch": 0.06978772899098575,
      "grad_norm": 3.458134651184082,
      "learning_rate": 0.0001990406235946635,
      "loss": 1.3419992446899414,
      "step": 240
    },
    {
      "epoch": 0.07269555103227683,
      "grad_norm": 3.352386474609375,
      "learning_rate": 0.0001987408184679958,
      "loss": 1.2986759185791015,
      "step": 250
    },
    {
      "epoch": 0.0756033730735679,
      "grad_norm": 3.1780903339385986,
      "learning_rate": 0.00019844101334132814,
      "loss": 1.2988534927368165,
      "step": 260
    },
    {
      "epoch": 0.07851119511485898,
      "grad_norm": 2.86576247215271,
      "learning_rate": 0.00019814120821466048,
      "loss": 1.3682787895202637,
      "step": 270
    },
    {
      "epoch": 0.08141901715615005,
      "grad_norm": 3.3476641178131104,
      "learning_rate": 0.0001978414030879928,
      "loss": 1.3652713775634766,
      "step": 280
    },
    {
      "epoch": 0.08432683919744112,
      "grad_norm": 3.2246811389923096,
      "learning_rate": 0.00019754159796132515,
      "loss": 1.2679337501525878,
      "step": 290
    },
    {
      "epoch": 0.08723466123873219,
      "grad_norm": 3.134486675262451,
      "learning_rate": 0.00019724179283465748,
      "loss": 1.211684513092041,
      "step": 300
    },
    {
      "epoch": 0.09014248328002326,
      "grad_norm": 3.142294406890869,
      "learning_rate": 0.00019694198770798982,
      "loss": 1.3083333015441894,
      "step": 310
    },
    {
      "epoch": 0.09305030532131434,
      "grad_norm": 3.2110912799835205,
      "learning_rate": 0.00019664218258132215,
      "loss": 1.2491901397705079,
      "step": 320
    },
    {
      "epoch": 0.09595812736260541,
      "grad_norm": 3.0253570079803467,
      "learning_rate": 0.00019634237745465449,
      "loss": 1.3311482429504395,
      "step": 330
    },
    {
      "epoch": 0.09886594940389648,
      "grad_norm": 2.7756271362304688,
      "learning_rate": 0.00019604257232798682,
      "loss": 1.2524259567260743,
      "step": 340
    },
    {
      "epoch": 0.10177377144518755,
      "grad_norm": 3.243698835372925,
      "learning_rate": 0.00019574276720131915,
      "loss": 1.293147087097168,
      "step": 350
    },
    {
      "epoch": 0.10468159348647862,
      "grad_norm": 2.8504862785339355,
      "learning_rate": 0.0001954429620746515,
      "loss": 1.2383965492248534,
      "step": 360
    },
    {
      "epoch": 0.1075894155277697,
      "grad_norm": 3.280348062515259,
      "learning_rate": 0.00019514315694798382,
      "loss": 1.2218168258666993,
      "step": 370
    },
    {
      "epoch": 0.11049723756906077,
      "grad_norm": 3.1593716144561768,
      "learning_rate": 0.00019484335182131616,
      "loss": 1.2068678855895996,
      "step": 380
    },
    {
      "epoch": 0.11340505961035184,
      "grad_norm": 3.1796255111694336,
      "learning_rate": 0.0001945435466946485,
      "loss": 1.2717905998229981,
      "step": 390
    },
    {
      "epoch": 0.11631288165164291,
      "grad_norm": 2.8659069538116455,
      "learning_rate": 0.00019424374156798083,
      "loss": 1.2112377166748047,
      "step": 400
    },
    {
      "epoch": 0.119220703692934,
      "grad_norm": 2.9140565395355225,
      "learning_rate": 0.00019394393644131313,
      "loss": 1.2414008140563966,
      "step": 410
    },
    {
      "epoch": 0.12212852573422507,
      "grad_norm": 3.0246622562408447,
      "learning_rate": 0.0001936441313146455,
      "loss": 1.2327194213867188,
      "step": 420
    },
    {
      "epoch": 0.12503634777551614,
      "grad_norm": 2.7067604064941406,
      "learning_rate": 0.00019334432618797783,
      "loss": 1.1890941619873048,
      "step": 430
    },
    {
      "epoch": 0.1279441698168072,
      "grad_norm": 3.0323450565338135,
      "learning_rate": 0.00019304452106131016,
      "loss": 1.2328312873840332,
      "step": 440
    },
    {
      "epoch": 0.1308519918580983,
      "grad_norm": 3.0487864017486572,
      "learning_rate": 0.0001927447159346425,
      "loss": 1.1881980895996094,
      "step": 450
    },
    {
      "epoch": 0.13375981389938935,
      "grad_norm": 3.0193893909454346,
      "learning_rate": 0.00019244491080797483,
      "loss": 1.2217585563659668,
      "step": 460
    },
    {
      "epoch": 0.13666763594068043,
      "grad_norm": 3.0121376514434814,
      "learning_rate": 0.00019214510568130717,
      "loss": 1.1492167472839356,
      "step": 470
    },
    {
      "epoch": 0.1395754579819715,
      "grad_norm": 2.9854133129119873,
      "learning_rate": 0.0001918453005546395,
      "loss": 1.2008344650268554,
      "step": 480
    },
    {
      "epoch": 0.14248328002326258,
      "grad_norm": 2.982191562652588,
      "learning_rate": 0.0001915454954279718,
      "loss": 1.1686691284179687,
      "step": 490
    },
    {
      "epoch": 0.14539110206455366,
      "grad_norm": 4.596578598022461,
      "learning_rate": 0.00019124569030130417,
      "loss": 1.2213269233703614,
      "step": 500
    },
    {
      "epoch": 0.14539110206455366,
      "eval_loss": 1.1948493719100952,
      "eval_runtime": 471.9433,
      "eval_samples_per_second": 7.287,
      "eval_steps_per_second": 7.287,
      "step": 500
    },
    {
      "epoch": 0.14829892410584472,
      "grad_norm": 2.8607027530670166,
      "learning_rate": 0.0001909458851746365,
      "loss": 1.2157353401184081,
      "step": 510
    },
    {
      "epoch": 0.1512067461471358,
      "grad_norm": 3.3106703758239746,
      "learning_rate": 0.0001906460800479688,
      "loss": 1.2436732292175292,
      "step": 520
    },
    {
      "epoch": 0.15411456818842686,
      "grad_norm": 2.8542511463165283,
      "learning_rate": 0.00019034627492130118,
      "loss": 1.2832223892211914,
      "step": 530
    },
    {
      "epoch": 0.15702239022971795,
      "grad_norm": 2.961954355239868,
      "learning_rate": 0.0001900464697946335,
      "loss": 1.214921474456787,
      "step": 540
    },
    {
      "epoch": 0.159930212271009,
      "grad_norm": 3.2760446071624756,
      "learning_rate": 0.00018974666466796582,
      "loss": 1.2085216522216797,
      "step": 550
    },
    {
      "epoch": 0.1628380343123001,
      "grad_norm": 3.117536783218384,
      "learning_rate": 0.00018944685954129818,
      "loss": 1.1909247398376466,
      "step": 560
    },
    {
      "epoch": 0.16574585635359115,
      "grad_norm": 3.1558895111083984,
      "learning_rate": 0.00018914705441463049,
      "loss": 1.2084429740905762,
      "step": 570
    },
    {
      "epoch": 0.16865367839488224,
      "grad_norm": 2.6539628505706787,
      "learning_rate": 0.00018884724928796282,
      "loss": 1.2048931121826172,
      "step": 580
    },
    {
      "epoch": 0.1715615004361733,
      "grad_norm": 2.4215893745422363,
      "learning_rate": 0.00018854744416129518,
      "loss": 1.1528879165649415,
      "step": 590
    },
    {
      "epoch": 0.17446932247746438,
      "grad_norm": 3.247122049331665,
      "learning_rate": 0.0001882476390346275,
      "loss": 1.1167863845825194,
      "step": 600
    },
    {
      "epoch": 0.17737714451875544,
      "grad_norm": 2.5841548442840576,
      "learning_rate": 0.00018794783390795982,
      "loss": 1.1510201454162599,
      "step": 610
    },
    {
      "epoch": 0.18028496656004653,
      "grad_norm": 2.8135194778442383,
      "learning_rate": 0.00018764802878129219,
      "loss": 1.1705083847045898,
      "step": 620
    },
    {
      "epoch": 0.1831927886013376,
      "grad_norm": 2.7703614234924316,
      "learning_rate": 0.0001873482236546245,
      "loss": 1.1498478889465331,
      "step": 630
    },
    {
      "epoch": 0.18610061064262867,
      "grad_norm": 2.933802843093872,
      "learning_rate": 0.00018704841852795685,
      "loss": 1.1263715744018554,
      "step": 640
    },
    {
      "epoch": 0.18900843268391973,
      "grad_norm": 2.8813698291778564,
      "learning_rate": 0.00018674861340128916,
      "loss": 1.0713846206665039,
      "step": 650
    },
    {
      "epoch": 0.19191625472521082,
      "grad_norm": 4.469653606414795,
      "learning_rate": 0.0001864488082746215,
      "loss": 1.1526763916015625,
      "step": 660
    },
    {
      "epoch": 0.1948240767665019,
      "grad_norm": 2.608485698699951,
      "learning_rate": 0.00018614900314795386,
      "loss": 1.1115928649902345,
      "step": 670
    },
    {
      "epoch": 0.19773189880779296,
      "grad_norm": 2.5391645431518555,
      "learning_rate": 0.00018584919802128617,
      "loss": 1.1768034934997558,
      "step": 680
    },
    {
      "epoch": 0.20063972084908405,
      "grad_norm": 2.8083078861236572,
      "learning_rate": 0.0001855493928946185,
      "loss": 1.1693111419677735,
      "step": 690
    },
    {
      "epoch": 0.2035475428903751,
      "grad_norm": 2.7247753143310547,
      "learning_rate": 0.00018524958776795086,
      "loss": 1.1793179512023926,
      "step": 700
    },
    {
      "epoch": 0.2064553649316662,
      "grad_norm": 2.5489275455474854,
      "learning_rate": 0.00018494978264128317,
      "loss": 1.153579044342041,
      "step": 710
    },
    {
      "epoch": 0.20936318697295725,
      "grad_norm": 2.848568916320801,
      "learning_rate": 0.0001846499775146155,
      "loss": 1.1616141319274902,
      "step": 720
    },
    {
      "epoch": 0.21227100901424834,
      "grad_norm": 2.6661322116851807,
      "learning_rate": 0.00018435017238794784,
      "loss": 1.185004997253418,
      "step": 730
    },
    {
      "epoch": 0.2151788310555394,
      "grad_norm": 2.6399037837982178,
      "learning_rate": 0.00018405036726128017,
      "loss": 1.1467523574829102,
      "step": 740
    },
    {
      "epoch": 0.21808665309683048,
      "grad_norm": 2.8910253047943115,
      "learning_rate": 0.0001837505621346125,
      "loss": 1.1423175811767579,
      "step": 750
    },
    {
      "epoch": 0.22099447513812154,
      "grad_norm": 2.6750359535217285,
      "learning_rate": 0.00018345075700794484,
      "loss": 1.1023540496826172,
      "step": 760
    },
    {
      "epoch": 0.22390229717941262,
      "grad_norm": 2.7184813022613525,
      "learning_rate": 0.00018315095188127718,
      "loss": 1.0655659675598144,
      "step": 770
    },
    {
      "epoch": 0.22681011922070368,
      "grad_norm": 2.352343797683716,
      "learning_rate": 0.0001828511467546095,
      "loss": 1.0686527252197267,
      "step": 780
    },
    {
      "epoch": 0.22971794126199477,
      "grad_norm": 2.9609851837158203,
      "learning_rate": 0.00018255134162794185,
      "loss": 1.0448270797729493,
      "step": 790
    },
    {
      "epoch": 0.23262576330328583,
      "grad_norm": 2.629925489425659,
      "learning_rate": 0.00018225153650127418,
      "loss": 1.2246409416198731,
      "step": 800
    },
    {
      "epoch": 0.2355335853445769,
      "grad_norm": 2.443338632583618,
      "learning_rate": 0.00018195173137460651,
      "loss": 1.0720739364624023,
      "step": 810
    },
    {
      "epoch": 0.238441407385868,
      "grad_norm": 2.612340211868286,
      "learning_rate": 0.00018165192624793885,
      "loss": 1.135690975189209,
      "step": 820
    },
    {
      "epoch": 0.24134922942715906,
      "grad_norm": 2.6910789012908936,
      "learning_rate": 0.00018135212112127118,
      "loss": 1.160903263092041,
      "step": 830
    },
    {
      "epoch": 0.24425705146845014,
      "grad_norm": 2.729325532913208,
      "learning_rate": 0.00018105231599460352,
      "loss": 1.1131114959716797,
      "step": 840
    },
    {
      "epoch": 0.2471648735097412,
      "grad_norm": 2.6346912384033203,
      "learning_rate": 0.00018075251086793585,
      "loss": 1.1584989547729492,
      "step": 850
    },
    {
      "epoch": 0.2500726955510323,
      "grad_norm": 2.4764983654022217,
      "learning_rate": 0.0001804527057412682,
      "loss": 1.1217921257019043,
      "step": 860
    },
    {
      "epoch": 0.2529805175923234,
      "grad_norm": 2.8104140758514404,
      "learning_rate": 0.00018015290061460052,
      "loss": 1.1900800704956054,
      "step": 870
    },
    {
      "epoch": 0.2558883396336144,
      "grad_norm": 2.6423113346099854,
      "learning_rate": 0.00017985309548793286,
      "loss": 1.1654984474182128,
      "step": 880
    },
    {
      "epoch": 0.2587961616749055,
      "grad_norm": 2.456171751022339,
      "learning_rate": 0.0001795532903612652,
      "loss": 1.1631184577941895,
      "step": 890
    },
    {
      "epoch": 0.2617039837161966,
      "grad_norm": 2.355860471725464,
      "learning_rate": 0.00017925348523459752,
      "loss": 1.1408035278320312,
      "step": 900
    },
    {
      "epoch": 0.26461180575748766,
      "grad_norm": 2.7640347480773926,
      "learning_rate": 0.00017895368010792986,
      "loss": 1.095857810974121,
      "step": 910
    },
    {
      "epoch": 0.2675196277987787,
      "grad_norm": 2.518118143081665,
      "learning_rate": 0.0001786538749812622,
      "loss": 1.1492120742797851,
      "step": 920
    },
    {
      "epoch": 0.2704274498400698,
      "grad_norm": 2.478942632675171,
      "learning_rate": 0.00017835406985459453,
      "loss": 1.095020580291748,
      "step": 930
    },
    {
      "epoch": 0.27333527188136086,
      "grad_norm": 2.5483644008636475,
      "learning_rate": 0.00017805426472792686,
      "loss": 1.1031085014343263,
      "step": 940
    },
    {
      "epoch": 0.27624309392265195,
      "grad_norm": 3.0256600379943848,
      "learning_rate": 0.00017775445960125917,
      "loss": 1.1728601455688477,
      "step": 950
    },
    {
      "epoch": 0.279150915963943,
      "grad_norm": 2.417307138442993,
      "learning_rate": 0.00017745465447459153,
      "loss": 1.1077165603637695,
      "step": 960
    },
    {
      "epoch": 0.28205873800523407,
      "grad_norm": 2.5772206783294678,
      "learning_rate": 0.00017715484934792387,
      "loss": 1.1050203323364258,
      "step": 970
    },
    {
      "epoch": 0.28496656004652515,
      "grad_norm": 2.4329328536987305,
      "learning_rate": 0.00017685504422125617,
      "loss": 1.0694819450378419,
      "step": 980
    },
    {
      "epoch": 0.28787438208781624,
      "grad_norm": 2.2022688388824463,
      "learning_rate": 0.00017655523909458854,
      "loss": 1.0653936386108398,
      "step": 990
    },
    {
      "epoch": 0.2907822041291073,
      "grad_norm": 2.5784685611724854,
      "learning_rate": 0.00017625543396792087,
      "loss": 1.1188321113586426,
      "step": 1000
    },
    {
      "epoch": 0.2907822041291073,
      "eval_loss": 1.088085651397705,
      "eval_runtime": 466.4025,
      "eval_samples_per_second": 7.373,
      "eval_steps_per_second": 7.373,
      "step": 1000
    },
    {
      "epoch": 0.29369002617039836,
      "grad_norm": 2.1797893047332764,
      "learning_rate": 0.00017595562884125318,
      "loss": 1.0641416549682616,
      "step": 1010
    },
    {
      "epoch": 0.29659784821168944,
      "grad_norm": 2.351658344268799,
      "learning_rate": 0.00017565582371458554,
      "loss": 1.0726565361022948,
      "step": 1020
    },
    {
      "epoch": 0.2995056702529805,
      "grad_norm": 2.5299222469329834,
      "learning_rate": 0.00017535601858791785,
      "loss": 1.110872459411621,
      "step": 1030
    },
    {
      "epoch": 0.3024134922942716,
      "grad_norm": 2.492405652999878,
      "learning_rate": 0.00017505621346125018,
      "loss": 1.0856511116027832,
      "step": 1040
    },
    {
      "epoch": 0.30532131433556264,
      "grad_norm": 2.490410089492798,
      "learning_rate": 0.00017475640833458254,
      "loss": 1.053286075592041,
      "step": 1050
    },
    {
      "epoch": 0.30822913637685373,
      "grad_norm": 2.451176404953003,
      "learning_rate": 0.00017445660320791485,
      "loss": 1.1099212646484375,
      "step": 1060
    },
    {
      "epoch": 0.3111369584181448,
      "grad_norm": 2.2699105739593506,
      "learning_rate": 0.0001741567980812472,
      "loss": 0.9613182067871093,
      "step": 1070
    },
    {
      "epoch": 0.3140447804594359,
      "grad_norm": 2.1652886867523193,
      "learning_rate": 0.00017385699295457955,
      "loss": 1.0921488761901856,
      "step": 1080
    },
    {
      "epoch": 0.31695260250072693,
      "grad_norm": 2.385770797729492,
      "learning_rate": 0.00017355718782791185,
      "loss": 1.1041201591491698,
      "step": 1090
    },
    {
      "epoch": 0.319860424542018,
      "grad_norm": 2.5070347785949707,
      "learning_rate": 0.00017325738270124421,
      "loss": 1.0626919746398926,
      "step": 1100
    },
    {
      "epoch": 0.3227682465833091,
      "grad_norm": 2.5580434799194336,
      "learning_rate": 0.00017295757757457652,
      "loss": 1.0355568885803224,
      "step": 1110
    },
    {
      "epoch": 0.3256760686246002,
      "grad_norm": 2.285900592803955,
      "learning_rate": 0.00017265777244790886,
      "loss": 1.094072151184082,
      "step": 1120
    },
    {
      "epoch": 0.3285838906658912,
      "grad_norm": 2.2862985134124756,
      "learning_rate": 0.00017235796732124122,
      "loss": 1.1321526527404786,
      "step": 1130
    },
    {
      "epoch": 0.3314917127071823,
      "grad_norm": 2.3698503971099854,
      "learning_rate": 0.00017205816219457353,
      "loss": 1.1155936241149902,
      "step": 1140
    },
    {
      "epoch": 0.3343995347484734,
      "grad_norm": 2.2256312370300293,
      "learning_rate": 0.00017175835706790586,
      "loss": 1.0627290725708007,
      "step": 1150
    },
    {
      "epoch": 0.3373073567897645,
      "grad_norm": 2.3896291255950928,
      "learning_rate": 0.00017145855194123822,
      "loss": 1.0547872543334962,
      "step": 1160
    },
    {
      "epoch": 0.34021517883105556,
      "grad_norm": 2.1930463314056396,
      "learning_rate": 0.00017115874681457053,
      "loss": 1.0243175506591797,
      "step": 1170
    },
    {
      "epoch": 0.3431230008723466,
      "grad_norm": 2.0768635272979736,
      "learning_rate": 0.00017085894168790286,
      "loss": 1.1152023315429687,
      "step": 1180
    },
    {
      "epoch": 0.3460308229136377,
      "grad_norm": 2.179349422454834,
      "learning_rate": 0.0001705591365612352,
      "loss": 1.1170848846435546,
      "step": 1190
    },
    {
      "epoch": 0.34893864495492877,
      "grad_norm": 2.2244808673858643,
      "learning_rate": 0.00017025933143456753,
      "loss": 0.964411735534668,
      "step": 1200
    },
    {
      "epoch": 0.35184646699621985,
      "grad_norm": 2.39132022857666,
      "learning_rate": 0.00016995952630789987,
      "loss": 1.0051309585571289,
      "step": 1210
    },
    {
      "epoch": 0.3547542890375109,
      "grad_norm": 2.1408185958862305,
      "learning_rate": 0.0001696597211812322,
      "loss": 1.1022598266601562,
      "step": 1220
    },
    {
      "epoch": 0.35766211107880197,
      "grad_norm": 2.3732504844665527,
      "learning_rate": 0.00016935991605456454,
      "loss": 1.0292579650878906,
      "step": 1230
    },
    {
      "epoch": 0.36056993312009306,
      "grad_norm": 2.5366053581237793,
      "learning_rate": 0.00016906011092789687,
      "loss": 1.0912357330322267,
      "step": 1240
    },
    {
      "epoch": 0.36347775516138414,
      "grad_norm": 2.2400059700012207,
      "learning_rate": 0.0001687603058012292,
      "loss": 1.033323383331299,
      "step": 1250
    },
    {
      "epoch": 0.3663855772026752,
      "grad_norm": 2.2703261375427246,
      "learning_rate": 0.00016846050067456154,
      "loss": 1.0109487533569337,
      "step": 1260
    },
    {
      "epoch": 0.36929339924396626,
      "grad_norm": 2.280935764312744,
      "learning_rate": 0.00016816069554789387,
      "loss": 1.0813608169555664,
      "step": 1270
    },
    {
      "epoch": 0.37220122128525734,
      "grad_norm": 2.168682098388672,
      "learning_rate": 0.0001678608904212262,
      "loss": 1.0790273666381835,
      "step": 1280
    },
    {
      "epoch": 0.37510904332654843,
      "grad_norm": 2.1177940368652344,
      "learning_rate": 0.00016756108529455854,
      "loss": 1.031337356567383,
      "step": 1290
    },
    {
      "epoch": 0.37801686536783946,
      "grad_norm": 2.4730405807495117,
      "learning_rate": 0.00016726128016789088,
      "loss": 1.129223918914795,
      "step": 1300
    },
    {
      "epoch": 0.38092468740913055,
      "grad_norm": 2.095201015472412,
      "learning_rate": 0.0001669614750412232,
      "loss": 1.097676658630371,
      "step": 1310
    },
    {
      "epoch": 0.38383250945042163,
      "grad_norm": 5.1267242431640625,
      "learning_rate": 0.00016666166991455555,
      "loss": 1.0067487716674806,
      "step": 1320
    },
    {
      "epoch": 0.3867403314917127,
      "grad_norm": 2.3142173290252686,
      "learning_rate": 0.00016636186478788788,
      "loss": 1.0111748695373535,
      "step": 1330
    },
    {
      "epoch": 0.3896481535330038,
      "grad_norm": 2.5803937911987305,
      "learning_rate": 0.00016606205966122022,
      "loss": 1.0414213180541991,
      "step": 1340
    },
    {
      "epoch": 0.39255597557429484,
      "grad_norm": 2.057889223098755,
      "learning_rate": 0.00016576225453455255,
      "loss": 1.0709516525268554,
      "step": 1350
    },
    {
      "epoch": 0.3954637976155859,
      "grad_norm": 2.221109628677368,
      "learning_rate": 0.00016546244940788488,
      "loss": 0.9829123497009278,
      "step": 1360
    },
    {
      "epoch": 0.398371619656877,
      "grad_norm": 2.2963709831237793,
      "learning_rate": 0.00016516264428121722,
      "loss": 1.0530911445617677,
      "step": 1370
    },
    {
      "epoch": 0.4012794416981681,
      "grad_norm": 2.225609064102173,
      "learning_rate": 0.00016486283915454955,
      "loss": 0.9976913452148437,
      "step": 1380
    },
    {
      "epoch": 0.4041872637394591,
      "grad_norm": 2.186084032058716,
      "learning_rate": 0.0001645630340278819,
      "loss": 0.9746930122375488,
      "step": 1390
    },
    {
      "epoch": 0.4070950857807502,
      "grad_norm": 2.3998475074768066,
      "learning_rate": 0.00016426322890121422,
      "loss": 0.995127010345459,
      "step": 1400
    },
    {
      "epoch": 0.4100029078220413,
      "grad_norm": 2.2103660106658936,
      "learning_rate": 0.00016396342377454653,
      "loss": 0.9968692779541015,
      "step": 1410
    },
    {
      "epoch": 0.4129107298633324,
      "grad_norm": 2.228457450866699,
      "learning_rate": 0.0001636636186478789,
      "loss": 1.0345491409301757,
      "step": 1420
    },
    {
      "epoch": 0.4158185519046234,
      "grad_norm": 2.2369489669799805,
      "learning_rate": 0.00016336381352121123,
      "loss": 1.0916296005249024,
      "step": 1430
    },
    {
      "epoch": 0.4187263739459145,
      "grad_norm": 2.4540905952453613,
      "learning_rate": 0.00016306400839454353,
      "loss": 1.0946255683898927,
      "step": 1440
    },
    {
      "epoch": 0.4216341959872056,
      "grad_norm": 2.272212505340576,
      "learning_rate": 0.0001627642032678759,
      "loss": 1.0748573303222657,
      "step": 1450
    },
    {
      "epoch": 0.42454201802849667,
      "grad_norm": 2.1080758571624756,
      "learning_rate": 0.00016246439814120823,
      "loss": 1.0456165313720702,
      "step": 1460
    },
    {
      "epoch": 0.42744984006978776,
      "grad_norm": 2.1176912784576416,
      "learning_rate": 0.00016216459301454056,
      "loss": 0.9982593536376954,
      "step": 1470
    },
    {
      "epoch": 0.4303576621110788,
      "grad_norm": 2.0624117851257324,
      "learning_rate": 0.0001618647878878729,
      "loss": 1.067337417602539,
      "step": 1480
    },
    {
      "epoch": 0.4332654841523699,
      "grad_norm": 2.2433207035064697,
      "learning_rate": 0.0001615649827612052,
      "loss": 1.1099421501159668,
      "step": 1490
    },
    {
      "epoch": 0.43617330619366096,
      "grad_norm": 2.0459365844726562,
      "learning_rate": 0.00016126517763453757,
      "loss": 1.0837010383605956,
      "step": 1500
    },
    {
      "epoch": 0.43617330619366096,
      "eval_loss": 1.0330508947372437,
      "eval_runtime": 467.9014,
      "eval_samples_per_second": 7.35,
      "eval_steps_per_second": 7.35,
      "step": 1500
    },
    {
      "epoch": 0.43908112823495205,
      "grad_norm": 1.9743572473526,
      "learning_rate": 0.0001609653725078699,
      "loss": 0.9442779541015625,
      "step": 1510
    },
    {
      "epoch": 0.4419889502762431,
      "grad_norm": 2.259799003601074,
      "learning_rate": 0.0001606655673812022,
      "loss": 1.0716489791870116,
      "step": 1520
    },
    {
      "epoch": 0.44489677231753416,
      "grad_norm": 2.1605937480926514,
      "learning_rate": 0.00016036576225453457,
      "loss": 1.0564517974853516,
      "step": 1530
    },
    {
      "epoch": 0.44780459435882525,
      "grad_norm": 2.2314250469207764,
      "learning_rate": 0.0001600659571278669,
      "loss": 1.0048583030700684,
      "step": 1540
    },
    {
      "epoch": 0.45071241640011633,
      "grad_norm": 2.099571943283081,
      "learning_rate": 0.0001597661520011992,
      "loss": 1.0006441116333007,
      "step": 1550
    },
    {
      "epoch": 0.45362023844140736,
      "grad_norm": 1.9105018377304077,
      "learning_rate": 0.00015946634687453157,
      "loss": 0.9975058555603027,
      "step": 1560
    },
    {
      "epoch": 0.45652806048269845,
      "grad_norm": 2.057384729385376,
      "learning_rate": 0.00015916654174786388,
      "loss": 1.087107276916504,
      "step": 1570
    },
    {
      "epoch": 0.45943588252398954,
      "grad_norm": 1.9773719310760498,
      "learning_rate": 0.00015886673662119622,
      "loss": 1.0378877639770507,
      "step": 1580
    },
    {
      "epoch": 0.4623437045652806,
      "grad_norm": 2.122605323791504,
      "learning_rate": 0.00015856693149452858,
      "loss": 1.0330044746398925,
      "step": 1590
    },
    {
      "epoch": 0.46525152660657165,
      "grad_norm": 2.409942388534546,
      "learning_rate": 0.00015826712636786089,
      "loss": 1.0367450714111328,
      "step": 1600
    },
    {
      "epoch": 0.46815934864786274,
      "grad_norm": 2.112682342529297,
      "learning_rate": 0.00015796732124119322,
      "loss": 1.0601228713989257,
      "step": 1610
    },
    {
      "epoch": 0.4710671706891538,
      "grad_norm": 2.8438005447387695,
      "learning_rate": 0.00015766751611452558,
      "loss": 1.013437271118164,
      "step": 1620
    },
    {
      "epoch": 0.4739749927304449,
      "grad_norm": 2.1778664588928223,
      "learning_rate": 0.0001573677109878579,
      "loss": 0.9827444076538085,
      "step": 1630
    },
    {
      "epoch": 0.476882814771736,
      "grad_norm": 2.261019468307495,
      "learning_rate": 0.00015706790586119022,
      "loss": 1.019342803955078,
      "step": 1640
    },
    {
      "epoch": 0.479790636813027,
      "grad_norm": 2.01497483253479,
      "learning_rate": 0.00015676810073452258,
      "loss": 0.9160719871520996,
      "step": 1650
    },
    {
      "epoch": 0.4826984588543181,
      "grad_norm": 2.2591161727905273,
      "learning_rate": 0.0001564682956078549,
      "loss": 0.9794828414916992,
      "step": 1660
    },
    {
      "epoch": 0.4856062808956092,
      "grad_norm": 2.303205966949463,
      "learning_rate": 0.00015616849048118725,
      "loss": 1.070561408996582,
      "step": 1670
    },
    {
      "epoch": 0.4885141029369003,
      "grad_norm": 2.075136661529541,
      "learning_rate": 0.00015586868535451956,
      "loss": 1.0001187324523926,
      "step": 1680
    },
    {
      "epoch": 0.4914219249781913,
      "grad_norm": 2.0339841842651367,
      "learning_rate": 0.0001555688802278519,
      "loss": 1.042809009552002,
      "step": 1690
    },
    {
      "epoch": 0.4943297470194824,
      "grad_norm": 2.23016357421875,
      "learning_rate": 0.00015526907510118426,
      "loss": 1.0348269462585449,
      "step": 1700
    },
    {
      "epoch": 0.4972375690607735,
      "grad_norm": 2.620288372039795,
      "learning_rate": 0.00015496926997451656,
      "loss": 0.9250534057617188,
      "step": 1710
    },
    {
      "epoch": 0.5001453911020646,
      "grad_norm": 2.0876739025115967,
      "learning_rate": 0.0001546694648478489,
      "loss": 1.0507851600646974,
      "step": 1720
    },
    {
      "epoch": 0.5030532131433556,
      "grad_norm": 2.0192840099334717,
      "learning_rate": 0.00015436965972118126,
      "loss": 0.9346181869506835,
      "step": 1730
    },
    {
      "epoch": 0.5059610351846467,
      "grad_norm": 2.198378086090088,
      "learning_rate": 0.00015406985459451357,
      "loss": 1.008394145965576,
      "step": 1740
    },
    {
      "epoch": 0.5088688572259378,
      "grad_norm": 2.028977394104004,
      "learning_rate": 0.0001537700494678459,
      "loss": 1.0137055397033692,
      "step": 1750
    },
    {
      "epoch": 0.5117766792672288,
      "grad_norm": 2.091458320617676,
      "learning_rate": 0.00015347024434117824,
      "loss": 0.9982250213623047,
      "step": 1760
    },
    {
      "epoch": 0.51468450130852,
      "grad_norm": 1.993230938911438,
      "learning_rate": 0.00015317043921451057,
      "loss": 0.9881864547729492,
      "step": 1770
    },
    {
      "epoch": 0.517592323349811,
      "grad_norm": 1.9999781847000122,
      "learning_rate": 0.0001528706340878429,
      "loss": 0.9195417404174805,
      "step": 1780
    },
    {
      "epoch": 0.520500145391102,
      "grad_norm": 2.0336172580718994,
      "learning_rate": 0.00015257082896117524,
      "loss": 1.040501880645752,
      "step": 1790
    },
    {
      "epoch": 0.5234079674323932,
      "grad_norm": 2.3573317527770996,
      "learning_rate": 0.00015227102383450758,
      "loss": 1.0388559341430663,
      "step": 1800
    },
    {
      "epoch": 0.5263157894736842,
      "grad_norm": 2.2667224407196045,
      "learning_rate": 0.0001519712187078399,
      "loss": 1.0280171394348145,
      "step": 1810
    },
    {
      "epoch": 0.5292236115149753,
      "grad_norm": 2.0019094944000244,
      "learning_rate": 0.00015167141358117224,
      "loss": 1.054083251953125,
      "step": 1820
    },
    {
      "epoch": 0.5321314335562664,
      "grad_norm": 2.1444737911224365,
      "learning_rate": 0.00015137160845450458,
      "loss": 1.0175910949707032,
      "step": 1830
    },
    {
      "epoch": 0.5350392555975574,
      "grad_norm": 2.2335095405578613,
      "learning_rate": 0.0001510718033278369,
      "loss": 1.0406922340393066,
      "step": 1840
    },
    {
      "epoch": 0.5379470776388485,
      "grad_norm": 1.984129786491394,
      "learning_rate": 0.00015077199820116925,
      "loss": 0.9368982315063477,
      "step": 1850
    },
    {
      "epoch": 0.5408548996801396,
      "grad_norm": 1.9685107469558716,
      "learning_rate": 0.00015047219307450158,
      "loss": 0.9616594314575195,
      "step": 1860
    },
    {
      "epoch": 0.5437627217214307,
      "grad_norm": 1.945898413658142,
      "learning_rate": 0.00015017238794783392,
      "loss": 0.8819991111755371,
      "step": 1870
    },
    {
      "epoch": 0.5466705437627217,
      "grad_norm": 1.999085783958435,
      "learning_rate": 0.00014987258282116625,
      "loss": 0.9666972160339355,
      "step": 1880
    },
    {
      "epoch": 0.5495783658040128,
      "grad_norm": 2.3765146732330322,
      "learning_rate": 0.00014957277769449859,
      "loss": 0.959471607208252,
      "step": 1890
    },
    {
      "epoch": 0.5524861878453039,
      "grad_norm": 2.3136610984802246,
      "learning_rate": 0.00014927297256783092,
      "loss": 1.027439785003662,
      "step": 1900
    },
    {
      "epoch": 0.5553940098865949,
      "grad_norm": 2.017866373062134,
      "learning_rate": 0.00014897316744116325,
      "loss": 0.9374051094055176,
      "step": 1910
    },
    {
      "epoch": 0.558301831927886,
      "grad_norm": 2.01604962348938,
      "learning_rate": 0.0001486733623144956,
      "loss": 0.9679468154907227,
      "step": 1920
    },
    {
      "epoch": 0.5612096539691771,
      "grad_norm": 2.088538646697998,
      "learning_rate": 0.00014837355718782792,
      "loss": 1.0370773315429687,
      "step": 1930
    },
    {
      "epoch": 0.5641174760104681,
      "grad_norm": 2.233410120010376,
      "learning_rate": 0.00014807375206116026,
      "loss": 1.0147683143615722,
      "step": 1940
    },
    {
      "epoch": 0.5670252980517593,
      "grad_norm": 2.323624610900879,
      "learning_rate": 0.0001477739469344926,
      "loss": 0.8868412017822266,
      "step": 1950
    },
    {
      "epoch": 0.5699331200930503,
      "grad_norm": 2.008387327194214,
      "learning_rate": 0.00014747414180782493,
      "loss": 1.042736530303955,
      "step": 1960
    },
    {
      "epoch": 0.5728409421343413,
      "grad_norm": 2.069681406021118,
      "learning_rate": 0.00014717433668115726,
      "loss": 1.0332537651062013,
      "step": 1970
    },
    {
      "epoch": 0.5757487641756325,
      "grad_norm": 2.0033297538757324,
      "learning_rate": 0.00014687453155448957,
      "loss": 1.033948040008545,
      "step": 1980
    },
    {
      "epoch": 0.5786565862169235,
      "grad_norm": 2.320786237716675,
      "learning_rate": 0.00014657472642782193,
      "loss": 0.9264739036560059,
      "step": 1990
    },
    {
      "epoch": 0.5815644082582146,
      "grad_norm": 2.1512808799743652,
      "learning_rate": 0.00014627492130115427,
      "loss": 0.9679861068725586,
      "step": 2000
    },
    {
      "epoch": 0.5815644082582146,
      "eval_loss": 0.9911443591117859,
      "eval_runtime": 466.7402,
      "eval_samples_per_second": 7.368,
      "eval_steps_per_second": 7.368,
      "step": 2000
    },
    {
      "epoch": 0.5844722302995057,
      "grad_norm": 2.1983020305633545,
      "learning_rate": 0.00014597511617448657,
      "loss": 0.9210161209106446,
      "step": 2010
    },
    {
      "epoch": 0.5873800523407967,
      "grad_norm": 2.4881699085235596,
      "learning_rate": 0.00014567531104781893,
      "loss": 0.9766405105590821,
      "step": 2020
    },
    {
      "epoch": 0.5902878743820879,
      "grad_norm": 2.123229503631592,
      "learning_rate": 0.00014537550592115127,
      "loss": 0.9692230224609375,
      "step": 2030
    },
    {
      "epoch": 0.5931956964233789,
      "grad_norm": 2.089709758758545,
      "learning_rate": 0.00014507570079448358,
      "loss": 0.9490997314453125,
      "step": 2040
    },
    {
      "epoch": 0.5961035184646699,
      "grad_norm": 2.2401466369628906,
      "learning_rate": 0.00014477589566781594,
      "loss": 1.010519027709961,
      "step": 2050
    },
    {
      "epoch": 0.599011340505961,
      "grad_norm": 1.967027187347412,
      "learning_rate": 0.00014447609054114825,
      "loss": 0.9748605728149414,
      "step": 2060
    },
    {
      "epoch": 0.6019191625472521,
      "grad_norm": 2.350550889968872,
      "learning_rate": 0.0001441762854144806,
      "loss": 1.0137447357177733,
      "step": 2070
    },
    {
      "epoch": 0.6048269845885432,
      "grad_norm": 2.527585744857788,
      "learning_rate": 0.00014387648028781294,
      "loss": 0.9678420066833496,
      "step": 2080
    },
    {
      "epoch": 0.6077348066298343,
      "grad_norm": 2.1940855979919434,
      "learning_rate": 0.00014357667516114525,
      "loss": 0.9550240516662598,
      "step": 2090
    },
    {
      "epoch": 0.6106426286711253,
      "grad_norm": 1.9863537549972534,
      "learning_rate": 0.0001432768700344776,
      "loss": 0.941413688659668,
      "step": 2100
    },
    {
      "epoch": 0.6135504507124164,
      "grad_norm": 2.2011659145355225,
      "learning_rate": 0.00014297706490780994,
      "loss": 0.947486686706543,
      "step": 2110
    },
    {
      "epoch": 0.6164582727537075,
      "grad_norm": 2.1515135765075684,
      "learning_rate": 0.00014267725978114225,
      "loss": 0.9121341705322266,
      "step": 2120
    },
    {
      "epoch": 0.6193660947949986,
      "grad_norm": 2.002952814102173,
      "learning_rate": 0.00014237745465447461,
      "loss": 0.9708009719848633,
      "step": 2130
    },
    {
      "epoch": 0.6222739168362896,
      "grad_norm": 1.953120231628418,
      "learning_rate": 0.00014207764952780692,
      "loss": 0.9927172660827637,
      "step": 2140
    },
    {
      "epoch": 0.6251817388775807,
      "grad_norm": 1.9617403745651245,
      "learning_rate": 0.00014177784440113926,
      "loss": 0.9382636070251464,
      "step": 2150
    },
    {
      "epoch": 0.6280895609188718,
      "grad_norm": 2.2497262954711914,
      "learning_rate": 0.00014147803927447162,
      "loss": 0.9852985382080078,
      "step": 2160
    },
    {
      "epoch": 0.6309973829601628,
      "grad_norm": 2.1998257637023926,
      "learning_rate": 0.00014117823414780392,
      "loss": 1.0225582122802734,
      "step": 2170
    },
    {
      "epoch": 0.6339052050014539,
      "grad_norm": 1.931475281715393,
      "learning_rate": 0.00014087842902113626,
      "loss": 0.951665210723877,
      "step": 2180
    },
    {
      "epoch": 0.636813027042745,
      "grad_norm": 2.1101760864257812,
      "learning_rate": 0.00014057862389446862,
      "loss": 0.9361721038818359,
      "step": 2190
    },
    {
      "epoch": 0.639720849084036,
      "grad_norm": 2.0846989154815674,
      "learning_rate": 0.00014027881876780093,
      "loss": 0.9642247200012207,
      "step": 2200
    },
    {
      "epoch": 0.6426286711253272,
      "grad_norm": 2.221968650817871,
      "learning_rate": 0.00013997901364113326,
      "loss": 0.9930216789245605,
      "step": 2210
    },
    {
      "epoch": 0.6455364931666182,
      "grad_norm": 1.9288430213928223,
      "learning_rate": 0.0001396792085144656,
      "loss": 0.934235668182373,
      "step": 2220
    },
    {
      "epoch": 0.6484443152079092,
      "grad_norm": 1.9532197713851929,
      "learning_rate": 0.00013937940338779793,
      "loss": 0.9925549507141114,
      "step": 2230
    },
    {
      "epoch": 0.6513521372492004,
      "grad_norm": 1.9111248254776,
      "learning_rate": 0.00013907959826113027,
      "loss": 0.9969470024108886,
      "step": 2240
    },
    {
      "epoch": 0.6542599592904914,
      "grad_norm": 2.285212278366089,
      "learning_rate": 0.0001387797931344626,
      "loss": 0.9836911201477051,
      "step": 2250
    },
    {
      "epoch": 0.6571677813317824,
      "grad_norm": 2.1522858142852783,
      "learning_rate": 0.00013847998800779494,
      "loss": 0.9622378349304199,
      "step": 2260
    },
    {
      "epoch": 0.6600756033730736,
      "grad_norm": 2.1318211555480957,
      "learning_rate": 0.0001381801828811273,
      "loss": 0.9809438705444335,
      "step": 2270
    },
    {
      "epoch": 0.6629834254143646,
      "grad_norm": 2.0285794734954834,
      "learning_rate": 0.0001378803777544596,
      "loss": 1.0077838897705078,
      "step": 2280
    },
    {
      "epoch": 0.6658912474556558,
      "grad_norm": 1.9738699197769165,
      "learning_rate": 0.00013758057262779194,
      "loss": 0.9947647094726563,
      "step": 2290
    },
    {
      "epoch": 0.6687990694969468,
      "grad_norm": 2.0177745819091797,
      "learning_rate": 0.00013728076750112427,
      "loss": 1.000431442260742,
      "step": 2300
    },
    {
      "epoch": 0.6717068915382378,
      "grad_norm": 2.348642110824585,
      "learning_rate": 0.0001369809623744566,
      "loss": 1.0012446403503419,
      "step": 2310
    },
    {
      "epoch": 0.674614713579529,
      "grad_norm": 2.1052684783935547,
      "learning_rate": 0.00013668115724778894,
      "loss": 1.0000034332275392,
      "step": 2320
    },
    {
      "epoch": 0.67752253562082,
      "grad_norm": 2.2087337970733643,
      "learning_rate": 0.00013638135212112128,
      "loss": 0.946139907836914,
      "step": 2330
    },
    {
      "epoch": 0.6804303576621111,
      "grad_norm": 2.344235420227051,
      "learning_rate": 0.0001360815469944536,
      "loss": 0.9389586448669434,
      "step": 2340
    },
    {
      "epoch": 0.6833381797034022,
      "grad_norm": 1.8961925506591797,
      "learning_rate": 0.00013578174186778595,
      "loss": 0.9666190147399902,
      "step": 2350
    },
    {
      "epoch": 0.6862460017446932,
      "grad_norm": 2.05000901222229,
      "learning_rate": 0.00013548193674111828,
      "loss": 0.9825595855712891,
      "step": 2360
    },
    {
      "epoch": 0.6891538237859843,
      "grad_norm": 2.0229718685150146,
      "learning_rate": 0.00013518213161445061,
      "loss": 1.0510747909545899,
      "step": 2370
    },
    {
      "epoch": 0.6920616458272754,
      "grad_norm": 1.9893312454223633,
      "learning_rate": 0.00013488232648778295,
      "loss": 0.954042911529541,
      "step": 2380
    },
    {
      "epoch": 0.6949694678685664,
      "grad_norm": 1.9798680543899536,
      "learning_rate": 0.00013458252136111528,
      "loss": 0.9377657890319824,
      "step": 2390
    },
    {
      "epoch": 0.6978772899098575,
      "grad_norm": 2.1432321071624756,
      "learning_rate": 0.00013428271623444762,
      "loss": 0.9713227272033691,
      "step": 2400
    },
    {
      "epoch": 0.7007851119511486,
      "grad_norm": 2.111288070678711,
      "learning_rate": 0.00013398291110777995,
      "loss": 0.9865769386291504,
      "step": 2410
    },
    {
      "epoch": 0.7036929339924397,
      "grad_norm": 1.9824166297912598,
      "learning_rate": 0.0001336831059811123,
      "loss": 0.9995452880859375,
      "step": 2420
    },
    {
      "epoch": 0.7066007560337307,
      "grad_norm": 1.9680949449539185,
      "learning_rate": 0.00013338330085444462,
      "loss": 0.909880256652832,
      "step": 2430
    },
    {
      "epoch": 0.7095085780750218,
      "grad_norm": 2.0280842781066895,
      "learning_rate": 0.00013308349572777693,
      "loss": 0.8853329658508301,
      "step": 2440
    },
    {
      "epoch": 0.7124164001163129,
      "grad_norm": 1.9791338443756104,
      "learning_rate": 0.0001327836906011093,
      "loss": 1.0020163536071778,
      "step": 2450
    },
    {
      "epoch": 0.7153242221576039,
      "grad_norm": 2.158463954925537,
      "learning_rate": 0.00013248388547444163,
      "loss": 0.915585708618164,
      "step": 2460
    },
    {
      "epoch": 0.7182320441988951,
      "grad_norm": 2.0322635173797607,
      "learning_rate": 0.00013218408034777396,
      "loss": 0.9437061309814453,
      "step": 2470
    },
    {
      "epoch": 0.7211398662401861,
      "grad_norm": 2.1616227626800537,
      "learning_rate": 0.0001318842752211063,
      "loss": 0.9552411079406739,
      "step": 2480
    },
    {
      "epoch": 0.7240476882814771,
      "grad_norm": 1.9678977727890015,
      "learning_rate": 0.00013158447009443863,
      "loss": 0.9155937194824219,
      "step": 2490
    },
    {
      "epoch": 0.7269555103227683,
      "grad_norm": 1.7541477680206299,
      "learning_rate": 0.00013128466496777096,
      "loss": 0.9028853416442871,
      "step": 2500
    },
    {
      "epoch": 0.7269555103227683,
      "eval_loss": 0.9591814279556274,
      "eval_runtime": 473.0242,
      "eval_samples_per_second": 7.27,
      "eval_steps_per_second": 7.27,
      "step": 2500
    },
    {
      "epoch": 0.7298633323640593,
      "grad_norm": 2.0649845600128174,
      "learning_rate": 0.0001309848598411033,
      "loss": 0.9972336769104004,
      "step": 2510
    },
    {
      "epoch": 0.7327711544053503,
      "grad_norm": 1.8874973058700562,
      "learning_rate": 0.0001306850547144356,
      "loss": 0.9446205139160156,
      "step": 2520
    },
    {
      "epoch": 0.7356789764466415,
      "grad_norm": 2.4050185680389404,
      "learning_rate": 0.00013038524958776797,
      "loss": 1.0329988479614258,
      "step": 2530
    },
    {
      "epoch": 0.7385867984879325,
      "grad_norm": 2.0193111896514893,
      "learning_rate": 0.0001300854444611003,
      "loss": 0.910318660736084,
      "step": 2540
    },
    {
      "epoch": 0.7414946205292237,
      "grad_norm": 2.376051902770996,
      "learning_rate": 0.0001297856393344326,
      "loss": 1.0247613906860351,
      "step": 2550
    },
    {
      "epoch": 0.7444024425705147,
      "grad_norm": 2.1105611324310303,
      "learning_rate": 0.00012948583420776497,
      "loss": 1.016015625,
      "step": 2560
    },
    {
      "epoch": 0.7473102646118057,
      "grad_norm": 2.1081364154815674,
      "learning_rate": 0.0001291860290810973,
      "loss": 0.909546184539795,
      "step": 2570
    },
    {
      "epoch": 0.7502180866530969,
      "grad_norm": 2.046095132827759,
      "learning_rate": 0.0001288862239544296,
      "loss": 0.9204464912414551,
      "step": 2580
    },
    {
      "epoch": 0.7531259086943879,
      "grad_norm": 1.9615705013275146,
      "learning_rate": 0.00012858641882776197,
      "loss": 0.9793522834777832,
      "step": 2590
    },
    {
      "epoch": 0.7560337307356789,
      "grad_norm": 1.9187848567962646,
      "learning_rate": 0.00012828661370109428,
      "loss": 0.9414368629455566,
      "step": 2600
    },
    {
      "epoch": 0.7589415527769701,
      "grad_norm": 2.0125985145568848,
      "learning_rate": 0.00012798680857442662,
      "loss": 1.0450970649719238,
      "step": 2610
    },
    {
      "epoch": 0.7618493748182611,
      "grad_norm": 2.2543137073516846,
      "learning_rate": 0.00012768700344775898,
      "loss": 1.0270273208618164,
      "step": 2620
    },
    {
      "epoch": 0.7647571968595522,
      "grad_norm": 2.00054669380188,
      "learning_rate": 0.00012738719832109128,
      "loss": 1.0244030952453613,
      "step": 2630
    },
    {
      "epoch": 0.7676650189008433,
      "grad_norm": 1.8571268320083618,
      "learning_rate": 0.00012708739319442362,
      "loss": 0.9479835510253907,
      "step": 2640
    },
    {
      "epoch": 0.7705728409421343,
      "grad_norm": 2.0844404697418213,
      "learning_rate": 0.00012678758806775598,
      "loss": 0.8867239952087402,
      "step": 2650
    },
    {
      "epoch": 0.7734806629834254,
      "grad_norm": 2.023630142211914,
      "learning_rate": 0.0001264877829410883,
      "loss": 0.9323092460632324,
      "step": 2660
    },
    {
      "epoch": 0.7763884850247165,
      "grad_norm": 1.935514211654663,
      "learning_rate": 0.00012618797781442065,
      "loss": 0.997464942932129,
      "step": 2670
    },
    {
      "epoch": 0.7792963070660076,
      "grad_norm": 2.1317570209503174,
      "learning_rate": 0.00012588817268775296,
      "loss": 0.9762091636657715,
      "step": 2680
    },
    {
      "epoch": 0.7822041291072986,
      "grad_norm": 2.094515323638916,
      "learning_rate": 0.0001255883675610853,
      "loss": 0.919368839263916,
      "step": 2690
    },
    {
      "epoch": 0.7851119511485897,
      "grad_norm": 2.0645945072174072,
      "learning_rate": 0.00012528856243441765,
      "loss": 0.917721176147461,
      "step": 2700
    },
    {
      "epoch": 0.7880197731898808,
      "grad_norm": 2.178105592727661,
      "learning_rate": 0.00012498875730774996,
      "loss": 0.9504012107849121,
      "step": 2710
    },
    {
      "epoch": 0.7909275952311718,
      "grad_norm": 2.3576605319976807,
      "learning_rate": 0.0001246889521810823,
      "loss": 1.0032535552978517,
      "step": 2720
    },
    {
      "epoch": 0.7938354172724629,
      "grad_norm": 1.9736145734786987,
      "learning_rate": 0.00012438914705441466,
      "loss": 0.9342514991760253,
      "step": 2730
    },
    {
      "epoch": 0.796743239313754,
      "grad_norm": 2.074565887451172,
      "learning_rate": 0.00012408934192774696,
      "loss": 0.951146125793457,
      "step": 2740
    },
    {
      "epoch": 0.799651061355045,
      "grad_norm": 2.1545727252960205,
      "learning_rate": 0.0001237895368010793,
      "loss": 0.9272260665893555,
      "step": 2750
    },
    {
      "epoch": 0.8025588833963362,
      "grad_norm": 2.0214531421661377,
      "learning_rate": 0.00012348973167441163,
      "loss": 0.9915397644042969,
      "step": 2760
    },
    {
      "epoch": 0.8054667054376272,
      "grad_norm": 2.0707271099090576,
      "learning_rate": 0.00012318992654774397,
      "loss": 0.9817545890808106,
      "step": 2770
    },
    {
      "epoch": 0.8083745274789182,
      "grad_norm": 1.9729819297790527,
      "learning_rate": 0.0001228901214210763,
      "loss": 0.9666746139526368,
      "step": 2780
    },
    {
      "epoch": 0.8112823495202094,
      "grad_norm": 2.0239417552948,
      "learning_rate": 0.00012259031629440864,
      "loss": 0.9174615859985351,
      "step": 2790
    },
    {
      "epoch": 0.8141901715615004,
      "grad_norm": 2.0677332878112793,
      "learning_rate": 0.00012229051116774097,
      "loss": 0.9489200592041016,
      "step": 2800
    },
    {
      "epoch": 0.8170979936027916,
      "grad_norm": 2.0116071701049805,
      "learning_rate": 0.0001219907060410733,
      "loss": 0.8491521835327148,
      "step": 2810
    },
    {
      "epoch": 0.8200058156440826,
      "grad_norm": 2.005683422088623,
      "learning_rate": 0.00012169090091440565,
      "loss": 0.8922024726867676,
      "step": 2820
    },
    {
      "epoch": 0.8229136376853736,
      "grad_norm": 2.201763868331909,
      "learning_rate": 0.00012139109578773797,
      "loss": 0.9258706092834472,
      "step": 2830
    },
    {
      "epoch": 0.8258214597266648,
      "grad_norm": 1.9857138395309448,
      "learning_rate": 0.0001210912906610703,
      "loss": 0.9566280364990234,
      "step": 2840
    },
    {
      "epoch": 0.8287292817679558,
      "grad_norm": 2.107966899871826,
      "learning_rate": 0.00012079148553440264,
      "loss": 0.924662971496582,
      "step": 2850
    },
    {
      "epoch": 0.8316371038092468,
      "grad_norm": 2.1448616981506348,
      "learning_rate": 0.00012049168040773498,
      "loss": 0.9880090713500976,
      "step": 2860
    },
    {
      "epoch": 0.834544925850538,
      "grad_norm": 2.1623342037200928,
      "learning_rate": 0.0001201918752810673,
      "loss": 0.9296030044555664,
      "step": 2870
    },
    {
      "epoch": 0.837452747891829,
      "grad_norm": 2.149792194366455,
      "learning_rate": 0.00011989207015439965,
      "loss": 0.8926197052001953,
      "step": 2880
    },
    {
      "epoch": 0.8403605699331201,
      "grad_norm": 1.7601749897003174,
      "learning_rate": 0.00011959226502773198,
      "loss": 0.937494945526123,
      "step": 2890
    },
    {
      "epoch": 0.8432683919744112,
      "grad_norm": 1.8277206420898438,
      "learning_rate": 0.00011929245990106433,
      "loss": 0.9348894119262695,
      "step": 2900
    },
    {
      "epoch": 0.8461762140157022,
      "grad_norm": 2.417834520339966,
      "learning_rate": 0.00011899265477439665,
      "loss": 0.9014430046081543,
      "step": 2910
    },
    {
      "epoch": 0.8490840360569933,
      "grad_norm": 1.9275659322738647,
      "learning_rate": 0.00011869284964772897,
      "loss": 0.904062557220459,
      "step": 2920
    },
    {
      "epoch": 0.8519918580982844,
      "grad_norm": 1.8992774486541748,
      "learning_rate": 0.00011839304452106132,
      "loss": 0.9319318771362305,
      "step": 2930
    },
    {
      "epoch": 0.8548996801395755,
      "grad_norm": 1.7830348014831543,
      "learning_rate": 0.00011809323939439365,
      "loss": 0.9675899505615234,
      "step": 2940
    },
    {
      "epoch": 0.8578075021808665,
      "grad_norm": 1.9002785682678223,
      "learning_rate": 0.00011779343426772598,
      "loss": 0.9884864807128906,
      "step": 2950
    },
    {
      "epoch": 0.8607153242221576,
      "grad_norm": 2.256084442138672,
      "learning_rate": 0.00011749362914105832,
      "loss": 0.9169980049133301,
      "step": 2960
    },
    {
      "epoch": 0.8636231462634487,
      "grad_norm": 1.977953553199768,
      "learning_rate": 0.00011719382401439066,
      "loss": 0.9288657188415528,
      "step": 2970
    },
    {
      "epoch": 0.8665309683047397,
      "grad_norm": 2.223893404006958,
      "learning_rate": 0.00011689401888772298,
      "loss": 0.8921388626098633,
      "step": 2980
    },
    {
      "epoch": 0.8694387903460308,
      "grad_norm": 1.8232818841934204,
      "learning_rate": 0.00011659421376105533,
      "loss": 0.9274946212768554,
      "step": 2990
    },
    {
      "epoch": 0.8723466123873219,
      "grad_norm": 2.229280471801758,
      "learning_rate": 0.00011629440863438765,
      "loss": 0.9193262100219727,
      "step": 3000
    },
    {
      "epoch": 0.8723466123873219,
      "eval_loss": 0.9303778409957886,
      "eval_runtime": 469.5057,
      "eval_samples_per_second": 7.325,
      "eval_steps_per_second": 7.325,
      "step": 3000
    },
    {
      "epoch": 0.875254434428613,
      "grad_norm": 1.7871650457382202,
      "learning_rate": 0.00011599460350771998,
      "loss": 1.0009597778320312,
      "step": 3010
    },
    {
      "epoch": 0.8781622564699041,
      "grad_norm": 1.9837925434112549,
      "learning_rate": 0.00011569479838105233,
      "loss": 0.9461586952209473,
      "step": 3020
    },
    {
      "epoch": 0.8810700785111951,
      "grad_norm": 2.1220059394836426,
      "learning_rate": 0.00011539499325438465,
      "loss": 0.8776027679443359,
      "step": 3030
    },
    {
      "epoch": 0.8839779005524862,
      "grad_norm": 2.0386621952056885,
      "learning_rate": 0.00011509518812771699,
      "loss": 0.8935223579406738,
      "step": 3040
    },
    {
      "epoch": 0.8868857225937773,
      "grad_norm": 2.0495152473449707,
      "learning_rate": 0.00011479538300104933,
      "loss": 0.9414477348327637,
      "step": 3050
    },
    {
      "epoch": 0.8897935446350683,
      "grad_norm": 2.1676785945892334,
      "learning_rate": 0.00011449557787438165,
      "loss": 0.9668932914733886,
      "step": 3060
    },
    {
      "epoch": 0.8927013666763594,
      "grad_norm": 1.8518208265304565,
      "learning_rate": 0.00011419577274771398,
      "loss": 0.83909912109375,
      "step": 3070
    },
    {
      "epoch": 0.8956091887176505,
      "grad_norm": 2.6842968463897705,
      "learning_rate": 0.00011389596762104632,
      "loss": 1.001734161376953,
      "step": 3080
    },
    {
      "epoch": 0.8985170107589415,
      "grad_norm": 2.093907594680786,
      "learning_rate": 0.00011359616249437866,
      "loss": 0.9194964408874512,
      "step": 3090
    },
    {
      "epoch": 0.9014248328002327,
      "grad_norm": 1.7096366882324219,
      "learning_rate": 0.000113296357367711,
      "loss": 0.8888837814331054,
      "step": 3100
    },
    {
      "epoch": 0.9043326548415237,
      "grad_norm": 2.0421509742736816,
      "learning_rate": 0.00011299655224104333,
      "loss": 0.9483534812927246,
      "step": 3110
    },
    {
      "epoch": 0.9072404768828147,
      "grad_norm": 2.220430612564087,
      "learning_rate": 0.00011269674711437566,
      "loss": 0.855626106262207,
      "step": 3120
    },
    {
      "epoch": 0.9101482989241059,
      "grad_norm": 2.721151351928711,
      "learning_rate": 0.00011239694198770801,
      "loss": 0.9696638107299804,
      "step": 3130
    },
    {
      "epoch": 0.9130561209653969,
      "grad_norm": 2.0278444290161133,
      "learning_rate": 0.00011209713686104033,
      "loss": 0.9626810073852539,
      "step": 3140
    },
    {
      "epoch": 0.915963943006688,
      "grad_norm": 1.9929981231689453,
      "learning_rate": 0.00011179733173437265,
      "loss": 0.9121280670166015,
      "step": 3150
    },
    {
      "epoch": 0.9188717650479791,
      "grad_norm": 1.8903363943099976,
      "learning_rate": 0.000111497526607705,
      "loss": 0.8926810264587403,
      "step": 3160
    },
    {
      "epoch": 0.9217795870892701,
      "grad_norm": 1.9016224145889282,
      "learning_rate": 0.00011119772148103733,
      "loss": 0.9811866760253907,
      "step": 3170
    },
    {
      "epoch": 0.9246874091305612,
      "grad_norm": 1.7538851499557495,
      "learning_rate": 0.00011089791635436966,
      "loss": 0.9020210266113281,
      "step": 3180
    },
    {
      "epoch": 0.9275952311718523,
      "grad_norm": 1.8952163457870483,
      "learning_rate": 0.000110598111227702,
      "loss": 0.9186564445495605,
      "step": 3190
    },
    {
      "epoch": 0.9305030532131433,
      "grad_norm": 2.0162456035614014,
      "learning_rate": 0.00011029830610103434,
      "loss": 0.9249841690063476,
      "step": 3200
    },
    {
      "epoch": 0.9334108752544344,
      "grad_norm": 1.9944981336593628,
      "learning_rate": 0.00010999850097436666,
      "loss": 0.8584202766418457,
      "step": 3210
    },
    {
      "epoch": 0.9363186972957255,
      "grad_norm": 1.9651786088943481,
      "learning_rate": 0.000109698695847699,
      "loss": 0.8806270599365235,
      "step": 3220
    },
    {
      "epoch": 0.9392265193370166,
      "grad_norm": 1.7801518440246582,
      "learning_rate": 0.00010939889072103133,
      "loss": 0.9265996932983398,
      "step": 3230
    },
    {
      "epoch": 0.9421343413783076,
      "grad_norm": 2.065995454788208,
      "learning_rate": 0.00010909908559436366,
      "loss": 0.9499953269958497,
      "step": 3240
    },
    {
      "epoch": 0.9450421634195987,
      "grad_norm": 1.797984004020691,
      "learning_rate": 0.00010879928046769601,
      "loss": 0.8817252159118653,
      "step": 3250
    },
    {
      "epoch": 0.9479499854608898,
      "grad_norm": 1.7669072151184082,
      "learning_rate": 0.00010849947534102833,
      "loss": 0.9481887817382812,
      "step": 3260
    },
    {
      "epoch": 0.9508578075021809,
      "grad_norm": 2.1424496173858643,
      "learning_rate": 0.00010819967021436067,
      "loss": 0.9072481155395508,
      "step": 3270
    },
    {
      "epoch": 0.953765629543472,
      "grad_norm": 2.006564140319824,
      "learning_rate": 0.00010789986508769301,
      "loss": 0.9786685943603516,
      "step": 3280
    },
    {
      "epoch": 0.956673451584763,
      "grad_norm": 1.8808116912841797,
      "learning_rate": 0.00010760005996102533,
      "loss": 0.9131714820861816,
      "step": 3290
    },
    {
      "epoch": 0.959581273626054,
      "grad_norm": 1.8156124353408813,
      "learning_rate": 0.00010730025483435768,
      "loss": 0.9058806419372558,
      "step": 3300
    },
    {
      "epoch": 0.9624890956673452,
      "grad_norm": 2.1244983673095703,
      "learning_rate": 0.00010700044970769,
      "loss": 0.8985400199890137,
      "step": 3310
    },
    {
      "epoch": 0.9653969177086362,
      "grad_norm": 1.979709506034851,
      "learning_rate": 0.00010670064458102234,
      "loss": 0.8887692451477051,
      "step": 3320
    },
    {
      "epoch": 0.9683047397499273,
      "grad_norm": 1.8038535118103027,
      "learning_rate": 0.00010640083945435469,
      "loss": 0.9474887847900391,
      "step": 3330
    },
    {
      "epoch": 0.9712125617912184,
      "grad_norm": 1.883305549621582,
      "learning_rate": 0.00010610103432768701,
      "loss": 0.8624442100524903,
      "step": 3340
    },
    {
      "epoch": 0.9741203838325094,
      "grad_norm": 1.9134957790374756,
      "learning_rate": 0.00010580122920101934,
      "loss": 0.8966679573059082,
      "step": 3350
    },
    {
      "epoch": 0.9770282058738006,
      "grad_norm": 1.8334492444992065,
      "learning_rate": 0.00010550142407435169,
      "loss": 0.8644783020019531,
      "step": 3360
    },
    {
      "epoch": 0.9799360279150916,
      "grad_norm": 1.8988165855407715,
      "learning_rate": 0.00010520161894768401,
      "loss": 0.8418782234191895,
      "step": 3370
    },
    {
      "epoch": 0.9828438499563826,
      "grad_norm": 2.0219290256500244,
      "learning_rate": 0.00010490181382101633,
      "loss": 0.9593500137329102,
      "step": 3380
    },
    {
      "epoch": 0.9857516719976738,
      "grad_norm": 1.9475761651992798,
      "learning_rate": 0.00010460200869434868,
      "loss": 0.9046772956848145,
      "step": 3390
    },
    {
      "epoch": 0.9886594940389648,
      "grad_norm": 2.156895637512207,
      "learning_rate": 0.00010430220356768101,
      "loss": 0.9402565956115723,
      "step": 3400
    },
    {
      "epoch": 0.9915673160802558,
      "grad_norm": 1.8304574489593506,
      "learning_rate": 0.00010400239844101334,
      "loss": 0.9041817665100098,
      "step": 3410
    },
    {
      "epoch": 0.994475138121547,
      "grad_norm": 2.0752060413360596,
      "learning_rate": 0.00010370259331434568,
      "loss": 0.8676543235778809,
      "step": 3420
    },
    {
      "epoch": 0.997382960162838,
      "grad_norm": 1.845677375793457,
      "learning_rate": 0.00010340278818767802,
      "loss": 0.8220011711120605,
      "step": 3430
    },
    {
      "epoch": 1.0002907822041291,
      "grad_norm": 1.7879046201705933,
      "learning_rate": 0.00010310298306101034,
      "loss": 0.9136262893676758,
      "step": 3440
    },
    {
      "epoch": 1.0031986042454202,
      "grad_norm": 1.9771437644958496,
      "learning_rate": 0.00010280317793434269,
      "loss": 0.7697383880615234,
      "step": 3450
    },
    {
      "epoch": 1.0061064262867112,
      "grad_norm": 2.23533296585083,
      "learning_rate": 0.00010250337280767501,
      "loss": 0.8118080139160156,
      "step": 3460
    },
    {
      "epoch": 1.0090142483280022,
      "grad_norm": 1.9873441457748413,
      "learning_rate": 0.00010220356768100734,
      "loss": 0.7812850475311279,
      "step": 3470
    },
    {
      "epoch": 1.0119220703692935,
      "grad_norm": 1.8505840301513672,
      "learning_rate": 0.00010190376255433969,
      "loss": 0.8191473960876465,
      "step": 3480
    },
    {
      "epoch": 1.0148298924105845,
      "grad_norm": 2.0484778881073,
      "learning_rate": 0.00010160395742767201,
      "loss": 0.8035045623779297,
      "step": 3490
    },
    {
      "epoch": 1.0177377144518756,
      "grad_norm": 1.9655097723007202,
      "learning_rate": 0.00010130415230100436,
      "loss": 0.779402208328247,
      "step": 3500
    },
    {
      "epoch": 1.0177377144518756,
      "eval_loss": 0.9158970713615417,
      "eval_runtime": 474.9541,
      "eval_samples_per_second": 7.241,
      "eval_steps_per_second": 7.241,
      "step": 3500
    },
    {
      "epoch": 1.0206455364931666,
      "grad_norm": 2.0883100032806396,
      "learning_rate": 0.0001010043471743367,
      "loss": 0.8522077560424804,
      "step": 3510
    },
    {
      "epoch": 1.0235533585344576,
      "grad_norm": 2.098511219024658,
      "learning_rate": 0.00010070454204766901,
      "loss": 0.8504983901977539,
      "step": 3520
    },
    {
      "epoch": 1.0264611805757489,
      "grad_norm": 2.101414442062378,
      "learning_rate": 0.00010040473692100136,
      "loss": 0.7838150978088378,
      "step": 3530
    },
    {
      "epoch": 1.02936900261704,
      "grad_norm": 2.2732677459716797,
      "learning_rate": 0.00010010493179433368,
      "loss": 0.7565544605255127,
      "step": 3540
    },
    {
      "epoch": 1.032276824658331,
      "grad_norm": 1.695768117904663,
      "learning_rate": 9.980512666766603e-05,
      "loss": 0.7418520450592041,
      "step": 3550
    },
    {
      "epoch": 1.035184646699622,
      "grad_norm": 2.0946264266967773,
      "learning_rate": 9.950532154099835e-05,
      "loss": 0.820562744140625,
      "step": 3560
    },
    {
      "epoch": 1.038092468740913,
      "grad_norm": 2.103559732437134,
      "learning_rate": 9.920551641433069e-05,
      "loss": 0.769464635848999,
      "step": 3570
    },
    {
      "epoch": 1.041000290782204,
      "grad_norm": 1.9463759660720825,
      "learning_rate": 9.890571128766302e-05,
      "loss": 0.7375322818756104,
      "step": 3580
    },
    {
      "epoch": 1.0439081128234953,
      "grad_norm": 2.0933845043182373,
      "learning_rate": 9.860590616099536e-05,
      "loss": 0.7640562534332276,
      "step": 3590
    },
    {
      "epoch": 1.0468159348647863,
      "grad_norm": 1.9825164079666138,
      "learning_rate": 9.830610103432769e-05,
      "loss": 0.7955950736999512,
      "step": 3600
    },
    {
      "epoch": 1.0497237569060773,
      "grad_norm": 1.9863218069076538,
      "learning_rate": 9.800629590766003e-05,
      "loss": 0.7707761764526367,
      "step": 3610
    },
    {
      "epoch": 1.0526315789473684,
      "grad_norm": 1.9356553554534912,
      "learning_rate": 9.770649078099236e-05,
      "loss": 0.7836559772491455,
      "step": 3620
    },
    {
      "epoch": 1.0555394009886594,
      "grad_norm": 2.0146119594573975,
      "learning_rate": 9.74066856543247e-05,
      "loss": 0.7523280143737793,
      "step": 3630
    },
    {
      "epoch": 1.0584472230299506,
      "grad_norm": 2.056215763092041,
      "learning_rate": 9.710688052765703e-05,
      "loss": 0.8155484199523926,
      "step": 3640
    },
    {
      "epoch": 1.0613550450712417,
      "grad_norm": 1.8928337097167969,
      "learning_rate": 9.680707540098936e-05,
      "loss": 0.8006336212158203,
      "step": 3650
    },
    {
      "epoch": 1.0642628671125327,
      "grad_norm": 1.931458592414856,
      "learning_rate": 9.65072702743217e-05,
      "loss": 0.7413979530334472,
      "step": 3660
    },
    {
      "epoch": 1.0671706891538237,
      "grad_norm": 2.162804126739502,
      "learning_rate": 9.620746514765403e-05,
      "loss": 0.7883041381835938,
      "step": 3670
    },
    {
      "epoch": 1.0700785111951148,
      "grad_norm": 2.1737864017486572,
      "learning_rate": 9.590766002098637e-05,
      "loss": 0.7595384597778321,
      "step": 3680
    },
    {
      "epoch": 1.072986333236406,
      "grad_norm": 2.1503849029541016,
      "learning_rate": 9.560785489431869e-05,
      "loss": 0.8216732025146485,
      "step": 3690
    },
    {
      "epoch": 1.075894155277697,
      "grad_norm": 1.9446245431900024,
      "learning_rate": 9.530804976765104e-05,
      "loss": 0.8016197204589843,
      "step": 3700
    },
    {
      "epoch": 1.078801977318988,
      "grad_norm": 1.8525793552398682,
      "learning_rate": 9.500824464098337e-05,
      "loss": 0.8090981483459473,
      "step": 3710
    },
    {
      "epoch": 1.0817097993602791,
      "grad_norm": 1.8631248474121094,
      "learning_rate": 9.470843951431569e-05,
      "loss": 0.7200882434844971,
      "step": 3720
    },
    {
      "epoch": 1.0846176214015701,
      "grad_norm": 2.06455135345459,
      "learning_rate": 9.440863438764803e-05,
      "loss": 0.7492884635925293,
      "step": 3730
    },
    {
      "epoch": 1.0875254434428614,
      "grad_norm": 2.1832897663116455,
      "learning_rate": 9.410882926098037e-05,
      "loss": 0.8133129119873047,
      "step": 3740
    },
    {
      "epoch": 1.0904332654841524,
      "grad_norm": 1.9299908876419067,
      "learning_rate": 9.380902413431271e-05,
      "loss": 0.7218931674957275,
      "step": 3750
    },
    {
      "epoch": 1.0933410875254435,
      "grad_norm": 1.8345733880996704,
      "learning_rate": 9.350921900764503e-05,
      "loss": 0.8038671493530274,
      "step": 3760
    },
    {
      "epoch": 1.0962489095667345,
      "grad_norm": 2.200580596923828,
      "learning_rate": 9.320941388097736e-05,
      "loss": 0.758363676071167,
      "step": 3770
    },
    {
      "epoch": 1.0991567316080255,
      "grad_norm": 1.9002389907836914,
      "learning_rate": 9.290960875430971e-05,
      "loss": 0.7319043636322021,
      "step": 3780
    },
    {
      "epoch": 1.1020645536493165,
      "grad_norm": 2.178980827331543,
      "learning_rate": 9.260980362764203e-05,
      "loss": 0.8144195556640625,
      "step": 3790
    },
    {
      "epoch": 1.1049723756906078,
      "grad_norm": 1.777524709701538,
      "learning_rate": 9.230999850097437e-05,
      "loss": 0.7428917407989502,
      "step": 3800
    },
    {
      "epoch": 1.1078801977318988,
      "grad_norm": 2.269186019897461,
      "learning_rate": 9.20101933743067e-05,
      "loss": 0.7581167221069336,
      "step": 3810
    },
    {
      "epoch": 1.1107880197731899,
      "grad_norm": 2.0214948654174805,
      "learning_rate": 9.171038824763904e-05,
      "loss": 0.786471939086914,
      "step": 3820
    },
    {
      "epoch": 1.113695841814481,
      "grad_norm": 1.8476779460906982,
      "learning_rate": 9.141058312097137e-05,
      "loss": 0.7075447559356689,
      "step": 3830
    },
    {
      "epoch": 1.1166036638557721,
      "grad_norm": 1.7384737730026245,
      "learning_rate": 9.11107779943037e-05,
      "loss": 0.7290010452270508,
      "step": 3840
    },
    {
      "epoch": 1.1195114858970632,
      "grad_norm": 2.1066110134124756,
      "learning_rate": 9.081097286763604e-05,
      "loss": 0.7164917469024659,
      "step": 3850
    },
    {
      "epoch": 1.1224193079383542,
      "grad_norm": 2.2718801498413086,
      "learning_rate": 9.051116774096837e-05,
      "loss": 0.7832975864410401,
      "step": 3860
    },
    {
      "epoch": 1.1253271299796452,
      "grad_norm": 2.0165557861328125,
      "learning_rate": 9.021136261430071e-05,
      "loss": 0.840385913848877,
      "step": 3870
    },
    {
      "epoch": 1.1282349520209363,
      "grad_norm": 1.9852454662322998,
      "learning_rate": 8.991155748763304e-05,
      "loss": 0.8062166213989258,
      "step": 3880
    },
    {
      "epoch": 1.1311427740622273,
      "grad_norm": 1.8891923427581787,
      "learning_rate": 8.961175236096538e-05,
      "loss": 0.860920524597168,
      "step": 3890
    },
    {
      "epoch": 1.1340505961035185,
      "grad_norm": 2.2840018272399902,
      "learning_rate": 8.931194723429771e-05,
      "loss": 0.8350888252258301,
      "step": 3900
    },
    {
      "epoch": 1.1369584181448096,
      "grad_norm": 1.9931566715240479,
      "learning_rate": 8.901214210763005e-05,
      "loss": 0.7368578910827637,
      "step": 3910
    },
    {
      "epoch": 1.1398662401861006,
      "grad_norm": 2.0133721828460693,
      "learning_rate": 8.871233698096237e-05,
      "loss": 0.8287955284118652,
      "step": 3920
    },
    {
      "epoch": 1.1427740622273916,
      "grad_norm": 2.1664857864379883,
      "learning_rate": 8.841253185429472e-05,
      "loss": 0.7541079044342041,
      "step": 3930
    },
    {
      "epoch": 1.1456818842686827,
      "grad_norm": 2.1204898357391357,
      "learning_rate": 8.811272672762705e-05,
      "loss": 0.7905386447906494,
      "step": 3940
    },
    {
      "epoch": 1.148589706309974,
      "grad_norm": 1.8248870372772217,
      "learning_rate": 8.781292160095938e-05,
      "loss": 0.7721247673034668,
      "step": 3950
    },
    {
      "epoch": 1.151497528351265,
      "grad_norm": 1.8345229625701904,
      "learning_rate": 8.75131164742917e-05,
      "loss": 0.7757219314575196,
      "step": 3960
    },
    {
      "epoch": 1.154405350392556,
      "grad_norm": 1.6888972520828247,
      "learning_rate": 8.721331134762405e-05,
      "loss": 0.7593977451324463,
      "step": 3970
    },
    {
      "epoch": 1.157313172433847,
      "grad_norm": 1.9662666320800781,
      "learning_rate": 8.691350622095639e-05,
      "loss": 0.7545000076293945,
      "step": 3980
    },
    {
      "epoch": 1.160220994475138,
      "grad_norm": 1.987597107887268,
      "learning_rate": 8.661370109428871e-05,
      "loss": 0.7677037239074707,
      "step": 3990
    },
    {
      "epoch": 1.163128816516429,
      "grad_norm": 2.0462353229522705,
      "learning_rate": 8.631389596762104e-05,
      "loss": 0.7688228130340576,
      "step": 4000
    },
    {
      "epoch": 1.163128816516429,
      "eval_loss": 0.902101457118988,
      "eval_runtime": 471.9941,
      "eval_samples_per_second": 7.286,
      "eval_steps_per_second": 7.286,
      "step": 4000
    },
    {
      "epoch": 1.1660366385577203,
      "grad_norm": 2.0174319744110107,
      "learning_rate": 8.601409084095339e-05,
      "loss": 0.6873229026794434,
      "step": 4010
    },
    {
      "epoch": 1.1689444605990114,
      "grad_norm": 2.1310856342315674,
      "learning_rate": 8.571428571428571e-05,
      "loss": 0.7570897579193115,
      "step": 4020
    },
    {
      "epoch": 1.1718522826403024,
      "grad_norm": 2.121331214904785,
      "learning_rate": 8.541448058761805e-05,
      "loss": 0.8037228584289551,
      "step": 4030
    },
    {
      "epoch": 1.1747601046815934,
      "grad_norm": 1.866264820098877,
      "learning_rate": 8.511467546095038e-05,
      "loss": 0.7070183277130127,
      "step": 4040
    },
    {
      "epoch": 1.1776679267228847,
      "grad_norm": 2.1696619987487793,
      "learning_rate": 8.481487033428273e-05,
      "loss": 0.7907981872558594,
      "step": 4050
    },
    {
      "epoch": 1.1805757487641757,
      "grad_norm": 1.9860618114471436,
      "learning_rate": 8.451506520761505e-05,
      "loss": 0.801030445098877,
      "step": 4060
    },
    {
      "epoch": 1.1834835708054667,
      "grad_norm": 1.92996084690094,
      "learning_rate": 8.421526008094739e-05,
      "loss": 0.7623518943786621,
      "step": 4070
    },
    {
      "epoch": 1.1863913928467578,
      "grad_norm": 2.483771800994873,
      "learning_rate": 8.391545495427972e-05,
      "loss": 0.7155436992645263,
      "step": 4080
    },
    {
      "epoch": 1.1892992148880488,
      "grad_norm": 2.0799410343170166,
      "learning_rate": 8.361564982761205e-05,
      "loss": 0.7853653430938721,
      "step": 4090
    },
    {
      "epoch": 1.1922070369293398,
      "grad_norm": 1.9911246299743652,
      "learning_rate": 8.331584470094439e-05,
      "loss": 0.7899494171142578,
      "step": 4100
    },
    {
      "epoch": 1.195114858970631,
      "grad_norm": 2.0691795349121094,
      "learning_rate": 8.301603957427672e-05,
      "loss": 0.7937345027923584,
      "step": 4110
    },
    {
      "epoch": 1.198022681011922,
      "grad_norm": 1.9724209308624268,
      "learning_rate": 8.271623444760906e-05,
      "loss": 0.7867683887481689,
      "step": 4120
    },
    {
      "epoch": 1.2009305030532131,
      "grad_norm": 2.0218544006347656,
      "learning_rate": 8.241642932094139e-05,
      "loss": 0.7292540550231934,
      "step": 4130
    },
    {
      "epoch": 1.2038383250945042,
      "grad_norm": 1.8983079195022583,
      "learning_rate": 8.211662419427373e-05,
      "loss": 0.7957502841949463,
      "step": 4140
    },
    {
      "epoch": 1.2067461471357952,
      "grad_norm": 2.0544803142547607,
      "learning_rate": 8.181681906760606e-05,
      "loss": 0.8023088455200196,
      "step": 4150
    },
    {
      "epoch": 1.2096539691770865,
      "grad_norm": 2.2147371768951416,
      "learning_rate": 8.15170139409384e-05,
      "loss": 0.7521897315979004,
      "step": 4160
    },
    {
      "epoch": 1.2125617912183775,
      "grad_norm": 2.114715814590454,
      "learning_rate": 8.121720881427073e-05,
      "loss": 0.7361032485961914,
      "step": 4170
    },
    {
      "epoch": 1.2154696132596685,
      "grad_norm": 2.0806241035461426,
      "learning_rate": 8.091740368760306e-05,
      "loss": 0.810362434387207,
      "step": 4180
    },
    {
      "epoch": 1.2183774353009595,
      "grad_norm": 2.1156203746795654,
      "learning_rate": 8.061759856093539e-05,
      "loss": 0.7738988876342774,
      "step": 4190
    },
    {
      "epoch": 1.2212852573422506,
      "grad_norm": 2.063788652420044,
      "learning_rate": 8.031779343426773e-05,
      "loss": 0.7684555053710938,
      "step": 4200
    },
    {
      "epoch": 1.2241930793835416,
      "grad_norm": 1.8935747146606445,
      "learning_rate": 8.001798830760007e-05,
      "loss": 0.7432861328125,
      "step": 4210
    },
    {
      "epoch": 1.2271009014248329,
      "grad_norm": 1.9250372648239136,
      "learning_rate": 7.971818318093239e-05,
      "loss": 0.752553653717041,
      "step": 4220
    },
    {
      "epoch": 1.2300087234661239,
      "grad_norm": 1.8770952224731445,
      "learning_rate": 7.941837805426474e-05,
      "loss": 0.7726155281066894,
      "step": 4230
    },
    {
      "epoch": 1.232916545507415,
      "grad_norm": 1.8295116424560547,
      "learning_rate": 7.911857292759707e-05,
      "loss": 0.7889208793640137,
      "step": 4240
    },
    {
      "epoch": 1.235824367548706,
      "grad_norm": 1.932578682899475,
      "learning_rate": 7.88187678009294e-05,
      "loss": 0.8311320304870605,
      "step": 4250
    },
    {
      "epoch": 1.2387321895899972,
      "grad_norm": 1.9207606315612793,
      "learning_rate": 7.851896267426173e-05,
      "loss": 0.7485177516937256,
      "step": 4260
    },
    {
      "epoch": 1.2416400116312882,
      "grad_norm": 1.9031519889831543,
      "learning_rate": 7.821915754759408e-05,
      "loss": 0.7838140010833741,
      "step": 4270
    },
    {
      "epoch": 1.2445478336725793,
      "grad_norm": 1.8704465627670288,
      "learning_rate": 7.791935242092641e-05,
      "loss": 0.7774901390075684,
      "step": 4280
    },
    {
      "epoch": 1.2474556557138703,
      "grad_norm": 1.819907784461975,
      "learning_rate": 7.761954729425873e-05,
      "loss": 0.750859260559082,
      "step": 4290
    },
    {
      "epoch": 1.2503634777551613,
      "grad_norm": 1.9019205570220947,
      "learning_rate": 7.731974216759107e-05,
      "loss": 0.780170202255249,
      "step": 4300
    },
    {
      "epoch": 1.2532712997964524,
      "grad_norm": 2.030393362045288,
      "learning_rate": 7.701993704092341e-05,
      "loss": 0.8301087379455566,
      "step": 4310
    },
    {
      "epoch": 1.2561791218377436,
      "grad_norm": 1.842457890510559,
      "learning_rate": 7.672013191425573e-05,
      "loss": 0.7692018985748291,
      "step": 4320
    },
    {
      "epoch": 1.2590869438790346,
      "grad_norm": 1.8982383012771606,
      "learning_rate": 7.642032678758807e-05,
      "loss": 0.781231689453125,
      "step": 4330
    },
    {
      "epoch": 1.2619947659203257,
      "grad_norm": 1.8713147640228271,
      "learning_rate": 7.61205216609204e-05,
      "loss": 0.762873363494873,
      "step": 4340
    },
    {
      "epoch": 1.2649025879616167,
      "grad_norm": 2.0454752445220947,
      "learning_rate": 7.582071653425275e-05,
      "loss": 0.7406270980834961,
      "step": 4350
    },
    {
      "epoch": 1.267810410002908,
      "grad_norm": 2.4180965423583984,
      "learning_rate": 7.552091140758507e-05,
      "loss": 0.7520180702209472,
      "step": 4360
    },
    {
      "epoch": 1.270718232044199,
      "grad_norm": 1.986022710800171,
      "learning_rate": 7.52211062809174e-05,
      "loss": 0.8284387588500977,
      "step": 4370
    },
    {
      "epoch": 1.27362605408549,
      "grad_norm": 1.760955810546875,
      "learning_rate": 7.492130115424974e-05,
      "loss": 0.7012954711914062,
      "step": 4380
    },
    {
      "epoch": 1.276533876126781,
      "grad_norm": 1.8714261054992676,
      "learning_rate": 7.462149602758208e-05,
      "loss": 0.7935136795043946,
      "step": 4390
    },
    {
      "epoch": 1.279441698168072,
      "grad_norm": 1.9436787366867065,
      "learning_rate": 7.432169090091441e-05,
      "loss": 0.7934539318084717,
      "step": 4400
    },
    {
      "epoch": 1.282349520209363,
      "grad_norm": 1.9582066535949707,
      "learning_rate": 7.402188577424674e-05,
      "loss": 0.7850748538970947,
      "step": 4410
    },
    {
      "epoch": 1.2852573422506541,
      "grad_norm": 1.9535675048828125,
      "learning_rate": 7.372208064757908e-05,
      "loss": 0.7332793712615967,
      "step": 4420
    },
    {
      "epoch": 1.2881651642919454,
      "grad_norm": 1.7450060844421387,
      "learning_rate": 7.342227552091141e-05,
      "loss": 0.7462188720703125,
      "step": 4430
    },
    {
      "epoch": 1.2910729863332364,
      "grad_norm": 1.8309375047683716,
      "learning_rate": 7.312247039424375e-05,
      "loss": 0.7612934112548828,
      "step": 4440
    },
    {
      "epoch": 1.2939808083745274,
      "grad_norm": 2.1091737747192383,
      "learning_rate": 7.282266526757608e-05,
      "loss": 0.7570091247558594,
      "step": 4450
    },
    {
      "epoch": 1.2968886304158185,
      "grad_norm": 1.933371901512146,
      "learning_rate": 7.252286014090842e-05,
      "loss": 0.7872342586517334,
      "step": 4460
    },
    {
      "epoch": 1.2997964524571097,
      "grad_norm": 1.857756495475769,
      "learning_rate": 7.222305501424075e-05,
      "loss": 0.7867274761199952,
      "step": 4470
    },
    {
      "epoch": 1.3027042744984008,
      "grad_norm": 1.8640364408493042,
      "learning_rate": 7.192324988757309e-05,
      "loss": 0.7677565574645996,
      "step": 4480
    },
    {
      "epoch": 1.3056120965396918,
      "grad_norm": 1.8762586116790771,
      "learning_rate": 7.162344476090541e-05,
      "loss": 0.7633658885955811,
      "step": 4490
    },
    {
      "epoch": 1.3085199185809828,
      "grad_norm": 2.0363333225250244,
      "learning_rate": 7.132363963423776e-05,
      "loss": 0.7173254013061523,
      "step": 4500
    },
    {
      "epoch": 1.3085199185809828,
      "eval_loss": 0.8845488429069519,
      "eval_runtime": 473.8247,
      "eval_samples_per_second": 7.258,
      "eval_steps_per_second": 7.258,
      "step": 4500
    },
    {
      "epoch": 1.3114277406222739,
      "grad_norm": 2.0255188941955566,
      "learning_rate": 7.102383450757009e-05,
      "loss": 0.7590040683746337,
      "step": 4510
    },
    {
      "epoch": 1.3143355626635649,
      "grad_norm": 2.052431583404541,
      "learning_rate": 7.072402938090241e-05,
      "loss": 0.7589908599853515,
      "step": 4520
    },
    {
      "epoch": 1.3172433847048561,
      "grad_norm": 1.9324688911437988,
      "learning_rate": 7.042422425423475e-05,
      "loss": 0.7061916828155518,
      "step": 4530
    },
    {
      "epoch": 1.3201512067461472,
      "grad_norm": 1.881585717201233,
      "learning_rate": 7.012441912756709e-05,
      "loss": 0.7603434562683106,
      "step": 4540
    },
    {
      "epoch": 1.3230590287874382,
      "grad_norm": 2.1515471935272217,
      "learning_rate": 6.982461400089941e-05,
      "loss": 0.7740952491760253,
      "step": 4550
    },
    {
      "epoch": 1.3259668508287292,
      "grad_norm": 1.9391210079193115,
      "learning_rate": 6.952480887423175e-05,
      "loss": 0.7603271007537842,
      "step": 4560
    },
    {
      "epoch": 1.3288746728700205,
      "grad_norm": 1.9450794458389282,
      "learning_rate": 6.922500374756408e-05,
      "loss": 0.8049670219421386,
      "step": 4570
    },
    {
      "epoch": 1.3317824949113115,
      "grad_norm": 2.054507255554199,
      "learning_rate": 6.892519862089643e-05,
      "loss": 0.7841194152832032,
      "step": 4580
    },
    {
      "epoch": 1.3346903169526025,
      "grad_norm": 1.9239962100982666,
      "learning_rate": 6.862539349422875e-05,
      "loss": 0.7888126850128174,
      "step": 4590
    },
    {
      "epoch": 1.3375981389938936,
      "grad_norm": 2.1011009216308594,
      "learning_rate": 6.832558836756109e-05,
      "loss": 0.7314376354217529,
      "step": 4600
    },
    {
      "epoch": 1.3405059610351846,
      "grad_norm": 2.109041213989258,
      "learning_rate": 6.802578324089342e-05,
      "loss": 0.8058440208435058,
      "step": 4610
    },
    {
      "epoch": 1.3434137830764756,
      "grad_norm": 1.9356318712234497,
      "learning_rate": 6.772597811422576e-05,
      "loss": 0.7847652435302734,
      "step": 4620
    },
    {
      "epoch": 1.3463216051177667,
      "grad_norm": 2.092618942260742,
      "learning_rate": 6.742617298755809e-05,
      "loss": 0.7178999423980713,
      "step": 4630
    },
    {
      "epoch": 1.349229427159058,
      "grad_norm": 1.8008778095245361,
      "learning_rate": 6.712636786089042e-05,
      "loss": 0.7678854465484619,
      "step": 4640
    },
    {
      "epoch": 1.352137249200349,
      "grad_norm": 2.20794415473938,
      "learning_rate": 6.682656273422276e-05,
      "loss": 0.7475598335266114,
      "step": 4650
    },
    {
      "epoch": 1.35504507124164,
      "grad_norm": 1.785436749458313,
      "learning_rate": 6.65267576075551e-05,
      "loss": 0.7328590393066406,
      "step": 4660
    },
    {
      "epoch": 1.357952893282931,
      "grad_norm": 1.9333852529525757,
      "learning_rate": 6.622695248088743e-05,
      "loss": 0.7973165035247802,
      "step": 4670
    },
    {
      "epoch": 1.3608607153242223,
      "grad_norm": 2.149812698364258,
      "learning_rate": 6.592714735421976e-05,
      "loss": 0.7523110389709473,
      "step": 4680
    },
    {
      "epoch": 1.3637685373655133,
      "grad_norm": 2.0981409549713135,
      "learning_rate": 6.56273422275521e-05,
      "loss": 0.7891485691070557,
      "step": 4690
    },
    {
      "epoch": 1.3666763594068043,
      "grad_norm": 2.0596706867218018,
      "learning_rate": 6.532753710088443e-05,
      "loss": 0.7680598735809326,
      "step": 4700
    },
    {
      "epoch": 1.3695841814480954,
      "grad_norm": 2.0713438987731934,
      "learning_rate": 6.502773197421677e-05,
      "loss": 0.7212025165557862,
      "step": 4710
    },
    {
      "epoch": 1.3724920034893864,
      "grad_norm": 1.933114767074585,
      "learning_rate": 6.472792684754909e-05,
      "loss": 0.8231717109680176,
      "step": 4720
    },
    {
      "epoch": 1.3753998255306774,
      "grad_norm": 1.9748258590698242,
      "learning_rate": 6.442812172088144e-05,
      "loss": 0.710457706451416,
      "step": 4730
    },
    {
      "epoch": 1.3783076475719687,
      "grad_norm": 1.99483323097229,
      "learning_rate": 6.412831659421377e-05,
      "loss": 0.7033157348632812,
      "step": 4740
    },
    {
      "epoch": 1.3812154696132597,
      "grad_norm": 2.1739165782928467,
      "learning_rate": 6.382851146754609e-05,
      "loss": 0.7652640819549561,
      "step": 4750
    },
    {
      "epoch": 1.3841232916545507,
      "grad_norm": 1.9803380966186523,
      "learning_rate": 6.352870634087842e-05,
      "loss": 0.7610398769378662,
      "step": 4760
    },
    {
      "epoch": 1.3870311136958418,
      "grad_norm": 1.8965667486190796,
      "learning_rate": 6.322890121421077e-05,
      "loss": 0.8127297401428223,
      "step": 4770
    },
    {
      "epoch": 1.389938935737133,
      "grad_norm": 2.0516648292541504,
      "learning_rate": 6.292909608754311e-05,
      "loss": 0.7877964973449707,
      "step": 4780
    },
    {
      "epoch": 1.392846757778424,
      "grad_norm": 1.9544559717178345,
      "learning_rate": 6.262929096087543e-05,
      "loss": 0.7798377513885498,
      "step": 4790
    },
    {
      "epoch": 1.395754579819715,
      "grad_norm": 1.7984205484390259,
      "learning_rate": 6.232948583420776e-05,
      "loss": 0.7166555404663086,
      "step": 4800
    },
    {
      "epoch": 1.398662401861006,
      "grad_norm": 1.914433479309082,
      "learning_rate": 6.202968070754011e-05,
      "loss": 0.7964422225952148,
      "step": 4810
    },
    {
      "epoch": 1.4015702239022971,
      "grad_norm": 2.067776679992676,
      "learning_rate": 6.172987558087243e-05,
      "loss": 0.7551724910736084,
      "step": 4820
    },
    {
      "epoch": 1.4044780459435882,
      "grad_norm": 1.9109352827072144,
      "learning_rate": 6.143007045420477e-05,
      "loss": 0.7385217666625976,
      "step": 4830
    },
    {
      "epoch": 1.4073858679848792,
      "grad_norm": 1.9591761827468872,
      "learning_rate": 6.11302653275371e-05,
      "loss": 0.721226167678833,
      "step": 4840
    },
    {
      "epoch": 1.4102936900261704,
      "grad_norm": 2.115058183670044,
      "learning_rate": 6.0830460200869435e-05,
      "loss": 0.6777582168579102,
      "step": 4850
    },
    {
      "epoch": 1.4132015120674615,
      "grad_norm": 2.06552791595459,
      "learning_rate": 6.053065507420177e-05,
      "loss": 0.726964282989502,
      "step": 4860
    },
    {
      "epoch": 1.4161093341087525,
      "grad_norm": 2.1735804080963135,
      "learning_rate": 6.0230849947534104e-05,
      "loss": 0.7624166011810303,
      "step": 4870
    },
    {
      "epoch": 1.4190171561500435,
      "grad_norm": 2.2036550045013428,
      "learning_rate": 5.9931044820866446e-05,
      "loss": 0.7692136287689209,
      "step": 4880
    },
    {
      "epoch": 1.4219249781913348,
      "grad_norm": 1.8746955394744873,
      "learning_rate": 5.9631239694198773e-05,
      "loss": 0.7410698890686035,
      "step": 4890
    },
    {
      "epoch": 1.4248328002326258,
      "grad_norm": 1.9036871194839478,
      "learning_rate": 5.933143456753111e-05,
      "loss": 0.7848501205444336,
      "step": 4900
    },
    {
      "epoch": 1.4277406222739168,
      "grad_norm": 1.8983385562896729,
      "learning_rate": 5.903162944086344e-05,
      "loss": 0.7688620090484619,
      "step": 4910
    },
    {
      "epoch": 1.4306484443152079,
      "grad_norm": 2.1584537029266357,
      "learning_rate": 5.873182431419577e-05,
      "loss": 0.7612662315368652,
      "step": 4920
    },
    {
      "epoch": 1.433556266356499,
      "grad_norm": 2.033599376678467,
      "learning_rate": 5.843201918752811e-05,
      "loss": 0.7598844528198242,
      "step": 4930
    },
    {
      "epoch": 1.43646408839779,
      "grad_norm": 1.9845435619354248,
      "learning_rate": 5.8132214060860446e-05,
      "loss": 0.7194857597351074,
      "step": 4940
    },
    {
      "epoch": 1.4393719104390812,
      "grad_norm": 2.1257405281066895,
      "learning_rate": 5.7832408934192774e-05,
      "loss": 0.7137130737304688,
      "step": 4950
    },
    {
      "epoch": 1.4422797324803722,
      "grad_norm": 1.8075517416000366,
      "learning_rate": 5.753260380752511e-05,
      "loss": 0.7500784873962403,
      "step": 4960
    },
    {
      "epoch": 1.4451875545216633,
      "grad_norm": 2.136146068572998,
      "learning_rate": 5.723279868085745e-05,
      "loss": 0.7785260677337646,
      "step": 4970
    },
    {
      "epoch": 1.4480953765629543,
      "grad_norm": 1.9265216588974,
      "learning_rate": 5.6932993554189784e-05,
      "loss": 0.6893830299377441,
      "step": 4980
    },
    {
      "epoch": 1.4510031986042455,
      "grad_norm": 2.246067762374878,
      "learning_rate": 5.663318842752211e-05,
      "loss": 0.7811415672302247,
      "step": 4990
    },
    {
      "epoch": 1.4539110206455366,
      "grad_norm": 2.068582534790039,
      "learning_rate": 5.6333383300854446e-05,
      "loss": 0.7605808258056641,
      "step": 5000
    },
    {
      "epoch": 1.4539110206455366,
      "eval_loss": 0.8719142079353333,
      "eval_runtime": 472.3067,
      "eval_samples_per_second": 7.281,
      "eval_steps_per_second": 7.281,
      "step": 5000
    },
    {
      "epoch": 1.4568188426868276,
      "grad_norm": 1.819368600845337,
      "learning_rate": 5.603357817418679e-05,
      "loss": 0.6737122535705566,
      "step": 5010
    },
    {
      "epoch": 1.4597266647281186,
      "grad_norm": 4.519766330718994,
      "learning_rate": 5.573377304751911e-05,
      "loss": 0.723225736618042,
      "step": 5020
    },
    {
      "epoch": 1.4626344867694097,
      "grad_norm": 2.129854202270508,
      "learning_rate": 5.543396792085145e-05,
      "loss": 0.744614839553833,
      "step": 5030
    },
    {
      "epoch": 1.4655423088107007,
      "grad_norm": 1.8777213096618652,
      "learning_rate": 5.5134162794183784e-05,
      "loss": 0.7135731220245362,
      "step": 5040
    },
    {
      "epoch": 1.4684501308519917,
      "grad_norm": 1.9724328517913818,
      "learning_rate": 5.483435766751611e-05,
      "loss": 0.754638957977295,
      "step": 5050
    },
    {
      "epoch": 1.471357952893283,
      "grad_norm": 1.8722237348556519,
      "learning_rate": 5.4534552540848446e-05,
      "loss": 0.7721257209777832,
      "step": 5060
    },
    {
      "epoch": 1.474265774934574,
      "grad_norm": 2.2810893058776855,
      "learning_rate": 5.423474741418079e-05,
      "loss": 0.7189100742340088,
      "step": 5070
    },
    {
      "epoch": 1.477173596975865,
      "grad_norm": 2.171797513961792,
      "learning_rate": 5.393494228751312e-05,
      "loss": 0.7469087600708008,
      "step": 5080
    },
    {
      "epoch": 1.4800814190171563,
      "grad_norm": 1.837860107421875,
      "learning_rate": 5.363513716084545e-05,
      "loss": 0.743229341506958,
      "step": 5090
    },
    {
      "epoch": 1.4829892410584473,
      "grad_norm": 1.96957528591156,
      "learning_rate": 5.3335332034177784e-05,
      "loss": 0.7477756977081299,
      "step": 5100
    },
    {
      "epoch": 1.4858970630997383,
      "grad_norm": 2.0596463680267334,
      "learning_rate": 5.3035526907510126e-05,
      "loss": 0.7668298721313477,
      "step": 5110
    },
    {
      "epoch": 1.4888048851410294,
      "grad_norm": 1.9528030157089233,
      "learning_rate": 5.2735721780842453e-05,
      "loss": 0.7141449451446533,
      "step": 5120
    },
    {
      "epoch": 1.4917127071823204,
      "grad_norm": 1.8499971628189087,
      "learning_rate": 5.243591665417479e-05,
      "loss": 0.7491943359375,
      "step": 5130
    },
    {
      "epoch": 1.4946205292236114,
      "grad_norm": 2.016645908355713,
      "learning_rate": 5.213611152750712e-05,
      "loss": 0.7747302055358887,
      "step": 5140
    },
    {
      "epoch": 1.4975283512649025,
      "grad_norm": 1.8727567195892334,
      "learning_rate": 5.183630640083945e-05,
      "loss": 0.7703737735748291,
      "step": 5150
    },
    {
      "epoch": 1.5004361733061935,
      "grad_norm": 1.9545795917510986,
      "learning_rate": 5.153650127417179e-05,
      "loss": 0.7513375282287598,
      "step": 5160
    },
    {
      "epoch": 1.5033439953474848,
      "grad_norm": 2.101614236831665,
      "learning_rate": 5.1236696147504126e-05,
      "loss": 0.7481746196746826,
      "step": 5170
    },
    {
      "epoch": 1.5062518173887758,
      "grad_norm": 1.8322865962982178,
      "learning_rate": 5.093689102083646e-05,
      "loss": 0.6998108386993408,
      "step": 5180
    },
    {
      "epoch": 1.509159639430067,
      "grad_norm": 2.0042383670806885,
      "learning_rate": 5.063708589416879e-05,
      "loss": 0.7009212017059326,
      "step": 5190
    },
    {
      "epoch": 1.512067461471358,
      "grad_norm": 2.0147664546966553,
      "learning_rate": 5.033728076750113e-05,
      "loss": 0.6757394313812256,
      "step": 5200
    },
    {
      "epoch": 1.514975283512649,
      "grad_norm": 2.185133218765259,
      "learning_rate": 5.0037475640833464e-05,
      "loss": 0.8208953857421875,
      "step": 5210
    },
    {
      "epoch": 1.5178831055539401,
      "grad_norm": 1.9892646074295044,
      "learning_rate": 4.97376705141658e-05,
      "loss": 0.7156825065612793,
      "step": 5220
    },
    {
      "epoch": 1.5207909275952312,
      "grad_norm": 2.1808860301971436,
      "learning_rate": 4.9437865387498126e-05,
      "loss": 0.7173275470733642,
      "step": 5230
    },
    {
      "epoch": 1.5236987496365222,
      "grad_norm": 2.2999000549316406,
      "learning_rate": 4.913806026083047e-05,
      "loss": 0.7130680084228516,
      "step": 5240
    },
    {
      "epoch": 1.5266065716778132,
      "grad_norm": 1.9804385900497437,
      "learning_rate": 4.8838255134162795e-05,
      "loss": 0.7353660106658936,
      "step": 5250
    },
    {
      "epoch": 1.5295143937191042,
      "grad_norm": 2.0175087451934814,
      "learning_rate": 4.853845000749513e-05,
      "loss": 0.6823455333709717,
      "step": 5260
    },
    {
      "epoch": 1.5324222157603955,
      "grad_norm": 1.9505001306533813,
      "learning_rate": 4.8238644880827464e-05,
      "loss": 0.7360805034637451,
      "step": 5270
    },
    {
      "epoch": 1.5353300378016865,
      "grad_norm": 2.100215435028076,
      "learning_rate": 4.79388397541598e-05,
      "loss": 0.761193037033081,
      "step": 5280
    },
    {
      "epoch": 1.5382378598429776,
      "grad_norm": 1.9561179876327515,
      "learning_rate": 4.763903462749213e-05,
      "loss": 0.8232732772827148,
      "step": 5290
    },
    {
      "epoch": 1.5411456818842688,
      "grad_norm": 2.07002592086792,
      "learning_rate": 4.733922950082447e-05,
      "loss": 0.7684964656829834,
      "step": 5300
    },
    {
      "epoch": 1.5440535039255598,
      "grad_norm": 2.175922393798828,
      "learning_rate": 4.7039424374156795e-05,
      "loss": 0.743954849243164,
      "step": 5310
    },
    {
      "epoch": 1.5469613259668509,
      "grad_norm": 1.8928070068359375,
      "learning_rate": 4.673961924748914e-05,
      "loss": 0.8028405189514161,
      "step": 5320
    },
    {
      "epoch": 1.549869148008142,
      "grad_norm": 1.8495947122573853,
      "learning_rate": 4.6439814120821464e-05,
      "loss": 0.7481747150421143,
      "step": 5330
    },
    {
      "epoch": 1.552776970049433,
      "grad_norm": 2.0875566005706787,
      "learning_rate": 4.6140008994153806e-05,
      "loss": 0.7654068470001221,
      "step": 5340
    },
    {
      "epoch": 1.555684792090724,
      "grad_norm": 2.132359027862549,
      "learning_rate": 4.5840203867486133e-05,
      "loss": 0.7879802703857421,
      "step": 5350
    },
    {
      "epoch": 1.558592614132015,
      "grad_norm": 1.9833087921142578,
      "learning_rate": 4.554039874081847e-05,
      "loss": 0.7630073547363281,
      "step": 5360
    },
    {
      "epoch": 1.561500436173306,
      "grad_norm": 2.261916399002075,
      "learning_rate": 4.52405936141508e-05,
      "loss": 0.7673455238342285,
      "step": 5370
    },
    {
      "epoch": 1.5644082582145973,
      "grad_norm": 1.9294761419296265,
      "learning_rate": 4.494078848748314e-05,
      "loss": 0.7796091556549072,
      "step": 5380
    },
    {
      "epoch": 1.5673160802558883,
      "grad_norm": 1.8864835500717163,
      "learning_rate": 4.464098336081548e-05,
      "loss": 0.7843995094299316,
      "step": 5390
    },
    {
      "epoch": 1.5702239022971796,
      "grad_norm": 1.6363269090652466,
      "learning_rate": 4.4341178234147806e-05,
      "loss": 0.7239855766296387,
      "step": 5400
    },
    {
      "epoch": 1.5731317243384706,
      "grad_norm": 1.910172462463379,
      "learning_rate": 4.404137310748014e-05,
      "loss": 0.684738302230835,
      "step": 5410
    },
    {
      "epoch": 1.5760395463797616,
      "grad_norm": 1.936583161354065,
      "learning_rate": 4.3741567980812475e-05,
      "loss": 0.7819816112518311,
      "step": 5420
    },
    {
      "epoch": 1.5789473684210527,
      "grad_norm": 2.261993169784546,
      "learning_rate": 4.344176285414481e-05,
      "loss": 0.7665531158447265,
      "step": 5430
    },
    {
      "epoch": 1.5818551904623437,
      "grad_norm": 2.1516878604888916,
      "learning_rate": 4.314195772747714e-05,
      "loss": 0.7250272274017334,
      "step": 5440
    },
    {
      "epoch": 1.5847630125036347,
      "grad_norm": 1.9359904527664185,
      "learning_rate": 4.284215260080948e-05,
      "loss": 0.7476921081542969,
      "step": 5450
    },
    {
      "epoch": 1.5876708345449257,
      "grad_norm": 1.9610286951065063,
      "learning_rate": 4.2542347474141806e-05,
      "loss": 0.747294282913208,
      "step": 5460
    },
    {
      "epoch": 1.5905786565862168,
      "grad_norm": 1.8940902948379517,
      "learning_rate": 4.224254234747415e-05,
      "loss": 0.7050137519836426,
      "step": 5470
    },
    {
      "epoch": 1.593486478627508,
      "grad_norm": 2.1721606254577637,
      "learning_rate": 4.1942737220806475e-05,
      "loss": 0.7259575366973877,
      "step": 5480
    },
    {
      "epoch": 1.596394300668799,
      "grad_norm": 2.0317630767822266,
      "learning_rate": 4.164293209413881e-05,
      "loss": 0.6914929389953614,
      "step": 5490
    },
    {
      "epoch": 1.59930212271009,
      "grad_norm": 2.04887056350708,
      "learning_rate": 4.1343126967471144e-05,
      "loss": 0.7758480548858643,
      "step": 5500
    },
    {
      "epoch": 1.59930212271009,
      "eval_loss": 0.8598825931549072,
      "eval_runtime": 474.1318,
      "eval_samples_per_second": 7.253,
      "eval_steps_per_second": 7.253,
      "step": 5500
    },
    {
      "epoch": 1.6022099447513813,
      "grad_norm": 1.7526671886444092,
      "learning_rate": 4.104332184080348e-05,
      "loss": 0.6790880680084228,
      "step": 5510
    },
    {
      "epoch": 1.6051177667926724,
      "grad_norm": 2.0832862854003906,
      "learning_rate": 4.074351671413581e-05,
      "loss": 0.7566723346710205,
      "step": 5520
    },
    {
      "epoch": 1.6080255888339634,
      "grad_norm": 1.8792216777801514,
      "learning_rate": 4.044371158746815e-05,
      "loss": 0.6554183959960938,
      "step": 5530
    },
    {
      "epoch": 1.6109334108752544,
      "grad_norm": 2.0853688716888428,
      "learning_rate": 4.014390646080048e-05,
      "loss": 0.7421646118164062,
      "step": 5540
    },
    {
      "epoch": 1.6138412329165455,
      "grad_norm": 2.5147030353546143,
      "learning_rate": 3.984410133413282e-05,
      "loss": 0.7599525451660156,
      "step": 5550
    },
    {
      "epoch": 1.6167490549578365,
      "grad_norm": 2.236278772354126,
      "learning_rate": 3.954429620746515e-05,
      "loss": 0.8087862014770508,
      "step": 5560
    },
    {
      "epoch": 1.6196568769991275,
      "grad_norm": 1.9631987810134888,
      "learning_rate": 3.9244491080797486e-05,
      "loss": 0.7550980091094971,
      "step": 5570
    },
    {
      "epoch": 1.6225646990404188,
      "grad_norm": 1.8814702033996582,
      "learning_rate": 3.894468595412982e-05,
      "loss": 0.7769564628601074,
      "step": 5580
    },
    {
      "epoch": 1.6254725210817098,
      "grad_norm": 1.9273751974105835,
      "learning_rate": 3.864488082746215e-05,
      "loss": 0.7157449722290039,
      "step": 5590
    },
    {
      "epoch": 1.6283803431230008,
      "grad_norm": 1.9550398588180542,
      "learning_rate": 3.834507570079449e-05,
      "loss": 0.793599796295166,
      "step": 5600
    },
    {
      "epoch": 1.631288165164292,
      "grad_norm": 1.9033679962158203,
      "learning_rate": 3.804527057412682e-05,
      "loss": 0.7436941146850586,
      "step": 5610
    },
    {
      "epoch": 1.6341959872055831,
      "grad_norm": 1.9171406030654907,
      "learning_rate": 3.774546544745916e-05,
      "loss": 0.6863879203796387,
      "step": 5620
    },
    {
      "epoch": 1.6371038092468742,
      "grad_norm": 1.7775182723999023,
      "learning_rate": 3.7445660320791486e-05,
      "loss": 0.7105512142181396,
      "step": 5630
    },
    {
      "epoch": 1.6400116312881652,
      "grad_norm": 2.081458568572998,
      "learning_rate": 3.714585519412382e-05,
      "loss": 0.679707384109497,
      "step": 5640
    },
    {
      "epoch": 1.6429194533294562,
      "grad_norm": 2.0358259677886963,
      "learning_rate": 3.6846050067456155e-05,
      "loss": 0.7309046745300293,
      "step": 5650
    },
    {
      "epoch": 1.6458272753707472,
      "grad_norm": 2.1495471000671387,
      "learning_rate": 3.654624494078849e-05,
      "loss": 0.678877878189087,
      "step": 5660
    },
    {
      "epoch": 1.6487350974120383,
      "grad_norm": 2.19714617729187,
      "learning_rate": 3.6246439814120824e-05,
      "loss": 0.724186372756958,
      "step": 5670
    },
    {
      "epoch": 1.6516429194533293,
      "grad_norm": 1.930059552192688,
      "learning_rate": 3.594663468745316e-05,
      "loss": 0.7309486389160156,
      "step": 5680
    },
    {
      "epoch": 1.6545507414946206,
      "grad_norm": 2.045048236846924,
      "learning_rate": 3.5646829560785486e-05,
      "loss": 0.7321044921875,
      "step": 5690
    },
    {
      "epoch": 1.6574585635359116,
      "grad_norm": 1.8372198343276978,
      "learning_rate": 3.534702443411783e-05,
      "loss": 0.690158462524414,
      "step": 5700
    },
    {
      "epoch": 1.6603663855772028,
      "grad_norm": 2.2258548736572266,
      "learning_rate": 3.5047219307450155e-05,
      "loss": 0.6976329803466796,
      "step": 5710
    },
    {
      "epoch": 1.6632742076184939,
      "grad_norm": 1.861602544784546,
      "learning_rate": 3.4747414180782496e-05,
      "loss": 0.7170249462127686,
      "step": 5720
    },
    {
      "epoch": 1.666182029659785,
      "grad_norm": 1.9709389209747314,
      "learning_rate": 3.4447609054114824e-05,
      "loss": 0.7029520511627197,
      "step": 5730
    },
    {
      "epoch": 1.669089851701076,
      "grad_norm": 1.8129233121871948,
      "learning_rate": 3.414780392744716e-05,
      "loss": 0.7850728988647461,
      "step": 5740
    },
    {
      "epoch": 1.671997673742367,
      "grad_norm": 2.106326103210449,
      "learning_rate": 3.384799880077949e-05,
      "loss": 0.8082990646362305,
      "step": 5750
    },
    {
      "epoch": 1.674905495783658,
      "grad_norm": 1.839580774307251,
      "learning_rate": 3.354819367411183e-05,
      "loss": 0.7232970237731934,
      "step": 5760
    },
    {
      "epoch": 1.677813317824949,
      "grad_norm": 2.027674674987793,
      "learning_rate": 3.324838854744416e-05,
      "loss": 0.7808068752288818,
      "step": 5770
    },
    {
      "epoch": 1.68072113986624,
      "grad_norm": 2.2386841773986816,
      "learning_rate": 3.29485834207765e-05,
      "loss": 0.711140775680542,
      "step": 5780
    },
    {
      "epoch": 1.6836289619075313,
      "grad_norm": 2.136626958847046,
      "learning_rate": 3.264877829410883e-05,
      "loss": 0.7360716342926026,
      "step": 5790
    },
    {
      "epoch": 1.6865367839488223,
      "grad_norm": 2.048578977584839,
      "learning_rate": 3.2348973167441166e-05,
      "loss": 0.710478401184082,
      "step": 5800
    },
    {
      "epoch": 1.6894446059901134,
      "grad_norm": 1.9906225204467773,
      "learning_rate": 3.20491680407735e-05,
      "loss": 0.7086396217346191,
      "step": 5810
    },
    {
      "epoch": 1.6923524280314046,
      "grad_norm": 1.964224934577942,
      "learning_rate": 3.1749362914105835e-05,
      "loss": 0.7486394882202149,
      "step": 5820
    },
    {
      "epoch": 1.6952602500726957,
      "grad_norm": 1.887143850326538,
      "learning_rate": 3.144955778743817e-05,
      "loss": 0.6863440990447998,
      "step": 5830
    },
    {
      "epoch": 1.6981680721139867,
      "grad_norm": 2.00508713722229,
      "learning_rate": 3.11497526607705e-05,
      "loss": 0.6853577613830566,
      "step": 5840
    },
    {
      "epoch": 1.7010758941552777,
      "grad_norm": 2.0328657627105713,
      "learning_rate": 3.084994753410284e-05,
      "loss": 0.7252396583557129,
      "step": 5850
    },
    {
      "epoch": 1.7039837161965687,
      "grad_norm": 2.1302683353424072,
      "learning_rate": 3.0550142407435166e-05,
      "loss": 0.7472237586975098,
      "step": 5860
    },
    {
      "epoch": 1.7068915382378598,
      "grad_norm": 2.106820821762085,
      "learning_rate": 3.0250337280767504e-05,
      "loss": 0.723507022857666,
      "step": 5870
    },
    {
      "epoch": 1.7097993602791508,
      "grad_norm": 2.0230047702789307,
      "learning_rate": 2.9950532154099835e-05,
      "loss": 0.6797329425811768,
      "step": 5880
    },
    {
      "epoch": 1.7127071823204418,
      "grad_norm": 2.121976613998413,
      "learning_rate": 2.965072702743217e-05,
      "loss": 0.7394673347473144,
      "step": 5890
    },
    {
      "epoch": 1.715615004361733,
      "grad_norm": 1.8224197626113892,
      "learning_rate": 2.9350921900764504e-05,
      "loss": 0.758949613571167,
      "step": 5900
    },
    {
      "epoch": 1.7185228264030241,
      "grad_norm": 1.832812786102295,
      "learning_rate": 2.905111677409684e-05,
      "loss": 0.724758243560791,
      "step": 5910
    },
    {
      "epoch": 1.7214306484443154,
      "grad_norm": 2.112184524536133,
      "learning_rate": 2.8751311647429173e-05,
      "loss": 0.7687956809997558,
      "step": 5920
    },
    {
      "epoch": 1.7243384704856064,
      "grad_norm": 2.5346004962921143,
      "learning_rate": 2.8451506520761507e-05,
      "loss": 0.721607780456543,
      "step": 5930
    },
    {
      "epoch": 1.7272462925268974,
      "grad_norm": 1.9310683012008667,
      "learning_rate": 2.815170139409384e-05,
      "loss": 0.750163459777832,
      "step": 5940
    },
    {
      "epoch": 1.7301541145681885,
      "grad_norm": 2.023061752319336,
      "learning_rate": 2.7851896267426176e-05,
      "loss": 0.7398929119110107,
      "step": 5950
    },
    {
      "epoch": 1.7330619366094795,
      "grad_norm": 1.9423258304595947,
      "learning_rate": 2.7552091140758507e-05,
      "loss": 0.7022686004638672,
      "step": 5960
    },
    {
      "epoch": 1.7359697586507705,
      "grad_norm": 1.9257018566131592,
      "learning_rate": 2.7252286014090845e-05,
      "loss": 0.760784387588501,
      "step": 5970
    },
    {
      "epoch": 1.7388775806920616,
      "grad_norm": 1.9650042057037354,
      "learning_rate": 2.6952480887423176e-05,
      "loss": 0.7613511562347413,
      "step": 5980
    },
    {
      "epoch": 1.7417854027333526,
      "grad_norm": 2.08854341506958,
      "learning_rate": 2.6652675760755508e-05,
      "loss": 0.7318148612976074,
      "step": 5990
    },
    {
      "epoch": 1.7446932247746438,
      "grad_norm": 2.0090866088867188,
      "learning_rate": 2.6352870634087845e-05,
      "loss": 0.7362863540649414,
      "step": 6000
    },
    {
      "epoch": 1.7446932247746438,
      "eval_loss": 0.8498985171318054,
      "eval_runtime": 471.5916,
      "eval_samples_per_second": 7.292,
      "eval_steps_per_second": 7.292,
      "step": 6000
    },
    {
      "epoch": 1.7476010468159349,
      "grad_norm": 1.9278331995010376,
      "learning_rate": 2.6053065507420177e-05,
      "loss": 0.6806503295898437,
      "step": 6010
    },
    {
      "epoch": 1.750508868857226,
      "grad_norm": 2.056682825088501,
      "learning_rate": 2.5753260380752514e-05,
      "loss": 0.722940731048584,
      "step": 6020
    },
    {
      "epoch": 1.7534166908985171,
      "grad_norm": 1.9063704013824463,
      "learning_rate": 2.5453455254084846e-05,
      "loss": 0.6798623085021973,
      "step": 6030
    },
    {
      "epoch": 1.7563245129398082,
      "grad_norm": 1.8881698846817017,
      "learning_rate": 2.5153650127417177e-05,
      "loss": 0.7303658485412597,
      "step": 6040
    },
    {
      "epoch": 1.7592323349810992,
      "grad_norm": 2.0086960792541504,
      "learning_rate": 2.4853845000749515e-05,
      "loss": 0.7225898265838623,
      "step": 6050
    },
    {
      "epoch": 1.7621401570223902,
      "grad_norm": 2.1029181480407715,
      "learning_rate": 2.4554039874081846e-05,
      "loss": 0.7572491645812989,
      "step": 6060
    },
    {
      "epoch": 1.7650479790636813,
      "grad_norm": 1.9289077520370483,
      "learning_rate": 2.425423474741418e-05,
      "loss": 0.780693531036377,
      "step": 6070
    },
    {
      "epoch": 1.7679558011049723,
      "grad_norm": 2.0310537815093994,
      "learning_rate": 2.3954429620746515e-05,
      "loss": 0.7033182144165039,
      "step": 6080
    },
    {
      "epoch": 1.7708636231462633,
      "grad_norm": 2.0626399517059326,
      "learning_rate": 2.365462449407885e-05,
      "loss": 0.7750870227813721,
      "step": 6090
    },
    {
      "epoch": 1.7737714451875544,
      "grad_norm": 1.9674657583236694,
      "learning_rate": 2.3354819367411184e-05,
      "loss": 0.7491857051849365,
      "step": 6100
    },
    {
      "epoch": 1.7766792672288456,
      "grad_norm": 2.028435707092285,
      "learning_rate": 2.3055014240743518e-05,
      "loss": 0.7668484687805176,
      "step": 6110
    },
    {
      "epoch": 1.7795870892701366,
      "grad_norm": 1.8845252990722656,
      "learning_rate": 2.2755209114075853e-05,
      "loss": 0.7411411285400391,
      "step": 6120
    },
    {
      "epoch": 1.782494911311428,
      "grad_norm": 2.013805627822876,
      "learning_rate": 2.2455403987408187e-05,
      "loss": 0.7249200344085693,
      "step": 6130
    },
    {
      "epoch": 1.785402733352719,
      "grad_norm": 2.006671667098999,
      "learning_rate": 2.2155598860740522e-05,
      "loss": 0.7829705715179444,
      "step": 6140
    },
    {
      "epoch": 1.78831055539401,
      "grad_norm": 2.0205607414245605,
      "learning_rate": 2.1855793734072856e-05,
      "loss": 0.7822526454925537,
      "step": 6150
    },
    {
      "epoch": 1.791218377435301,
      "grad_norm": 1.9972530603408813,
      "learning_rate": 2.1555988607405187e-05,
      "loss": 0.7459287166595459,
      "step": 6160
    },
    {
      "epoch": 1.794126199476592,
      "grad_norm": 2.1279802322387695,
      "learning_rate": 2.1256183480737522e-05,
      "loss": 0.7397714614868164,
      "step": 6170
    },
    {
      "epoch": 1.797034021517883,
      "grad_norm": 2.1933655738830566,
      "learning_rate": 2.0956378354069856e-05,
      "loss": 0.6955758571624756,
      "step": 6180
    },
    {
      "epoch": 1.799941843559174,
      "grad_norm": 1.9316877126693726,
      "learning_rate": 2.065657322740219e-05,
      "loss": 0.727911376953125,
      "step": 6190
    },
    {
      "epoch": 1.8028496656004651,
      "grad_norm": 1.8654370307922363,
      "learning_rate": 2.0356768100734525e-05,
      "loss": 0.6596095561981201,
      "step": 6200
    },
    {
      "epoch": 1.8057574876417564,
      "grad_norm": 2.046983003616333,
      "learning_rate": 2.0056962974066856e-05,
      "loss": 0.7094531059265137,
      "step": 6210
    },
    {
      "epoch": 1.8086653096830474,
      "grad_norm": 2.1500179767608643,
      "learning_rate": 1.975715784739919e-05,
      "loss": 0.7008066654205323,
      "step": 6220
    },
    {
      "epoch": 1.8115731317243384,
      "grad_norm": 2.3866703510284424,
      "learning_rate": 1.9457352720731525e-05,
      "loss": 0.7228167533874512,
      "step": 6230
    },
    {
      "epoch": 1.8144809537656297,
      "grad_norm": 2.0172855854034424,
      "learning_rate": 1.915754759406386e-05,
      "loss": 0.6767440795898437,
      "step": 6240
    },
    {
      "epoch": 1.8173887758069207,
      "grad_norm": 1.8794561624526978,
      "learning_rate": 1.8857742467396194e-05,
      "loss": 0.7008272647857666,
      "step": 6250
    },
    {
      "epoch": 1.8202965978482117,
      "grad_norm": 2.0600626468658447,
      "learning_rate": 1.8557937340728526e-05,
      "loss": 0.7082679748535157,
      "step": 6260
    },
    {
      "epoch": 1.8232044198895028,
      "grad_norm": 1.869964599609375,
      "learning_rate": 1.825813221406086e-05,
      "loss": 0.7198378562927246,
      "step": 6270
    },
    {
      "epoch": 1.8261122419307938,
      "grad_norm": 1.9929425716400146,
      "learning_rate": 1.7958327087393195e-05,
      "loss": 0.7699549674987793,
      "step": 6280
    },
    {
      "epoch": 1.8290200639720848,
      "grad_norm": 1.9782027006149292,
      "learning_rate": 1.765852196072553e-05,
      "loss": 0.6976008892059327,
      "step": 6290
    },
    {
      "epoch": 1.8319278860133759,
      "grad_norm": 2.1247029304504395,
      "learning_rate": 1.7358716834057864e-05,
      "loss": 0.690690565109253,
      "step": 6300
    },
    {
      "epoch": 1.834835708054667,
      "grad_norm": 1.8903204202651978,
      "learning_rate": 1.7058911707390195e-05,
      "loss": 0.7616622447967529,
      "step": 6310
    },
    {
      "epoch": 1.8377435300959581,
      "grad_norm": 2.0401735305786133,
      "learning_rate": 1.675910658072253e-05,
      "loss": 0.6766151428222656,
      "step": 6320
    },
    {
      "epoch": 1.8406513521372492,
      "grad_norm": 1.9508399963378906,
      "learning_rate": 1.6459301454054864e-05,
      "loss": 0.7218796730041503,
      "step": 6330
    },
    {
      "epoch": 1.8435591741785404,
      "grad_norm": 2.1602697372436523,
      "learning_rate": 1.6159496327387198e-05,
      "loss": 0.753945779800415,
      "step": 6340
    },
    {
      "epoch": 1.8464669962198315,
      "grad_norm": 2.0386970043182373,
      "learning_rate": 1.5859691200719533e-05,
      "loss": 0.7185985088348389,
      "step": 6350
    },
    {
      "epoch": 1.8493748182611225,
      "grad_norm": 1.8012441396713257,
      "learning_rate": 1.5559886074051867e-05,
      "loss": 0.777646541595459,
      "step": 6360
    },
    {
      "epoch": 1.8522826403024135,
      "grad_norm": 1.8146647214889526,
      "learning_rate": 1.52600809473842e-05,
      "loss": 0.7361874580383301,
      "step": 6370
    },
    {
      "epoch": 1.8551904623437045,
      "grad_norm": 2.180405616760254,
      "learning_rate": 1.4960275820716534e-05,
      "loss": 0.6907386779785156,
      "step": 6380
    },
    {
      "epoch": 1.8580982843849956,
      "grad_norm": 2.1181304454803467,
      "learning_rate": 1.4660470694048869e-05,
      "loss": 0.6983653545379639,
      "step": 6390
    },
    {
      "epoch": 1.8610061064262866,
      "grad_norm": 1.8451800346374512,
      "learning_rate": 1.4360665567381203e-05,
      "loss": 0.6979443073272705,
      "step": 6400
    },
    {
      "epoch": 1.8639139284675776,
      "grad_norm": 2.0219805240631104,
      "learning_rate": 1.4060860440713536e-05,
      "loss": 0.6582244396209717,
      "step": 6410
    },
    {
      "epoch": 1.866821750508869,
      "grad_norm": 1.9723000526428223,
      "learning_rate": 1.376105531404587e-05,
      "loss": 0.7287851333618164,
      "step": 6420
    },
    {
      "epoch": 1.86972957255016,
      "grad_norm": 2.274547815322876,
      "learning_rate": 1.3461250187378205e-05,
      "loss": 0.7135519027709961,
      "step": 6430
    },
    {
      "epoch": 1.872637394591451,
      "grad_norm": 1.9583326578140259,
      "learning_rate": 1.316144506071054e-05,
      "loss": 0.6913717746734619,
      "step": 6440
    },
    {
      "epoch": 1.8755452166327422,
      "grad_norm": 2.0431573390960693,
      "learning_rate": 1.2861639934042874e-05,
      "loss": 0.7512425422668457,
      "step": 6450
    },
    {
      "epoch": 1.8784530386740332,
      "grad_norm": 2.096263885498047,
      "learning_rate": 1.2561834807375205e-05,
      "loss": 0.705587911605835,
      "step": 6460
    },
    {
      "epoch": 1.8813608607153243,
      "grad_norm": 1.940988302230835,
      "learning_rate": 1.226202968070754e-05,
      "loss": 0.7097304344177247,
      "step": 6470
    },
    {
      "epoch": 1.8842686827566153,
      "grad_norm": 1.9227349758148193,
      "learning_rate": 1.1962224554039874e-05,
      "loss": 0.7371804237365722,
      "step": 6480
    },
    {
      "epoch": 1.8871765047979063,
      "grad_norm": 1.936657428741455,
      "learning_rate": 1.1662419427372209e-05,
      "loss": 0.763831090927124,
      "step": 6490
    },
    {
      "epoch": 1.8900843268391974,
      "grad_norm": 2.0900590419769287,
      "learning_rate": 1.1362614300704542e-05,
      "loss": 0.7716608047485352,
      "step": 6500
    },
    {
      "epoch": 1.8900843268391974,
      "eval_loss": 0.8433617949485779,
      "eval_runtime": 473.6198,
      "eval_samples_per_second": 7.261,
      "eval_steps_per_second": 7.261,
      "step": 6500
    },
    {
      "epoch": 1.8929921488804884,
      "grad_norm": 2.003570318222046,
      "learning_rate": 1.1062809174036876e-05,
      "loss": 0.6903162956237793,
      "step": 6510
    },
    {
      "epoch": 1.8958999709217796,
      "grad_norm": 2.030358076095581,
      "learning_rate": 1.076300404736921e-05,
      "loss": 0.7362218856811523,
      "step": 6520
    },
    {
      "epoch": 1.8988077929630707,
      "grad_norm": 1.8056998252868652,
      "learning_rate": 1.0463198920701545e-05,
      "loss": 0.6997631072998047,
      "step": 6530
    },
    {
      "epoch": 1.9017156150043617,
      "grad_norm": 1.793885350227356,
      "learning_rate": 1.016339379403388e-05,
      "loss": 0.731913709640503,
      "step": 6540
    },
    {
      "epoch": 1.904623437045653,
      "grad_norm": 2.0713064670562744,
      "learning_rate": 9.863588667366213e-06,
      "loss": 0.7571732997894287,
      "step": 6550
    },
    {
      "epoch": 1.907531259086944,
      "grad_norm": 1.9770585298538208,
      "learning_rate": 9.563783540698547e-06,
      "loss": 0.7581112384796143,
      "step": 6560
    },
    {
      "epoch": 1.910439081128235,
      "grad_norm": 2.289889097213745,
      "learning_rate": 9.26397841403088e-06,
      "loss": 0.7515597343444824,
      "step": 6570
    },
    {
      "epoch": 1.913346903169526,
      "grad_norm": 2.222022294998169,
      "learning_rate": 8.964173287363214e-06,
      "loss": 0.7455884933471679,
      "step": 6580
    },
    {
      "epoch": 1.916254725210817,
      "grad_norm": 1.969591736793518,
      "learning_rate": 8.664368160695547e-06,
      "loss": 0.6826446533203125,
      "step": 6590
    },
    {
      "epoch": 1.919162547252108,
      "grad_norm": 2.064741611480713,
      "learning_rate": 8.364563034027882e-06,
      "loss": 0.6947153091430665,
      "step": 6600
    },
    {
      "epoch": 1.9220703692933991,
      "grad_norm": 2.0524702072143555,
      "learning_rate": 8.064757907360216e-06,
      "loss": 0.7242071151733398,
      "step": 6610
    },
    {
      "epoch": 1.9249781913346902,
      "grad_norm": 2.0459067821502686,
      "learning_rate": 7.76495278069255e-06,
      "loss": 0.7215473651885986,
      "step": 6620
    },
    {
      "epoch": 1.9278860133759814,
      "grad_norm": 2.034057140350342,
      "learning_rate": 7.465147654024884e-06,
      "loss": 0.7145359992980957,
      "step": 6630
    },
    {
      "epoch": 1.9307938354172725,
      "grad_norm": 2.039088487625122,
      "learning_rate": 7.165342527357217e-06,
      "loss": 0.7058275699615478,
      "step": 6640
    },
    {
      "epoch": 1.9337016574585635,
      "grad_norm": 2.0845980644226074,
      "learning_rate": 6.865537400689552e-06,
      "loss": 0.7172946453094482,
      "step": 6650
    },
    {
      "epoch": 1.9366094794998547,
      "grad_norm": 2.0260169506073,
      "learning_rate": 6.565732274021887e-06,
      "loss": 0.7668924808502198,
      "step": 6660
    },
    {
      "epoch": 1.9395173015411458,
      "grad_norm": 2.099771738052368,
      "learning_rate": 6.26592714735422e-06,
      "loss": 0.7463947772979737,
      "step": 6670
    },
    {
      "epoch": 1.9424251235824368,
      "grad_norm": 1.9962726831436157,
      "learning_rate": 5.966122020686554e-06,
      "loss": 0.698793363571167,
      "step": 6680
    },
    {
      "epoch": 1.9453329456237278,
      "grad_norm": 2.075279951095581,
      "learning_rate": 5.666316894018888e-06,
      "loss": 0.6867193222045899,
      "step": 6690
    },
    {
      "epoch": 1.9482407676650189,
      "grad_norm": 1.8623920679092407,
      "learning_rate": 5.366511767351222e-06,
      "loss": 0.7588799953460693,
      "step": 6700
    },
    {
      "epoch": 1.9511485897063099,
      "grad_norm": 2.0419015884399414,
      "learning_rate": 5.066706640683556e-06,
      "loss": 0.7772952079772949,
      "step": 6710
    },
    {
      "epoch": 1.954056411747601,
      "grad_norm": 1.9629226922988892,
      "learning_rate": 4.76690151401589e-06,
      "loss": 0.7375712394714355,
      "step": 6720
    },
    {
      "epoch": 1.9569642337888922,
      "grad_norm": 2.0185976028442383,
      "learning_rate": 4.467096387348224e-06,
      "loss": 0.759752893447876,
      "step": 6730
    },
    {
      "epoch": 1.9598720558301832,
      "grad_norm": 2.034822463989258,
      "learning_rate": 4.167291260680558e-06,
      "loss": 0.7086891174316406,
      "step": 6740
    },
    {
      "epoch": 1.9627798778714742,
      "grad_norm": 2.146571159362793,
      "learning_rate": 3.8674861340128915e-06,
      "loss": 0.7501019477844239,
      "step": 6750
    },
    {
      "epoch": 1.9656876999127655,
      "grad_norm": 2.0261270999908447,
      "learning_rate": 3.5676810073452256e-06,
      "loss": 0.749469804763794,
      "step": 6760
    },
    {
      "epoch": 1.9685955219540565,
      "grad_norm": 2.1559603214263916,
      "learning_rate": 3.2678758806775593e-06,
      "loss": 0.7909334659576416,
      "step": 6770
    },
    {
      "epoch": 1.9715033439953475,
      "grad_norm": 1.8334423303604126,
      "learning_rate": 2.9680707540098938e-06,
      "loss": 0.7327389717102051,
      "step": 6780
    },
    {
      "epoch": 1.9744111660366386,
      "grad_norm": 2.1545159816741943,
      "learning_rate": 2.668265627342228e-06,
      "loss": 0.7096083641052247,
      "step": 6790
    },
    {
      "epoch": 1.9773189880779296,
      "grad_norm": 1.9905986785888672,
      "learning_rate": 2.3684605006745615e-06,
      "loss": 0.7368163585662841,
      "step": 6800
    },
    {
      "epoch": 1.9802268101192206,
      "grad_norm": 2.1959192752838135,
      "learning_rate": 2.0686553740068956e-06,
      "loss": 0.6501263618469239,
      "step": 6810
    },
    {
      "epoch": 1.9831346321605117,
      "grad_norm": 1.9602024555206299,
      "learning_rate": 1.7688502473392297e-06,
      "loss": 0.7349401950836182,
      "step": 6820
    },
    {
      "epoch": 1.9860424542018027,
      "grad_norm": 1.9173307418823242,
      "learning_rate": 1.4690451206715635e-06,
      "loss": 0.7650456428527832,
      "step": 6830
    },
    {
      "epoch": 1.988950276243094,
      "grad_norm": 1.9146604537963867,
      "learning_rate": 1.1692399940038976e-06,
      "loss": 0.6636839866638183,
      "step": 6840
    },
    {
      "epoch": 1.991858098284385,
      "grad_norm": 1.845234990119934,
      "learning_rate": 8.694348673362316e-07,
      "loss": 0.6827308654785156,
      "step": 6850
    },
    {
      "epoch": 1.9947659203256762,
      "grad_norm": 1.9973350763320923,
      "learning_rate": 5.696297406685654e-07,
      "loss": 0.7055688381195069,
      "step": 6860
    },
    {
      "epoch": 1.9976737423669673,
      "grad_norm": 1.9779253005981445,
      "learning_rate": 2.698246140008994e-07,
      "loss": 0.7651626110076905,
      "step": 6870
    },
    {
      "epoch": 2.0,
      "eval_loss": 0.8412191271781921,
      "eval_runtime": 473.3499,
      "eval_samples_per_second": 7.265,
      "eval_steps_per_second": 7.265,
      "step": 6878
    }
  ],
  "logging_steps": 10,
  "max_steps": 6878,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 2,
  "save_steps": 500,
  "stateful_callbacks": {
    "EarlyStoppingCallback": {
      "args": {
        "early_stopping_patience": 3,
        "early_stopping_threshold": 0.0
      },
      "attributes": {
        "early_stopping_patience_counter": 0
      }
    },
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 1.8901151514e+17,
  "train_batch_size": 1,
  "trial_name": null,
  "trial_params": null
}