{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 3.0,
  "eval_steps": 500,
  "global_step": 1875,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.016,
      "grad_norm": 2.508913278579712,
      "learning_rate": 3.6e-05,
      "loss": 2.2622838973999024,
      "step": 10
    },
    {
      "epoch": 0.032,
      "grad_norm": 1.3637374639511108,
      "learning_rate": 7.6e-05,
      "loss": 1.4729194641113281,
      "step": 20
    },
    {
      "epoch": 0.048,
      "grad_norm": 0.8274662494659424,
      "learning_rate": 0.000116,
      "loss": 0.5348126411437988,
      "step": 30
    },
    {
      "epoch": 0.064,
      "grad_norm": 0.24004890024662018,
      "learning_rate": 0.00015600000000000002,
      "loss": 0.24594340324401856,
      "step": 40
    },
    {
      "epoch": 0.08,
      "grad_norm": 0.21569570899009705,
      "learning_rate": 0.000196,
      "loss": 0.14221376180648804,
      "step": 50
    },
    {
      "epoch": 0.096,
      "grad_norm": 0.25074923038482666,
      "learning_rate": 0.00019901369863013698,
      "loss": 0.1053991436958313,
      "step": 60
    },
    {
      "epoch": 0.112,
      "grad_norm": 0.17641063034534454,
      "learning_rate": 0.0001979178082191781,
      "loss": 0.10523500442504882,
      "step": 70
    },
    {
      "epoch": 0.128,
      "grad_norm": 0.3789774477481842,
      "learning_rate": 0.0001968219178082192,
      "loss": 0.10122023820877075,
      "step": 80
    },
    {
      "epoch": 0.144,
      "grad_norm": 0.17485778033733368,
      "learning_rate": 0.00019572602739726029,
      "loss": 0.09140864610671998,
      "step": 90
    },
    {
      "epoch": 0.16,
      "grad_norm": 0.1414472460746765,
      "learning_rate": 0.00019463013698630137,
      "loss": 0.0925188422203064,
      "step": 100
    },
    {
      "epoch": 0.176,
      "grad_norm": 0.15549571812152863,
      "learning_rate": 0.00019353424657534248,
      "loss": 0.07405711412429809,
      "step": 110
    },
    {
      "epoch": 0.192,
      "grad_norm": 0.1726241111755371,
      "learning_rate": 0.00019243835616438357,
      "loss": 0.08338193893432617,
      "step": 120
    },
    {
      "epoch": 0.208,
      "grad_norm": 0.16814149916172028,
      "learning_rate": 0.00019134246575342468,
      "loss": 0.0827404260635376,
      "step": 130
    },
    {
      "epoch": 0.224,
      "grad_norm": 0.10934246331453323,
      "learning_rate": 0.00019024657534246576,
      "loss": 0.08371676802635193,
      "step": 140
    },
    {
      "epoch": 0.24,
      "grad_norm": 0.15543144941329956,
      "learning_rate": 0.00018915068493150685,
      "loss": 0.07682002186775208,
      "step": 150
    },
    {
      "epoch": 0.256,
      "grad_norm": 0.1047728955745697,
      "learning_rate": 0.00018805479452054796,
      "loss": 0.08323028087615966,
      "step": 160
    },
    {
      "epoch": 0.272,
      "grad_norm": 0.15727756917476654,
      "learning_rate": 0.00018695890410958904,
      "loss": 0.0881002426147461,
      "step": 170
    },
    {
      "epoch": 0.288,
      "grad_norm": 0.10985921323299408,
      "learning_rate": 0.00018586301369863015,
      "loss": 0.0725629210472107,
      "step": 180
    },
    {
      "epoch": 0.304,
      "grad_norm": 0.14681079983711243,
      "learning_rate": 0.00018476712328767124,
      "loss": 0.07297256588935852,
      "step": 190
    },
    {
      "epoch": 0.32,
      "grad_norm": 0.11168694496154785,
      "learning_rate": 0.00018367123287671232,
      "loss": 0.07204994559288025,
      "step": 200
    },
    {
      "epoch": 0.336,
      "grad_norm": 0.14472435414791107,
      "learning_rate": 0.00018257534246575343,
      "loss": 0.07219824194908142,
      "step": 210
    },
    {
      "epoch": 0.352,
      "grad_norm": 0.10257100313901901,
      "learning_rate": 0.00018147945205479452,
      "loss": 0.08176417350769043,
      "step": 220
    },
    {
      "epoch": 0.368,
      "grad_norm": 0.15164950489997864,
      "learning_rate": 0.00018038356164383563,
      "loss": 0.080460923910141,
      "step": 230
    },
    {
      "epoch": 0.384,
      "grad_norm": 0.17651614546775818,
      "learning_rate": 0.00017928767123287674,
      "loss": 0.0750627338886261,
      "step": 240
    },
    {
      "epoch": 0.4,
      "grad_norm": 0.10908259451389313,
      "learning_rate": 0.0001781917808219178,
      "loss": 0.06641974449157714,
      "step": 250
    },
    {
      "epoch": 0.416,
      "grad_norm": 0.11685926467180252,
      "learning_rate": 0.0001770958904109589,
      "loss": 0.06228730082511902,
      "step": 260
    },
    {
      "epoch": 0.432,
      "grad_norm": 0.13672174513339996,
      "learning_rate": 0.00017600000000000002,
      "loss": 0.06748496294021607,
      "step": 270
    },
    {
      "epoch": 0.448,
      "grad_norm": 0.12294622510671616,
      "learning_rate": 0.0001749041095890411,
      "loss": 0.07605534791946411,
      "step": 280
    },
    {
      "epoch": 0.464,
      "grad_norm": 0.07749421894550323,
      "learning_rate": 0.00017380821917808222,
      "loss": 0.06066908836364746,
      "step": 290
    },
    {
      "epoch": 0.48,
      "grad_norm": 0.15694420039653778,
      "learning_rate": 0.00017271232876712328,
      "loss": 0.06470143795013428,
      "step": 300
    },
    {
      "epoch": 0.496,
      "grad_norm": 0.09705078601837158,
      "learning_rate": 0.0001716164383561644,
      "loss": 0.05920176506042481,
      "step": 310
    },
    {
      "epoch": 0.512,
      "grad_norm": 0.09835303574800491,
      "learning_rate": 0.0001705205479452055,
      "loss": 0.08239805102348327,
      "step": 320
    },
    {
      "epoch": 0.528,
      "grad_norm": 0.0981152132153511,
      "learning_rate": 0.00016942465753424658,
      "loss": 0.05663549304008484,
      "step": 330
    },
    {
      "epoch": 0.544,
      "grad_norm": 0.07651517540216446,
      "learning_rate": 0.0001683287671232877,
      "loss": 0.06320589184761047,
      "step": 340
    },
    {
      "epoch": 0.56,
      "grad_norm": 0.10856720060110092,
      "learning_rate": 0.00016723287671232878,
      "loss": 0.07581254243850707,
      "step": 350
    },
    {
      "epoch": 0.576,
      "grad_norm": 0.0863386020064354,
      "learning_rate": 0.00016613698630136986,
      "loss": 0.0790505051612854,
      "step": 360
    },
    {
      "epoch": 0.592,
      "grad_norm": 0.10928363353013992,
      "learning_rate": 0.00016504109589041098,
      "loss": 0.06397929787635803,
      "step": 370
    },
    {
      "epoch": 0.608,
      "grad_norm": 0.11172884702682495,
      "learning_rate": 0.00016394520547945206,
      "loss": 0.05513489246368408,
      "step": 380
    },
    {
      "epoch": 0.624,
      "grad_norm": 0.09518434852361679,
      "learning_rate": 0.00016284931506849317,
      "loss": 0.07740641236305237,
      "step": 390
    },
    {
      "epoch": 0.64,
      "grad_norm": 0.10455268621444702,
      "learning_rate": 0.00016175342465753426,
      "loss": 0.07365262508392334,
      "step": 400
    },
    {
      "epoch": 0.656,
      "grad_norm": 0.0962410569190979,
      "learning_rate": 0.00016065753424657534,
      "loss": 0.08518975973129272,
      "step": 410
    },
    {
      "epoch": 0.672,
      "grad_norm": 0.12104412168264389,
      "learning_rate": 0.00015956164383561645,
      "loss": 0.06871490478515625,
      "step": 420
    },
    {
      "epoch": 0.688,
      "grad_norm": 0.10041218250989914,
      "learning_rate": 0.00015846575342465754,
      "loss": 0.051221036911010744,
      "step": 430
    },
    {
      "epoch": 0.704,
      "grad_norm": 0.08319935947656631,
      "learning_rate": 0.00015736986301369865,
      "loss": 0.06872759461402893,
      "step": 440
    },
    {
      "epoch": 0.72,
      "grad_norm": 0.078139528632164,
      "learning_rate": 0.00015627397260273973,
      "loss": 0.07016033530235291,
      "step": 450
    },
    {
      "epoch": 0.736,
      "grad_norm": 0.10938999056816101,
      "learning_rate": 0.00015517808219178082,
      "loss": 0.06500827074050904,
      "step": 460
    },
    {
      "epoch": 0.752,
      "grad_norm": 0.07604733109474182,
      "learning_rate": 0.00015408219178082193,
      "loss": 0.06083506941795349,
      "step": 470
    },
    {
      "epoch": 0.768,
      "grad_norm": 0.0853394940495491,
      "learning_rate": 0.00015298630136986304,
      "loss": 0.059677237272262575,
      "step": 480
    },
    {
      "epoch": 0.784,
      "grad_norm": 0.09151621907949448,
      "learning_rate": 0.0001518904109589041,
      "loss": 0.05737585425376892,
      "step": 490
    },
    {
      "epoch": 0.8,
      "grad_norm": 0.0884072557091713,
      "learning_rate": 0.0001507945205479452,
      "loss": 0.0566044270992279,
      "step": 500
    },
    {
      "epoch": 0.816,
      "grad_norm": 0.091744065284729,
      "learning_rate": 0.0001496986301369863,
      "loss": 0.06913858652114868,
      "step": 510
    },
    {
      "epoch": 0.832,
      "grad_norm": 0.07623863965272903,
      "learning_rate": 0.0001486027397260274,
      "loss": 0.06345137357711791,
      "step": 520
    },
    {
      "epoch": 0.848,
      "grad_norm": 0.07791073620319366,
      "learning_rate": 0.00014750684931506852,
      "loss": 0.060628962516784665,
      "step": 530
    },
    {
      "epoch": 0.864,
      "grad_norm": 0.08502475172281265,
      "learning_rate": 0.00014641095890410957,
      "loss": 0.068820059299469,
      "step": 540
    },
    {
      "epoch": 0.88,
      "grad_norm": 0.09597698599100113,
      "learning_rate": 0.00014531506849315069,
      "loss": 0.0585732638835907,
      "step": 550
    },
    {
      "epoch": 0.896,
      "grad_norm": 0.09175119549036026,
      "learning_rate": 0.0001442191780821918,
      "loss": 0.067873615026474,
      "step": 560
    },
    {
      "epoch": 0.912,
      "grad_norm": 0.10440277308225632,
      "learning_rate": 0.00014312328767123288,
      "loss": 0.06306946873664857,
      "step": 570
    },
    {
      "epoch": 0.928,
      "grad_norm": 0.08166486769914627,
      "learning_rate": 0.000142027397260274,
      "loss": 0.06535319089889527,
      "step": 580
    },
    {
      "epoch": 0.944,
      "grad_norm": 0.09520258009433746,
      "learning_rate": 0.00014093150684931508,
      "loss": 0.06487776637077332,
      "step": 590
    },
    {
      "epoch": 0.96,
      "grad_norm": 0.08356442302465439,
      "learning_rate": 0.00013983561643835616,
      "loss": 0.0673000991344452,
      "step": 600
    },
    {
      "epoch": 0.976,
      "grad_norm": 0.10857579857110977,
      "learning_rate": 0.00013873972602739727,
      "loss": 0.06675973534584045,
      "step": 610
    },
    {
      "epoch": 0.992,
      "grad_norm": 0.0846136212348938,
      "learning_rate": 0.00013764383561643836,
      "loss": 0.056840169429779056,
      "step": 620
    },
    {
      "epoch": 1.008,
      "grad_norm": 0.12099627405405045,
      "learning_rate": 0.00013654794520547947,
      "loss": 0.06133960485458374,
      "step": 630
    },
    {
      "epoch": 1.024,
      "grad_norm": 0.14235953986644745,
      "learning_rate": 0.00013545205479452055,
      "loss": 0.051949769258499146,
      "step": 640
    },
    {
      "epoch": 1.04,
      "grad_norm": 0.06599270552396774,
      "learning_rate": 0.00013435616438356164,
      "loss": 0.05395192503929138,
      "step": 650
    },
    {
      "epoch": 1.056,
      "grad_norm": 0.08540896326303482,
      "learning_rate": 0.00013326027397260275,
      "loss": 0.058018720149993895,
      "step": 660
    },
    {
      "epoch": 1.072,
      "grad_norm": 0.07638814300298691,
      "learning_rate": 0.00013216438356164384,
      "loss": 0.05708546638488769,
      "step": 670
    },
    {
      "epoch": 1.088,
      "grad_norm": 0.07762673497200012,
      "learning_rate": 0.00013106849315068495,
      "loss": 0.05458671450614929,
      "step": 680
    },
    {
      "epoch": 1.104,
      "grad_norm": 0.06784480065107346,
      "learning_rate": 0.00012997260273972603,
      "loss": 0.05251591801643372,
      "step": 690
    },
    {
      "epoch": 1.12,
      "grad_norm": 0.08402290940284729,
      "learning_rate": 0.00012887671232876712,
      "loss": 0.0629364550113678,
      "step": 700
    },
    {
      "epoch": 1.1360000000000001,
      "grad_norm": 0.08088182657957077,
      "learning_rate": 0.00012778082191780823,
      "loss": 0.06972357630729675,
      "step": 710
    },
    {
      "epoch": 1.152,
      "grad_norm": 0.09672300517559052,
      "learning_rate": 0.0001266849315068493,
      "loss": 0.07236437201499939,
      "step": 720
    },
    {
      "epoch": 1.168,
      "grad_norm": 0.07926960289478302,
      "learning_rate": 0.00012558904109589042,
      "loss": 0.05548118352890015,
      "step": 730
    },
    {
      "epoch": 1.184,
      "grad_norm": 0.08126692473888397,
      "learning_rate": 0.0001244931506849315,
      "loss": 0.0625440776348114,
      "step": 740
    },
    {
      "epoch": 1.2,
      "grad_norm": 0.06809371709823608,
      "learning_rate": 0.0001233972602739726,
      "loss": 0.06416532397270203,
      "step": 750
    },
    {
      "epoch": 1.216,
      "grad_norm": 0.11283153295516968,
      "learning_rate": 0.0001223013698630137,
      "loss": 0.06832035183906555,
      "step": 760
    },
    {
      "epoch": 1.232,
      "grad_norm": 0.08298429101705551,
      "learning_rate": 0.0001212054794520548,
      "loss": 0.05774534940719604,
      "step": 770
    },
    {
      "epoch": 1.248,
      "grad_norm": 0.07594408094882965,
      "learning_rate": 0.0001201095890410959,
      "loss": 0.05947027802467346,
      "step": 780
    },
    {
      "epoch": 1.264,
      "grad_norm": 0.07829966396093369,
      "learning_rate": 0.00011901369863013698,
      "loss": 0.05323997139930725,
      "step": 790
    },
    {
      "epoch": 1.28,
      "grad_norm": 0.0940285176038742,
      "learning_rate": 0.00011791780821917808,
      "loss": 0.051990669965744016,
      "step": 800
    },
    {
      "epoch": 1.296,
      "grad_norm": 0.07158597558736801,
      "learning_rate": 0.00011682191780821918,
      "loss": 0.06842873692512512,
      "step": 810
    },
    {
      "epoch": 1.312,
      "grad_norm": 0.06882186979055405,
      "learning_rate": 0.00011572602739726028,
      "loss": 0.06891562342643738,
      "step": 820
    },
    {
      "epoch": 1.328,
      "grad_norm": 0.08038769662380219,
      "learning_rate": 0.00011463013698630139,
      "loss": 0.07961333990097046,
      "step": 830
    },
    {
      "epoch": 1.3439999999999999,
      "grad_norm": 0.07336015999317169,
      "learning_rate": 0.00011353424657534246,
      "loss": 0.0596350908279419,
      "step": 840
    },
    {
      "epoch": 1.3599999999999999,
      "grad_norm": 0.07672706991434097,
      "learning_rate": 0.00011243835616438356,
      "loss": 0.06021456122398376,
      "step": 850
    },
    {
      "epoch": 1.376,
      "grad_norm": 0.06943219900131226,
      "learning_rate": 0.00011134246575342466,
      "loss": 0.05736762285232544,
      "step": 860
    },
    {
      "epoch": 1.392,
      "grad_norm": 0.07571737468242645,
      "learning_rate": 0.00011024657534246577,
      "loss": 0.06433975100517272,
      "step": 870
    },
    {
      "epoch": 1.408,
      "grad_norm": 0.07972362637519836,
      "learning_rate": 0.00010915068493150687,
      "loss": 0.06327899098396302,
      "step": 880
    },
    {
      "epoch": 1.424,
      "grad_norm": 0.05659586563706398,
      "learning_rate": 0.00010805479452054794,
      "loss": 0.07444382905960083,
      "step": 890
    },
    {
      "epoch": 1.44,
      "grad_norm": 0.08906129002571106,
      "learning_rate": 0.00010695890410958904,
      "loss": 0.054902708530426024,
      "step": 900
    },
    {
      "epoch": 1.456,
      "grad_norm": 0.0920051857829094,
      "learning_rate": 0.00010586301369863015,
      "loss": 0.0726428508758545,
      "step": 910
    },
    {
      "epoch": 1.472,
      "grad_norm": 0.07324782013893127,
      "learning_rate": 0.00010476712328767125,
      "loss": 0.0653969943523407,
      "step": 920
    },
    {
      "epoch": 1.488,
      "grad_norm": 0.07842034846544266,
      "learning_rate": 0.00010367123287671234,
      "loss": 0.06544245481491089,
      "step": 930
    },
    {
      "epoch": 1.504,
      "grad_norm": 0.05455109104514122,
      "learning_rate": 0.00010257534246575343,
      "loss": 0.06340432167053223,
      "step": 940
    },
    {
      "epoch": 1.52,
      "grad_norm": 0.0697791799902916,
      "learning_rate": 0.00010147945205479453,
      "loss": 0.0673532783985138,
      "step": 950
    },
    {
      "epoch": 1.536,
      "grad_norm": 0.08127626776695251,
      "learning_rate": 0.00010038356164383562,
      "loss": 0.05687047243118286,
      "step": 960
    },
    {
      "epoch": 1.552,
      "grad_norm": 0.07406352460384369,
      "learning_rate": 9.928767123287672e-05,
      "loss": 0.05548548698425293,
      "step": 970
    },
    {
      "epoch": 1.568,
      "grad_norm": 0.058401789516210556,
      "learning_rate": 9.81917808219178e-05,
      "loss": 0.05551270842552185,
      "step": 980
    },
    {
      "epoch": 1.584,
      "grad_norm": 0.07948075234889984,
      "learning_rate": 9.709589041095892e-05,
      "loss": 0.0539792537689209,
      "step": 990
    },
    {
      "epoch": 1.6,
      "grad_norm": 0.08617192506790161,
      "learning_rate": 9.6e-05,
      "loss": 0.07711206674575806,
      "step": 1000
    },
    {
      "epoch": 1.616,
      "grad_norm": 0.07934480905532837,
      "learning_rate": 9.49041095890411e-05,
      "loss": 0.05799928903579712,
      "step": 1010
    },
    {
      "epoch": 1.6320000000000001,
      "grad_norm": 0.07682377099990845,
      "learning_rate": 9.38082191780822e-05,
      "loss": 0.04488539695739746,
      "step": 1020
    },
    {
      "epoch": 1.6480000000000001,
      "grad_norm": 0.08497436344623566,
      "learning_rate": 9.27123287671233e-05,
      "loss": 0.06617907881736755,
      "step": 1030
    },
    {
      "epoch": 1.6640000000000001,
      "grad_norm": 0.07464807480573654,
      "learning_rate": 9.16164383561644e-05,
      "loss": 0.06436434388160706,
      "step": 1040
    },
    {
      "epoch": 1.6800000000000002,
      "grad_norm": 0.07073179632425308,
      "learning_rate": 9.052054794520548e-05,
      "loss": 0.05826765298843384,
      "step": 1050
    },
    {
      "epoch": 1.696,
      "grad_norm": 0.07814770191907883,
      "learning_rate": 8.942465753424658e-05,
      "loss": 0.058675730228424074,
      "step": 1060
    },
    {
      "epoch": 1.712,
      "grad_norm": 0.07397276908159256,
      "learning_rate": 8.832876712328768e-05,
      "loss": 0.05216291546821594,
      "step": 1070
    },
    {
      "epoch": 1.728,
      "grad_norm": 0.06203208118677139,
      "learning_rate": 8.723287671232877e-05,
      "loss": 0.04832034409046173,
      "step": 1080
    },
    {
      "epoch": 1.744,
      "grad_norm": 0.08247426897287369,
      "learning_rate": 8.613698630136987e-05,
      "loss": 0.06704681515693664,
      "step": 1090
    },
    {
      "epoch": 1.76,
      "grad_norm": 0.07352187484502792,
      "learning_rate": 8.504109589041096e-05,
      "loss": 0.056213170289993286,
      "step": 1100
    },
    {
      "epoch": 1.776,
      "grad_norm": 0.0742267519235611,
      "learning_rate": 8.394520547945205e-05,
      "loss": 0.05674695372581482,
      "step": 1110
    },
    {
      "epoch": 1.792,
      "grad_norm": 0.07203282415866852,
      "learning_rate": 8.284931506849315e-05,
      "loss": 0.05544196367263794,
      "step": 1120
    },
    {
      "epoch": 1.808,
      "grad_norm": 0.07500709593296051,
      "learning_rate": 8.175342465753425e-05,
      "loss": 0.050712913274765015,
      "step": 1130
    },
    {
      "epoch": 1.8239999999999998,
      "grad_norm": 0.06234560161828995,
      "learning_rate": 8.065753424657535e-05,
      "loss": 0.06268961429595947,
      "step": 1140
    },
    {
      "epoch": 1.8399999999999999,
      "grad_norm": 0.0685572475194931,
      "learning_rate": 7.956164383561645e-05,
      "loss": 0.050892168283462526,
      "step": 1150
    },
    {
      "epoch": 1.8559999999999999,
      "grad_norm": 0.06377062201499939,
      "learning_rate": 7.846575342465754e-05,
      "loss": 0.05338585376739502,
      "step": 1160
    },
    {
      "epoch": 1.8719999999999999,
      "grad_norm": 0.07672174274921417,
      "learning_rate": 7.736986301369863e-05,
      "loss": 0.0719197690486908,
      "step": 1170
    },
    {
      "epoch": 1.888,
      "grad_norm": 0.090825654566288,
      "learning_rate": 7.627397260273973e-05,
      "loss": 0.05965543985366821,
      "step": 1180
    },
    {
      "epoch": 1.904,
      "grad_norm": 0.07492175698280334,
      "learning_rate": 7.517808219178082e-05,
      "loss": 0.05505464673042297,
      "step": 1190
    },
    {
      "epoch": 1.92,
      "grad_norm": 0.06776276230812073,
      "learning_rate": 7.408219178082192e-05,
      "loss": 0.06357068419456482,
      "step": 1200
    },
    {
      "epoch": 1.936,
      "grad_norm": 0.08154194056987762,
      "learning_rate": 7.298630136986302e-05,
      "loss": 0.07490106225013733,
      "step": 1210
    },
    {
      "epoch": 1.952,
      "grad_norm": 0.06378313153982162,
      "learning_rate": 7.18904109589041e-05,
      "loss": 0.05937790870666504,
      "step": 1220
    },
    {
      "epoch": 1.968,
      "grad_norm": 0.07686297595500946,
      "learning_rate": 7.07945205479452e-05,
      "loss": 0.047175332903862,
      "step": 1230
    },
    {
      "epoch": 1.984,
      "grad_norm": 0.07107747346162796,
      "learning_rate": 6.969863013698631e-05,
      "loss": 0.05534272789955139,
      "step": 1240
    },
    {
      "epoch": 2.0,
      "grad_norm": 0.07110369950532913,
      "learning_rate": 6.86027397260274e-05,
      "loss": 0.06306116580963135,
      "step": 1250
    },
    {
      "epoch": 2.016,
      "grad_norm": 0.08407752960920334,
      "learning_rate": 6.75068493150685e-05,
      "loss": 0.06708587408065796,
      "step": 1260
    },
    {
      "epoch": 2.032,
      "grad_norm": 0.06716394424438477,
      "learning_rate": 6.641095890410958e-05,
      "loss": 0.05874839425086975,
      "step": 1270
    },
    {
      "epoch": 2.048,
      "grad_norm": 0.07555590569972992,
      "learning_rate": 6.531506849315069e-05,
      "loss": 0.05490245819091797,
      "step": 1280
    },
    {
      "epoch": 2.064,
      "grad_norm": 0.07453346997499466,
      "learning_rate": 6.421917808219179e-05,
      "loss": 0.060266101360321046,
      "step": 1290
    },
    {
      "epoch": 2.08,
      "grad_norm": 0.07161426544189453,
      "learning_rate": 6.312328767123288e-05,
      "loss": 0.05789074897766113,
      "step": 1300
    },
    {
      "epoch": 2.096,
      "grad_norm": 0.06622769683599472,
      "learning_rate": 6.202739726027397e-05,
      "loss": 0.06246234774589539,
      "step": 1310
    },
    {
      "epoch": 2.112,
      "grad_norm": 0.06886615604162216,
      "learning_rate": 6.0931506849315065e-05,
      "loss": 0.06343585848808289,
      "step": 1320
    },
    {
      "epoch": 2.128,
      "grad_norm": 0.07828567922115326,
      "learning_rate": 5.983561643835617e-05,
      "loss": 0.05640849471092224,
      "step": 1330
    },
    {
      "epoch": 2.144,
      "grad_norm": 0.07572014629840851,
      "learning_rate": 5.873972602739727e-05,
      "loss": 0.05074018836021423,
      "step": 1340
    },
    {
      "epoch": 2.16,
      "grad_norm": 0.06873492151498795,
      "learning_rate": 5.764383561643836e-05,
      "loss": 0.054788839817047116,
      "step": 1350
    },
    {
      "epoch": 2.176,
      "grad_norm": 0.080296590924263,
      "learning_rate": 5.654794520547946e-05,
      "loss": 0.06817570328712463,
      "step": 1360
    },
    {
      "epoch": 2.192,
      "grad_norm": 0.07481079548597336,
      "learning_rate": 5.545205479452055e-05,
      "loss": 0.05946822166442871,
      "step": 1370
    },
    {
      "epoch": 2.208,
      "grad_norm": 0.06598034501075745,
      "learning_rate": 5.4356164383561646e-05,
      "loss": 0.06451416015625,
      "step": 1380
    },
    {
      "epoch": 2.224,
      "grad_norm": 0.06421754509210587,
      "learning_rate": 5.326027397260275e-05,
      "loss": 0.044728249311447144,
      "step": 1390
    },
    {
      "epoch": 2.24,
      "grad_norm": 0.0873740166425705,
      "learning_rate": 5.2164383561643835e-05,
      "loss": 0.06062799692153931,
      "step": 1400
    },
    {
      "epoch": 2.2560000000000002,
      "grad_norm": 0.0765632912516594,
      "learning_rate": 5.106849315068494e-05,
      "loss": 0.045326176285743716,
      "step": 1410
    },
    {
      "epoch": 2.2720000000000002,
      "grad_norm": 0.07045675814151764,
      "learning_rate": 4.997260273972603e-05,
      "loss": 0.055944430828094485,
      "step": 1420
    },
    {
      "epoch": 2.288,
      "grad_norm": 0.07185397297143936,
      "learning_rate": 4.887671232876713e-05,
      "loss": 0.0565622866153717,
      "step": 1430
    },
    {
      "epoch": 2.304,
      "grad_norm": 0.060445524752140045,
      "learning_rate": 4.778082191780822e-05,
      "loss": 0.046791556477546695,
      "step": 1440
    },
    {
      "epoch": 2.32,
      "grad_norm": 0.07421662658452988,
      "learning_rate": 4.668493150684932e-05,
      "loss": 0.05277963280677796,
      "step": 1450
    },
    {
      "epoch": 2.336,
      "grad_norm": 0.07298663258552551,
      "learning_rate": 4.558904109589041e-05,
      "loss": 0.05688644647598266,
      "step": 1460
    },
    {
      "epoch": 2.352,
      "grad_norm": 0.09492790699005127,
      "learning_rate": 4.4493150684931515e-05,
      "loss": 0.047478115558624266,
      "step": 1470
    },
    {
      "epoch": 2.368,
      "grad_norm": 0.06642141193151474,
      "learning_rate": 4.3397260273972606e-05,
      "loss": 0.06602519750595093,
      "step": 1480
    },
    {
      "epoch": 2.384,
      "grad_norm": 0.07402130216360092,
      "learning_rate": 4.2301369863013704e-05,
      "loss": 0.05580626130104065,
      "step": 1490
    },
    {
      "epoch": 2.4,
      "grad_norm": 0.06669177114963531,
      "learning_rate": 4.1205479452054795e-05,
      "loss": 0.05892255902290344,
      "step": 1500
    },
    {
      "epoch": 2.416,
      "grad_norm": 0.06874891370534897,
      "learning_rate": 4.0109589041095893e-05,
      "loss": 0.051643162965774536,
      "step": 1510
    },
    {
      "epoch": 2.432,
      "grad_norm": 0.0800771415233612,
      "learning_rate": 3.9013698630136985e-05,
      "loss": 0.05913302302360535,
      "step": 1520
    },
    {
      "epoch": 2.448,
      "grad_norm": 0.07515449076890945,
      "learning_rate": 3.791780821917808e-05,
      "loss": 0.05278569459915161,
      "step": 1530
    },
    {
      "epoch": 2.464,
      "grad_norm": 0.07199724018573761,
      "learning_rate": 3.682191780821918e-05,
      "loss": 0.06340099573135376,
      "step": 1540
    },
    {
      "epoch": 2.48,
      "grad_norm": 0.0678173080086708,
      "learning_rate": 3.572602739726028e-05,
      "loss": 0.05762805938720703,
      "step": 1550
    },
    {
      "epoch": 2.496,
      "grad_norm": 0.06555041670799255,
      "learning_rate": 3.463013698630137e-05,
      "loss": 0.05636816024780274,
      "step": 1560
    },
    {
      "epoch": 2.512,
      "grad_norm": 0.06559967249631882,
      "learning_rate": 3.353424657534247e-05,
      "loss": 0.06568785905838012,
      "step": 1570
    },
    {
      "epoch": 2.528,
      "grad_norm": 0.06496980041265488,
      "learning_rate": 3.2438356164383566e-05,
      "loss": 0.05101228356361389,
      "step": 1580
    },
    {
      "epoch": 2.544,
      "grad_norm": 0.07682781666517258,
      "learning_rate": 3.134246575342466e-05,
      "loss": 0.0528583824634552,
      "step": 1590
    },
    {
      "epoch": 2.56,
      "grad_norm": 0.07121063023805618,
      "learning_rate": 3.0246575342465755e-05,
      "loss": 0.058035969734191895,
      "step": 1600
    },
    {
      "epoch": 2.576,
      "grad_norm": 0.06801185756921768,
      "learning_rate": 2.915068493150685e-05,
      "loss": 0.06011275053024292,
      "step": 1610
    },
    {
      "epoch": 2.592,
      "grad_norm": 0.07581860572099686,
      "learning_rate": 2.8054794520547945e-05,
      "loss": 0.05654975771903992,
      "step": 1620
    },
    {
      "epoch": 2.608,
      "grad_norm": 0.0737839862704277,
      "learning_rate": 2.6958904109589046e-05,
      "loss": 0.056451690196990964,
      "step": 1630
    },
    {
      "epoch": 2.624,
      "grad_norm": 0.07139196991920471,
      "learning_rate": 2.586301369863014e-05,
      "loss": 0.04597268998622894,
      "step": 1640
    },
    {
      "epoch": 2.64,
      "grad_norm": 0.06762823462486267,
      "learning_rate": 2.4767123287671235e-05,
      "loss": 0.05820190906524658,
      "step": 1650
    },
    {
      "epoch": 2.656,
      "grad_norm": 0.0811057910323143,
      "learning_rate": 2.367123287671233e-05,
      "loss": 0.05836214423179627,
      "step": 1660
    },
    {
      "epoch": 2.672,
      "grad_norm": 0.06706267595291138,
      "learning_rate": 2.2575342465753428e-05,
      "loss": 0.06739939451217651,
      "step": 1670
    },
    {
      "epoch": 2.6879999999999997,
      "grad_norm": 0.08226553350687027,
      "learning_rate": 2.1479452054794523e-05,
      "loss": 0.0494617223739624,
      "step": 1680
    },
    {
      "epoch": 2.7039999999999997,
      "grad_norm": 0.06992533802986145,
      "learning_rate": 2.0383561643835617e-05,
      "loss": 0.04956637024879455,
      "step": 1690
    },
    {
      "epoch": 2.7199999999999998,
      "grad_norm": 0.07530827820301056,
      "learning_rate": 1.9287671232876715e-05,
      "loss": 0.05145506858825684,
      "step": 1700
    },
    {
      "epoch": 2.7359999999999998,
      "grad_norm": 0.08654113113880157,
      "learning_rate": 1.819178082191781e-05,
      "loss": 0.06389129757881165,
      "step": 1710
    },
    {
      "epoch": 2.752,
      "grad_norm": 0.07525806874036789,
      "learning_rate": 1.7095890410958905e-05,
      "loss": 0.05270478129386902,
      "step": 1720
    },
    {
      "epoch": 2.768,
      "grad_norm": 0.07091408967971802,
      "learning_rate": 1.6000000000000003e-05,
      "loss": 0.04515470564365387,
      "step": 1730
    },
    {
      "epoch": 2.784,
      "grad_norm": 0.08608058840036392,
      "learning_rate": 1.4904109589041096e-05,
      "loss": 0.06389402747154235,
      "step": 1740
    },
    {
      "epoch": 2.8,
      "grad_norm": 0.08408579975366592,
      "learning_rate": 1.3808219178082194e-05,
      "loss": 0.0562017560005188,
      "step": 1750
    },
    {
      "epoch": 2.816,
      "grad_norm": 0.07602408528327942,
      "learning_rate": 1.2712328767123288e-05,
      "loss": 0.06580867767333984,
      "step": 1760
    },
    {
      "epoch": 2.832,
      "grad_norm": 0.07258498668670654,
      "learning_rate": 1.1616438356164385e-05,
      "loss": 0.06434884071350097,
      "step": 1770
    },
    {
      "epoch": 2.848,
      "grad_norm": 0.07658011466264725,
      "learning_rate": 1.0520547945205481e-05,
      "loss": 0.06852009296417236,
      "step": 1780
    },
    {
      "epoch": 2.864,
      "grad_norm": 0.08555035293102264,
      "learning_rate": 9.424657534246576e-06,
      "loss": 0.05706756114959717,
      "step": 1790
    },
    {
      "epoch": 2.88,
      "grad_norm": 0.07567881047725677,
      "learning_rate": 8.328767123287672e-06,
      "loss": 0.05250921845436096,
      "step": 1800
    },
    {
      "epoch": 2.896,
      "grad_norm": 0.07079999893903732,
      "learning_rate": 7.232876712328767e-06,
      "loss": 0.056053298711776736,
      "step": 1810
    },
    {
      "epoch": 2.912,
      "grad_norm": 0.06325879693031311,
      "learning_rate": 6.136986301369863e-06,
      "loss": 0.0492043673992157,
      "step": 1820
    },
    {
      "epoch": 2.928,
      "grad_norm": 0.08406194299459457,
      "learning_rate": 5.041095890410959e-06,
      "loss": 0.06466425061225892,
      "step": 1830
    },
    {
      "epoch": 2.944,
      "grad_norm": 0.07006578147411346,
      "learning_rate": 3.945205479452055e-06,
      "loss": 0.06320858597755433,
      "step": 1840
    },
    {
      "epoch": 2.96,
      "grad_norm": 0.06405450403690338,
      "learning_rate": 2.8493150684931506e-06,
      "loss": 0.05880612134933472,
      "step": 1850
    },
    {
      "epoch": 2.976,
      "grad_norm": 0.06825050711631775,
      "learning_rate": 1.7534246575342465e-06,
      "loss": 0.06181260943412781,
      "step": 1860
    },
    {
      "epoch": 2.992,
      "grad_norm": 0.07468807697296143,
      "learning_rate": 6.575342465753426e-07,
      "loss": 0.07073507905006408,
      "step": 1870
    }
  ],
  "logging_steps": 10,
  "max_steps": 1875,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 3,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 1.5924415770723533e+17,
  "train_batch_size": 4,
  "trial_name": null,
  "trial_params": null
}