{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 1.0,
  "eval_steps": 500,
  "global_step": 625,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.016,
      "grad_norm": 2.508913278579712,
      "learning_rate": 3.6e-05,
      "loss": 2.2622838973999024,
      "step": 10
    },
    {
      "epoch": 0.032,
      "grad_norm": 1.3637374639511108,
      "learning_rate": 7.6e-05,
      "loss": 1.4729194641113281,
      "step": 20
    },
    {
      "epoch": 0.048,
      "grad_norm": 0.8274662494659424,
      "learning_rate": 0.000116,
      "loss": 0.5348126411437988,
      "step": 30
    },
    {
      "epoch": 0.064,
      "grad_norm": 0.24004890024662018,
      "learning_rate": 0.00015600000000000002,
      "loss": 0.24594340324401856,
      "step": 40
    },
    {
      "epoch": 0.08,
      "grad_norm": 0.21569570899009705,
      "learning_rate": 0.000196,
      "loss": 0.14221376180648804,
      "step": 50
    },
    {
      "epoch": 0.096,
      "grad_norm": 0.25074923038482666,
      "learning_rate": 0.00019901369863013698,
      "loss": 0.1053991436958313,
      "step": 60
    },
    {
      "epoch": 0.112,
      "grad_norm": 0.17641063034534454,
      "learning_rate": 0.0001979178082191781,
      "loss": 0.10523500442504882,
      "step": 70
    },
    {
      "epoch": 0.128,
      "grad_norm": 0.3789774477481842,
      "learning_rate": 0.0001968219178082192,
      "loss": 0.10122023820877075,
      "step": 80
    },
    {
      "epoch": 0.144,
      "grad_norm": 0.17485778033733368,
      "learning_rate": 0.00019572602739726029,
      "loss": 0.09140864610671998,
      "step": 90
    },
    {
      "epoch": 0.16,
      "grad_norm": 0.1414472460746765,
      "learning_rate": 0.00019463013698630137,
      "loss": 0.0925188422203064,
      "step": 100
    },
    {
      "epoch": 0.176,
      "grad_norm": 0.15549571812152863,
      "learning_rate": 0.00019353424657534248,
      "loss": 0.07405711412429809,
      "step": 110
    },
    {
      "epoch": 0.192,
      "grad_norm": 0.1726241111755371,
      "learning_rate": 0.00019243835616438357,
      "loss": 0.08338193893432617,
      "step": 120
    },
    {
      "epoch": 0.208,
      "grad_norm": 0.16814149916172028,
      "learning_rate": 0.00019134246575342468,
      "loss": 0.0827404260635376,
      "step": 130
    },
    {
      "epoch": 0.224,
      "grad_norm": 0.10934246331453323,
      "learning_rate": 0.00019024657534246576,
      "loss": 0.08371676802635193,
      "step": 140
    },
    {
      "epoch": 0.24,
      "grad_norm": 0.15543144941329956,
      "learning_rate": 0.00018915068493150685,
      "loss": 0.07682002186775208,
      "step": 150
    },
    {
      "epoch": 0.256,
      "grad_norm": 0.1047728955745697,
      "learning_rate": 0.00018805479452054796,
      "loss": 0.08323028087615966,
      "step": 160
    },
    {
      "epoch": 0.272,
      "grad_norm": 0.15727756917476654,
      "learning_rate": 0.00018695890410958904,
      "loss": 0.0881002426147461,
      "step": 170
    },
    {
      "epoch": 0.288,
      "grad_norm": 0.10985921323299408,
      "learning_rate": 0.00018586301369863015,
      "loss": 0.0725629210472107,
      "step": 180
    },
    {
      "epoch": 0.304,
      "grad_norm": 0.14681079983711243,
      "learning_rate": 0.00018476712328767124,
      "loss": 0.07297256588935852,
      "step": 190
    },
    {
      "epoch": 0.32,
      "grad_norm": 0.11168694496154785,
      "learning_rate": 0.00018367123287671232,
      "loss": 0.07204994559288025,
      "step": 200
    },
    {
      "epoch": 0.336,
      "grad_norm": 0.14472435414791107,
      "learning_rate": 0.00018257534246575343,
      "loss": 0.07219824194908142,
      "step": 210
    },
    {
      "epoch": 0.352,
      "grad_norm": 0.10257100313901901,
      "learning_rate": 0.00018147945205479452,
      "loss": 0.08176417350769043,
      "step": 220
    },
    {
      "epoch": 0.368,
      "grad_norm": 0.15164950489997864,
      "learning_rate": 0.00018038356164383563,
      "loss": 0.080460923910141,
      "step": 230
    },
    {
      "epoch": 0.384,
      "grad_norm": 0.17651614546775818,
      "learning_rate": 0.00017928767123287674,
      "loss": 0.0750627338886261,
      "step": 240
    },
    {
      "epoch": 0.4,
      "grad_norm": 0.10908259451389313,
      "learning_rate": 0.0001781917808219178,
      "loss": 0.06641974449157714,
      "step": 250
    },
    {
      "epoch": 0.416,
      "grad_norm": 0.11685926467180252,
      "learning_rate": 0.0001770958904109589,
      "loss": 0.06228730082511902,
      "step": 260
    },
    {
      "epoch": 0.432,
      "grad_norm": 0.13672174513339996,
      "learning_rate": 0.00017600000000000002,
      "loss": 0.06748496294021607,
      "step": 270
    },
    {
      "epoch": 0.448,
      "grad_norm": 0.12294622510671616,
      "learning_rate": 0.0001749041095890411,
      "loss": 0.07605534791946411,
      "step": 280
    },
    {
      "epoch": 0.464,
      "grad_norm": 0.07749421894550323,
      "learning_rate": 0.00017380821917808222,
      "loss": 0.06066908836364746,
      "step": 290
    },
    {
      "epoch": 0.48,
      "grad_norm": 0.15694420039653778,
      "learning_rate": 0.00017271232876712328,
      "loss": 0.06470143795013428,
      "step": 300
    },
    {
      "epoch": 0.496,
      "grad_norm": 0.09705078601837158,
      "learning_rate": 0.0001716164383561644,
      "loss": 0.05920176506042481,
      "step": 310
    },
    {
      "epoch": 0.512,
      "grad_norm": 0.09835303574800491,
      "learning_rate": 0.0001705205479452055,
      "loss": 0.08239805102348327,
      "step": 320
    },
    {
      "epoch": 0.528,
      "grad_norm": 0.0981152132153511,
      "learning_rate": 0.00016942465753424658,
      "loss": 0.05663549304008484,
      "step": 330
    },
    {
      "epoch": 0.544,
      "grad_norm": 0.07651517540216446,
      "learning_rate": 0.0001683287671232877,
      "loss": 0.06320589184761047,
      "step": 340
    },
    {
      "epoch": 0.56,
      "grad_norm": 0.10856720060110092,
      "learning_rate": 0.00016723287671232878,
      "loss": 0.07581254243850707,
      "step": 350
    },
    {
      "epoch": 0.576,
      "grad_norm": 0.0863386020064354,
      "learning_rate": 0.00016613698630136986,
      "loss": 0.0790505051612854,
      "step": 360
    },
    {
      "epoch": 0.592,
      "grad_norm": 0.10928363353013992,
      "learning_rate": 0.00016504109589041098,
      "loss": 0.06397929787635803,
      "step": 370
    },
    {
      "epoch": 0.608,
      "grad_norm": 0.11172884702682495,
      "learning_rate": 0.00016394520547945206,
      "loss": 0.05513489246368408,
      "step": 380
    },
    {
      "epoch": 0.624,
      "grad_norm": 0.09518434852361679,
      "learning_rate": 0.00016284931506849317,
      "loss": 0.07740641236305237,
      "step": 390
    },
    {
      "epoch": 0.64,
      "grad_norm": 0.10455268621444702,
      "learning_rate": 0.00016175342465753426,
      "loss": 0.07365262508392334,
      "step": 400
    },
    {
      "epoch": 0.656,
      "grad_norm": 0.0962410569190979,
      "learning_rate": 0.00016065753424657534,
      "loss": 0.08518975973129272,
      "step": 410
    },
    {
      "epoch": 0.672,
      "grad_norm": 0.12104412168264389,
      "learning_rate": 0.00015956164383561645,
      "loss": 0.06871490478515625,
      "step": 420
    },
    {
      "epoch": 0.688,
      "grad_norm": 0.10041218250989914,
      "learning_rate": 0.00015846575342465754,
      "loss": 0.051221036911010744,
      "step": 430
    },
    {
      "epoch": 0.704,
      "grad_norm": 0.08319935947656631,
      "learning_rate": 0.00015736986301369865,
      "loss": 0.06872759461402893,
      "step": 440
    },
    {
      "epoch": 0.72,
      "grad_norm": 0.078139528632164,
      "learning_rate": 0.00015627397260273973,
      "loss": 0.07016033530235291,
      "step": 450
    },
    {
      "epoch": 0.736,
      "grad_norm": 0.10938999056816101,
      "learning_rate": 0.00015517808219178082,
      "loss": 0.06500827074050904,
      "step": 460
    },
    {
      "epoch": 0.752,
      "grad_norm": 0.07604733109474182,
      "learning_rate": 0.00015408219178082193,
      "loss": 0.06083506941795349,
      "step": 470
    },
    {
      "epoch": 0.768,
      "grad_norm": 0.0853394940495491,
      "learning_rate": 0.00015298630136986304,
      "loss": 0.059677237272262575,
      "step": 480
    },
    {
      "epoch": 0.784,
      "grad_norm": 0.09151621907949448,
      "learning_rate": 0.0001518904109589041,
      "loss": 0.05737585425376892,
      "step": 490
    },
    {
      "epoch": 0.8,
      "grad_norm": 0.0884072557091713,
      "learning_rate": 0.0001507945205479452,
      "loss": 0.0566044270992279,
      "step": 500
    },
    {
      "epoch": 0.816,
      "grad_norm": 0.091744065284729,
      "learning_rate": 0.0001496986301369863,
      "loss": 0.06913858652114868,
      "step": 510
    },
    {
      "epoch": 0.832,
      "grad_norm": 0.07623863965272903,
      "learning_rate": 0.0001486027397260274,
      "loss": 0.06345137357711791,
      "step": 520
    },
    {
      "epoch": 0.848,
      "grad_norm": 0.07791073620319366,
      "learning_rate": 0.00014750684931506852,
      "loss": 0.060628962516784665,
      "step": 530
    },
    {
      "epoch": 0.864,
      "grad_norm": 0.08502475172281265,
      "learning_rate": 0.00014641095890410957,
      "loss": 0.068820059299469,
      "step": 540
    },
    {
      "epoch": 0.88,
      "grad_norm": 0.09597698599100113,
      "learning_rate": 0.00014531506849315069,
      "loss": 0.0585732638835907,
      "step": 550
    },
    {
      "epoch": 0.896,
      "grad_norm": 0.09175119549036026,
      "learning_rate": 0.0001442191780821918,
      "loss": 0.067873615026474,
      "step": 560
    },
    {
      "epoch": 0.912,
      "grad_norm": 0.10440277308225632,
      "learning_rate": 0.00014312328767123288,
      "loss": 0.06306946873664857,
      "step": 570
    },
    {
      "epoch": 0.928,
      "grad_norm": 0.08166486769914627,
      "learning_rate": 0.000142027397260274,
      "loss": 0.06535319089889527,
      "step": 580
    },
    {
      "epoch": 0.944,
      "grad_norm": 0.09520258009433746,
      "learning_rate": 0.00014093150684931508,
      "loss": 0.06487776637077332,
      "step": 590
    },
    {
      "epoch": 0.96,
      "grad_norm": 0.08356442302465439,
      "learning_rate": 0.00013983561643835616,
      "loss": 0.0673000991344452,
      "step": 600
    },
    {
      "epoch": 0.976,
      "grad_norm": 0.10857579857110977,
      "learning_rate": 0.00013873972602739727,
      "loss": 0.06675973534584045,
      "step": 610
    },
    {
      "epoch": 0.992,
      "grad_norm": 0.0846136212348938,
      "learning_rate": 0.00013764383561643836,
      "loss": 0.056840169429779056,
      "step": 620
    }
  ],
  "logging_steps": 10,
  "max_steps": 1875,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 3,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 5.315045223832781e+16,
  "train_batch_size": 4,
  "trial_name": null,
  "trial_params": null
}