{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08, "grad_norm": 0.7171631455421448, "learning_rate": 0.00017999999999999998, "loss": 0.738, "step": 10 }, { "epoch": 0.16, "grad_norm": 0.6291916370391846, "learning_rate": 0.00038, "loss": 0.576, "step": 20 }, { "epoch": 0.24, "grad_norm": 0.711400032043457, "learning_rate": 0.00058, "loss": 0.5621, "step": 30 }, { "epoch": 0.32, "grad_norm": 0.7226306200027466, "learning_rate": 0.0007800000000000001, "loss": 0.5347, "step": 40 }, { "epoch": 0.4, "grad_norm": 0.7293037176132202, "learning_rate": 0.00098, "loss": 0.5287, "step": 50 }, { "epoch": 0.48, "grad_norm": 1.492329478263855, "learning_rate": 0.0009990133642141358, "loss": 0.5951, "step": 60 }, { "epoch": 0.56, "grad_norm": 0.9376721978187561, "learning_rate": 0.0009956077701257708, "loss": 0.5181, "step": 70 }, { "epoch": 0.64, "grad_norm": 0.8607097268104553, "learning_rate": 0.000989787624799672, "loss": 0.5919, "step": 80 }, { "epoch": 0.72, "grad_norm": 1.1687450408935547, "learning_rate": 0.0009815812833988292, "loss": 0.6227, "step": 90 }, { "epoch": 0.8, "grad_norm": 0.9768692851066589, "learning_rate": 0.0009710287263936483, "loss": 0.5733, "step": 100 }, { "epoch": 0.88, "grad_norm": 1.0083130598068237, "learning_rate": 0.0009581813647811198, "loss": 0.5704, "step": 110 }, { "epoch": 0.96, "grad_norm": 1.0848993062973022, "learning_rate": 0.0009431017896156073, "loss": 0.5714, "step": 120 }, { "epoch": 1.04, "grad_norm": 1.2620114088058472, "learning_rate": 0.0009258634670715238, "loss": 0.5629, "step": 130 }, { "epoch": 1.12, "grad_norm": 0.8126377463340759, "learning_rate": 0.0009065503805235138, "loss": 0.4443, "step": 140 }, { "epoch": 1.2, "grad_norm": 1.2117948532104492, "learning_rate": 0.0008852566213878947, "loss": 0.4317, "step": 150 }, { "epoch": 1.28, "grad_norm": 0.9739482402801514, "learning_rate": 0.0008620859307187339, "loss": 0.458, "step": 160 }, { "epoch": 1.3599999999999999, "grad_norm": 1.2082128524780273, "learning_rate": 0.0008371511937918616, "loss": 0.4182, "step": 170 }, { "epoch": 1.44, "grad_norm": 1.371351957321167, "learning_rate": 0.0008105738901391552, "loss": 0.489, "step": 180 }, { "epoch": 1.52, "grad_norm": 1.154181957244873, "learning_rate": 0.0007824835017124689, "loss": 0.4378, "step": 190 }, { "epoch": 1.6, "grad_norm": 1.1466692686080933, "learning_rate": 0.0007530168820605818, "loss": 0.456, "step": 200 }, { "epoch": 1.6800000000000002, "grad_norm": 1.0234856605529785, "learning_rate": 0.0007223175895924637, "loss": 0.4724, "step": 210 }, { "epoch": 1.76, "grad_norm": 1.0541609525680542, "learning_rate": 0.0006905351881751372, "loss": 0.393, "step": 220 }, { "epoch": 1.8399999999999999, "grad_norm": 1.285465121269226, "learning_rate": 0.0006578245184735513, "loss": 0.4404, "step": 230 }, { "epoch": 1.92, "grad_norm": 1.226035714149475, "learning_rate": 0.0006243449435824276, "loss": 0.4397, "step": 240 }, { "epoch": 2.0, "grad_norm": 1.3251399993896484, "learning_rate": 0.0005902595726252801, "loss": 0.4566, "step": 250 }, { "epoch": 2.08, "grad_norm": 1.7789580821990967, "learning_rate": 0.0005557344661031627, "loss": 0.2441, "step": 260 }, { "epoch": 2.16, "grad_norm": 1.0361311435699463, "learning_rate": 0.0005209378268645998, "loss": 0.254, "step": 270 }, { "epoch": 2.24, "grad_norm": 0.9719555377960205, "learning_rate": 0.00048603918063821566, "loss": 0.2456, "step": 280 }, { "epoch": 2.32, "grad_norm": 1.2225875854492188, "learning_rate": 0.0004512085501204253, "loss": 0.2154, "step": 290 }, { "epoch": 2.4, "grad_norm": 0.8809466361999512, "learning_rate": 0.0004166156266419489, "loss": 0.2113, "step": 300 }, { "epoch": 2.48, "grad_norm": 1.0200402736663818, "learning_rate": 0.000382428943448705, "loss": 0.2338, "step": 310 }, { "epoch": 2.56, "grad_norm": 1.0294510126113892, "learning_rate": 0.00034881505462477783, "loss": 0.225, "step": 320 }, { "epoch": 2.64, "grad_norm": 1.0166362524032593, "learning_rate": 0.00031593772365766105, "loss": 0.2494, "step": 330 }, { "epoch": 2.7199999999999998, "grad_norm": 1.0922179222106934, "learning_rate": 0.0002839571255990088, "loss": 0.2047, "step": 340 }, { "epoch": 2.8, "grad_norm": 1.2643201351165771, "learning_rate": 0.0002530290667078846, "loss": 0.212, "step": 350 }, { "epoch": 2.88, "grad_norm": 0.8040714859962463, "learning_rate": 0.000223304225378328, "loss": 0.2274, "step": 360 }, { "epoch": 2.96, "grad_norm": 0.9333784580230713, "learning_rate": 0.00019492741804936621, "loss": 0.2093, "step": 370 }, { "epoch": 3.04, "grad_norm": 0.5495390892028809, "learning_rate": 0.0001680368936738792, "loss": 0.1401, "step": 380 }, { "epoch": 3.12, "grad_norm": 0.7462188601493835, "learning_rate": 0.00014276366018359842, "loss": 0.0869, "step": 390 }, { "epoch": 3.2, "grad_norm": 0.5287328958511353, "learning_rate": 0.00011923084623163172, "loss": 0.0684, "step": 400 }, { "epoch": 3.2800000000000002, "grad_norm": 0.952439546585083, "learning_rate": 9.755310132204298e-05, "loss": 0.0752, "step": 410 }, { "epoch": 3.36, "grad_norm": 0.6919301152229309, "learning_rate": 7.783603724899258e-05, "loss": 0.066, "step": 420 }, { "epoch": 3.44, "grad_norm": 0.6896870732307434, "learning_rate": 6.0175713566691824e-05, "loss": 0.0645, "step": 430 }, { "epoch": 3.52, "grad_norm": 0.8325856328010559, "learning_rate": 4.465816959691149e-05, "loss": 0.0787, "step": 440 }, { "epoch": 3.6, "grad_norm": 0.7463304400444031, "learning_rate": 3.1359005254054274e-05, "loss": 0.072, "step": 450 }, { "epoch": 3.68, "grad_norm": 0.7797716856002808, "learning_rate": 2.0343012729971243e-05, "loss": 0.0833, "step": 460 }, { "epoch": 3.76, "grad_norm": 1.173261284828186, "learning_rate": 1.166386083291604e-05, "loss": 0.0653, "step": 470 }, { "epoch": 3.84, "grad_norm": 0.7136673331260681, "learning_rate": 5.363833518505834e-06, "loss": 0.06, "step": 480 }, { "epoch": 3.92, "grad_norm": 0.7013105750083923, "learning_rate": 1.4736238865398766e-06, "loss": 0.0738, "step": 490 }, { "epoch": 4.0, "grad_norm": 0.6790759563446045, "learning_rate": 1.2184647302626584e-08, "loss": 0.075, "step": 500 }, { "epoch": 4.0, "step": 500, "total_flos": 5591743584010240.0, "train_loss": 0.3324708139896393, "train_runtime": 676.5273, "train_samples_per_second": 5.913, "train_steps_per_second": 0.739 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5591743584010240.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }