{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 963, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03115264797507788, "grad_norm": 4.738249253201136, "learning_rate": 1.0309278350515464e-06, "loss": 2.1741, "step": 10 }, { "epoch": 0.06230529595015576, "grad_norm": 3.2401014424140167, "learning_rate": 2.061855670103093e-06, "loss": 2.1229, "step": 20 }, { "epoch": 0.09345794392523364, "grad_norm": 2.8415277325627266, "learning_rate": 3.0927835051546395e-06, "loss": 1.9732, "step": 30 }, { "epoch": 0.12461059190031153, "grad_norm": 2.736037961913757, "learning_rate": 4.123711340206186e-06, "loss": 1.9484, "step": 40 }, { "epoch": 0.1557632398753894, "grad_norm": 2.4003619066518302, "learning_rate": 5.154639175257732e-06, "loss": 1.8748, "step": 50 }, { "epoch": 0.18691588785046728, "grad_norm": 2.6391276979901694, "learning_rate": 6.185567010309279e-06, "loss": 1.8787, "step": 60 }, { "epoch": 0.21806853582554517, "grad_norm": 2.4942487849758694, "learning_rate": 7.216494845360825e-06, "loss": 1.8368, "step": 70 }, { "epoch": 0.24922118380062305, "grad_norm": 2.822084088437886, "learning_rate": 8.247422680412371e-06, "loss": 1.8894, "step": 80 }, { "epoch": 0.2803738317757009, "grad_norm": 2.6875520332918503, "learning_rate": 9.278350515463918e-06, "loss": 1.8245, "step": 90 }, { "epoch": 0.3115264797507788, "grad_norm": 2.6263381435665116, "learning_rate": 9.999703897419048e-06, "loss": 1.7866, "step": 100 }, { "epoch": 0.3426791277258567, "grad_norm": 2.7744012568631478, "learning_rate": 9.99444082710777e-06, "loss": 1.8159, "step": 110 }, { "epoch": 0.37383177570093457, "grad_norm": 2.6658024247528034, "learning_rate": 9.982605671302293e-06, "loss": 1.8021, "step": 120 }, { "epoch": 0.40498442367601245, "grad_norm": 2.625120262587981, "learning_rate": 9.9642140036491e-06, "loss": 1.7743, "step": 130 }, { "epoch": 0.43613707165109034, "grad_norm": 2.5142261111844966, "learning_rate": 9.93929002537839e-06, "loss": 1.7878, "step": 140 }, { "epoch": 0.4672897196261682, "grad_norm": 2.854232906840411, "learning_rate": 9.90786653345818e-06, "loss": 1.7857, "step": 150 }, { "epoch": 0.4984423676012461, "grad_norm": 2.8406719564272014, "learning_rate": 9.869984877437413e-06, "loss": 1.8149, "step": 160 }, { "epoch": 0.5295950155763239, "grad_norm": 2.700981263790333, "learning_rate": 9.82569490503491e-06, "loss": 1.7803, "step": 170 }, { "epoch": 0.5607476635514018, "grad_norm": 2.530124037261927, "learning_rate": 9.775054896545755e-06, "loss": 1.7613, "step": 180 }, { "epoch": 0.5919003115264797, "grad_norm": 2.2222205932023793, "learning_rate": 9.718131488151399e-06, "loss": 1.7738, "step": 190 }, { "epoch": 0.6230529595015576, "grad_norm": 2.5500930798593413, "learning_rate": 9.654999584234444e-06, "loss": 1.7506, "step": 200 }, { "epoch": 0.6542056074766355, "grad_norm": 2.3248583237139, "learning_rate": 9.585742258813447e-06, "loss": 1.7656, "step": 210 }, { "epoch": 0.6853582554517134, "grad_norm": 2.5583210039836257, "learning_rate": 9.51045064622747e-06, "loss": 1.7601, "step": 220 }, { "epoch": 0.7165109034267912, "grad_norm": 2.5253607291187095, "learning_rate": 9.429223821214213e-06, "loss": 1.7587, "step": 230 }, { "epoch": 0.7476635514018691, "grad_norm": 3.599519999833458, "learning_rate": 9.34216866853954e-06, "loss": 1.7371, "step": 240 }, { "epoch": 0.778816199376947, "grad_norm": 3.112140263611012, "learning_rate": 9.249399742349928e-06, "loss": 1.7399, "step": 250 }, { "epoch": 0.8099688473520249, "grad_norm": 2.460048723197972, "learning_rate": 9.151039115432946e-06, "loss": 1.7792, "step": 260 }, { "epoch": 0.8411214953271028, "grad_norm": 2.7818108539377695, "learning_rate": 9.047216218584105e-06, "loss": 1.7368, "step": 270 }, { "epoch": 0.8722741433021807, "grad_norm": 2.6593659104855902, "learning_rate": 8.93806767029143e-06, "loss": 1.7264, "step": 280 }, { "epoch": 0.9034267912772586, "grad_norm": 2.5183656701737074, "learning_rate": 8.823737096961916e-06, "loss": 1.6964, "step": 290 }, { "epoch": 0.9345794392523364, "grad_norm": 2.6798445502428265, "learning_rate": 8.704374943926386e-06, "loss": 1.765, "step": 300 }, { "epoch": 0.9657320872274143, "grad_norm": 2.5879778784700394, "learning_rate": 8.580138277471476e-06, "loss": 1.7533, "step": 310 }, { "epoch": 0.9968847352024922, "grad_norm": 3.013949460294022, "learning_rate": 8.45119057815922e-06, "loss": 1.7242, "step": 320 }, { "epoch": 1.02803738317757, "grad_norm": 2.4197649218374995, "learning_rate": 8.317701525706226e-06, "loss": 1.5794, "step": 330 }, { "epoch": 1.0591900311526479, "grad_norm": 2.9627624008639906, "learning_rate": 8.179846775705504e-06, "loss": 1.4717, "step": 340 }, { "epoch": 1.0903426791277258, "grad_norm": 2.455280291962598, "learning_rate": 8.03780772848477e-06, "loss": 1.5367, "step": 350 }, { "epoch": 1.1214953271028036, "grad_norm": 2.658888202364778, "learning_rate": 7.891771290405351e-06, "loss": 1.5353, "step": 360 }, { "epoch": 1.1526479750778815, "grad_norm": 2.8313334702049024, "learning_rate": 7.741929627915814e-06, "loss": 1.5069, "step": 370 }, { "epoch": 1.1838006230529594, "grad_norm": 2.5138362094593383, "learning_rate": 7.588479914683954e-06, "loss": 1.5002, "step": 380 }, { "epoch": 1.2149532710280373, "grad_norm": 2.653704835678763, "learning_rate": 7.431624072139884e-06, "loss": 1.5774, "step": 390 }, { "epoch": 1.2461059190031152, "grad_norm": 2.7594654733051924, "learning_rate": 7.271568503771632e-06, "loss": 1.5323, "step": 400 }, { "epoch": 1.277258566978193, "grad_norm": 2.6291361296605604, "learning_rate": 7.108523823522891e-06, "loss": 1.5193, "step": 410 }, { "epoch": 1.308411214953271, "grad_norm": 2.4716101194608635, "learning_rate": 6.942704578650312e-06, "loss": 1.5163, "step": 420 }, { "epoch": 1.3395638629283488, "grad_norm": 2.6790323258843354, "learning_rate": 6.774328967405035e-06, "loss": 1.5731, "step": 430 }, { "epoch": 1.3707165109034267, "grad_norm": 2.4220245385790635, "learning_rate": 6.603618551909935e-06, "loss": 1.4769, "step": 440 }, { "epoch": 1.4018691588785046, "grad_norm": 2.9692192247657476, "learning_rate": 6.430797966610436e-06, "loss": 1.538, "step": 450 }, { "epoch": 1.4330218068535825, "grad_norm": 2.681036765738111, "learning_rate": 6.256094622682493e-06, "loss": 1.5295, "step": 460 }, { "epoch": 1.4641744548286604, "grad_norm": 2.5464332369331233, "learning_rate": 6.079738408786753e-06, "loss": 1.5375, "step": 470 }, { "epoch": 1.4953271028037383, "grad_norm": 2.918141623936573, "learning_rate": 5.9019613885626235e-06, "loss": 1.483, "step": 480 }, { "epoch": 1.5264797507788161, "grad_norm": 2.6056089657043024, "learning_rate": 5.722997495260348e-06, "loss": 1.5378, "step": 490 }, { "epoch": 1.557632398753894, "grad_norm": 2.682345830391196, "learning_rate": 5.543082223912875e-06, "loss": 1.5105, "step": 500 }, { "epoch": 1.588785046728972, "grad_norm": 2.418071341611168, "learning_rate": 5.362452321452636e-06, "loss": 1.5186, "step": 510 }, { "epoch": 1.6199376947040498, "grad_norm": 2.7761981641005105, "learning_rate": 5.181345475180941e-06, "loss": 1.4844, "step": 520 }, { "epoch": 1.6510903426791277, "grad_norm": 2.630104175186077, "learning_rate": 5e-06, "loss": 1.5189, "step": 530 }, { "epoch": 1.6822429906542056, "grad_norm": 2.7160409762576845, "learning_rate": 4.8186545248190604e-06, "loss": 1.5063, "step": 540 }, { "epoch": 1.7133956386292835, "grad_norm": 2.7941182969523237, "learning_rate": 4.637547678547366e-06, "loss": 1.5104, "step": 550 }, { "epoch": 1.7445482866043613, "grad_norm": 2.7635210894959186, "learning_rate": 4.4569177760871255e-06, "loss": 1.4964, "step": 560 }, { "epoch": 1.7757009345794392, "grad_norm": 2.7040452036881253, "learning_rate": 4.277002504739653e-06, "loss": 1.5406, "step": 570 }, { "epoch": 1.8068535825545171, "grad_norm": 3.0310876611935678, "learning_rate": 4.098038611437377e-06, "loss": 1.5634, "step": 580 }, { "epoch": 1.838006230529595, "grad_norm": 2.8756337522810007, "learning_rate": 3.920261591213249e-06, "loss": 1.4982, "step": 590 }, { "epoch": 1.8691588785046729, "grad_norm": 2.931182866108726, "learning_rate": 3.7439053773175092e-06, "loss": 1.525, "step": 600 }, { "epoch": 1.9003115264797508, "grad_norm": 2.618724233974035, "learning_rate": 3.569202033389565e-06, "loss": 1.5296, "step": 610 }, { "epoch": 1.9314641744548287, "grad_norm": 2.4527018980750133, "learning_rate": 3.3963814480900665e-06, "loss": 1.4938, "step": 620 }, { "epoch": 1.9626168224299065, "grad_norm": 2.5593961662264753, "learning_rate": 3.225671032594966e-06, "loss": 1.5196, "step": 630 }, { "epoch": 1.9937694704049844, "grad_norm": 2.6422408984177164, "learning_rate": 3.0572954213496897e-06, "loss": 1.5159, "step": 640 }, { "epoch": 2.0249221183800623, "grad_norm": 2.2409404747264996, "learning_rate": 2.8914761764771093e-06, "loss": 1.368, "step": 650 }, { "epoch": 2.05607476635514, "grad_norm": 2.832245112153312, "learning_rate": 2.728431496228369e-06, "loss": 1.3305, "step": 660 }, { "epoch": 2.087227414330218, "grad_norm": 2.6890920333725554, "learning_rate": 2.5683759278601174e-06, "loss": 1.3275, "step": 670 }, { "epoch": 2.1183800623052957, "grad_norm": 2.7348466649941816, "learning_rate": 2.4115200853160475e-06, "loss": 1.3551, "step": 680 }, { "epoch": 2.149532710280374, "grad_norm": 2.7443347323397407, "learning_rate": 2.258070372084188e-06, "loss": 1.3282, "step": 690 }, { "epoch": 2.1806853582554515, "grad_norm": 2.58000682159267, "learning_rate": 2.108228709594649e-06, "loss": 1.3291, "step": 700 }, { "epoch": 2.2118380062305296, "grad_norm": 2.857457848907294, "learning_rate": 1.962192271515232e-06, "loss": 1.3544, "step": 710 }, { "epoch": 2.2429906542056073, "grad_norm": 2.9475901657988777, "learning_rate": 1.820153224294498e-06, "loss": 1.3479, "step": 720 }, { "epoch": 2.2741433021806854, "grad_norm": 2.2731328518925906, "learning_rate": 1.6822984742937764e-06, "loss": 1.3707, "step": 730 }, { "epoch": 2.305295950155763, "grad_norm": 2.7512861025748805, "learning_rate": 1.548809421840779e-06, "loss": 1.3778, "step": 740 }, { "epoch": 2.336448598130841, "grad_norm": 2.6460831192309184, "learning_rate": 1.4198617225285244e-06, "loss": 1.3371, "step": 750 }, { "epoch": 2.367601246105919, "grad_norm": 2.6430950689993016, "learning_rate": 1.2956250560736143e-06, "loss": 1.3529, "step": 760 }, { "epoch": 2.398753894080997, "grad_norm": 2.754383193984707, "learning_rate": 1.1762629030380867e-06, "loss": 1.3617, "step": 770 }, { "epoch": 2.4299065420560746, "grad_norm": 2.3375327059468627, "learning_rate": 1.061932329708572e-06, "loss": 1.3712, "step": 780 }, { "epoch": 2.4610591900311527, "grad_norm": 2.9641705209236675, "learning_rate": 9.527837814158963e-07, "loss": 1.3363, "step": 790 }, { "epoch": 2.4922118380062304, "grad_norm": 2.8810838394628457, "learning_rate": 8.489608845670527e-07, "loss": 1.3548, "step": 800 }, { "epoch": 2.5233644859813085, "grad_norm": 2.895341139217939, "learning_rate": 7.506002576500732e-07, "loss": 1.3483, "step": 810 }, { "epoch": 2.554517133956386, "grad_norm": 3.25688215356644, "learning_rate": 6.578313314604612e-07, "loss": 1.3509, "step": 820 }, { "epoch": 2.5856697819314642, "grad_norm": 2.265660300440991, "learning_rate": 5.707761787857879e-07, "loss": 1.3071, "step": 830 }, { "epoch": 2.616822429906542, "grad_norm": 2.866761226251221, "learning_rate": 4.895493537725326e-07, "loss": 1.3367, "step": 840 }, { "epoch": 2.64797507788162, "grad_norm": 2.623512078489181, "learning_rate": 4.1425774118655505e-07, "loss": 1.3577, "step": 850 }, { "epoch": 2.6791277258566977, "grad_norm": 2.5389678031150305, "learning_rate": 3.4500041576555733e-07, "loss": 1.3349, "step": 860 }, { "epoch": 2.710280373831776, "grad_norm": 2.7619056373906736, "learning_rate": 2.818685118486025e-07, "loss": 1.36, "step": 870 }, { "epoch": 2.7414330218068534, "grad_norm": 2.6263120768496533, "learning_rate": 2.2494510345424657e-07, "loss": 1.3853, "step": 880 }, { "epoch": 2.7725856697819315, "grad_norm": 2.60206393215365, "learning_rate": 1.7430509496508985e-07, "loss": 1.3795, "step": 890 }, { "epoch": 2.803738317757009, "grad_norm": 2.755154917816144, "learning_rate": 1.3001512256258841e-07, "loss": 1.3213, "step": 900 }, { "epoch": 2.8348909657320873, "grad_norm": 2.57799358687677, "learning_rate": 9.213346654182054e-08, "loss": 1.3557, "step": 910 }, { "epoch": 2.866043613707165, "grad_norm": 2.6277993338233108, "learning_rate": 6.070997462161055e-08, "loss": 1.3675, "step": 920 }, { "epoch": 2.897196261682243, "grad_norm": 2.3161497816289978, "learning_rate": 3.578599635090163e-08, "loss": 1.3401, "step": 930 }, { "epoch": 2.9283489096573208, "grad_norm": 2.383092955101212, "learning_rate": 1.7394328697707407e-08, "loss": 1.3287, "step": 940 }, { "epoch": 2.959501557632399, "grad_norm": 2.5504821012557124, "learning_rate": 5.5591728922316235e-09, "loss": 1.3864, "step": 950 }, { "epoch": 2.9906542056074765, "grad_norm": 2.459519377766809, "learning_rate": 2.9610258095169596e-10, "loss": 1.3746, "step": 960 }, { "epoch": 3.0, "step": 963, "total_flos": 5654321823744.0, "train_loss": 1.5619763468903791, "train_runtime": 395.7248, "train_samples_per_second": 155.411, "train_steps_per_second": 2.434 } ], "logging_steps": 10, "max_steps": 963, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 20000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5654321823744.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }