{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 625, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016, "grad_norm": 2.508913278579712, "learning_rate": 3.6e-05, "loss": 2.2622838973999024, "step": 10 }, { "epoch": 0.032, "grad_norm": 1.3637374639511108, "learning_rate": 7.6e-05, "loss": 1.4729194641113281, "step": 20 }, { "epoch": 0.048, "grad_norm": 0.8274662494659424, "learning_rate": 0.000116, "loss": 0.5348126411437988, "step": 30 }, { "epoch": 0.064, "grad_norm": 0.24004890024662018, "learning_rate": 0.00015600000000000002, "loss": 0.24594340324401856, "step": 40 }, { "epoch": 0.08, "grad_norm": 0.21569570899009705, "learning_rate": 0.000196, "loss": 0.14221376180648804, "step": 50 }, { "epoch": 0.096, "grad_norm": 0.25074923038482666, "learning_rate": 0.00019901369863013698, "loss": 0.1053991436958313, "step": 60 }, { "epoch": 0.112, "grad_norm": 0.17641063034534454, "learning_rate": 0.0001979178082191781, "loss": 0.10523500442504882, "step": 70 }, { "epoch": 0.128, "grad_norm": 0.3789774477481842, "learning_rate": 0.0001968219178082192, "loss": 0.10122023820877075, "step": 80 }, { "epoch": 0.144, "grad_norm": 0.17485778033733368, "learning_rate": 0.00019572602739726029, "loss": 0.09140864610671998, "step": 90 }, { "epoch": 0.16, "grad_norm": 0.1414472460746765, "learning_rate": 0.00019463013698630137, "loss": 0.0925188422203064, "step": 100 }, { "epoch": 0.176, "grad_norm": 0.15549571812152863, "learning_rate": 0.00019353424657534248, "loss": 0.07405711412429809, "step": 110 }, { "epoch": 0.192, "grad_norm": 0.1726241111755371, "learning_rate": 0.00019243835616438357, "loss": 0.08338193893432617, "step": 120 }, { "epoch": 0.208, "grad_norm": 0.16814149916172028, "learning_rate": 0.00019134246575342468, "loss": 0.0827404260635376, "step": 130 }, { "epoch": 0.224, "grad_norm": 0.10934246331453323, "learning_rate": 0.00019024657534246576, "loss": 0.08371676802635193, "step": 140 }, { "epoch": 0.24, "grad_norm": 0.15543144941329956, "learning_rate": 0.00018915068493150685, "loss": 0.07682002186775208, "step": 150 }, { "epoch": 0.256, "grad_norm": 0.1047728955745697, "learning_rate": 0.00018805479452054796, "loss": 0.08323028087615966, "step": 160 }, { "epoch": 0.272, "grad_norm": 0.15727756917476654, "learning_rate": 0.00018695890410958904, "loss": 0.0881002426147461, "step": 170 }, { "epoch": 0.288, "grad_norm": 0.10985921323299408, "learning_rate": 0.00018586301369863015, "loss": 0.0725629210472107, "step": 180 }, { "epoch": 0.304, "grad_norm": 0.14681079983711243, "learning_rate": 0.00018476712328767124, "loss": 0.07297256588935852, "step": 190 }, { "epoch": 0.32, "grad_norm": 0.11168694496154785, "learning_rate": 0.00018367123287671232, "loss": 0.07204994559288025, "step": 200 }, { "epoch": 0.336, "grad_norm": 0.14472435414791107, "learning_rate": 0.00018257534246575343, "loss": 0.07219824194908142, "step": 210 }, { "epoch": 0.352, "grad_norm": 0.10257100313901901, "learning_rate": 0.00018147945205479452, "loss": 0.08176417350769043, "step": 220 }, { "epoch": 0.368, "grad_norm": 0.15164950489997864, "learning_rate": 0.00018038356164383563, "loss": 0.080460923910141, "step": 230 }, { "epoch": 0.384, "grad_norm": 0.17651614546775818, "learning_rate": 0.00017928767123287674, "loss": 0.0750627338886261, "step": 240 }, { "epoch": 0.4, "grad_norm": 0.10908259451389313, "learning_rate": 0.0001781917808219178, "loss": 0.06641974449157714, "step": 250 }, { "epoch": 0.416, "grad_norm": 0.11685926467180252, "learning_rate": 0.0001770958904109589, "loss": 0.06228730082511902, "step": 260 }, { "epoch": 0.432, "grad_norm": 0.13672174513339996, "learning_rate": 0.00017600000000000002, "loss": 0.06748496294021607, "step": 270 }, { "epoch": 0.448, "grad_norm": 0.12294622510671616, "learning_rate": 0.0001749041095890411, "loss": 0.07605534791946411, "step": 280 }, { "epoch": 0.464, "grad_norm": 0.07749421894550323, "learning_rate": 0.00017380821917808222, "loss": 0.06066908836364746, "step": 290 }, { "epoch": 0.48, "grad_norm": 0.15694420039653778, "learning_rate": 0.00017271232876712328, "loss": 0.06470143795013428, "step": 300 }, { "epoch": 0.496, "grad_norm": 0.09705078601837158, "learning_rate": 0.0001716164383561644, "loss": 0.05920176506042481, "step": 310 }, { "epoch": 0.512, "grad_norm": 0.09835303574800491, "learning_rate": 0.0001705205479452055, "loss": 0.08239805102348327, "step": 320 }, { "epoch": 0.528, "grad_norm": 0.0981152132153511, "learning_rate": 0.00016942465753424658, "loss": 0.05663549304008484, "step": 330 }, { "epoch": 0.544, "grad_norm": 0.07651517540216446, "learning_rate": 0.0001683287671232877, "loss": 0.06320589184761047, "step": 340 }, { "epoch": 0.56, "grad_norm": 0.10856720060110092, "learning_rate": 0.00016723287671232878, "loss": 0.07581254243850707, "step": 350 }, { "epoch": 0.576, "grad_norm": 0.0863386020064354, "learning_rate": 0.00016613698630136986, "loss": 0.0790505051612854, "step": 360 }, { "epoch": 0.592, "grad_norm": 0.10928363353013992, "learning_rate": 0.00016504109589041098, "loss": 0.06397929787635803, "step": 370 }, { "epoch": 0.608, "grad_norm": 0.11172884702682495, "learning_rate": 0.00016394520547945206, "loss": 0.05513489246368408, "step": 380 }, { "epoch": 0.624, "grad_norm": 0.09518434852361679, "learning_rate": 0.00016284931506849317, "loss": 0.07740641236305237, "step": 390 }, { "epoch": 0.64, "grad_norm": 0.10455268621444702, "learning_rate": 0.00016175342465753426, "loss": 0.07365262508392334, "step": 400 }, { "epoch": 0.656, "grad_norm": 0.0962410569190979, "learning_rate": 0.00016065753424657534, "loss": 0.08518975973129272, "step": 410 }, { "epoch": 0.672, "grad_norm": 0.12104412168264389, "learning_rate": 0.00015956164383561645, "loss": 0.06871490478515625, "step": 420 }, { "epoch": 0.688, "grad_norm": 0.10041218250989914, "learning_rate": 0.00015846575342465754, "loss": 0.051221036911010744, "step": 430 }, { "epoch": 0.704, "grad_norm": 0.08319935947656631, "learning_rate": 0.00015736986301369865, "loss": 0.06872759461402893, "step": 440 }, { "epoch": 0.72, "grad_norm": 0.078139528632164, "learning_rate": 0.00015627397260273973, "loss": 0.07016033530235291, "step": 450 }, { "epoch": 0.736, "grad_norm": 0.10938999056816101, "learning_rate": 0.00015517808219178082, "loss": 0.06500827074050904, "step": 460 }, { "epoch": 0.752, "grad_norm": 0.07604733109474182, "learning_rate": 0.00015408219178082193, "loss": 0.06083506941795349, "step": 470 }, { "epoch": 0.768, "grad_norm": 0.0853394940495491, "learning_rate": 0.00015298630136986304, "loss": 0.059677237272262575, "step": 480 }, { "epoch": 0.784, "grad_norm": 0.09151621907949448, "learning_rate": 0.0001518904109589041, "loss": 0.05737585425376892, "step": 490 }, { "epoch": 0.8, "grad_norm": 0.0884072557091713, "learning_rate": 0.0001507945205479452, "loss": 0.0566044270992279, "step": 500 }, { "epoch": 0.816, "grad_norm": 0.091744065284729, "learning_rate": 0.0001496986301369863, "loss": 0.06913858652114868, "step": 510 }, { "epoch": 0.832, "grad_norm": 0.07623863965272903, "learning_rate": 0.0001486027397260274, "loss": 0.06345137357711791, "step": 520 }, { "epoch": 0.848, "grad_norm": 0.07791073620319366, "learning_rate": 0.00014750684931506852, "loss": 0.060628962516784665, "step": 530 }, { "epoch": 0.864, "grad_norm": 0.08502475172281265, "learning_rate": 0.00014641095890410957, "loss": 0.068820059299469, "step": 540 }, { "epoch": 0.88, "grad_norm": 0.09597698599100113, "learning_rate": 0.00014531506849315069, "loss": 0.0585732638835907, "step": 550 }, { "epoch": 0.896, "grad_norm": 0.09175119549036026, "learning_rate": 0.0001442191780821918, "loss": 0.067873615026474, "step": 560 }, { "epoch": 0.912, "grad_norm": 0.10440277308225632, "learning_rate": 0.00014312328767123288, "loss": 0.06306946873664857, "step": 570 }, { "epoch": 0.928, "grad_norm": 0.08166486769914627, "learning_rate": 0.000142027397260274, "loss": 0.06535319089889527, "step": 580 }, { "epoch": 0.944, "grad_norm": 0.09520258009433746, "learning_rate": 0.00014093150684931508, "loss": 0.06487776637077332, "step": 590 }, { "epoch": 0.96, "grad_norm": 0.08356442302465439, "learning_rate": 0.00013983561643835616, "loss": 0.0673000991344452, "step": 600 }, { "epoch": 0.976, "grad_norm": 0.10857579857110977, "learning_rate": 0.00013873972602739727, "loss": 0.06675973534584045, "step": 610 }, { "epoch": 0.992, "grad_norm": 0.0846136212348938, "learning_rate": 0.00013764383561643836, "loss": 0.056840169429779056, "step": 620 } ], "logging_steps": 10, "max_steps": 1875, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.315045223832781e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }