{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1875, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016, "grad_norm": 2.508913278579712, "learning_rate": 3.6e-05, "loss": 2.2622838973999024, "step": 10 }, { "epoch": 0.032, "grad_norm": 1.3637374639511108, "learning_rate": 7.6e-05, "loss": 1.4729194641113281, "step": 20 }, { "epoch": 0.048, "grad_norm": 0.8274662494659424, "learning_rate": 0.000116, "loss": 0.5348126411437988, "step": 30 }, { "epoch": 0.064, "grad_norm": 0.24004890024662018, "learning_rate": 0.00015600000000000002, "loss": 0.24594340324401856, "step": 40 }, { "epoch": 0.08, "grad_norm": 0.21569570899009705, "learning_rate": 0.000196, "loss": 0.14221376180648804, "step": 50 }, { "epoch": 0.096, "grad_norm": 0.25074923038482666, "learning_rate": 0.00019901369863013698, "loss": 0.1053991436958313, "step": 60 }, { "epoch": 0.112, "grad_norm": 0.17641063034534454, "learning_rate": 0.0001979178082191781, "loss": 0.10523500442504882, "step": 70 }, { "epoch": 0.128, "grad_norm": 0.3789774477481842, "learning_rate": 0.0001968219178082192, "loss": 0.10122023820877075, "step": 80 }, { "epoch": 0.144, "grad_norm": 0.17485778033733368, "learning_rate": 0.00019572602739726029, "loss": 0.09140864610671998, "step": 90 }, { "epoch": 0.16, "grad_norm": 0.1414472460746765, "learning_rate": 0.00019463013698630137, "loss": 0.0925188422203064, "step": 100 }, { "epoch": 0.176, "grad_norm": 0.15549571812152863, "learning_rate": 0.00019353424657534248, "loss": 0.07405711412429809, "step": 110 }, { "epoch": 0.192, "grad_norm": 0.1726241111755371, "learning_rate": 0.00019243835616438357, "loss": 0.08338193893432617, "step": 120 }, { "epoch": 0.208, "grad_norm": 0.16814149916172028, "learning_rate": 0.00019134246575342468, "loss": 0.0827404260635376, "step": 130 }, { "epoch": 0.224, "grad_norm": 0.10934246331453323, "learning_rate": 0.00019024657534246576, "loss": 0.08371676802635193, "step": 140 }, { "epoch": 0.24, "grad_norm": 0.15543144941329956, "learning_rate": 0.00018915068493150685, "loss": 0.07682002186775208, "step": 150 }, { "epoch": 0.256, "grad_norm": 0.1047728955745697, "learning_rate": 0.00018805479452054796, "loss": 0.08323028087615966, "step": 160 }, { "epoch": 0.272, "grad_norm": 0.15727756917476654, "learning_rate": 0.00018695890410958904, "loss": 0.0881002426147461, "step": 170 }, { "epoch": 0.288, "grad_norm": 0.10985921323299408, "learning_rate": 0.00018586301369863015, "loss": 0.0725629210472107, "step": 180 }, { "epoch": 0.304, "grad_norm": 0.14681079983711243, "learning_rate": 0.00018476712328767124, "loss": 0.07297256588935852, "step": 190 }, { "epoch": 0.32, "grad_norm": 0.11168694496154785, "learning_rate": 0.00018367123287671232, "loss": 0.07204994559288025, "step": 200 }, { "epoch": 0.336, "grad_norm": 0.14472435414791107, "learning_rate": 0.00018257534246575343, "loss": 0.07219824194908142, "step": 210 }, { "epoch": 0.352, "grad_norm": 0.10257100313901901, "learning_rate": 0.00018147945205479452, "loss": 0.08176417350769043, "step": 220 }, { "epoch": 0.368, "grad_norm": 0.15164950489997864, "learning_rate": 0.00018038356164383563, "loss": 0.080460923910141, "step": 230 }, { "epoch": 0.384, "grad_norm": 0.17651614546775818, "learning_rate": 0.00017928767123287674, "loss": 0.0750627338886261, "step": 240 }, { "epoch": 0.4, "grad_norm": 0.10908259451389313, "learning_rate": 0.0001781917808219178, "loss": 0.06641974449157714, "step": 250 }, { "epoch": 0.416, "grad_norm": 0.11685926467180252, "learning_rate": 0.0001770958904109589, "loss": 0.06228730082511902, "step": 260 }, { "epoch": 0.432, "grad_norm": 0.13672174513339996, "learning_rate": 0.00017600000000000002, "loss": 0.06748496294021607, "step": 270 }, { "epoch": 0.448, "grad_norm": 0.12294622510671616, "learning_rate": 0.0001749041095890411, "loss": 0.07605534791946411, "step": 280 }, { "epoch": 0.464, "grad_norm": 0.07749421894550323, "learning_rate": 0.00017380821917808222, "loss": 0.06066908836364746, "step": 290 }, { "epoch": 0.48, "grad_norm": 0.15694420039653778, "learning_rate": 0.00017271232876712328, "loss": 0.06470143795013428, "step": 300 }, { "epoch": 0.496, "grad_norm": 0.09705078601837158, "learning_rate": 0.0001716164383561644, "loss": 0.05920176506042481, "step": 310 }, { "epoch": 0.512, "grad_norm": 0.09835303574800491, "learning_rate": 0.0001705205479452055, "loss": 0.08239805102348327, "step": 320 }, { "epoch": 0.528, "grad_norm": 0.0981152132153511, "learning_rate": 0.00016942465753424658, "loss": 0.05663549304008484, "step": 330 }, { "epoch": 0.544, "grad_norm": 0.07651517540216446, "learning_rate": 0.0001683287671232877, "loss": 0.06320589184761047, "step": 340 }, { "epoch": 0.56, "grad_norm": 0.10856720060110092, "learning_rate": 0.00016723287671232878, "loss": 0.07581254243850707, "step": 350 }, { "epoch": 0.576, "grad_norm": 0.0863386020064354, "learning_rate": 0.00016613698630136986, "loss": 0.0790505051612854, "step": 360 }, { "epoch": 0.592, "grad_norm": 0.10928363353013992, "learning_rate": 0.00016504109589041098, "loss": 0.06397929787635803, "step": 370 }, { "epoch": 0.608, "grad_norm": 0.11172884702682495, "learning_rate": 0.00016394520547945206, "loss": 0.05513489246368408, "step": 380 }, { "epoch": 0.624, "grad_norm": 0.09518434852361679, "learning_rate": 0.00016284931506849317, "loss": 0.07740641236305237, "step": 390 }, { "epoch": 0.64, "grad_norm": 0.10455268621444702, "learning_rate": 0.00016175342465753426, "loss": 0.07365262508392334, "step": 400 }, { "epoch": 0.656, "grad_norm": 0.0962410569190979, "learning_rate": 0.00016065753424657534, "loss": 0.08518975973129272, "step": 410 }, { "epoch": 0.672, "grad_norm": 0.12104412168264389, "learning_rate": 0.00015956164383561645, "loss": 0.06871490478515625, "step": 420 }, { "epoch": 0.688, "grad_norm": 0.10041218250989914, "learning_rate": 0.00015846575342465754, "loss": 0.051221036911010744, "step": 430 }, { "epoch": 0.704, "grad_norm": 0.08319935947656631, "learning_rate": 0.00015736986301369865, "loss": 0.06872759461402893, "step": 440 }, { "epoch": 0.72, "grad_norm": 0.078139528632164, "learning_rate": 0.00015627397260273973, "loss": 0.07016033530235291, "step": 450 }, { "epoch": 0.736, "grad_norm": 0.10938999056816101, "learning_rate": 0.00015517808219178082, "loss": 0.06500827074050904, "step": 460 }, { "epoch": 0.752, "grad_norm": 0.07604733109474182, "learning_rate": 0.00015408219178082193, "loss": 0.06083506941795349, "step": 470 }, { "epoch": 0.768, "grad_norm": 0.0853394940495491, "learning_rate": 0.00015298630136986304, "loss": 0.059677237272262575, "step": 480 }, { "epoch": 0.784, "grad_norm": 0.09151621907949448, "learning_rate": 0.0001518904109589041, "loss": 0.05737585425376892, "step": 490 }, { "epoch": 0.8, "grad_norm": 0.0884072557091713, "learning_rate": 0.0001507945205479452, "loss": 0.0566044270992279, "step": 500 }, { "epoch": 0.816, "grad_norm": 0.091744065284729, "learning_rate": 0.0001496986301369863, "loss": 0.06913858652114868, "step": 510 }, { "epoch": 0.832, "grad_norm": 0.07623863965272903, "learning_rate": 0.0001486027397260274, "loss": 0.06345137357711791, "step": 520 }, { "epoch": 0.848, "grad_norm": 0.07791073620319366, "learning_rate": 0.00014750684931506852, "loss": 0.060628962516784665, "step": 530 }, { "epoch": 0.864, "grad_norm": 0.08502475172281265, "learning_rate": 0.00014641095890410957, "loss": 0.068820059299469, "step": 540 }, { "epoch": 0.88, "grad_norm": 0.09597698599100113, "learning_rate": 0.00014531506849315069, "loss": 0.0585732638835907, "step": 550 }, { "epoch": 0.896, "grad_norm": 0.09175119549036026, "learning_rate": 0.0001442191780821918, "loss": 0.067873615026474, "step": 560 }, { "epoch": 0.912, "grad_norm": 0.10440277308225632, "learning_rate": 0.00014312328767123288, "loss": 0.06306946873664857, "step": 570 }, { "epoch": 0.928, "grad_norm": 0.08166486769914627, "learning_rate": 0.000142027397260274, "loss": 0.06535319089889527, "step": 580 }, { "epoch": 0.944, "grad_norm": 0.09520258009433746, "learning_rate": 0.00014093150684931508, "loss": 0.06487776637077332, "step": 590 }, { "epoch": 0.96, "grad_norm": 0.08356442302465439, "learning_rate": 0.00013983561643835616, "loss": 0.0673000991344452, "step": 600 }, { "epoch": 0.976, "grad_norm": 0.10857579857110977, "learning_rate": 0.00013873972602739727, "loss": 0.06675973534584045, "step": 610 }, { "epoch": 0.992, "grad_norm": 0.0846136212348938, "learning_rate": 0.00013764383561643836, "loss": 0.056840169429779056, "step": 620 }, { "epoch": 1.008, "grad_norm": 0.12099627405405045, "learning_rate": 0.00013654794520547947, "loss": 0.06133960485458374, "step": 630 }, { "epoch": 1.024, "grad_norm": 0.14235953986644745, "learning_rate": 0.00013545205479452055, "loss": 0.051949769258499146, "step": 640 }, { "epoch": 1.04, "grad_norm": 0.06599270552396774, "learning_rate": 0.00013435616438356164, "loss": 0.05395192503929138, "step": 650 }, { "epoch": 1.056, "grad_norm": 0.08540896326303482, "learning_rate": 0.00013326027397260275, "loss": 0.058018720149993895, "step": 660 }, { "epoch": 1.072, "grad_norm": 0.07638814300298691, "learning_rate": 0.00013216438356164384, "loss": 0.05708546638488769, "step": 670 }, { "epoch": 1.088, "grad_norm": 0.07762673497200012, "learning_rate": 0.00013106849315068495, "loss": 0.05458671450614929, "step": 680 }, { "epoch": 1.104, "grad_norm": 0.06784480065107346, "learning_rate": 0.00012997260273972603, "loss": 0.05251591801643372, "step": 690 }, { "epoch": 1.12, "grad_norm": 0.08402290940284729, "learning_rate": 0.00012887671232876712, "loss": 0.0629364550113678, "step": 700 }, { "epoch": 1.1360000000000001, "grad_norm": 0.08088182657957077, "learning_rate": 0.00012778082191780823, "loss": 0.06972357630729675, "step": 710 }, { "epoch": 1.152, "grad_norm": 0.09672300517559052, "learning_rate": 0.0001266849315068493, "loss": 0.07236437201499939, "step": 720 }, { "epoch": 1.168, "grad_norm": 0.07926960289478302, "learning_rate": 0.00012558904109589042, "loss": 0.05548118352890015, "step": 730 }, { "epoch": 1.184, "grad_norm": 0.08126692473888397, "learning_rate": 0.0001244931506849315, "loss": 0.0625440776348114, "step": 740 }, { "epoch": 1.2, "grad_norm": 0.06809371709823608, "learning_rate": 0.0001233972602739726, "loss": 0.06416532397270203, "step": 750 }, { "epoch": 1.216, "grad_norm": 0.11283153295516968, "learning_rate": 0.0001223013698630137, "loss": 0.06832035183906555, "step": 760 }, { "epoch": 1.232, "grad_norm": 0.08298429101705551, "learning_rate": 0.0001212054794520548, "loss": 0.05774534940719604, "step": 770 }, { "epoch": 1.248, "grad_norm": 0.07594408094882965, "learning_rate": 0.0001201095890410959, "loss": 0.05947027802467346, "step": 780 }, { "epoch": 1.264, "grad_norm": 0.07829966396093369, "learning_rate": 0.00011901369863013698, "loss": 0.05323997139930725, "step": 790 }, { "epoch": 1.28, "grad_norm": 0.0940285176038742, "learning_rate": 0.00011791780821917808, "loss": 0.051990669965744016, "step": 800 }, { "epoch": 1.296, "grad_norm": 0.07158597558736801, "learning_rate": 0.00011682191780821918, "loss": 0.06842873692512512, "step": 810 }, { "epoch": 1.312, "grad_norm": 0.06882186979055405, "learning_rate": 0.00011572602739726028, "loss": 0.06891562342643738, "step": 820 }, { "epoch": 1.328, "grad_norm": 0.08038769662380219, "learning_rate": 0.00011463013698630139, "loss": 0.07961333990097046, "step": 830 }, { "epoch": 1.3439999999999999, "grad_norm": 0.07336015999317169, "learning_rate": 0.00011353424657534246, "loss": 0.0596350908279419, "step": 840 }, { "epoch": 1.3599999999999999, "grad_norm": 0.07672706991434097, "learning_rate": 0.00011243835616438356, "loss": 0.06021456122398376, "step": 850 }, { "epoch": 1.376, "grad_norm": 0.06943219900131226, "learning_rate": 0.00011134246575342466, "loss": 0.05736762285232544, "step": 860 }, { "epoch": 1.392, "grad_norm": 0.07571737468242645, "learning_rate": 0.00011024657534246577, "loss": 0.06433975100517272, "step": 870 }, { "epoch": 1.408, "grad_norm": 0.07972362637519836, "learning_rate": 0.00010915068493150687, "loss": 0.06327899098396302, "step": 880 }, { "epoch": 1.424, "grad_norm": 0.05659586563706398, "learning_rate": 0.00010805479452054794, "loss": 0.07444382905960083, "step": 890 }, { "epoch": 1.44, "grad_norm": 0.08906129002571106, "learning_rate": 0.00010695890410958904, "loss": 0.054902708530426024, "step": 900 }, { "epoch": 1.456, "grad_norm": 0.0920051857829094, "learning_rate": 0.00010586301369863015, "loss": 0.0726428508758545, "step": 910 }, { "epoch": 1.472, "grad_norm": 0.07324782013893127, "learning_rate": 0.00010476712328767125, "loss": 0.0653969943523407, "step": 920 }, { "epoch": 1.488, "grad_norm": 0.07842034846544266, "learning_rate": 0.00010367123287671234, "loss": 0.06544245481491089, "step": 930 }, { "epoch": 1.504, "grad_norm": 0.05455109104514122, "learning_rate": 0.00010257534246575343, "loss": 0.06340432167053223, "step": 940 }, { "epoch": 1.52, "grad_norm": 0.0697791799902916, "learning_rate": 0.00010147945205479453, "loss": 0.0673532783985138, "step": 950 }, { "epoch": 1.536, "grad_norm": 0.08127626776695251, "learning_rate": 0.00010038356164383562, "loss": 0.05687047243118286, "step": 960 }, { "epoch": 1.552, "grad_norm": 0.07406352460384369, "learning_rate": 9.928767123287672e-05, "loss": 0.05548548698425293, "step": 970 }, { "epoch": 1.568, "grad_norm": 0.058401789516210556, "learning_rate": 9.81917808219178e-05, "loss": 0.05551270842552185, "step": 980 }, { "epoch": 1.584, "grad_norm": 0.07948075234889984, "learning_rate": 9.709589041095892e-05, "loss": 0.0539792537689209, "step": 990 }, { "epoch": 1.6, "grad_norm": 0.08617192506790161, "learning_rate": 9.6e-05, "loss": 0.07711206674575806, "step": 1000 }, { "epoch": 1.616, "grad_norm": 0.07934480905532837, "learning_rate": 9.49041095890411e-05, "loss": 0.05799928903579712, "step": 1010 }, { "epoch": 1.6320000000000001, "grad_norm": 0.07682377099990845, "learning_rate": 9.38082191780822e-05, "loss": 0.04488539695739746, "step": 1020 }, { "epoch": 1.6480000000000001, "grad_norm": 0.08497436344623566, "learning_rate": 9.27123287671233e-05, "loss": 0.06617907881736755, "step": 1030 }, { "epoch": 1.6640000000000001, "grad_norm": 0.07464807480573654, "learning_rate": 9.16164383561644e-05, "loss": 0.06436434388160706, "step": 1040 }, { "epoch": 1.6800000000000002, "grad_norm": 0.07073179632425308, "learning_rate": 9.052054794520548e-05, "loss": 0.05826765298843384, "step": 1050 }, { "epoch": 1.696, "grad_norm": 0.07814770191907883, "learning_rate": 8.942465753424658e-05, "loss": 0.058675730228424074, "step": 1060 }, { "epoch": 1.712, "grad_norm": 0.07397276908159256, "learning_rate": 8.832876712328768e-05, "loss": 0.05216291546821594, "step": 1070 }, { "epoch": 1.728, "grad_norm": 0.06203208118677139, "learning_rate": 8.723287671232877e-05, "loss": 0.04832034409046173, "step": 1080 }, { "epoch": 1.744, "grad_norm": 0.08247426897287369, "learning_rate": 8.613698630136987e-05, "loss": 0.06704681515693664, "step": 1090 }, { "epoch": 1.76, "grad_norm": 0.07352187484502792, "learning_rate": 8.504109589041096e-05, "loss": 0.056213170289993286, "step": 1100 }, { "epoch": 1.776, "grad_norm": 0.0742267519235611, "learning_rate": 8.394520547945205e-05, "loss": 0.05674695372581482, "step": 1110 }, { "epoch": 1.792, "grad_norm": 0.07203282415866852, "learning_rate": 8.284931506849315e-05, "loss": 0.05544196367263794, "step": 1120 }, { "epoch": 1.808, "grad_norm": 0.07500709593296051, "learning_rate": 8.175342465753425e-05, "loss": 0.050712913274765015, "step": 1130 }, { "epoch": 1.8239999999999998, "grad_norm": 0.06234560161828995, "learning_rate": 8.065753424657535e-05, "loss": 0.06268961429595947, "step": 1140 }, { "epoch": 1.8399999999999999, "grad_norm": 0.0685572475194931, "learning_rate": 7.956164383561645e-05, "loss": 0.050892168283462526, "step": 1150 }, { "epoch": 1.8559999999999999, "grad_norm": 0.06377062201499939, "learning_rate": 7.846575342465754e-05, "loss": 0.05338585376739502, "step": 1160 }, { "epoch": 1.8719999999999999, "grad_norm": 0.07672174274921417, "learning_rate": 7.736986301369863e-05, "loss": 0.0719197690486908, "step": 1170 }, { "epoch": 1.888, "grad_norm": 0.090825654566288, "learning_rate": 7.627397260273973e-05, "loss": 0.05965543985366821, "step": 1180 }, { "epoch": 1.904, "grad_norm": 0.07492175698280334, "learning_rate": 7.517808219178082e-05, "loss": 0.05505464673042297, "step": 1190 }, { "epoch": 1.92, "grad_norm": 0.06776276230812073, "learning_rate": 7.408219178082192e-05, "loss": 0.06357068419456482, "step": 1200 }, { "epoch": 1.936, "grad_norm": 0.08154194056987762, "learning_rate": 7.298630136986302e-05, "loss": 0.07490106225013733, "step": 1210 }, { "epoch": 1.952, "grad_norm": 0.06378313153982162, "learning_rate": 7.18904109589041e-05, "loss": 0.05937790870666504, "step": 1220 }, { "epoch": 1.968, "grad_norm": 0.07686297595500946, "learning_rate": 7.07945205479452e-05, "loss": 0.047175332903862, "step": 1230 }, { "epoch": 1.984, "grad_norm": 0.07107747346162796, "learning_rate": 6.969863013698631e-05, "loss": 0.05534272789955139, "step": 1240 }, { "epoch": 2.0, "grad_norm": 0.07110369950532913, "learning_rate": 6.86027397260274e-05, "loss": 0.06306116580963135, "step": 1250 }, { "epoch": 2.016, "grad_norm": 0.08407752960920334, "learning_rate": 6.75068493150685e-05, "loss": 0.06708587408065796, "step": 1260 }, { "epoch": 2.032, "grad_norm": 0.06716394424438477, "learning_rate": 6.641095890410958e-05, "loss": 0.05874839425086975, "step": 1270 }, { "epoch": 2.048, "grad_norm": 0.07555590569972992, "learning_rate": 6.531506849315069e-05, "loss": 0.05490245819091797, "step": 1280 }, { "epoch": 2.064, "grad_norm": 0.07453346997499466, "learning_rate": 6.421917808219179e-05, "loss": 0.060266101360321046, "step": 1290 }, { "epoch": 2.08, "grad_norm": 0.07161426544189453, "learning_rate": 6.312328767123288e-05, "loss": 0.05789074897766113, "step": 1300 }, { "epoch": 2.096, "grad_norm": 0.06622769683599472, "learning_rate": 6.202739726027397e-05, "loss": 0.06246234774589539, "step": 1310 }, { "epoch": 2.112, "grad_norm": 0.06886615604162216, "learning_rate": 6.0931506849315065e-05, "loss": 0.06343585848808289, "step": 1320 }, { "epoch": 2.128, "grad_norm": 0.07828567922115326, "learning_rate": 5.983561643835617e-05, "loss": 0.05640849471092224, "step": 1330 }, { "epoch": 2.144, "grad_norm": 0.07572014629840851, "learning_rate": 5.873972602739727e-05, "loss": 0.05074018836021423, "step": 1340 }, { "epoch": 2.16, "grad_norm": 0.06873492151498795, "learning_rate": 5.764383561643836e-05, "loss": 0.054788839817047116, "step": 1350 }, { "epoch": 2.176, "grad_norm": 0.080296590924263, "learning_rate": 5.654794520547946e-05, "loss": 0.06817570328712463, "step": 1360 }, { "epoch": 2.192, "grad_norm": 0.07481079548597336, "learning_rate": 5.545205479452055e-05, "loss": 0.05946822166442871, "step": 1370 }, { "epoch": 2.208, "grad_norm": 0.06598034501075745, "learning_rate": 5.4356164383561646e-05, "loss": 0.06451416015625, "step": 1380 }, { "epoch": 2.224, "grad_norm": 0.06421754509210587, "learning_rate": 5.326027397260275e-05, "loss": 0.044728249311447144, "step": 1390 }, { "epoch": 2.24, "grad_norm": 0.0873740166425705, "learning_rate": 5.2164383561643835e-05, "loss": 0.06062799692153931, "step": 1400 }, { "epoch": 2.2560000000000002, "grad_norm": 0.0765632912516594, "learning_rate": 5.106849315068494e-05, "loss": 0.045326176285743716, "step": 1410 }, { "epoch": 2.2720000000000002, "grad_norm": 0.07045675814151764, "learning_rate": 4.997260273972603e-05, "loss": 0.055944430828094485, "step": 1420 }, { "epoch": 2.288, "grad_norm": 0.07185397297143936, "learning_rate": 4.887671232876713e-05, "loss": 0.0565622866153717, "step": 1430 }, { "epoch": 2.304, "grad_norm": 0.060445524752140045, "learning_rate": 4.778082191780822e-05, "loss": 0.046791556477546695, "step": 1440 }, { "epoch": 2.32, "grad_norm": 0.07421662658452988, "learning_rate": 4.668493150684932e-05, "loss": 0.05277963280677796, "step": 1450 }, { "epoch": 2.336, "grad_norm": 0.07298663258552551, "learning_rate": 4.558904109589041e-05, "loss": 0.05688644647598266, "step": 1460 }, { "epoch": 2.352, "grad_norm": 0.09492790699005127, "learning_rate": 4.4493150684931515e-05, "loss": 0.047478115558624266, "step": 1470 }, { "epoch": 2.368, "grad_norm": 0.06642141193151474, "learning_rate": 4.3397260273972606e-05, "loss": 0.06602519750595093, "step": 1480 }, { "epoch": 2.384, "grad_norm": 0.07402130216360092, "learning_rate": 4.2301369863013704e-05, "loss": 0.05580626130104065, "step": 1490 }, { "epoch": 2.4, "grad_norm": 0.06669177114963531, "learning_rate": 4.1205479452054795e-05, "loss": 0.05892255902290344, "step": 1500 }, { "epoch": 2.416, "grad_norm": 0.06874891370534897, "learning_rate": 4.0109589041095893e-05, "loss": 0.051643162965774536, "step": 1510 }, { "epoch": 2.432, "grad_norm": 0.0800771415233612, "learning_rate": 3.9013698630136985e-05, "loss": 0.05913302302360535, "step": 1520 }, { "epoch": 2.448, "grad_norm": 0.07515449076890945, "learning_rate": 3.791780821917808e-05, "loss": 0.05278569459915161, "step": 1530 }, { "epoch": 2.464, "grad_norm": 0.07199724018573761, "learning_rate": 3.682191780821918e-05, "loss": 0.06340099573135376, "step": 1540 }, { "epoch": 2.48, "grad_norm": 0.0678173080086708, "learning_rate": 3.572602739726028e-05, "loss": 0.05762805938720703, "step": 1550 }, { "epoch": 2.496, "grad_norm": 0.06555041670799255, "learning_rate": 3.463013698630137e-05, "loss": 0.05636816024780274, "step": 1560 }, { "epoch": 2.512, "grad_norm": 0.06559967249631882, "learning_rate": 3.353424657534247e-05, "loss": 0.06568785905838012, "step": 1570 }, { "epoch": 2.528, "grad_norm": 0.06496980041265488, "learning_rate": 3.2438356164383566e-05, "loss": 0.05101228356361389, "step": 1580 }, { "epoch": 2.544, "grad_norm": 0.07682781666517258, "learning_rate": 3.134246575342466e-05, "loss": 0.0528583824634552, "step": 1590 }, { "epoch": 2.56, "grad_norm": 0.07121063023805618, "learning_rate": 3.0246575342465755e-05, "loss": 0.058035969734191895, "step": 1600 }, { "epoch": 2.576, "grad_norm": 0.06801185756921768, "learning_rate": 2.915068493150685e-05, "loss": 0.06011275053024292, "step": 1610 }, { "epoch": 2.592, "grad_norm": 0.07581860572099686, "learning_rate": 2.8054794520547945e-05, "loss": 0.05654975771903992, "step": 1620 }, { "epoch": 2.608, "grad_norm": 0.0737839862704277, "learning_rate": 2.6958904109589046e-05, "loss": 0.056451690196990964, "step": 1630 }, { "epoch": 2.624, "grad_norm": 0.07139196991920471, "learning_rate": 2.586301369863014e-05, "loss": 0.04597268998622894, "step": 1640 }, { "epoch": 2.64, "grad_norm": 0.06762823462486267, "learning_rate": 2.4767123287671235e-05, "loss": 0.05820190906524658, "step": 1650 }, { "epoch": 2.656, "grad_norm": 0.0811057910323143, "learning_rate": 2.367123287671233e-05, "loss": 0.05836214423179627, "step": 1660 }, { "epoch": 2.672, "grad_norm": 0.06706267595291138, "learning_rate": 2.2575342465753428e-05, "loss": 0.06739939451217651, "step": 1670 }, { "epoch": 2.6879999999999997, "grad_norm": 0.08226553350687027, "learning_rate": 2.1479452054794523e-05, "loss": 0.0494617223739624, "step": 1680 }, { "epoch": 2.7039999999999997, "grad_norm": 0.06992533802986145, "learning_rate": 2.0383561643835617e-05, "loss": 0.04956637024879455, "step": 1690 }, { "epoch": 2.7199999999999998, "grad_norm": 0.07530827820301056, "learning_rate": 1.9287671232876715e-05, "loss": 0.05145506858825684, "step": 1700 }, { "epoch": 2.7359999999999998, "grad_norm": 0.08654113113880157, "learning_rate": 1.819178082191781e-05, "loss": 0.06389129757881165, "step": 1710 }, { "epoch": 2.752, "grad_norm": 0.07525806874036789, "learning_rate": 1.7095890410958905e-05, "loss": 0.05270478129386902, "step": 1720 }, { "epoch": 2.768, "grad_norm": 0.07091408967971802, "learning_rate": 1.6000000000000003e-05, "loss": 0.04515470564365387, "step": 1730 }, { "epoch": 2.784, "grad_norm": 0.08608058840036392, "learning_rate": 1.4904109589041096e-05, "loss": 0.06389402747154235, "step": 1740 }, { "epoch": 2.8, "grad_norm": 0.08408579975366592, "learning_rate": 1.3808219178082194e-05, "loss": 0.0562017560005188, "step": 1750 }, { "epoch": 2.816, "grad_norm": 0.07602408528327942, "learning_rate": 1.2712328767123288e-05, "loss": 0.06580867767333984, "step": 1760 }, { "epoch": 2.832, "grad_norm": 0.07258498668670654, "learning_rate": 1.1616438356164385e-05, "loss": 0.06434884071350097, "step": 1770 }, { "epoch": 2.848, "grad_norm": 0.07658011466264725, "learning_rate": 1.0520547945205481e-05, "loss": 0.06852009296417236, "step": 1780 }, { "epoch": 2.864, "grad_norm": 0.08555035293102264, "learning_rate": 9.424657534246576e-06, "loss": 0.05706756114959717, "step": 1790 }, { "epoch": 2.88, "grad_norm": 0.07567881047725677, "learning_rate": 8.328767123287672e-06, "loss": 0.05250921845436096, "step": 1800 }, { "epoch": 2.896, "grad_norm": 0.07079999893903732, "learning_rate": 7.232876712328767e-06, "loss": 0.056053298711776736, "step": 1810 }, { "epoch": 2.912, "grad_norm": 0.06325879693031311, "learning_rate": 6.136986301369863e-06, "loss": 0.0492043673992157, "step": 1820 }, { "epoch": 2.928, "grad_norm": 0.08406194299459457, "learning_rate": 5.041095890410959e-06, "loss": 0.06466425061225892, "step": 1830 }, { "epoch": 2.944, "grad_norm": 0.07006578147411346, "learning_rate": 3.945205479452055e-06, "loss": 0.06320858597755433, "step": 1840 }, { "epoch": 2.96, "grad_norm": 0.06405450403690338, "learning_rate": 2.8493150684931506e-06, "loss": 0.05880612134933472, "step": 1850 }, { "epoch": 2.976, "grad_norm": 0.06825050711631775, "learning_rate": 1.7534246575342465e-06, "loss": 0.06181260943412781, "step": 1860 }, { "epoch": 2.992, "grad_norm": 0.07468807697296143, "learning_rate": 6.575342465753426e-07, "loss": 0.07073507905006408, "step": 1870 } ], "logging_steps": 10, "max_steps": 1875, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.5924415770723533e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }