{ "best_global_step": 4000, "best_metric": 1.8740234375, "best_model_checkpoint": "/home/sagemaker-user/persistent_models/models/llama3-dialects-lora/checkpoint-4000", "epoch": 2.8758651685393257, "eval_steps": 200, "global_step": 4000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.017977528089887642, "grad_norm": 1.749185562133789, "learning_rate": 7.199999999999999e-05, "loss": 3.7646, "step": 25 }, { "epoch": 0.035955056179775284, "grad_norm": 1.730239987373352, "learning_rate": 0.000147, "loss": 3.2187, "step": 50 }, { "epoch": 0.05393258426966292, "grad_norm": 1.582846760749817, "learning_rate": 0.00022199999999999998, "loss": 3.004, "step": 75 }, { "epoch": 0.07191011235955057, "grad_norm": 1.4977154731750488, "learning_rate": 0.00029699999999999996, "loss": 2.9559, "step": 100 }, { "epoch": 0.0898876404494382, "grad_norm": 1.3574167490005493, "learning_rate": 0.00029823226123250676, "loss": 2.8599, "step": 125 }, { "epoch": 0.10786516853932585, "grad_norm": 1.5711396932601929, "learning_rate": 0.0002963908666830346, "loss": 2.7605, "step": 150 }, { "epoch": 0.1258426966292135, "grad_norm": 1.2747020721435547, "learning_rate": 0.00029454947213356246, "loss": 2.7511, "step": 175 }, { "epoch": 0.14382022471910114, "grad_norm": 1.1830449104309082, "learning_rate": 0.00029270807758409033, "loss": 2.7047, "step": 200 }, { "epoch": 0.14382022471910114, "eval_loss": 2.690298557281494, "eval_runtime": 80.7044, "eval_samples_per_second": 14.522, "eval_steps_per_second": 1.821, "step": 200 }, { "epoch": 0.16179775280898875, "grad_norm": 1.2935799360275269, "learning_rate": 0.0002908666830346182, "loss": 2.6583, "step": 225 }, { "epoch": 0.1797752808988764, "grad_norm": 1.3620558977127075, "learning_rate": 0.0002890252884851461, "loss": 2.6213, "step": 250 }, { "epoch": 0.19775280898876405, "grad_norm": 1.17039155960083, "learning_rate": 0.0002871838939356739, "loss": 2.6112, "step": 275 }, { "epoch": 0.2157303370786517, "grad_norm": 1.202431559562683, "learning_rate": 0.0002853424993862018, "loss": 2.5672, "step": 300 }, { "epoch": 0.23370786516853934, "grad_norm": 1.3193095922470093, "learning_rate": 0.00028350110483672965, "loss": 2.5744, "step": 325 }, { "epoch": 0.251685393258427, "grad_norm": 1.1977546215057373, "learning_rate": 0.0002816597102872575, "loss": 2.5604, "step": 350 }, { "epoch": 0.2696629213483146, "grad_norm": 1.090685248374939, "learning_rate": 0.0002798183157377854, "loss": 2.4976, "step": 375 }, { "epoch": 0.2876404494382023, "grad_norm": 1.3060740232467651, "learning_rate": 0.0002779769211883133, "loss": 2.5319, "step": 400 }, { "epoch": 0.2876404494382023, "eval_loss": 2.5200583934783936, "eval_runtime": 80.7116, "eval_samples_per_second": 14.521, "eval_steps_per_second": 1.821, "step": 400 }, { "epoch": 0.3056179775280899, "grad_norm": 1.362787127494812, "learning_rate": 0.0002761355266388411, "loss": 2.4959, "step": 425 }, { "epoch": 0.3235955056179775, "grad_norm": 1.1826609373092651, "learning_rate": 0.00027429413208936897, "loss": 2.5202, "step": 450 }, { "epoch": 0.3415730337078652, "grad_norm": 1.202694058418274, "learning_rate": 0.00027245273753989684, "loss": 2.4595, "step": 475 }, { "epoch": 0.3595505617977528, "grad_norm": 1.2969545125961304, "learning_rate": 0.0002706113429904247, "loss": 2.4712, "step": 500 }, { "epoch": 0.3775280898876405, "grad_norm": 1.2445427179336548, "learning_rate": 0.0002687699484409526, "loss": 2.4647, "step": 525 }, { "epoch": 0.3955056179775281, "grad_norm": 1.250051736831665, "learning_rate": 0.00026692855389148047, "loss": 2.452, "step": 550 }, { "epoch": 0.4134831460674157, "grad_norm": 1.2168865203857422, "learning_rate": 0.00026508715934200834, "loss": 2.4104, "step": 575 }, { "epoch": 0.4314606741573034, "grad_norm": 1.2741878032684326, "learning_rate": 0.0002632457647925362, "loss": 2.4287, "step": 600 }, { "epoch": 0.4314606741573034, "eval_loss": 2.4175591468811035, "eval_runtime": 80.7302, "eval_samples_per_second": 14.517, "eval_steps_per_second": 1.821, "step": 600 }, { "epoch": 0.449438202247191, "grad_norm": 1.3188307285308838, "learning_rate": 0.0002614043702430641, "loss": 2.3745, "step": 625 }, { "epoch": 0.46741573033707867, "grad_norm": 1.2066679000854492, "learning_rate": 0.0002595629756935919, "loss": 2.3834, "step": 650 }, { "epoch": 0.4853932584269663, "grad_norm": 1.467005729675293, "learning_rate": 0.0002577215811441198, "loss": 2.3595, "step": 675 }, { "epoch": 0.503370786516854, "grad_norm": 1.283755898475647, "learning_rate": 0.00025588018659464766, "loss": 2.3891, "step": 700 }, { "epoch": 0.5213483146067416, "grad_norm": 1.325465202331543, "learning_rate": 0.00025403879204517554, "loss": 2.3776, "step": 725 }, { "epoch": 0.5393258426966292, "grad_norm": 1.2250348329544067, "learning_rate": 0.00025219739749570336, "loss": 2.3857, "step": 750 }, { "epoch": 0.5573033707865168, "grad_norm": 1.1220479011535645, "learning_rate": 0.00025035600294623123, "loss": 2.3507, "step": 775 }, { "epoch": 0.5752808988764045, "grad_norm": 1.2399861812591553, "learning_rate": 0.0002485146083967591, "loss": 2.3382, "step": 800 }, { "epoch": 0.5752808988764045, "eval_loss": 2.351914405822754, "eval_runtime": 80.7188, "eval_samples_per_second": 14.52, "eval_steps_per_second": 1.821, "step": 800 }, { "epoch": 0.5932584269662922, "grad_norm": 1.2778280973434448, "learning_rate": 0.000246673213847287, "loss": 2.3252, "step": 825 }, { "epoch": 0.6112359550561798, "grad_norm": 1.1694093942642212, "learning_rate": 0.00024483181929781485, "loss": 2.3263, "step": 850 }, { "epoch": 0.6292134831460674, "grad_norm": 1.2676703929901123, "learning_rate": 0.00024299042474834273, "loss": 2.2918, "step": 875 }, { "epoch": 0.647191011235955, "grad_norm": 1.2600834369659424, "learning_rate": 0.0002411490301988706, "loss": 2.3312, "step": 900 }, { "epoch": 0.6651685393258427, "grad_norm": 1.2385838031768799, "learning_rate": 0.00023930763564939848, "loss": 2.347, "step": 925 }, { "epoch": 0.6831460674157304, "grad_norm": 1.1745089292526245, "learning_rate": 0.00023746624109992633, "loss": 2.2975, "step": 950 }, { "epoch": 0.701123595505618, "grad_norm": 1.3536038398742676, "learning_rate": 0.0002356248465504542, "loss": 2.2881, "step": 975 }, { "epoch": 0.7191011235955056, "grad_norm": 1.3784751892089844, "learning_rate": 0.00023378345200098205, "loss": 2.268, "step": 1000 }, { "epoch": 0.7191011235955056, "eval_loss": 2.298362970352173, "eval_runtime": 80.7025, "eval_samples_per_second": 14.522, "eval_steps_per_second": 1.822, "step": 1000 }, { "epoch": 0.7370786516853932, "grad_norm": 1.1211934089660645, "learning_rate": 0.00023194205745150992, "loss": 2.2905, "step": 1025 }, { "epoch": 0.755056179775281, "grad_norm": 1.2473045587539673, "learning_rate": 0.00023010066290203777, "loss": 2.3067, "step": 1050 }, { "epoch": 0.7730337078651686, "grad_norm": 1.1718072891235352, "learning_rate": 0.00022825926835256564, "loss": 2.3023, "step": 1075 }, { "epoch": 0.7910112359550562, "grad_norm": 1.2194567918777466, "learning_rate": 0.00022641787380309352, "loss": 2.2565, "step": 1100 }, { "epoch": 0.8089887640449438, "grad_norm": 1.3996608257293701, "learning_rate": 0.0002245764792536214, "loss": 2.2864, "step": 1125 }, { "epoch": 0.8269662921348314, "grad_norm": 1.4146196842193604, "learning_rate": 0.00022273508470414927, "loss": 2.2751, "step": 1150 }, { "epoch": 0.8449438202247191, "grad_norm": 1.2465920448303223, "learning_rate": 0.00022089369015467712, "loss": 2.226, "step": 1175 }, { "epoch": 0.8629213483146068, "grad_norm": 1.2541664838790894, "learning_rate": 0.000219052295605205, "loss": 2.2658, "step": 1200 }, { "epoch": 0.8629213483146068, "eval_loss": 2.2391414642333984, "eval_runtime": 80.6942, "eval_samples_per_second": 14.524, "eval_steps_per_second": 1.822, "step": 1200 }, { "epoch": 0.8808988764044944, "grad_norm": 1.3151921033859253, "learning_rate": 0.00021721090105573286, "loss": 2.2105, "step": 1225 }, { "epoch": 0.898876404494382, "grad_norm": 1.1428577899932861, "learning_rate": 0.00021536950650626074, "loss": 2.2555, "step": 1250 }, { "epoch": 0.9168539325842696, "grad_norm": 1.2680996656417847, "learning_rate": 0.0002135281119567886, "loss": 2.2403, "step": 1275 }, { "epoch": 0.9348314606741573, "grad_norm": 1.1781823635101318, "learning_rate": 0.00021168671740731643, "loss": 2.2282, "step": 1300 }, { "epoch": 0.952808988764045, "grad_norm": 1.1358236074447632, "learning_rate": 0.0002098453228578443, "loss": 2.2282, "step": 1325 }, { "epoch": 0.9707865168539326, "grad_norm": 1.3253538608551025, "learning_rate": 0.00020800392830837218, "loss": 2.2043, "step": 1350 }, { "epoch": 0.9887640449438202, "grad_norm": 1.1950758695602417, "learning_rate": 0.00020616253375890006, "loss": 2.2338, "step": 1375 }, { "epoch": 1.0064719101123596, "grad_norm": 1.4950629472732544, "learning_rate": 0.00020432113920942793, "loss": 2.1445, "step": 1400 }, { "epoch": 1.0064719101123596, "eval_loss": 2.2137484550476074, "eval_runtime": 80.7063, "eval_samples_per_second": 14.522, "eval_steps_per_second": 1.821, "step": 1400 }, { "epoch": 1.024449438202247, "grad_norm": 1.2458395957946777, "learning_rate": 0.00020247974465995578, "loss": 1.9485, "step": 1425 }, { "epoch": 1.0424269662921348, "grad_norm": 1.3286699056625366, "learning_rate": 0.00020063835011048365, "loss": 1.9582, "step": 1450 }, { "epoch": 1.0604044943820226, "grad_norm": 1.3999990224838257, "learning_rate": 0.00019879695556101153, "loss": 1.9849, "step": 1475 }, { "epoch": 1.07838202247191, "grad_norm": 1.2548496723175049, "learning_rate": 0.0001969555610115394, "loss": 1.97, "step": 1500 }, { "epoch": 1.0963595505617978, "grad_norm": 1.3132121562957764, "learning_rate": 0.00019511416646206728, "loss": 1.9875, "step": 1525 }, { "epoch": 1.1143370786516853, "grad_norm": 1.1348389387130737, "learning_rate": 0.00019327277191259512, "loss": 1.9779, "step": 1550 }, { "epoch": 1.132314606741573, "grad_norm": 1.1579432487487793, "learning_rate": 0.00019143137736312297, "loss": 1.9779, "step": 1575 }, { "epoch": 1.1502921348314608, "grad_norm": 1.1249704360961914, "learning_rate": 0.00018958998281365085, "loss": 1.9173, "step": 1600 }, { "epoch": 1.1502921348314608, "eval_loss": 2.1703741550445557, "eval_runtime": 80.6853, "eval_samples_per_second": 14.526, "eval_steps_per_second": 1.822, "step": 1600 }, { "epoch": 1.1682696629213483, "grad_norm": 1.227824330329895, "learning_rate": 0.00018774858826417872, "loss": 1.9645, "step": 1625 }, { "epoch": 1.186247191011236, "grad_norm": 1.2425934076309204, "learning_rate": 0.00018590719371470657, "loss": 1.9705, "step": 1650 }, { "epoch": 1.2042247191011235, "grad_norm": 1.2888400554656982, "learning_rate": 0.00018406579916523444, "loss": 1.9725, "step": 1675 }, { "epoch": 1.2222022471910112, "grad_norm": 1.6009498834609985, "learning_rate": 0.00018222440461576232, "loss": 1.9468, "step": 1700 }, { "epoch": 1.240179775280899, "grad_norm": 1.4453681707382202, "learning_rate": 0.0001803830100662902, "loss": 1.9557, "step": 1725 }, { "epoch": 1.2581573033707865, "grad_norm": 1.4247924089431763, "learning_rate": 0.00017854161551681807, "loss": 1.9733, "step": 1750 }, { "epoch": 1.2761348314606742, "grad_norm": 1.508577823638916, "learning_rate": 0.00017670022096734594, "loss": 1.9471, "step": 1775 }, { "epoch": 1.294112359550562, "grad_norm": 1.1020046472549438, "learning_rate": 0.0001748588264178738, "loss": 1.9526, "step": 1800 }, { "epoch": 1.294112359550562, "eval_loss": 2.132683515548706, "eval_runtime": 80.6992, "eval_samples_per_second": 14.523, "eval_steps_per_second": 1.822, "step": 1800 }, { "epoch": 1.3120898876404494, "grad_norm": 1.4377837181091309, "learning_rate": 0.00017301743186840166, "loss": 1.8673, "step": 1825 }, { "epoch": 1.3300674157303372, "grad_norm": 1.3954998254776, "learning_rate": 0.00017117603731892954, "loss": 1.9195, "step": 1850 }, { "epoch": 1.3480449438202247, "grad_norm": 1.3105148077011108, "learning_rate": 0.00016933464276945739, "loss": 1.9165, "step": 1875 }, { "epoch": 1.3660224719101124, "grad_norm": 1.2619152069091797, "learning_rate": 0.00016749324821998523, "loss": 1.9237, "step": 1900 }, { "epoch": 1.384, "grad_norm": 1.4734113216400146, "learning_rate": 0.0001656518536705131, "loss": 1.9283, "step": 1925 }, { "epoch": 1.4019775280898876, "grad_norm": 1.3151576519012451, "learning_rate": 0.00016381045912104098, "loss": 1.9462, "step": 1950 }, { "epoch": 1.4199550561797754, "grad_norm": 1.8173805475234985, "learning_rate": 0.00016196906457156886, "loss": 1.9055, "step": 1975 }, { "epoch": 1.4379325842696629, "grad_norm": 1.3704513311386108, "learning_rate": 0.00016012767002209673, "loss": 1.9107, "step": 2000 }, { "epoch": 1.4379325842696629, "eval_loss": 2.099823474884033, "eval_runtime": 80.7256, "eval_samples_per_second": 14.518, "eval_steps_per_second": 1.821, "step": 2000 }, { "epoch": 1.4559101123595506, "grad_norm": 1.197521686553955, "learning_rate": 0.00015828627547262458, "loss": 1.9403, "step": 2025 }, { "epoch": 1.4738876404494383, "grad_norm": 1.3856151103973389, "learning_rate": 0.00015644488092315245, "loss": 1.9333, "step": 2050 }, { "epoch": 1.4918651685393258, "grad_norm": 1.1521707773208618, "learning_rate": 0.00015460348637368033, "loss": 1.8697, "step": 2075 }, { "epoch": 1.5098426966292133, "grad_norm": 1.2475628852844238, "learning_rate": 0.0001527620918242082, "loss": 1.8779, "step": 2100 }, { "epoch": 1.5278202247191013, "grad_norm": 1.406043529510498, "learning_rate": 0.00015092069727473608, "loss": 1.9047, "step": 2125 }, { "epoch": 1.5457977528089888, "grad_norm": 1.294491171836853, "learning_rate": 0.00014907930272526392, "loss": 1.8965, "step": 2150 }, { "epoch": 1.5637752808988763, "grad_norm": 1.2842826843261719, "learning_rate": 0.0001472379081757918, "loss": 1.8981, "step": 2175 }, { "epoch": 1.581752808988764, "grad_norm": 1.312206506729126, "learning_rate": 0.00014539651362631967, "loss": 1.9482, "step": 2200 }, { "epoch": 1.581752808988764, "eval_loss": 2.0673165321350098, "eval_runtime": 80.7193, "eval_samples_per_second": 14.519, "eval_steps_per_second": 1.821, "step": 2200 }, { "epoch": 1.5997303370786518, "grad_norm": 1.3602793216705322, "learning_rate": 0.00014355511907684752, "loss": 1.903, "step": 2225 }, { "epoch": 1.6177078651685393, "grad_norm": 1.4941192865371704, "learning_rate": 0.0001417137245273754, "loss": 1.8165, "step": 2250 }, { "epoch": 1.635685393258427, "grad_norm": 1.3320022821426392, "learning_rate": 0.00013987232997790324, "loss": 1.8722, "step": 2275 }, { "epoch": 1.6536629213483147, "grad_norm": 1.316677212715149, "learning_rate": 0.00013803093542843112, "loss": 1.8762, "step": 2300 }, { "epoch": 1.6716404494382022, "grad_norm": 1.226443886756897, "learning_rate": 0.000136189540878959, "loss": 1.898, "step": 2325 }, { "epoch": 1.6896179775280897, "grad_norm": 1.362231969833374, "learning_rate": 0.00013434814632948687, "loss": 1.8414, "step": 2350 }, { "epoch": 1.7075955056179777, "grad_norm": 1.4274637699127197, "learning_rate": 0.00013250675178001471, "loss": 1.873, "step": 2375 }, { "epoch": 1.7255730337078652, "grad_norm": 1.3610618114471436, "learning_rate": 0.0001306653572305426, "loss": 1.8512, "step": 2400 }, { "epoch": 1.7255730337078652, "eval_loss": 2.0208215713500977, "eval_runtime": 80.723, "eval_samples_per_second": 14.519, "eval_steps_per_second": 1.821, "step": 2400 }, { "epoch": 1.7435505617977527, "grad_norm": 1.4524530172348022, "learning_rate": 0.00012882396268107046, "loss": 1.8351, "step": 2425 }, { "epoch": 1.7615280898876404, "grad_norm": 1.3653873205184937, "learning_rate": 0.0001269825681315983, "loss": 1.8128, "step": 2450 }, { "epoch": 1.7795056179775282, "grad_norm": 1.3041210174560547, "learning_rate": 0.00012514117358212618, "loss": 1.8611, "step": 2475 }, { "epoch": 1.7974831460674157, "grad_norm": 1.6194740533828735, "learning_rate": 0.00012329977903265403, "loss": 1.8929, "step": 2500 }, { "epoch": 1.8154606741573034, "grad_norm": 1.4779677391052246, "learning_rate": 0.00012145838448318192, "loss": 1.8804, "step": 2525 }, { "epoch": 1.8334382022471911, "grad_norm": 1.6529386043548584, "learning_rate": 0.00011961698993370978, "loss": 1.8651, "step": 2550 }, { "epoch": 1.8514157303370786, "grad_norm": 1.3699911832809448, "learning_rate": 0.00011777559538423766, "loss": 1.8118, "step": 2575 }, { "epoch": 1.8693932584269661, "grad_norm": 1.4519306421279907, "learning_rate": 0.00011593420083476552, "loss": 1.8368, "step": 2600 }, { "epoch": 1.8693932584269661, "eval_loss": 1.9914958477020264, "eval_runtime": 80.6759, "eval_samples_per_second": 14.527, "eval_steps_per_second": 1.822, "step": 2600 }, { "epoch": 1.887370786516854, "grad_norm": 1.5025938749313354, "learning_rate": 0.00011409280628529339, "loss": 1.8106, "step": 2625 }, { "epoch": 1.9053483146067416, "grad_norm": 1.2333178520202637, "learning_rate": 0.00011225141173582124, "loss": 1.8241, "step": 2650 }, { "epoch": 1.923325842696629, "grad_norm": 1.4107869863510132, "learning_rate": 0.00011041001718634911, "loss": 1.8145, "step": 2675 }, { "epoch": 1.9413033707865168, "grad_norm": 1.5761250257492065, "learning_rate": 0.00010856862263687699, "loss": 1.809, "step": 2700 }, { "epoch": 1.9592808988764046, "grad_norm": 1.8295477628707886, "learning_rate": 0.00010672722808740485, "loss": 1.792, "step": 2725 }, { "epoch": 1.977258426966292, "grad_norm": 1.35286545753479, "learning_rate": 0.00010488583353793272, "loss": 1.8149, "step": 2750 }, { "epoch": 1.9952359550561798, "grad_norm": 1.4267395734786987, "learning_rate": 0.0001030444389884606, "loss": 1.7479, "step": 2775 }, { "epoch": 2.012943820224719, "grad_norm": 1.2338273525238037, "learning_rate": 0.00010120304443898845, "loss": 1.562, "step": 2800 }, { "epoch": 2.012943820224719, "eval_loss": 1.9848835468292236, "eval_runtime": 80.6992, "eval_samples_per_second": 14.523, "eval_steps_per_second": 1.822, "step": 2800 }, { "epoch": 2.0309213483146067, "grad_norm": 1.4810634851455688, "learning_rate": 9.936164988951632e-05, "loss": 1.5435, "step": 2825 }, { "epoch": 2.048898876404494, "grad_norm": 1.3950103521347046, "learning_rate": 9.752025534004418e-05, "loss": 1.5131, "step": 2850 }, { "epoch": 2.066876404494382, "grad_norm": 1.5464402437210083, "learning_rate": 9.567886079057206e-05, "loss": 1.5633, "step": 2875 }, { "epoch": 2.0848539325842697, "grad_norm": 1.507130742073059, "learning_rate": 9.383746624109993e-05, "loss": 1.5307, "step": 2900 }, { "epoch": 2.102831460674157, "grad_norm": 1.217025637626648, "learning_rate": 9.199607169162779e-05, "loss": 1.5411, "step": 2925 }, { "epoch": 2.120808988764045, "grad_norm": 1.4660253524780273, "learning_rate": 9.015467714215565e-05, "loss": 1.4794, "step": 2950 }, { "epoch": 2.1387865168539326, "grad_norm": 1.697140097618103, "learning_rate": 8.831328259268351e-05, "loss": 1.4896, "step": 2975 }, { "epoch": 2.15676404494382, "grad_norm": 1.5362123250961304, "learning_rate": 8.647188804321139e-05, "loss": 1.5059, "step": 3000 }, { "epoch": 2.15676404494382, "eval_loss": 1.964095950126648, "eval_runtime": 80.6844, "eval_samples_per_second": 14.526, "eval_steps_per_second": 1.822, "step": 3000 }, { "epoch": 2.1747415730337076, "grad_norm": 1.6470073461532593, "learning_rate": 8.463049349373925e-05, "loss": 1.5021, "step": 3025 }, { "epoch": 2.1927191011235956, "grad_norm": 1.3981434106826782, "learning_rate": 8.278909894426712e-05, "loss": 1.5012, "step": 3050 }, { "epoch": 2.210696629213483, "grad_norm": 1.741492748260498, "learning_rate": 8.094770439479497e-05, "loss": 1.5276, "step": 3075 }, { "epoch": 2.2286741573033706, "grad_norm": 1.391007661819458, "learning_rate": 7.910630984532285e-05, "loss": 1.4697, "step": 3100 }, { "epoch": 2.2466516853932585, "grad_norm": 1.4171358346939087, "learning_rate": 7.726491529585072e-05, "loss": 1.5107, "step": 3125 }, { "epoch": 2.264629213483146, "grad_norm": 1.7048368453979492, "learning_rate": 7.542352074637858e-05, "loss": 1.4998, "step": 3150 }, { "epoch": 2.2826067415730336, "grad_norm": 1.5188285112380981, "learning_rate": 7.358212619690646e-05, "loss": 1.5317, "step": 3175 }, { "epoch": 2.3005842696629215, "grad_norm": 1.5329570770263672, "learning_rate": 7.174073164743432e-05, "loss": 1.4975, "step": 3200 }, { "epoch": 2.3005842696629215, "eval_loss": 1.9450942277908325, "eval_runtime": 80.7207, "eval_samples_per_second": 14.519, "eval_steps_per_second": 1.821, "step": 3200 }, { "epoch": 2.318561797752809, "grad_norm": 1.334620714187622, "learning_rate": 6.989933709796219e-05, "loss": 1.4884, "step": 3225 }, { "epoch": 2.3365393258426965, "grad_norm": 1.7162734270095825, "learning_rate": 6.805794254849005e-05, "loss": 1.4997, "step": 3250 }, { "epoch": 2.3545168539325845, "grad_norm": 1.4038029909133911, "learning_rate": 6.621654799901791e-05, "loss": 1.502, "step": 3275 }, { "epoch": 2.372494382022472, "grad_norm": 1.4473646879196167, "learning_rate": 6.437515344954579e-05, "loss": 1.5225, "step": 3300 }, { "epoch": 2.3904719101123595, "grad_norm": 1.6588456630706787, "learning_rate": 6.253375890007365e-05, "loss": 1.5523, "step": 3325 }, { "epoch": 2.408449438202247, "grad_norm": 1.4811519384384155, "learning_rate": 6.0692364350601516e-05, "loss": 1.4859, "step": 3350 }, { "epoch": 2.426426966292135, "grad_norm": 1.5292789936065674, "learning_rate": 5.8850969801129384e-05, "loss": 1.5008, "step": 3375 }, { "epoch": 2.4444044943820225, "grad_norm": 1.627760410308838, "learning_rate": 5.7009575251657245e-05, "loss": 1.4626, "step": 3400 }, { "epoch": 2.4444044943820225, "eval_loss": 1.9264752864837646, "eval_runtime": 80.7267, "eval_samples_per_second": 14.518, "eval_steps_per_second": 1.821, "step": 3400 }, { "epoch": 2.46238202247191, "grad_norm": 1.6374777555465698, "learning_rate": 5.516818070218512e-05, "loss": 1.4543, "step": 3425 }, { "epoch": 2.480359550561798, "grad_norm": 1.584922432899475, "learning_rate": 5.332678615271299e-05, "loss": 1.5078, "step": 3450 }, { "epoch": 2.4983370786516854, "grad_norm": 1.6486908197402954, "learning_rate": 5.148539160324085e-05, "loss": 1.4539, "step": 3475 }, { "epoch": 2.516314606741573, "grad_norm": 1.4464045763015747, "learning_rate": 4.9643997053768716e-05, "loss": 1.4697, "step": 3500 }, { "epoch": 2.5342921348314604, "grad_norm": 1.488883376121521, "learning_rate": 4.7802602504296584e-05, "loss": 1.4807, "step": 3525 }, { "epoch": 2.5522696629213484, "grad_norm": 1.573306918144226, "learning_rate": 4.5961207954824445e-05, "loss": 1.4501, "step": 3550 }, { "epoch": 2.570247191011236, "grad_norm": 1.4624181985855103, "learning_rate": 4.411981340535232e-05, "loss": 1.5547, "step": 3575 }, { "epoch": 2.588224719101124, "grad_norm": 1.4811517000198364, "learning_rate": 4.227841885588019e-05, "loss": 1.5078, "step": 3600 }, { "epoch": 2.588224719101124, "eval_loss": 1.8996490240097046, "eval_runtime": 80.7148, "eval_samples_per_second": 14.52, "eval_steps_per_second": 1.821, "step": 3600 }, { "epoch": 2.6062022471910113, "grad_norm": 1.4766281843185425, "learning_rate": 4.043702430640805e-05, "loss": 1.4418, "step": 3625 }, { "epoch": 2.624179775280899, "grad_norm": 1.6454192399978638, "learning_rate": 3.8595629756935916e-05, "loss": 1.4239, "step": 3650 }, { "epoch": 2.6421573033707864, "grad_norm": 1.5528382062911987, "learning_rate": 3.6754235207463783e-05, "loss": 1.3851, "step": 3675 }, { "epoch": 2.6601348314606743, "grad_norm": 1.5566567182540894, "learning_rate": 3.4912840657991644e-05, "loss": 1.5234, "step": 3700 }, { "epoch": 2.678112359550562, "grad_norm": 1.4368932247161865, "learning_rate": 3.307144610851952e-05, "loss": 1.4561, "step": 3725 }, { "epoch": 2.6960898876404493, "grad_norm": 1.5049488544464111, "learning_rate": 3.123005155904738e-05, "loss": 1.5281, "step": 3750 }, { "epoch": 2.7140674157303373, "grad_norm": 1.473576307296753, "learning_rate": 2.9388657009575248e-05, "loss": 1.4552, "step": 3775 }, { "epoch": 2.732044943820225, "grad_norm": 1.4664705991744995, "learning_rate": 2.7547262460103116e-05, "loss": 1.4835, "step": 3800 }, { "epoch": 2.732044943820225, "eval_loss": 1.8854955434799194, "eval_runtime": 80.6975, "eval_samples_per_second": 14.523, "eval_steps_per_second": 1.822, "step": 3800 }, { "epoch": 2.7500224719101123, "grad_norm": 1.3552887439727783, "learning_rate": 2.5705867910630983e-05, "loss": 1.3847, "step": 3825 }, { "epoch": 2.768, "grad_norm": 1.8006858825683594, "learning_rate": 2.3864473361158848e-05, "loss": 1.439, "step": 3850 }, { "epoch": 2.7859775280898877, "grad_norm": 1.5043394565582275, "learning_rate": 2.202307881168672e-05, "loss": 1.4442, "step": 3875 }, { "epoch": 2.8039550561797753, "grad_norm": 1.5590105056762695, "learning_rate": 2.0181684262214583e-05, "loss": 1.5243, "step": 3900 }, { "epoch": 2.8219325842696628, "grad_norm": 1.430759310722351, "learning_rate": 1.8340289712742448e-05, "loss": 1.4409, "step": 3925 }, { "epoch": 2.8399101123595507, "grad_norm": 1.4432014226913452, "learning_rate": 1.6498895163270315e-05, "loss": 1.4827, "step": 3950 }, { "epoch": 2.857887640449438, "grad_norm": 1.6197547912597656, "learning_rate": 1.4657500613798183e-05, "loss": 1.491, "step": 3975 }, { "epoch": 2.8758651685393257, "grad_norm": 1.6343038082122803, "learning_rate": 1.2816106064326048e-05, "loss": 1.4473, "step": 4000 }, { "epoch": 2.8758651685393257, "eval_loss": 1.8740234375, "eval_runtime": 80.6815, "eval_samples_per_second": 14.526, "eval_steps_per_second": 1.822, "step": 4000 } ], "logging_steps": 25, "max_steps": 4173, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.4834499914343383e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }