{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 1570, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.064, "grad_norm": 17.60050094292921, "learning_rate": 5.732484076433121e-07, "loss": 0.1956, "step": 10 }, { "epoch": 0.128, "grad_norm": 0.360872577474102, "learning_rate": 1.210191082802548e-06, "loss": 0.0375, "step": 20 }, { "epoch": 0.192, "grad_norm": 0.19682313468384985, "learning_rate": 1.8471337579617835e-06, "loss": 0.0033, "step": 30 }, { "epoch": 0.256, "grad_norm": 0.1063560099577119, "learning_rate": 2.4840764331210194e-06, "loss": 0.0024, "step": 40 }, { "epoch": 0.32, "grad_norm": 0.0815568233065803, "learning_rate": 3.121019108280255e-06, "loss": 0.0023, "step": 50 }, { "epoch": 0.384, "grad_norm": 0.08743440250912729, "learning_rate": 3.757961783439491e-06, "loss": 0.0022, "step": 60 }, { "epoch": 0.448, "grad_norm": 0.08581063789526745, "learning_rate": 4.394904458598727e-06, "loss": 0.0024, "step": 70 }, { "epoch": 0.512, "grad_norm": 0.12211726984887894, "learning_rate": 5.031847133757962e-06, "loss": 0.0029, "step": 80 }, { "epoch": 0.576, "grad_norm": 0.07537531616308987, "learning_rate": 5.668789808917198e-06, "loss": 0.0021, "step": 90 }, { "epoch": 0.64, "grad_norm": 0.1340344984825586, "learning_rate": 6.305732484076433e-06, "loss": 0.0019, "step": 100 }, { "epoch": 0.704, "grad_norm": 0.08953134246511378, "learning_rate": 6.942675159235669e-06, "loss": 0.0018, "step": 110 }, { "epoch": 0.768, "grad_norm": 0.0858578822485225, "learning_rate": 7.579617834394906e-06, "loss": 0.002, "step": 120 }, { "epoch": 0.832, "grad_norm": 0.0655815960237088, "learning_rate": 8.21656050955414e-06, "loss": 0.0022, "step": 130 }, { "epoch": 0.896, "grad_norm": 0.06259879392044645, "learning_rate": 8.853503184713377e-06, "loss": 0.0021, "step": 140 }, { "epoch": 0.96, "grad_norm": 0.11115786959066185, "learning_rate": 9.490445859872613e-06, "loss": 0.0022, "step": 150 }, { "epoch": 1.0192, "grad_norm": 0.11253567844278162, "learning_rate": 9.985845718329796e-06, "loss": 0.0022, "step": 160 }, { "epoch": 1.0832, "grad_norm": 0.08722803511321106, "learning_rate": 9.91507430997877e-06, "loss": 0.0016, "step": 170 }, { "epoch": 1.1472, "grad_norm": 0.06678686196789735, "learning_rate": 9.844302901627743e-06, "loss": 0.0017, "step": 180 }, { "epoch": 1.2112, "grad_norm": 0.04730574861303533, "learning_rate": 9.773531493276717e-06, "loss": 0.0016, "step": 190 }, { "epoch": 1.2752, "grad_norm": 0.0531043878627717, "learning_rate": 9.70276008492569e-06, "loss": 0.0015, "step": 200 }, { "epoch": 1.3392, "grad_norm": 0.04150433810191339, "learning_rate": 9.631988676574666e-06, "loss": 0.0016, "step": 210 }, { "epoch": 1.4032, "grad_norm": 0.048744215851851264, "learning_rate": 9.561217268223637e-06, "loss": 0.0019, "step": 220 }, { "epoch": 1.4672, "grad_norm": 0.051141936058794614, "learning_rate": 9.490445859872613e-06, "loss": 0.0015, "step": 230 }, { "epoch": 1.5312000000000001, "grad_norm": 0.04473230852030802, "learning_rate": 9.419674451521586e-06, "loss": 0.0017, "step": 240 }, { "epoch": 1.5952, "grad_norm": 0.057517928281497044, "learning_rate": 9.34890304317056e-06, "loss": 0.0021, "step": 250 }, { "epoch": 1.6592, "grad_norm": 0.04626400515314615, "learning_rate": 9.278131634819534e-06, "loss": 0.0017, "step": 260 }, { "epoch": 1.7231999999999998, "grad_norm": 0.060866311940863976, "learning_rate": 9.207360226468509e-06, "loss": 0.0018, "step": 270 }, { "epoch": 1.7872, "grad_norm": 0.047260502110355984, "learning_rate": 9.13658881811748e-06, "loss": 0.0018, "step": 280 }, { "epoch": 1.8512, "grad_norm": 0.05629426497008717, "learning_rate": 9.065817409766456e-06, "loss": 0.002, "step": 290 }, { "epoch": 1.9152, "grad_norm": 0.03894980230831118, "learning_rate": 8.995046001415428e-06, "loss": 0.0018, "step": 300 }, { "epoch": 1.9792, "grad_norm": 0.0490808359112902, "learning_rate": 8.924274593064403e-06, "loss": 0.0023, "step": 310 }, { "epoch": 2.0384, "grad_norm": 0.04147961477820841, "learning_rate": 8.853503184713377e-06, "loss": 0.0014, "step": 320 }, { "epoch": 2.1024, "grad_norm": 0.06996184627519739, "learning_rate": 8.78273177636235e-06, "loss": 0.0011, "step": 330 }, { "epoch": 2.1664, "grad_norm": 0.059454727341288253, "learning_rate": 8.711960368011324e-06, "loss": 0.0012, "step": 340 }, { "epoch": 2.2304, "grad_norm": 0.04354879962710656, "learning_rate": 8.641188959660298e-06, "loss": 0.001, "step": 350 }, { "epoch": 2.2944, "grad_norm": 0.04462335944862259, "learning_rate": 8.570417551309271e-06, "loss": 0.0011, "step": 360 }, { "epoch": 2.3584, "grad_norm": 0.03976732177932052, "learning_rate": 8.499646142958245e-06, "loss": 0.0013, "step": 370 }, { "epoch": 2.4224, "grad_norm": 0.2689176723191193, "learning_rate": 8.42887473460722e-06, "loss": 0.0013, "step": 380 }, { "epoch": 2.4864, "grad_norm": 0.06879386397720086, "learning_rate": 8.358103326256194e-06, "loss": 0.0015, "step": 390 }, { "epoch": 2.5504, "grad_norm": 0.04660712249010195, "learning_rate": 8.287331917905167e-06, "loss": 0.0013, "step": 400 }, { "epoch": 2.6144, "grad_norm": 0.049288480673167614, "learning_rate": 8.21656050955414e-06, "loss": 0.0012, "step": 410 }, { "epoch": 2.6784, "grad_norm": 0.0736787873870253, "learning_rate": 8.145789101203114e-06, "loss": 0.0013, "step": 420 }, { "epoch": 2.7424, "grad_norm": 0.03501443136894185, "learning_rate": 8.075017692852088e-06, "loss": 0.0014, "step": 430 }, { "epoch": 2.8064, "grad_norm": 0.05662961561934753, "learning_rate": 8.004246284501063e-06, "loss": 0.0013, "step": 440 }, { "epoch": 2.8704, "grad_norm": 0.034163817502979116, "learning_rate": 7.933474876150035e-06, "loss": 0.0013, "step": 450 }, { "epoch": 2.9344, "grad_norm": 0.03627782201165022, "learning_rate": 7.86270346779901e-06, "loss": 0.0015, "step": 460 }, { "epoch": 2.9984, "grad_norm": 0.0378327369660673, "learning_rate": 7.791932059447984e-06, "loss": 0.0014, "step": 470 }, { "epoch": 3.0576, "grad_norm": 0.05647581306266388, "learning_rate": 7.721160651096958e-06, "loss": 0.0011, "step": 480 }, { "epoch": 3.1216, "grad_norm": 0.0359488394078568, "learning_rate": 7.650389242745931e-06, "loss": 0.0009, "step": 490 }, { "epoch": 3.1856, "grad_norm": 0.03876989103654088, "learning_rate": 7.579617834394906e-06, "loss": 0.0009, "step": 500 }, { "epoch": 3.2496, "grad_norm": 0.025465697683673762, "learning_rate": 7.5088464260438785e-06, "loss": 0.0009, "step": 510 }, { "epoch": 3.3136, "grad_norm": 0.03335723703353167, "learning_rate": 7.438075017692853e-06, "loss": 0.001, "step": 520 }, { "epoch": 3.3776, "grad_norm": 0.023575631149543714, "learning_rate": 7.367303609341826e-06, "loss": 0.0009, "step": 530 }, { "epoch": 3.4416, "grad_norm": 0.030623516564894863, "learning_rate": 7.2965322009908e-06, "loss": 0.0009, "step": 540 }, { "epoch": 3.5056000000000003, "grad_norm": 0.024108954363881076, "learning_rate": 7.2257607926397746e-06, "loss": 0.0012, "step": 550 }, { "epoch": 3.5696, "grad_norm": 0.05329738469797095, "learning_rate": 7.154989384288747e-06, "loss": 0.0011, "step": 560 }, { "epoch": 3.6336, "grad_norm": 0.05843959664676447, "learning_rate": 7.084217975937722e-06, "loss": 0.001, "step": 570 }, { "epoch": 3.6976, "grad_norm": 0.03573169633656267, "learning_rate": 7.013446567586696e-06, "loss": 0.001, "step": 580 }, { "epoch": 3.7616, "grad_norm": 0.03098931841718372, "learning_rate": 6.942675159235669e-06, "loss": 0.0011, "step": 590 }, { "epoch": 3.8256, "grad_norm": 0.029839006708716795, "learning_rate": 6.871903750884643e-06, "loss": 0.001, "step": 600 }, { "epoch": 3.8895999999999997, "grad_norm": 0.034632444699442086, "learning_rate": 6.801132342533617e-06, "loss": 0.0011, "step": 610 }, { "epoch": 3.9536, "grad_norm": 0.025514404182222564, "learning_rate": 6.730360934182591e-06, "loss": 0.0011, "step": 620 }, { "epoch": 4.0128, "grad_norm": 0.018837993447164435, "learning_rate": 6.659589525831564e-06, "loss": 0.0009, "step": 630 }, { "epoch": 4.0768, "grad_norm": 0.030434272679411154, "learning_rate": 6.588818117480539e-06, "loss": 0.0007, "step": 640 }, { "epoch": 4.1408, "grad_norm": 0.025073707651371353, "learning_rate": 6.518046709129512e-06, "loss": 0.0008, "step": 650 }, { "epoch": 4.2048, "grad_norm": 0.03899789663804376, "learning_rate": 6.447275300778486e-06, "loss": 0.0007, "step": 660 }, { "epoch": 4.2688, "grad_norm": 0.01589697038233231, "learning_rate": 6.37650389242746e-06, "loss": 0.0008, "step": 670 }, { "epoch": 4.3328, "grad_norm": 0.024560187315444824, "learning_rate": 6.305732484076433e-06, "loss": 0.0007, "step": 680 }, { "epoch": 4.3968, "grad_norm": 0.02518563194507897, "learning_rate": 6.2349610757254074e-06, "loss": 0.0008, "step": 690 }, { "epoch": 4.4608, "grad_norm": 0.02855642906111709, "learning_rate": 6.164189667374382e-06, "loss": 0.0008, "step": 700 }, { "epoch": 4.5248, "grad_norm": 0.03178443031646811, "learning_rate": 6.093418259023355e-06, "loss": 0.0009, "step": 710 }, { "epoch": 4.5888, "grad_norm": 0.021081075903526782, "learning_rate": 6.022646850672329e-06, "loss": 0.0008, "step": 720 }, { "epoch": 4.6528, "grad_norm": 0.018073305283650833, "learning_rate": 5.9518754423213035e-06, "loss": 0.0008, "step": 730 }, { "epoch": 4.7168, "grad_norm": 0.04811056774674395, "learning_rate": 5.881104033970276e-06, "loss": 0.0008, "step": 740 }, { "epoch": 4.7808, "grad_norm": 0.018757359542175032, "learning_rate": 5.810332625619251e-06, "loss": 0.0009, "step": 750 }, { "epoch": 4.8448, "grad_norm": 0.021213357394449274, "learning_rate": 5.7395612172682235e-06, "loss": 0.001, "step": 760 }, { "epoch": 4.9088, "grad_norm": 0.03275271950493163, "learning_rate": 5.668789808917198e-06, "loss": 0.0009, "step": 770 }, { "epoch": 4.9728, "grad_norm": 0.023316003525068154, "learning_rate": 5.598018400566172e-06, "loss": 0.0009, "step": 780 }, { "epoch": 5.032, "grad_norm": 0.014334840854956359, "learning_rate": 5.527246992215145e-06, "loss": 0.0007, "step": 790 }, { "epoch": 5.096, "grad_norm": 0.05422632736773663, "learning_rate": 5.4564755838641195e-06, "loss": 0.0005, "step": 800 }, { "epoch": 5.16, "grad_norm": 0.018449386679577014, "learning_rate": 5.385704175513093e-06, "loss": 0.0006, "step": 810 }, { "epoch": 5.224, "grad_norm": 0.02919022100324906, "learning_rate": 5.314932767162067e-06, "loss": 0.0006, "step": 820 }, { "epoch": 5.288, "grad_norm": 0.0305959616492891, "learning_rate": 5.24416135881104e-06, "loss": 0.0006, "step": 830 }, { "epoch": 5.352, "grad_norm": 0.022695467463977156, "learning_rate": 5.173389950460015e-06, "loss": 0.0006, "step": 840 }, { "epoch": 5.416, "grad_norm": 0.02483139320478547, "learning_rate": 5.102618542108988e-06, "loss": 0.0007, "step": 850 }, { "epoch": 5.48, "grad_norm": 0.01866673577908835, "learning_rate": 5.031847133757962e-06, "loss": 0.0007, "step": 860 }, { "epoch": 5.5440000000000005, "grad_norm": 0.02475309749140289, "learning_rate": 4.9610757254069355e-06, "loss": 0.0007, "step": 870 }, { "epoch": 5.608, "grad_norm": 0.02329874201328524, "learning_rate": 4.89030431705591e-06, "loss": 0.0007, "step": 880 }, { "epoch": 5.672, "grad_norm": 0.017899965001059195, "learning_rate": 4.819532908704884e-06, "loss": 0.0007, "step": 890 }, { "epoch": 5.736, "grad_norm": 0.018801411907769315, "learning_rate": 4.748761500353857e-06, "loss": 0.0007, "step": 900 }, { "epoch": 5.8, "grad_norm": 0.015720546212916624, "learning_rate": 4.677990092002832e-06, "loss": 0.0007, "step": 910 }, { "epoch": 5.864, "grad_norm": 0.03502870587760174, "learning_rate": 4.607218683651805e-06, "loss": 0.0008, "step": 920 }, { "epoch": 5.928, "grad_norm": 0.03386452230790969, "learning_rate": 4.536447275300779e-06, "loss": 0.0007, "step": 930 }, { "epoch": 5.992, "grad_norm": 0.03351146213175313, "learning_rate": 4.465675866949752e-06, "loss": 0.0007, "step": 940 }, { "epoch": 6.0512, "grad_norm": 0.011861931780566704, "learning_rate": 4.394904458598727e-06, "loss": 0.0006, "step": 950 }, { "epoch": 6.1152, "grad_norm": 0.02438378480663504, "learning_rate": 4.3241330502477004e-06, "loss": 0.0005, "step": 960 }, { "epoch": 6.1792, "grad_norm": 0.01716531081341717, "learning_rate": 4.253361641896674e-06, "loss": 0.0006, "step": 970 }, { "epoch": 6.2432, "grad_norm": 0.01938921457239796, "learning_rate": 4.1825902335456485e-06, "loss": 0.0006, "step": 980 }, { "epoch": 6.3072, "grad_norm": 0.01373221841561274, "learning_rate": 4.111818825194622e-06, "loss": 0.0005, "step": 990 }, { "epoch": 6.3712, "grad_norm": 0.0199597209044529, "learning_rate": 4.041047416843596e-06, "loss": 0.0006, "step": 1000 }, { "epoch": 6.4352, "grad_norm": 0.02424156176996985, "learning_rate": 3.970276008492569e-06, "loss": 0.0006, "step": 1010 }, { "epoch": 6.4992, "grad_norm": 0.025315655480968886, "learning_rate": 3.899504600141543e-06, "loss": 0.0006, "step": 1020 }, { "epoch": 6.5632, "grad_norm": 0.02920235438042742, "learning_rate": 3.8287331917905165e-06, "loss": 0.0006, "step": 1030 }, { "epoch": 6.6272, "grad_norm": 0.016555192151659806, "learning_rate": 3.757961783439491e-06, "loss": 0.0006, "step": 1040 }, { "epoch": 6.6912, "grad_norm": 0.013049019694846958, "learning_rate": 3.6871903750884645e-06, "loss": 0.0006, "step": 1050 }, { "epoch": 6.7552, "grad_norm": 0.02447313938635393, "learning_rate": 3.6164189667374385e-06, "loss": 0.0006, "step": 1060 }, { "epoch": 6.8192, "grad_norm": 0.011543889477958485, "learning_rate": 3.545647558386412e-06, "loss": 0.0006, "step": 1070 }, { "epoch": 6.8832, "grad_norm": 0.020945153083698518, "learning_rate": 3.474876150035386e-06, "loss": 0.0006, "step": 1080 }, { "epoch": 6.9472000000000005, "grad_norm": 0.052172806158250935, "learning_rate": 3.4041047416843597e-06, "loss": 0.0007, "step": 1090 }, { "epoch": 7.0064, "grad_norm": 0.04186589672688367, "learning_rate": 3.3333333333333333e-06, "loss": 0.0006, "step": 1100 }, { "epoch": 7.0704, "grad_norm": 0.023917480677921623, "learning_rate": 3.2625619249823078e-06, "loss": 0.0006, "step": 1110 }, { "epoch": 7.1344, "grad_norm": 0.021509274576004597, "learning_rate": 3.1917905166312814e-06, "loss": 0.0005, "step": 1120 }, { "epoch": 7.1984, "grad_norm": 0.014051823896192348, "learning_rate": 3.121019108280255e-06, "loss": 0.0005, "step": 1130 }, { "epoch": 7.2624, "grad_norm": 0.042071708786109156, "learning_rate": 3.050247699929229e-06, "loss": 0.0005, "step": 1140 }, { "epoch": 7.3264, "grad_norm": 0.015899068217884903, "learning_rate": 2.9794762915782026e-06, "loss": 0.0005, "step": 1150 }, { "epoch": 7.3904, "grad_norm": 0.01397192533675872, "learning_rate": 2.908704883227176e-06, "loss": 0.0005, "step": 1160 }, { "epoch": 7.4544, "grad_norm": 0.026963623776955922, "learning_rate": 2.83793347487615e-06, "loss": 0.0005, "step": 1170 }, { "epoch": 7.5184, "grad_norm": 0.039247352934989634, "learning_rate": 2.767162066525124e-06, "loss": 0.0005, "step": 1180 }, { "epoch": 7.5824, "grad_norm": 0.014656470038010723, "learning_rate": 2.696390658174098e-06, "loss": 0.0005, "step": 1190 }, { "epoch": 7.6464, "grad_norm": 0.01623246283065862, "learning_rate": 2.6256192498230714e-06, "loss": 0.0006, "step": 1200 }, { "epoch": 7.7104, "grad_norm": 0.01364971630536226, "learning_rate": 2.554847841472046e-06, "loss": 0.0005, "step": 1210 }, { "epoch": 7.7744, "grad_norm": 0.01843596725314251, "learning_rate": 2.4840764331210194e-06, "loss": 0.0006, "step": 1220 }, { "epoch": 7.8384, "grad_norm": 0.01324833994986271, "learning_rate": 2.413305024769993e-06, "loss": 0.0005, "step": 1230 }, { "epoch": 7.9024, "grad_norm": 0.010850155510887709, "learning_rate": 2.342533616418967e-06, "loss": 0.0005, "step": 1240 }, { "epoch": 7.9664, "grad_norm": 0.01975586454883896, "learning_rate": 2.2717622080679406e-06, "loss": 0.0006, "step": 1250 }, { "epoch": 8.0256, "grad_norm": 0.022062764393436206, "learning_rate": 2.2009907997169142e-06, "loss": 0.0006, "step": 1260 }, { "epoch": 8.0896, "grad_norm": 0.011242064989508685, "learning_rate": 2.1302193913658883e-06, "loss": 0.0004, "step": 1270 }, { "epoch": 8.1536, "grad_norm": 0.01149746346411442, "learning_rate": 2.0594479830148623e-06, "loss": 0.0004, "step": 1280 }, { "epoch": 8.2176, "grad_norm": 0.026515020603223683, "learning_rate": 1.988676574663836e-06, "loss": 0.0005, "step": 1290 }, { "epoch": 8.2816, "grad_norm": 0.013807929815403902, "learning_rate": 1.91790516631281e-06, "loss": 0.0005, "step": 1300 }, { "epoch": 8.3456, "grad_norm": 0.013074536666619924, "learning_rate": 1.8471337579617835e-06, "loss": 0.0005, "step": 1310 }, { "epoch": 8.4096, "grad_norm": 0.01259520749871116, "learning_rate": 1.7763623496107573e-06, "loss": 0.0005, "step": 1320 }, { "epoch": 8.4736, "grad_norm": 0.01040755614337679, "learning_rate": 1.7055909412597313e-06, "loss": 0.0005, "step": 1330 }, { "epoch": 8.5376, "grad_norm": 0.019423750818007783, "learning_rate": 1.634819532908705e-06, "loss": 0.0005, "step": 1340 }, { "epoch": 8.6016, "grad_norm": 0.02075041086847078, "learning_rate": 1.564048124557679e-06, "loss": 0.0006, "step": 1350 }, { "epoch": 8.6656, "grad_norm": 0.012939220630062887, "learning_rate": 1.4932767162066527e-06, "loss": 0.0005, "step": 1360 }, { "epoch": 8.7296, "grad_norm": 0.013042241021270056, "learning_rate": 1.4225053078556263e-06, "loss": 0.0005, "step": 1370 }, { "epoch": 8.7936, "grad_norm": 0.014428179836796827, "learning_rate": 1.3517338995046003e-06, "loss": 0.0005, "step": 1380 }, { "epoch": 8.8576, "grad_norm": 0.02246660360791134, "learning_rate": 1.280962491153574e-06, "loss": 0.0005, "step": 1390 }, { "epoch": 8.9216, "grad_norm": 0.024316300351936923, "learning_rate": 1.210191082802548e-06, "loss": 0.0005, "step": 1400 }, { "epoch": 8.9856, "grad_norm": 0.017404048203407522, "learning_rate": 1.1394196744515216e-06, "loss": 0.0005, "step": 1410 }, { "epoch": 9.0448, "grad_norm": 0.012844194995988387, "learning_rate": 1.0686482661004954e-06, "loss": 0.0005, "step": 1420 }, { "epoch": 9.1088, "grad_norm": 0.020036814618503134, "learning_rate": 9.978768577494694e-07, "loss": 0.0005, "step": 1430 }, { "epoch": 9.1728, "grad_norm": 0.00765325259778791, "learning_rate": 9.271054493984431e-07, "loss": 0.0004, "step": 1440 }, { "epoch": 9.2368, "grad_norm": 0.01507156630584454, "learning_rate": 8.563340410474169e-07, "loss": 0.0005, "step": 1450 }, { "epoch": 9.3008, "grad_norm": 0.012047267490532228, "learning_rate": 7.855626326963907e-07, "loss": 0.0005, "step": 1460 }, { "epoch": 9.3648, "grad_norm": 0.016105089807538026, "learning_rate": 7.147912243453645e-07, "loss": 0.0005, "step": 1470 }, { "epoch": 9.4288, "grad_norm": 0.014283181711916166, "learning_rate": 6.440198159943384e-07, "loss": 0.0005, "step": 1480 }, { "epoch": 9.4928, "grad_norm": 0.009125720569816429, "learning_rate": 5.732484076433121e-07, "loss": 0.0005, "step": 1490 }, { "epoch": 9.556799999999999, "grad_norm": 0.011829314376872695, "learning_rate": 5.024769992922859e-07, "loss": 0.0005, "step": 1500 }, { "epoch": 9.6208, "grad_norm": 0.012106395568977475, "learning_rate": 4.317055909412598e-07, "loss": 0.0004, "step": 1510 }, { "epoch": 9.6848, "grad_norm": 0.011903236971213026, "learning_rate": 3.6093418259023354e-07, "loss": 0.0005, "step": 1520 }, { "epoch": 9.7488, "grad_norm": 0.013644455361646967, "learning_rate": 2.901627742392074e-07, "loss": 0.0005, "step": 1530 }, { "epoch": 9.8128, "grad_norm": 0.014986000780857893, "learning_rate": 2.1939136588818118e-07, "loss": 0.0005, "step": 1540 }, { "epoch": 9.8768, "grad_norm": 0.017539322056519816, "learning_rate": 1.4861995753715502e-07, "loss": 0.0005, "step": 1550 }, { "epoch": 9.9408, "grad_norm": 0.016233742810560718, "learning_rate": 7.784854918612881e-08, "loss": 0.0004, "step": 1560 }, { "epoch": 10.0, "grad_norm": 0.0117183501383805, "learning_rate": 7.077140835102619e-09, "loss": 0.0005, "step": 1570 }, { "epoch": 10.0, "step": 1570, "total_flos": 538322966937600.0, "train_loss": 0.0024459090680002595, "train_runtime": 33809.8122, "train_samples_per_second": 5.915, "train_steps_per_second": 0.046 } ], "logging_steps": 10, "max_steps": 1570, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 538322966937600.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }