{ "best_metric": 0.325932115316391, "best_model_checkpoint": "miner_id_24/checkpoint-200", "epoch": 0.2876404494382023, "eval_steps": 25, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0014382022471910113, "grad_norm": 0.2460157573223114, "learning_rate": 2.9999999999999997e-05, "loss": 0.3715, "step": 1 }, { "epoch": 0.0014382022471910113, "eval_loss": 0.6134927868843079, "eval_runtime": 7.1753, "eval_samples_per_second": 6.968, "eval_steps_per_second": 6.968, "step": 1 }, { "epoch": 0.0028764044943820227, "grad_norm": 0.2635881006717682, "learning_rate": 5.9999999999999995e-05, "loss": 0.3798, "step": 2 }, { "epoch": 0.004314606741573033, "grad_norm": 0.2805251479148865, "learning_rate": 8.999999999999999e-05, "loss": 0.4453, "step": 3 }, { "epoch": 0.005752808988764045, "grad_norm": 0.308766633272171, "learning_rate": 0.00011999999999999999, "loss": 0.4679, "step": 4 }, { "epoch": 0.0071910112359550565, "grad_norm": 0.33456265926361084, "learning_rate": 0.00015, "loss": 0.4702, "step": 5 }, { "epoch": 0.008629213483146067, "grad_norm": 0.2737188935279846, "learning_rate": 0.00017999999999999998, "loss": 0.4228, "step": 6 }, { "epoch": 0.010067415730337079, "grad_norm": 0.25884535908699036, "learning_rate": 0.00020999999999999998, "loss": 0.4558, "step": 7 }, { "epoch": 0.01150561797752809, "grad_norm": 0.36323437094688416, "learning_rate": 0.00023999999999999998, "loss": 0.4488, "step": 8 }, { "epoch": 0.012943820224719101, "grad_norm": 0.38317617774009705, "learning_rate": 0.00027, "loss": 0.4029, "step": 9 }, { "epoch": 0.014382022471910113, "grad_norm": 0.29558563232421875, "learning_rate": 0.0003, "loss": 0.4254, "step": 10 }, { "epoch": 0.015820224719101123, "grad_norm": 0.20489034056663513, "learning_rate": 0.0002999794957488703, "loss": 0.409, "step": 11 }, { "epoch": 0.017258426966292133, "grad_norm": 0.18313074111938477, "learning_rate": 0.0002999179886011389, "loss": 0.3964, "step": 12 }, { "epoch": 0.018696629213483147, "grad_norm": 0.1963389664888382, "learning_rate": 0.0002998154953722457, "loss": 0.3476, "step": 13 }, { "epoch": 0.020134831460674157, "grad_norm": 0.18995118141174316, "learning_rate": 0.00029967204408281613, "loss": 0.3894, "step": 14 }, { "epoch": 0.021573033707865168, "grad_norm": 0.18869389593601227, "learning_rate": 0.00029948767395100045, "loss": 0.3947, "step": 15 }, { "epoch": 0.02301123595505618, "grad_norm": 0.18127363920211792, "learning_rate": 0.0002992624353817517, "loss": 0.374, "step": 16 }, { "epoch": 0.02444943820224719, "grad_norm": 0.21948474645614624, "learning_rate": 0.0002989963899530457, "loss": 0.3968, "step": 17 }, { "epoch": 0.025887640449438202, "grad_norm": 0.18607376515865326, "learning_rate": 0.00029868961039904624, "loss": 0.4043, "step": 18 }, { "epoch": 0.027325842696629212, "grad_norm": 0.17366155982017517, "learning_rate": 0.00029834218059022024, "loss": 0.3499, "step": 19 }, { "epoch": 0.028764044943820226, "grad_norm": 0.18901684880256653, "learning_rate": 0.00029795419551040833, "loss": 0.3942, "step": 20 }, { "epoch": 0.030202247191011236, "grad_norm": 0.1909410059452057, "learning_rate": 0.00029752576123085736, "loss": 0.3601, "step": 21 }, { "epoch": 0.031640449438202246, "grad_norm": 0.20346027612686157, "learning_rate": 0.0002970569948812214, "loss": 0.3527, "step": 22 }, { "epoch": 0.03307865168539326, "grad_norm": 0.1950492113828659, "learning_rate": 0.0002965480246175399, "loss": 0.4546, "step": 23 }, { "epoch": 0.03451685393258427, "grad_norm": 0.1822466254234314, "learning_rate": 0.0002959989895872009, "loss": 0.3361, "step": 24 }, { "epoch": 0.035955056179775284, "grad_norm": 0.20171159505844116, "learning_rate": 0.0002954100398908995, "loss": 0.3685, "step": 25 }, { "epoch": 0.035955056179775284, "eval_loss": 0.3542916774749756, "eval_runtime": 7.3398, "eval_samples_per_second": 6.812, "eval_steps_per_second": 6.812, "step": 25 }, { "epoch": 0.037393258426966294, "grad_norm": 0.20236289501190186, "learning_rate": 0.0002947813365416023, "loss": 0.3725, "step": 26 }, { "epoch": 0.038831460674157305, "grad_norm": 0.20255300402641296, "learning_rate": 0.0002941130514205272, "loss": 0.3873, "step": 27 }, { "epoch": 0.040269662921348315, "grad_norm": 0.20497362315654755, "learning_rate": 0.0002934053672301536, "loss": 0.3685, "step": 28 }, { "epoch": 0.041707865168539325, "grad_norm": 0.18622219562530518, "learning_rate": 0.00029265847744427303, "loss": 0.3764, "step": 29 }, { "epoch": 0.043146067415730335, "grad_norm": 0.22534993290901184, "learning_rate": 0.00029187258625509513, "loss": 0.4209, "step": 30 }, { "epoch": 0.044584269662921346, "grad_norm": 0.2199658304452896, "learning_rate": 0.00029104790851742417, "loss": 0.4027, "step": 31 }, { "epoch": 0.04602247191011236, "grad_norm": 0.19549360871315002, "learning_rate": 0.0002901846696899191, "loss": 0.372, "step": 32 }, { "epoch": 0.04746067415730337, "grad_norm": 0.22447125613689423, "learning_rate": 0.00028928310577345606, "loss": 0.3894, "step": 33 }, { "epoch": 0.04889887640449438, "grad_norm": 0.23258201777935028, "learning_rate": 0.0002883434632466077, "loss": 0.4255, "step": 34 }, { "epoch": 0.050337078651685394, "grad_norm": 0.1966194063425064, "learning_rate": 0.00028736599899825856, "loss": 0.3194, "step": 35 }, { "epoch": 0.051775280898876404, "grad_norm": 0.20393122732639313, "learning_rate": 0.00028635098025737434, "loss": 0.3292, "step": 36 }, { "epoch": 0.053213483146067414, "grad_norm": 0.21718700230121613, "learning_rate": 0.00028529868451994384, "loss": 0.3747, "step": 37 }, { "epoch": 0.054651685393258424, "grad_norm": 0.2381013184785843, "learning_rate": 0.0002842093994731145, "loss": 0.3792, "step": 38 }, { "epoch": 0.05608988764044944, "grad_norm": 0.2573167383670807, "learning_rate": 0.00028308342291654174, "loss": 0.3838, "step": 39 }, { "epoch": 0.05752808988764045, "grad_norm": 0.2560511827468872, "learning_rate": 0.00028192106268097334, "loss": 0.4291, "step": 40 }, { "epoch": 0.05896629213483146, "grad_norm": 0.2690950334072113, "learning_rate": 0.00028072263654409154, "loss": 0.4018, "step": 41 }, { "epoch": 0.06040449438202247, "grad_norm": 0.26220467686653137, "learning_rate": 0.0002794884721436361, "loss": 0.3727, "step": 42 }, { "epoch": 0.06184269662921348, "grad_norm": 0.27452510595321655, "learning_rate": 0.00027821890688783083, "loss": 0.361, "step": 43 }, { "epoch": 0.06328089887640449, "grad_norm": 0.2721319794654846, "learning_rate": 0.0002769142878631403, "loss": 0.3902, "step": 44 }, { "epoch": 0.06471910112359551, "grad_norm": 0.2733122706413269, "learning_rate": 0.00027557497173937923, "loss": 0.383, "step": 45 }, { "epoch": 0.06615730337078651, "grad_norm": 0.304383248090744, "learning_rate": 0.000274201324672203, "loss": 0.3961, "step": 46 }, { "epoch": 0.06759550561797753, "grad_norm": 0.2619877755641937, "learning_rate": 0.00027279372220300385, "loss": 0.2951, "step": 47 }, { "epoch": 0.06903370786516853, "grad_norm": 0.3123132288455963, "learning_rate": 0.0002713525491562421, "loss": 0.4074, "step": 48 }, { "epoch": 0.07047191011235955, "grad_norm": 0.32356810569763184, "learning_rate": 0.00026987819953423867, "loss": 0.3643, "step": 49 }, { "epoch": 0.07191011235955057, "grad_norm": 0.42503225803375244, "learning_rate": 0.00026837107640945905, "loss": 0.4686, "step": 50 }, { "epoch": 0.07191011235955057, "eval_loss": 0.358296275138855, "eval_runtime": 7.3369, "eval_samples_per_second": 6.815, "eval_steps_per_second": 6.815, "step": 50 }, { "epoch": 0.07334831460674157, "grad_norm": 0.24504470825195312, "learning_rate": 0.0002668315918143169, "loss": 0.2738, "step": 51 }, { "epoch": 0.07478651685393259, "grad_norm": 0.20398622751235962, "learning_rate": 0.00026526016662852886, "loss": 0.3004, "step": 52 }, { "epoch": 0.07622471910112359, "grad_norm": 0.1633155792951584, "learning_rate": 0.00026365723046405023, "loss": 0.2947, "step": 53 }, { "epoch": 0.07766292134831461, "grad_norm": 0.15001870691776276, "learning_rate": 0.0002620232215476231, "loss": 0.3256, "step": 54 }, { "epoch": 0.07910112359550561, "grad_norm": 0.15909050405025482, "learning_rate": 0.0002603585866009697, "loss": 0.345, "step": 55 }, { "epoch": 0.08053932584269663, "grad_norm": 0.16075675189495087, "learning_rate": 0.00025866378071866334, "loss": 0.3026, "step": 56 }, { "epoch": 0.08197752808988765, "grad_norm": 0.16503892838954926, "learning_rate": 0.00025693926724370956, "loss": 0.3458, "step": 57 }, { "epoch": 0.08341573033707865, "grad_norm": 0.1700170785188675, "learning_rate": 0.00025518551764087326, "loss": 0.3086, "step": 58 }, { "epoch": 0.08485393258426967, "grad_norm": 0.17736107110977173, "learning_rate": 0.00025340301136778483, "loss": 0.3636, "step": 59 }, { "epoch": 0.08629213483146067, "grad_norm": 0.18760119378566742, "learning_rate": 0.00025159223574386114, "loss": 0.3795, "step": 60 }, { "epoch": 0.08773033707865169, "grad_norm": 0.19076241552829742, "learning_rate": 0.0002497536858170772, "loss": 0.3814, "step": 61 }, { "epoch": 0.08916853932584269, "grad_norm": 0.19734014570713043, "learning_rate": 0.00024788786422862526, "loss": 0.3976, "step": 62 }, { "epoch": 0.09060674157303371, "grad_norm": 0.16381360590457916, "learning_rate": 0.00024599528107549745, "loss": 0.3401, "step": 63 }, { "epoch": 0.09204494382022473, "grad_norm": 0.1795213371515274, "learning_rate": 0.00024407645377103054, "loss": 0.3616, "step": 64 }, { "epoch": 0.09348314606741573, "grad_norm": 0.18468116223812103, "learning_rate": 0.00024213190690345018, "loss": 0.3528, "step": 65 }, { "epoch": 0.09492134831460675, "grad_norm": 0.18842492997646332, "learning_rate": 0.00024016217209245374, "loss": 0.405, "step": 66 }, { "epoch": 0.09635955056179775, "grad_norm": 0.2019750028848648, "learning_rate": 0.00023816778784387094, "loss": 0.4458, "step": 67 }, { "epoch": 0.09779775280898877, "grad_norm": 0.18534868955612183, "learning_rate": 0.0002361492994024415, "loss": 0.3452, "step": 68 }, { "epoch": 0.09923595505617977, "grad_norm": 0.20960098505020142, "learning_rate": 0.0002341072586027509, "loss": 0.3636, "step": 69 }, { "epoch": 0.10067415730337079, "grad_norm": 0.19326113164424896, "learning_rate": 0.00023204222371836405, "loss": 0.361, "step": 70 }, { "epoch": 0.1021123595505618, "grad_norm": 0.20372538268566132, "learning_rate": 0.00022995475930919905, "loss": 0.4194, "step": 71 }, { "epoch": 0.10355056179775281, "grad_norm": 0.19035255908966064, "learning_rate": 0.00022784543606718227, "loss": 0.3849, "step": 72 }, { "epoch": 0.10498876404494382, "grad_norm": 0.19900982081890106, "learning_rate": 0.00022571483066022657, "loss": 0.3896, "step": 73 }, { "epoch": 0.10642696629213483, "grad_norm": 0.18629737198352814, "learning_rate": 0.0002235635255745762, "loss": 0.3694, "step": 74 }, { "epoch": 0.10786516853932585, "grad_norm": 0.18903249502182007, "learning_rate": 0.00022139210895556104, "loss": 0.3426, "step": 75 }, { "epoch": 0.10786516853932585, "eval_loss": 0.3397490382194519, "eval_runtime": 7.3379, "eval_samples_per_second": 6.814, "eval_steps_per_second": 6.814, "step": 75 }, { "epoch": 0.10930337078651685, "grad_norm": 0.19008047878742218, "learning_rate": 0.00021920117444680317, "loss": 0.3548, "step": 76 }, { "epoch": 0.11074157303370787, "grad_norm": 0.20341919362545013, "learning_rate": 0.00021699132102792097, "loss": 0.3996, "step": 77 }, { "epoch": 0.11217977528089888, "grad_norm": 0.2073621153831482, "learning_rate": 0.0002147631528507739, "loss": 0.3767, "step": 78 }, { "epoch": 0.11361797752808989, "grad_norm": 0.2078581303358078, "learning_rate": 0.00021251727907429355, "loss": 0.3898, "step": 79 }, { "epoch": 0.1150561797752809, "grad_norm": 0.20783011615276337, "learning_rate": 0.0002102543136979454, "loss": 0.3451, "step": 80 }, { "epoch": 0.1164943820224719, "grad_norm": 0.2205270677804947, "learning_rate": 0.0002079748753938678, "loss": 0.4023, "step": 81 }, { "epoch": 0.11793258426966292, "grad_norm": 0.20485907793045044, "learning_rate": 0.0002056795873377331, "loss": 0.3301, "step": 82 }, { "epoch": 0.11937078651685393, "grad_norm": 0.2091403603553772, "learning_rate": 0.00020336907703837748, "loss": 0.3565, "step": 83 }, { "epoch": 0.12080898876404494, "grad_norm": 0.2295161336660385, "learning_rate": 0.00020104397616624645, "loss": 0.3882, "step": 84 }, { "epoch": 0.12224719101123596, "grad_norm": 0.21091902256011963, "learning_rate": 0.00019870492038070252, "loss": 0.3423, "step": 85 }, { "epoch": 0.12368539325842696, "grad_norm": 0.22040095925331116, "learning_rate": 0.0001963525491562421, "loss": 0.3695, "step": 86 }, { "epoch": 0.12512359550561797, "grad_norm": 0.21162955462932587, "learning_rate": 0.0001939875056076697, "loss": 0.3529, "step": 87 }, { "epoch": 0.12656179775280899, "grad_norm": 0.24340693652629852, "learning_rate": 0.00019161043631427666, "loss": 0.3945, "step": 88 }, { "epoch": 0.128, "grad_norm": 0.228141188621521, "learning_rate": 0.00018922199114307294, "loss": 0.3394, "step": 89 }, { "epoch": 0.12943820224719102, "grad_norm": 0.25084877014160156, "learning_rate": 0.00018682282307111987, "loss": 0.3794, "step": 90 }, { "epoch": 0.130876404494382, "grad_norm": 0.2509770393371582, "learning_rate": 0.00018441358800701273, "loss": 0.3936, "step": 91 }, { "epoch": 0.13231460674157303, "grad_norm": 0.2724292576313019, "learning_rate": 0.00018199494461156203, "loss": 0.4054, "step": 92 }, { "epoch": 0.13375280898876404, "grad_norm": 0.2628544569015503, "learning_rate": 0.000179567554117722, "loss": 0.41, "step": 93 }, { "epoch": 0.13519101123595506, "grad_norm": 0.2531772553920746, "learning_rate": 0.00017713208014981648, "loss": 0.3279, "step": 94 }, { "epoch": 0.13662921348314608, "grad_norm": 0.26911845803260803, "learning_rate": 0.00017468918854211007, "loss": 0.3622, "step": 95 }, { "epoch": 0.13806741573033707, "grad_norm": 0.26984256505966187, "learning_rate": 0.00017223954715677627, "loss": 0.3243, "step": 96 }, { "epoch": 0.13950561797752808, "grad_norm": 0.27123814821243286, "learning_rate": 0.00016978382570131034, "loss": 0.3311, "step": 97 }, { "epoch": 0.1409438202247191, "grad_norm": 0.3036772906780243, "learning_rate": 0.00016732269554543794, "loss": 0.3514, "step": 98 }, { "epoch": 0.14238202247191012, "grad_norm": 0.3050975203514099, "learning_rate": 0.00016485682953756942, "loss": 0.3491, "step": 99 }, { "epoch": 0.14382022471910114, "grad_norm": 0.3501208424568176, "learning_rate": 0.00016238690182084986, "loss": 0.3431, "step": 100 }, { "epoch": 0.14382022471910114, "eval_loss": 0.3474767804145813, "eval_runtime": 7.3384, "eval_samples_per_second": 6.813, "eval_steps_per_second": 6.813, "step": 100 }, { "epoch": 0.14525842696629213, "grad_norm": 0.2047216147184372, "learning_rate": 0.0001599135876488549, "loss": 0.2478, "step": 101 }, { "epoch": 0.14669662921348314, "grad_norm": 0.22239995002746582, "learning_rate": 0.00015743756320098332, "loss": 0.3139, "step": 102 }, { "epoch": 0.14813483146067416, "grad_norm": 0.18037539720535278, "learning_rate": 0.0001549595053975962, "loss": 0.2951, "step": 103 }, { "epoch": 0.14957303370786518, "grad_norm": 0.1970859169960022, "learning_rate": 0.00015248009171495378, "loss": 0.3285, "step": 104 }, { "epoch": 0.15101123595505617, "grad_norm": 0.17473134398460388, "learning_rate": 0.00015, "loss": 0.3446, "step": 105 }, { "epoch": 0.15244943820224718, "grad_norm": 0.14631038904190063, "learning_rate": 0.00014751990828504622, "loss": 0.3021, "step": 106 }, { "epoch": 0.1538876404494382, "grad_norm": 0.16100949048995972, "learning_rate": 0.00014504049460240375, "loss": 0.3642, "step": 107 }, { "epoch": 0.15532584269662922, "grad_norm": 0.14648860692977905, "learning_rate": 0.00014256243679901663, "loss": 0.2866, "step": 108 }, { "epoch": 0.15676404494382024, "grad_norm": 0.17564325034618378, "learning_rate": 0.00014008641235114508, "loss": 0.3579, "step": 109 }, { "epoch": 0.15820224719101122, "grad_norm": 0.17189237475395203, "learning_rate": 0.00013761309817915014, "loss": 0.3292, "step": 110 }, { "epoch": 0.15964044943820224, "grad_norm": 0.16522134840488434, "learning_rate": 0.00013514317046243058, "loss": 0.3429, "step": 111 }, { "epoch": 0.16107865168539326, "grad_norm": 0.1639784872531891, "learning_rate": 0.00013267730445456208, "loss": 0.3127, "step": 112 }, { "epoch": 0.16251685393258428, "grad_norm": 0.1672915667295456, "learning_rate": 0.00013021617429868963, "loss": 0.3543, "step": 113 }, { "epoch": 0.1639550561797753, "grad_norm": 0.16919255256652832, "learning_rate": 0.00012776045284322368, "loss": 0.3759, "step": 114 }, { "epoch": 0.16539325842696628, "grad_norm": 0.16773313283920288, "learning_rate": 0.00012531081145788987, "loss": 0.3501, "step": 115 }, { "epoch": 0.1668314606741573, "grad_norm": 0.1864137202501297, "learning_rate": 0.00012286791985018355, "loss": 0.3724, "step": 116 }, { "epoch": 0.16826966292134832, "grad_norm": 0.18314597010612488, "learning_rate": 0.00012043244588227796, "loss": 0.3763, "step": 117 }, { "epoch": 0.16970786516853933, "grad_norm": 0.19960667192935944, "learning_rate": 0.00011800505538843798, "loss": 0.3528, "step": 118 }, { "epoch": 0.17114606741573032, "grad_norm": 0.18872325122356415, "learning_rate": 0.00011558641199298727, "loss": 0.3591, "step": 119 }, { "epoch": 0.17258426966292134, "grad_norm": 0.20653153955936432, "learning_rate": 0.00011317717692888012, "loss": 0.4643, "step": 120 }, { "epoch": 0.17402247191011236, "grad_norm": 0.18767227232456207, "learning_rate": 0.00011077800885692702, "loss": 0.3649, "step": 121 }, { "epoch": 0.17546067415730338, "grad_norm": 0.17155469954013824, "learning_rate": 0.00010838956368572334, "loss": 0.3181, "step": 122 }, { "epoch": 0.1768988764044944, "grad_norm": 0.1725505292415619, "learning_rate": 0.0001060124943923303, "loss": 0.3498, "step": 123 }, { "epoch": 0.17833707865168538, "grad_norm": 0.20052678883075714, "learning_rate": 0.0001036474508437579, "loss": 0.4044, "step": 124 }, { "epoch": 0.1797752808988764, "grad_norm": 0.18122945725917816, "learning_rate": 0.00010129507961929748, "loss": 0.3538, "step": 125 }, { "epoch": 0.1797752808988764, "eval_loss": 0.33110183477401733, "eval_runtime": 7.3408, "eval_samples_per_second": 6.811, "eval_steps_per_second": 6.811, "step": 125 }, { "epoch": 0.18121348314606742, "grad_norm": 0.1839105784893036, "learning_rate": 9.895602383375353e-05, "loss": 0.3226, "step": 126 }, { "epoch": 0.18265168539325843, "grad_norm": 0.18478575348854065, "learning_rate": 9.663092296162251e-05, "loss": 0.3455, "step": 127 }, { "epoch": 0.18408988764044945, "grad_norm": 0.19540473818778992, "learning_rate": 9.432041266226686e-05, "loss": 0.3677, "step": 128 }, { "epoch": 0.18552808988764044, "grad_norm": 0.19050459563732147, "learning_rate": 9.202512460613219e-05, "loss": 0.3518, "step": 129 }, { "epoch": 0.18696629213483146, "grad_norm": 0.18181604146957397, "learning_rate": 8.97456863020546e-05, "loss": 0.2808, "step": 130 }, { "epoch": 0.18840449438202247, "grad_norm": 0.2084735631942749, "learning_rate": 8.748272092570646e-05, "loss": 0.3091, "step": 131 }, { "epoch": 0.1898426966292135, "grad_norm": 0.20461665093898773, "learning_rate": 8.523684714922608e-05, "loss": 0.3514, "step": 132 }, { "epoch": 0.19128089887640448, "grad_norm": 0.2227044254541397, "learning_rate": 8.300867897207903e-05, "loss": 0.329, "step": 133 }, { "epoch": 0.1927191011235955, "grad_norm": 0.20998114347457886, "learning_rate": 8.079882555319684e-05, "loss": 0.3346, "step": 134 }, { "epoch": 0.19415730337078652, "grad_norm": 0.2123170793056488, "learning_rate": 7.860789104443896e-05, "loss": 0.3527, "step": 135 }, { "epoch": 0.19559550561797753, "grad_norm": 0.21765443682670593, "learning_rate": 7.643647442542382e-05, "loss": 0.3296, "step": 136 }, { "epoch": 0.19703370786516855, "grad_norm": 0.23418861627578735, "learning_rate": 7.428516933977347e-05, "loss": 0.3508, "step": 137 }, { "epoch": 0.19847191011235954, "grad_norm": 0.23621684312820435, "learning_rate": 7.215456393281776e-05, "loss": 0.3344, "step": 138 }, { "epoch": 0.19991011235955056, "grad_norm": 0.23977969586849213, "learning_rate": 7.004524069080096e-05, "loss": 0.3696, "step": 139 }, { "epoch": 0.20134831460674157, "grad_norm": 0.23609517514705658, "learning_rate": 6.795777628163599e-05, "loss": 0.3606, "step": 140 }, { "epoch": 0.2027865168539326, "grad_norm": 0.2567392587661743, "learning_rate": 6.58927413972491e-05, "loss": 0.3665, "step": 141 }, { "epoch": 0.2042247191011236, "grad_norm": 0.24417299032211304, "learning_rate": 6.385070059755846e-05, "loss": 0.3254, "step": 142 }, { "epoch": 0.2056629213483146, "grad_norm": 0.2223217934370041, "learning_rate": 6.183221215612904e-05, "loss": 0.3126, "step": 143 }, { "epoch": 0.20710112359550562, "grad_norm": 0.24945496022701263, "learning_rate": 5.983782790754623e-05, "loss": 0.3164, "step": 144 }, { "epoch": 0.20853932584269663, "grad_norm": 0.26994356513023376, "learning_rate": 5.786809309654982e-05, "loss": 0.317, "step": 145 }, { "epoch": 0.20997752808988765, "grad_norm": 0.23470191657543182, "learning_rate": 5.592354622896944e-05, "loss": 0.3044, "step": 146 }, { "epoch": 0.21141573033707864, "grad_norm": 0.2303595095872879, "learning_rate": 5.40047189245025e-05, "loss": 0.2865, "step": 147 }, { "epoch": 0.21285393258426966, "grad_norm": 0.2797142267227173, "learning_rate": 5.211213577137469e-05, "loss": 0.3366, "step": 148 }, { "epoch": 0.21429213483146067, "grad_norm": 0.2945963740348816, "learning_rate": 5.024631418292274e-05, "loss": 0.3129, "step": 149 }, { "epoch": 0.2157303370786517, "grad_norm": 0.37612637877464294, "learning_rate": 4.840776425613886e-05, "loss": 0.3869, "step": 150 }, { "epoch": 0.2157303370786517, "eval_loss": 0.32923999428749084, "eval_runtime": 7.3393, "eval_samples_per_second": 6.813, "eval_steps_per_second": 6.813, "step": 150 }, { "epoch": 0.2171685393258427, "grad_norm": 0.1282893419265747, "learning_rate": 4.659698863221513e-05, "loss": 0.2459, "step": 151 }, { "epoch": 0.2186067415730337, "grad_norm": 0.14603035151958466, "learning_rate": 4.481448235912671e-05, "loss": 0.2796, "step": 152 }, { "epoch": 0.22004494382022471, "grad_norm": 0.14428257942199707, "learning_rate": 4.306073275629044e-05, "loss": 0.291, "step": 153 }, { "epoch": 0.22148314606741573, "grad_norm": 0.14837384223937988, "learning_rate": 4.133621928133665e-05, "loss": 0.3257, "step": 154 }, { "epoch": 0.22292134831460675, "grad_norm": 0.15859010815620422, "learning_rate": 3.964141339903026e-05, "loss": 0.3246, "step": 155 }, { "epoch": 0.22435955056179777, "grad_norm": 0.15000857412815094, "learning_rate": 3.797677845237696e-05, "loss": 0.3206, "step": 156 }, { "epoch": 0.22579775280898876, "grad_norm": 0.15084688365459442, "learning_rate": 3.634276953594982e-05, "loss": 0.3086, "step": 157 }, { "epoch": 0.22723595505617977, "grad_norm": 0.14977791905403137, "learning_rate": 3.473983337147118e-05, "loss": 0.3171, "step": 158 }, { "epoch": 0.2286741573033708, "grad_norm": 0.154226154088974, "learning_rate": 3.316840818568315e-05, "loss": 0.3541, "step": 159 }, { "epoch": 0.2301123595505618, "grad_norm": 0.1479630470275879, "learning_rate": 3.162892359054098e-05, "loss": 0.316, "step": 160 }, { "epoch": 0.2315505617977528, "grad_norm": 0.1592627912759781, "learning_rate": 3.0121800465761293e-05, "loss": 0.3356, "step": 161 }, { "epoch": 0.2329887640449438, "grad_norm": 0.16222472488880157, "learning_rate": 2.8647450843757897e-05, "loss": 0.3523, "step": 162 }, { "epoch": 0.23442696629213483, "grad_norm": 0.15601494908332825, "learning_rate": 2.7206277796996144e-05, "loss": 0.3282, "step": 163 }, { "epoch": 0.23586516853932585, "grad_norm": 0.15167421102523804, "learning_rate": 2.5798675327796993e-05, "loss": 0.3228, "step": 164 }, { "epoch": 0.23730337078651687, "grad_norm": 0.1610068380832672, "learning_rate": 2.4425028260620715e-05, "loss": 0.3536, "step": 165 }, { "epoch": 0.23874157303370785, "grad_norm": 0.17391842603683472, "learning_rate": 2.3085712136859668e-05, "loss": 0.3552, "step": 166 }, { "epoch": 0.24017977528089887, "grad_norm": 0.16592349112033844, "learning_rate": 2.178109311216913e-05, "loss": 0.3578, "step": 167 }, { "epoch": 0.2416179775280899, "grad_norm": 0.17902176082134247, "learning_rate": 2.0511527856363912e-05, "loss": 0.391, "step": 168 }, { "epoch": 0.2430561797752809, "grad_norm": 0.1601848155260086, "learning_rate": 1.927736345590839e-05, "loss": 0.3089, "step": 169 }, { "epoch": 0.24449438202247192, "grad_norm": 0.17364348471164703, "learning_rate": 1.8078937319026654e-05, "loss": 0.3645, "step": 170 }, { "epoch": 0.2459325842696629, "grad_norm": 0.1794712394475937, "learning_rate": 1.6916577083458228e-05, "loss": 0.3725, "step": 171 }, { "epoch": 0.24737078651685393, "grad_norm": 0.1813696026802063, "learning_rate": 1.579060052688548e-05, "loss": 0.3553, "step": 172 }, { "epoch": 0.24880898876404495, "grad_norm": 0.19984084367752075, "learning_rate": 1.4701315480056164e-05, "loss": 0.3278, "step": 173 }, { "epoch": 0.25024719101123594, "grad_norm": 0.17033952474594116, "learning_rate": 1.3649019742625623e-05, "loss": 0.324, "step": 174 }, { "epoch": 0.251685393258427, "grad_norm": 0.1826409548521042, "learning_rate": 1.2634001001741373e-05, "loss": 0.3577, "step": 175 }, { "epoch": 0.251685393258427, "eval_loss": 0.32624199986457825, "eval_runtime": 7.3358, "eval_samples_per_second": 6.816, "eval_steps_per_second": 6.816, "step": 175 }, { "epoch": 0.25312359550561797, "grad_norm": 0.17394211888313293, "learning_rate": 1.1656536753392287e-05, "loss": 0.3341, "step": 176 }, { "epoch": 0.254561797752809, "grad_norm": 0.18117299675941467, "learning_rate": 1.0716894226543953e-05, "loss": 0.3256, "step": 177 }, { "epoch": 0.256, "grad_norm": 0.17311780154705048, "learning_rate": 9.815330310080887e-06, "loss": 0.2887, "step": 178 }, { "epoch": 0.257438202247191, "grad_norm": 0.1852751523256302, "learning_rate": 8.952091482575824e-06, "loss": 0.3404, "step": 179 }, { "epoch": 0.25887640449438204, "grad_norm": 0.19263628125190735, "learning_rate": 8.127413744904804e-06, "loss": 0.3538, "step": 180 }, { "epoch": 0.26031460674157303, "grad_norm": 0.20452255010604858, "learning_rate": 7.34152255572697e-06, "loss": 0.3715, "step": 181 }, { "epoch": 0.261752808988764, "grad_norm": 0.2183782458305359, "learning_rate": 6.594632769846353e-06, "loss": 0.3805, "step": 182 }, { "epoch": 0.26319101123595506, "grad_norm": 0.20077747106552124, "learning_rate": 5.886948579472778e-06, "loss": 0.359, "step": 183 }, { "epoch": 0.26462921348314605, "grad_norm": 0.19744691252708435, "learning_rate": 5.218663458397715e-06, "loss": 0.3203, "step": 184 }, { "epoch": 0.2660674157303371, "grad_norm": 0.2061202973127365, "learning_rate": 4.589960109100444e-06, "loss": 0.3546, "step": 185 }, { "epoch": 0.2675056179775281, "grad_norm": 0.20638707280158997, "learning_rate": 4.001010412799138e-06, "loss": 0.337, "step": 186 }, { "epoch": 0.2689438202247191, "grad_norm": 0.21961137652397156, "learning_rate": 3.451975382460109e-06, "loss": 0.3902, "step": 187 }, { "epoch": 0.2703820224719101, "grad_norm": 0.22892671823501587, "learning_rate": 2.9430051187785962e-06, "loss": 0.3423, "step": 188 }, { "epoch": 0.2718202247191011, "grad_norm": 0.21468259394168854, "learning_rate": 2.4742387691426445e-06, "loss": 0.3358, "step": 189 }, { "epoch": 0.27325842696629216, "grad_norm": 0.20298202335834503, "learning_rate": 2.0458044895916513e-06, "loss": 0.2882, "step": 190 }, { "epoch": 0.27469662921348315, "grad_norm": 0.23409554362297058, "learning_rate": 1.6578194097797258e-06, "loss": 0.3673, "step": 191 }, { "epoch": 0.27613483146067413, "grad_norm": 0.21903392672538757, "learning_rate": 1.3103896009537207e-06, "loss": 0.3052, "step": 192 }, { "epoch": 0.2775730337078652, "grad_norm": 0.23991554975509644, "learning_rate": 1.0036100469542786e-06, "loss": 0.3609, "step": 193 }, { "epoch": 0.27901123595505617, "grad_norm": 0.23952968418598175, "learning_rate": 7.375646182482875e-07, "loss": 0.3325, "step": 194 }, { "epoch": 0.2804494382022472, "grad_norm": 0.2451406568288803, "learning_rate": 5.123260489995229e-07, "loss": 0.3273, "step": 195 }, { "epoch": 0.2818876404494382, "grad_norm": 0.26098620891571045, "learning_rate": 3.2795591718381975e-07, "loss": 0.3254, "step": 196 }, { "epoch": 0.2833258426966292, "grad_norm": 0.28347116708755493, "learning_rate": 1.8450462775428942e-07, "loss": 0.3497, "step": 197 }, { "epoch": 0.28476404494382024, "grad_norm": 0.28191664814949036, "learning_rate": 8.201139886109264e-08, "loss": 0.3397, "step": 198 }, { "epoch": 0.2862022471910112, "grad_norm": 0.3383502960205078, "learning_rate": 2.0504251129649374e-08, "loss": 0.3807, "step": 199 }, { "epoch": 0.2876404494382023, "grad_norm": 0.39992961287498474, "learning_rate": 0.0, "loss": 0.4089, "step": 200 }, { "epoch": 0.2876404494382023, "eval_loss": 0.325932115316391, "eval_runtime": 7.335, "eval_samples_per_second": 6.817, "eval_steps_per_second": 6.817, "step": 200 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 1, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.159973069093601e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }