{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.315955766192733, "eval_steps": 500, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00105318588730911, "grad_norm": 1.3188265562057495, "learning_rate": 0.0, "loss": 12.2439, "num_input_tokens_seen": 2363056, "step": 1 }, { "epoch": 0.00210637177461822, "grad_norm": 1.1604911088943481, "learning_rate": 8.333333333333334e-06, "loss": 13.6505, "num_input_tokens_seen": 4914924, "step": 2 }, { "epoch": 0.00315955766192733, "grad_norm": 1.0786586999893188, "learning_rate": 1.6666666666666667e-05, "loss": 12.7472, "num_input_tokens_seen": 7350666, "step": 3 }, { "epoch": 0.00421274354923644, "grad_norm": 0.9630117416381836, "learning_rate": 2.5e-05, "loss": 12.9927, "num_input_tokens_seen": 9911760, "step": 4 }, { "epoch": 0.0052659294365455505, "grad_norm": 1.2464433908462524, "learning_rate": 3.3333333333333335e-05, "loss": 13.0706, "num_input_tokens_seen": 12460110, "step": 5 }, { "epoch": 0.00631911532385466, "grad_norm": 1.4381663799285889, "learning_rate": 4.1666666666666665e-05, "loss": 13.5137, "num_input_tokens_seen": 14941884, "step": 6 }, { "epoch": 0.00737230121116377, "grad_norm": 0.9651690125465393, "learning_rate": 5e-05, "loss": 12.9703, "num_input_tokens_seen": 17514914, "step": 7 }, { "epoch": 0.00842548709847288, "grad_norm": 1.1015037298202515, "learning_rate": 5.833333333333333e-05, "loss": 10.8888, "num_input_tokens_seen": 20142298, "step": 8 }, { "epoch": 0.009478672985781991, "grad_norm": 1.195784091949463, "learning_rate": 6.666666666666667e-05, "loss": 12.1476, "num_input_tokens_seen": 22656144, "step": 9 }, { "epoch": 0.010531858873091101, "grad_norm": 1.1983633041381836, "learning_rate": 7.5e-05, "loss": 13.4884, "num_input_tokens_seen": 25175402, "step": 10 }, { "epoch": 0.01158504476040021, "grad_norm": 1.2181470394134521, "learning_rate": 8.333333333333333e-05, "loss": 11.6243, "num_input_tokens_seen": 27603302, "step": 11 }, { "epoch": 0.01263823064770932, "grad_norm": 1.1232670545578003, "learning_rate": 9.166666666666667e-05, "loss": 12.1416, "num_input_tokens_seen": 30063438, "step": 12 }, { "epoch": 0.01369141653501843, "grad_norm": 1.4114834070205688, "learning_rate": 0.0001, "loss": 12.2753, "num_input_tokens_seen": 32496926, "step": 13 }, { "epoch": 0.01474460242232754, "grad_norm": 1.1449567079544067, "learning_rate": 0.00010833333333333334, "loss": 12.4263, "num_input_tokens_seen": 34824490, "step": 14 }, { "epoch": 0.01579778830963665, "grad_norm": 1.307713270187378, "learning_rate": 0.00011666666666666667, "loss": 11.6708, "num_input_tokens_seen": 37474392, "step": 15 }, { "epoch": 0.01685097419694576, "grad_norm": 0.8677361011505127, "learning_rate": 0.000125, "loss": 12.8925, "num_input_tokens_seen": 40079890, "step": 16 }, { "epoch": 0.01790416008425487, "grad_norm": 1.3970224857330322, "learning_rate": 0.00013333333333333334, "loss": 12.7623, "num_input_tokens_seen": 42483202, "step": 17 }, { "epoch": 0.018957345971563982, "grad_norm": 1.0275459289550781, "learning_rate": 0.00014166666666666668, "loss": 13.5152, "num_input_tokens_seen": 44983536, "step": 18 }, { "epoch": 0.020010531858873092, "grad_norm": 1.1338393688201904, "learning_rate": 0.00015, "loss": 13.4871, "num_input_tokens_seen": 47556326, "step": 19 }, { "epoch": 0.021063717746182202, "grad_norm": 1.1492358446121216, "learning_rate": 0.00015833333333333332, "loss": 12.854, "num_input_tokens_seen": 49984930, "step": 20 }, { "epoch": 0.022116903633491312, "grad_norm": 1.051015019416809, "learning_rate": 0.00016666666666666666, "loss": 11.8565, "num_input_tokens_seen": 52475706, "step": 21 }, { "epoch": 0.02317008952080042, "grad_norm": 1.0687230825424194, "learning_rate": 0.000175, "loss": 12.3539, "num_input_tokens_seen": 54931676, "step": 22 }, { "epoch": 0.02422327540810953, "grad_norm": 1.1236428022384644, "learning_rate": 0.00018333333333333334, "loss": 11.7098, "num_input_tokens_seen": 57311266, "step": 23 }, { "epoch": 0.02527646129541864, "grad_norm": 1.2922334671020508, "learning_rate": 0.00019166666666666667, "loss": 12.4618, "num_input_tokens_seen": 59782354, "step": 24 }, { "epoch": 0.02632964718272775, "grad_norm": 0.9868028163909912, "learning_rate": 0.0002, "loss": 13.3621, "num_input_tokens_seen": 62186446, "step": 25 }, { "epoch": 0.02738283307003686, "grad_norm": 1.0244539976119995, "learning_rate": 0.00020833333333333335, "loss": 11.9551, "num_input_tokens_seen": 64698904, "step": 26 }, { "epoch": 0.02843601895734597, "grad_norm": 1.2274116277694702, "learning_rate": 0.00021666666666666668, "loss": 12.3889, "num_input_tokens_seen": 67157778, "step": 27 }, { "epoch": 0.02948920484465508, "grad_norm": 1.8787362575531006, "learning_rate": 0.00022500000000000002, "loss": 13.1491, "num_input_tokens_seen": 69536532, "step": 28 }, { "epoch": 0.030542390731964193, "grad_norm": 1.6268733739852905, "learning_rate": 0.00023333333333333333, "loss": 11.4937, "num_input_tokens_seen": 72073830, "step": 29 }, { "epoch": 0.0315955766192733, "grad_norm": 1.0921963453292847, "learning_rate": 0.00024166666666666667, "loss": 13.3898, "num_input_tokens_seen": 74649418, "step": 30 }, { "epoch": 0.03264876250658241, "grad_norm": 0.9047552347183228, "learning_rate": 0.00025, "loss": 13.247, "num_input_tokens_seen": 77194468, "step": 31 }, { "epoch": 0.03370194839389152, "grad_norm": 1.0892783403396606, "learning_rate": 0.00025833333333333334, "loss": 12.8638, "num_input_tokens_seen": 79567360, "step": 32 }, { "epoch": 0.03475513428120063, "grad_norm": 1.2737656831741333, "learning_rate": 0.0002666666666666667, "loss": 11.8823, "num_input_tokens_seen": 81976622, "step": 33 }, { "epoch": 0.03580832016850974, "grad_norm": 1.589759349822998, "learning_rate": 0.000275, "loss": 11.2952, "num_input_tokens_seen": 84456254, "step": 34 }, { "epoch": 0.03686150605581885, "grad_norm": 1.317963719367981, "learning_rate": 0.00028333333333333335, "loss": 13.3181, "num_input_tokens_seen": 86912082, "step": 35 }, { "epoch": 0.037914691943127965, "grad_norm": 1.2985906600952148, "learning_rate": 0.0002916666666666667, "loss": 12.4856, "num_input_tokens_seen": 89296692, "step": 36 }, { "epoch": 0.03896787783043707, "grad_norm": 1.2449119091033936, "learning_rate": 0.0003, "loss": 12.9048, "num_input_tokens_seen": 91786022, "step": 37 }, { "epoch": 0.040021063717746184, "grad_norm": 1.4544183015823364, "learning_rate": 0.00030833333333333337, "loss": 12.4393, "num_input_tokens_seen": 94451030, "step": 38 }, { "epoch": 0.04107424960505529, "grad_norm": 0.9716716408729553, "learning_rate": 0.00031666666666666665, "loss": 13.424, "num_input_tokens_seen": 96817518, "step": 39 }, { "epoch": 0.042127435492364404, "grad_norm": 0.980323076248169, "learning_rate": 0.00032500000000000004, "loss": 11.7307, "num_input_tokens_seen": 99234416, "step": 40 }, { "epoch": 0.04318062137967351, "grad_norm": 1.7067333459854126, "learning_rate": 0.0003333333333333333, "loss": 12.8208, "num_input_tokens_seen": 101710538, "step": 41 }, { "epoch": 0.044233807266982623, "grad_norm": 1.6978358030319214, "learning_rate": 0.00034166666666666666, "loss": 13.5936, "num_input_tokens_seen": 104066772, "step": 42 }, { "epoch": 0.04528699315429173, "grad_norm": 1.54063880443573, "learning_rate": 0.00035, "loss": 12.9598, "num_input_tokens_seen": 106478524, "step": 43 }, { "epoch": 0.04634017904160084, "grad_norm": 1.4603615999221802, "learning_rate": 0.00035833333333333333, "loss": 11.7817, "num_input_tokens_seen": 108918022, "step": 44 }, { "epoch": 0.04739336492890995, "grad_norm": 1.2922568321228027, "learning_rate": 0.00036666666666666667, "loss": 13.9764, "num_input_tokens_seen": 111331584, "step": 45 }, { "epoch": 0.04844655081621906, "grad_norm": 0.7605544924736023, "learning_rate": 0.000375, "loss": 12.7903, "num_input_tokens_seen": 113804550, "step": 46 }, { "epoch": 0.049499736703528176, "grad_norm": 1.6951416730880737, "learning_rate": 0.00038333333333333334, "loss": 14.0389, "num_input_tokens_seen": 116406598, "step": 47 }, { "epoch": 0.05055292259083728, "grad_norm": 1.4952235221862793, "learning_rate": 0.0003916666666666667, "loss": 11.5827, "num_input_tokens_seen": 118834762, "step": 48 }, { "epoch": 0.051606108478146395, "grad_norm": 1.1292210817337036, "learning_rate": 0.0004, "loss": 12.5723, "num_input_tokens_seen": 121241054, "step": 49 }, { "epoch": 0.0526592943654555, "grad_norm": 1.3782143592834473, "learning_rate": 0.00040833333333333336, "loss": 11.5477, "num_input_tokens_seen": 123714340, "step": 50 }, { "epoch": 0.053712480252764615, "grad_norm": 1.3731465339660645, "learning_rate": 0.0004166666666666667, "loss": 11.4992, "num_input_tokens_seen": 126399394, "step": 51 }, { "epoch": 0.05476566614007372, "grad_norm": 1.0577400922775269, "learning_rate": 0.000425, "loss": 12.9262, "num_input_tokens_seen": 128815712, "step": 52 }, { "epoch": 0.055818852027382834, "grad_norm": 0.9413532018661499, "learning_rate": 0.00043333333333333337, "loss": 11.2172, "num_input_tokens_seen": 131243982, "step": 53 }, { "epoch": 0.05687203791469194, "grad_norm": 1.6401008367538452, "learning_rate": 0.00044166666666666665, "loss": 13.5043, "num_input_tokens_seen": 133695752, "step": 54 }, { "epoch": 0.057925223802001054, "grad_norm": 1.1288220882415771, "learning_rate": 0.00045000000000000004, "loss": 13.8469, "num_input_tokens_seen": 136206586, "step": 55 }, { "epoch": 0.05897840968931016, "grad_norm": 1.6782962083816528, "learning_rate": 0.0004583333333333333, "loss": 14.2535, "num_input_tokens_seen": 138588134, "step": 56 }, { "epoch": 0.06003159557661927, "grad_norm": 1.3746291399002075, "learning_rate": 0.00046666666666666666, "loss": 11.6482, "num_input_tokens_seen": 141117190, "step": 57 }, { "epoch": 0.061084781463928386, "grad_norm": 1.0187997817993164, "learning_rate": 0.000475, "loss": 13.7658, "num_input_tokens_seen": 143491738, "step": 58 }, { "epoch": 0.06213796735123749, "grad_norm": 1.1370724439620972, "learning_rate": 0.00048333333333333334, "loss": 13.1967, "num_input_tokens_seen": 145991144, "step": 59 }, { "epoch": 0.0631911532385466, "grad_norm": 1.401911973953247, "learning_rate": 0.0004916666666666666, "loss": 11.8345, "num_input_tokens_seen": 148486356, "step": 60 }, { "epoch": 0.06424433912585571, "grad_norm": 1.3002967834472656, "learning_rate": 0.0005, "loss": 12.5778, "num_input_tokens_seen": 150891696, "step": 61 }, { "epoch": 0.06529752501316483, "grad_norm": 1.8764727115631104, "learning_rate": 0.0004999785818956435, "loss": 13.3194, "num_input_tokens_seen": 153414658, "step": 62 }, { "epoch": 0.06635071090047394, "grad_norm": 1.4269951581954956, "learning_rate": 0.0004999143312524562, "loss": 11.347, "num_input_tokens_seen": 155793700, "step": 63 }, { "epoch": 0.06740389678778304, "grad_norm": 1.2144259214401245, "learning_rate": 0.0004998072590794548, "loss": 12.3637, "num_input_tokens_seen": 158248338, "step": 64 }, { "epoch": 0.06845708267509215, "grad_norm": 1.1703037023544312, "learning_rate": 0.000499657383722905, "loss": 12.3153, "num_input_tokens_seen": 160737224, "step": 65 }, { "epoch": 0.06951026856240126, "grad_norm": 0.9227648973464966, "learning_rate": 0.0004994647308631777, "loss": 12.247, "num_input_tokens_seen": 163317772, "step": 66 }, { "epoch": 0.07056345444971038, "grad_norm": 1.1137187480926514, "learning_rate": 0.0004992293335103487, "loss": 12.8889, "num_input_tokens_seen": 165797524, "step": 67 }, { "epoch": 0.07161664033701948, "grad_norm": 1.0727897882461548, "learning_rate": 0.0004989512319985422, "loss": 11.9518, "num_input_tokens_seen": 168202674, "step": 68 }, { "epoch": 0.07266982622432859, "grad_norm": 1.1321420669555664, "learning_rate": 0.000498630473979021, "loss": 11.6875, "num_input_tokens_seen": 170618878, "step": 69 }, { "epoch": 0.0737230121116377, "grad_norm": 1.3855392932891846, "learning_rate": 0.0004982671144120202, "loss": 11.4171, "num_input_tokens_seen": 173023976, "step": 70 }, { "epoch": 0.07477619799894682, "grad_norm": 0.8411357998847961, "learning_rate": 0.0004978612155573311, "loss": 13.1131, "num_input_tokens_seen": 175423500, "step": 71 }, { "epoch": 0.07582938388625593, "grad_norm": 1.355546236038208, "learning_rate": 0.0004974128469636329, "loss": 11.2159, "num_input_tokens_seen": 177865696, "step": 72 }, { "epoch": 0.07688256977356503, "grad_norm": 1.486307144165039, "learning_rate": 0.000496922085456576, "loss": 12.4092, "num_input_tokens_seen": 180215372, "step": 73 }, { "epoch": 0.07793575566087414, "grad_norm": 0.969342827796936, "learning_rate": 0.0004963890151256181, "loss": 12.625, "num_input_tokens_seen": 182660290, "step": 74 }, { "epoch": 0.07898894154818326, "grad_norm": 1.078611135482788, "learning_rate": 0.000495813727309616, "loss": 13.0958, "num_input_tokens_seen": 185180996, "step": 75 }, { "epoch": 0.08004212743549237, "grad_norm": 1.5402764081954956, "learning_rate": 0.0004951963205811756, "loss": 11.3179, "num_input_tokens_seen": 187704644, "step": 76 }, { "epoch": 0.08109531332280147, "grad_norm": 1.763607382774353, "learning_rate": 0.0004945369007297615, "loss": 12.3761, "num_input_tokens_seen": 190082764, "step": 77 }, { "epoch": 0.08214849921011058, "grad_norm": 2.118359327316284, "learning_rate": 0.0004938355807435702, "loss": 12.8066, "num_input_tokens_seen": 192368804, "step": 78 }, { "epoch": 0.0832016850974197, "grad_norm": 1.365455150604248, "learning_rate": 0.0004930924807901711, "loss": 11.9209, "num_input_tokens_seen": 194857438, "step": 79 }, { "epoch": 0.08425487098472881, "grad_norm": 1.2654019594192505, "learning_rate": 0.0004923077281959159, "loss": 12.3324, "num_input_tokens_seen": 197152948, "step": 80 }, { "epoch": 0.08530805687203792, "grad_norm": 1.1840094327926636, "learning_rate": 0.0004914814574241215, "loss": 12.4565, "num_input_tokens_seen": 199500708, "step": 81 }, { "epoch": 0.08636124275934702, "grad_norm": 1.177915096282959, "learning_rate": 0.0004906138100520309, "loss": 11.576, "num_input_tokens_seen": 201991874, "step": 82 }, { "epoch": 0.08741442864665613, "grad_norm": 0.7854015827178955, "learning_rate": 0.0004897049347465549, "loss": 12.9953, "num_input_tokens_seen": 204251674, "step": 83 }, { "epoch": 0.08846761453396525, "grad_norm": 0.937538206577301, "learning_rate": 0.0004887549872387981, "loss": 12.4283, "num_input_tokens_seen": 206656880, "step": 84 }, { "epoch": 0.08952080042127436, "grad_norm": 0.8664125800132751, "learning_rate": 0.0004877641302973755, "loss": 13.17, "num_input_tokens_seen": 208918254, "step": 85 }, { "epoch": 0.09057398630858346, "grad_norm": 1.2139054536819458, "learning_rate": 0.0004867325337005232, "loss": 12.2815, "num_input_tokens_seen": 211434142, "step": 86 }, { "epoch": 0.09162717219589257, "grad_norm": 1.1984803676605225, "learning_rate": 0.00048566037420700735, "loss": 12.6045, "num_input_tokens_seen": 214000898, "step": 87 }, { "epoch": 0.09268035808320169, "grad_norm": 1.8322398662567139, "learning_rate": 0.0004845478355258377, "loss": 12.4396, "num_input_tokens_seen": 216309292, "step": 88 }, { "epoch": 0.0937335439705108, "grad_norm": 1.363206386566162, "learning_rate": 0.0004833951082847898, "loss": 12.4456, "num_input_tokens_seen": 218756170, "step": 89 }, { "epoch": 0.0947867298578199, "grad_norm": 1.314345121383667, "learning_rate": 0.00048220238999774226, "loss": 12.2991, "num_input_tokens_seen": 221193636, "step": 90 }, { "epoch": 0.09583991574512901, "grad_norm": 1.163728952407837, "learning_rate": 0.0004809698850308334, "loss": 13.0735, "num_input_tokens_seen": 223524870, "step": 91 }, { "epoch": 0.09689310163243813, "grad_norm": 0.991970956325531, "learning_rate": 0.00047969780456744436, "loss": 10.4714, "num_input_tokens_seen": 226066666, "step": 92 }, { "epoch": 0.09794628751974724, "grad_norm": 1.0605396032333374, "learning_rate": 0.0004783863665720137, "loss": 12.0203, "num_input_tokens_seen": 228525260, "step": 93 }, { "epoch": 0.09899947340705635, "grad_norm": 1.0144538879394531, "learning_rate": 0.000477035795752691, "loss": 12.1258, "num_input_tokens_seen": 231156516, "step": 94 }, { "epoch": 0.10005265929436545, "grad_norm": 0.8651906251907349, "learning_rate": 0.0004756463235228331, "loss": 13.4279, "num_input_tokens_seen": 233846560, "step": 95 }, { "epoch": 0.10110584518167456, "grad_norm": 1.388136625289917, "learning_rate": 0.0004742181879613535, "loss": 11.5721, "num_input_tokens_seen": 236170696, "step": 96 }, { "epoch": 0.10215903106898368, "grad_norm": 1.3920540809631348, "learning_rate": 0.00047275163377192886, "loss": 12.037, "num_input_tokens_seen": 238549390, "step": 97 }, { "epoch": 0.10321221695629279, "grad_norm": 0.9544283747673035, "learning_rate": 0.0004712469122410695, "loss": 13.2605, "num_input_tokens_seen": 240990068, "step": 98 }, { "epoch": 0.10426540284360189, "grad_norm": 1.0330355167388916, "learning_rate": 0.00046970428119506353, "loss": 12.4961, "num_input_tokens_seen": 243394892, "step": 99 }, { "epoch": 0.105318588730911, "grad_norm": 0.9636231064796448, "learning_rate": 0.0004681240049557991, "loss": 11.0471, "num_input_tokens_seen": 245927990, "step": 100 }, { "epoch": 0.10637177461822012, "grad_norm": 1.4202877283096313, "learning_rate": 0.0004665063542954746, "loss": 9.9291, "num_input_tokens_seen": 248369144, "step": 101 }, { "epoch": 0.10742496050552923, "grad_norm": 1.065412998199463, "learning_rate": 0.00046485160639020293, "loss": 12.6054, "num_input_tokens_seen": 250819650, "step": 102 }, { "epoch": 0.10847814639283834, "grad_norm": 0.7211883068084717, "learning_rate": 0.0004631600447725189, "loss": 14.1816, "num_input_tokens_seen": 253219186, "step": 103 }, { "epoch": 0.10953133228014744, "grad_norm": 0.8114559650421143, "learning_rate": 0.0004614319592827978, "loss": 13.6071, "num_input_tokens_seen": 255671766, "step": 104 }, { "epoch": 0.11058451816745656, "grad_norm": 0.8580276966094971, "learning_rate": 0.0004596676460195918, "loss": 12.8117, "num_input_tokens_seen": 258276528, "step": 105 }, { "epoch": 0.11163770405476567, "grad_norm": 1.31855309009552, "learning_rate": 0.000457867407288896, "loss": 12.6043, "num_input_tokens_seen": 260625306, "step": 106 }, { "epoch": 0.11269088994207478, "grad_norm": 0.7763041257858276, "learning_rate": 0.0004560315515523492, "loss": 11.8848, "num_input_tokens_seen": 263021648, "step": 107 }, { "epoch": 0.11374407582938388, "grad_norm": 0.9121378064155579, "learning_rate": 0.00045416039337438087, "loss": 12.6511, "num_input_tokens_seen": 265467338, "step": 108 }, { "epoch": 0.114797261716693, "grad_norm": 0.7179497480392456, "learning_rate": 0.000452254253368312, "loss": 12.6821, "num_input_tokens_seen": 267981590, "step": 109 }, { "epoch": 0.11585044760400211, "grad_norm": 0.9322866797447205, "learning_rate": 0.0004503134581414198, "loss": 11.4214, "num_input_tokens_seen": 270308658, "step": 110 }, { "epoch": 0.11690363349131122, "grad_norm": 0.8432686924934387, "learning_rate": 0.0004483383402389753, "loss": 12.6013, "num_input_tokens_seen": 272795368, "step": 111 }, { "epoch": 0.11795681937862032, "grad_norm": 0.9262714982032776, "learning_rate": 0.00044632923808726293, "loss": 12.9433, "num_input_tokens_seen": 275169288, "step": 112 }, { "epoch": 0.11901000526592943, "grad_norm": 0.7486256957054138, "learning_rate": 0.00044428649593559365, "loss": 13.8824, "num_input_tokens_seen": 277598550, "step": 113 }, { "epoch": 0.12006319115323855, "grad_norm": 0.5905117392539978, "learning_rate": 0.0004422104637973191, "loss": 13.0405, "num_input_tokens_seen": 280073876, "step": 114 }, { "epoch": 0.12111637704054766, "grad_norm": 1.0221725702285767, "learning_rate": 0.0004401014973898586, "loss": 12.4056, "num_input_tokens_seen": 282587368, "step": 115 }, { "epoch": 0.12216956292785677, "grad_norm": 1.0212949514389038, "learning_rate": 0.00043795995807374916, "loss": 12.4775, "num_input_tokens_seen": 285113802, "step": 116 }, { "epoch": 0.12322274881516587, "grad_norm": 1.2418874502182007, "learning_rate": 0.00043578621279072793, "loss": 10.0364, "num_input_tokens_seen": 287411358, "step": 117 }, { "epoch": 0.12427593470247499, "grad_norm": 0.9875852465629578, "learning_rate": 0.0004335806340008587, "loss": 11.6843, "num_input_tokens_seen": 289841892, "step": 118 }, { "epoch": 0.1253291205897841, "grad_norm": 1.0161254405975342, "learning_rate": 0.0004313435996187126, "loss": 11.4954, "num_input_tokens_seen": 292269164, "step": 119 }, { "epoch": 0.1263823064770932, "grad_norm": 0.7848489284515381, "learning_rate": 0.00042907549294861504, "loss": 13.5343, "num_input_tokens_seen": 294553234, "step": 120 }, { "epoch": 0.12743549236440233, "grad_norm": 0.8620059490203857, "learning_rate": 0.0004267767026189673, "loss": 12.5524, "num_input_tokens_seen": 297047120, "step": 121 }, { "epoch": 0.12848867825171142, "grad_norm": 0.9458215832710266, "learning_rate": 0.00042444762251565854, "loss": 10.9325, "num_input_tokens_seen": 299619262, "step": 122 }, { "epoch": 0.12954186413902052, "grad_norm": 0.6834012866020203, "learning_rate": 0.0004220886517145741, "loss": 12.1203, "num_input_tokens_seen": 302193484, "step": 123 }, { "epoch": 0.13059505002632965, "grad_norm": 0.8515812158584595, "learning_rate": 0.0004197001944132168, "loss": 13.1872, "num_input_tokens_seen": 304771644, "step": 124 }, { "epoch": 0.13164823591363875, "grad_norm": 0.7977530360221863, "learning_rate": 0.00041728265986144944, "loss": 13.1917, "num_input_tokens_seen": 307154770, "step": 125 }, { "epoch": 0.13270142180094788, "grad_norm": 1.0813367366790771, "learning_rate": 0.0004148364622913718, "loss": 12.1914, "num_input_tokens_seen": 309623990, "step": 126 }, { "epoch": 0.13375460768825698, "grad_norm": 0.823809027671814, "learning_rate": 0.00041236202084634466, "loss": 11.1064, "num_input_tokens_seen": 312189822, "step": 127 }, { "epoch": 0.13480779357556608, "grad_norm": 0.6433836817741394, "learning_rate": 0.00040985975950917115, "loss": 12.066, "num_input_tokens_seen": 314481274, "step": 128 }, { "epoch": 0.1358609794628752, "grad_norm": 0.7915118932723999, "learning_rate": 0.0004073301070294496, "loss": 12.0873, "num_input_tokens_seen": 316975100, "step": 129 }, { "epoch": 0.1369141653501843, "grad_norm": 0.7420702576637268, "learning_rate": 0.0004047734968501098, "loss": 12.9072, "num_input_tokens_seen": 319629918, "step": 130 }, { "epoch": 0.13796735123749343, "grad_norm": 1.0302780866622925, "learning_rate": 0.0004021903670331444, "loss": 10.6858, "num_input_tokens_seen": 322045726, "step": 131 }, { "epoch": 0.13902053712480253, "grad_norm": 0.9391334652900696, "learning_rate": 0.00039958116018454974, "loss": 12.0805, "num_input_tokens_seen": 324422858, "step": 132 }, { "epoch": 0.14007372301211163, "grad_norm": 0.834057629108429, "learning_rate": 0.000396946323378487, "loss": 11.8058, "num_input_tokens_seen": 326675990, "step": 133 }, { "epoch": 0.14112690889942076, "grad_norm": 0.8652845025062561, "learning_rate": 0.0003942863080806787, "loss": 11.8927, "num_input_tokens_seen": 329185696, "step": 134 }, { "epoch": 0.14218009478672985, "grad_norm": 0.6714434027671814, "learning_rate": 0.0003916015700710523, "loss": 12.5655, "num_input_tokens_seen": 331756854, "step": 135 }, { "epoch": 0.14323328067403895, "grad_norm": 0.7775278091430664, "learning_rate": 0.0003888925693656447, "loss": 11.3614, "num_input_tokens_seen": 334321854, "step": 136 }, { "epoch": 0.14428646656134808, "grad_norm": 0.6510213613510132, "learning_rate": 0.00038615977013778093, "loss": 13.0796, "num_input_tokens_seen": 336876626, "step": 137 }, { "epoch": 0.14533965244865718, "grad_norm": 0.8339363932609558, "learning_rate": 0.00038340364063854, "loss": 10.8552, "num_input_tokens_seen": 339348464, "step": 138 }, { "epoch": 0.1463928383359663, "grad_norm": 0.7218100428581238, "learning_rate": 0.0003806246531165231, "loss": 13.2681, "num_input_tokens_seen": 341823736, "step": 139 }, { "epoch": 0.1474460242232754, "grad_norm": 0.6435384154319763, "learning_rate": 0.0003778232837369358, "loss": 10.819, "num_input_tokens_seen": 344380626, "step": 140 }, { "epoch": 0.1484992101105845, "grad_norm": 0.8053638339042664, "learning_rate": 0.0003750000125, "loss": 13.6279, "num_input_tokens_seen": 346692052, "step": 141 }, { "epoch": 0.14955239599789363, "grad_norm": 0.9220738410949707, "learning_rate": 0.00037215532315870774, "loss": 13.3008, "num_input_tokens_seen": 349112532, "step": 142 }, { "epoch": 0.15060558188520273, "grad_norm": 0.8803617358207703, "learning_rate": 0.00036928970313593307, "loss": 12.9177, "num_input_tokens_seen": 351452022, "step": 143 }, { "epoch": 0.15165876777251186, "grad_norm": 0.7481730580329895, "learning_rate": 0.00036640364344091487, "loss": 11.0708, "num_input_tokens_seen": 353797086, "step": 144 }, { "epoch": 0.15271195365982096, "grad_norm": 0.7051401734352112, "learning_rate": 0.0003634976385851242, "loss": 12.2188, "num_input_tokens_seen": 356162002, "step": 145 }, { "epoch": 0.15376513954713006, "grad_norm": 0.7327920794487, "learning_rate": 0.0003605721864975331, "loss": 12.7737, "num_input_tokens_seen": 358794470, "step": 146 }, { "epoch": 0.15481832543443919, "grad_norm": 0.6063867211341858, "learning_rate": 0.0003576277884392964, "loss": 12.7589, "num_input_tokens_seen": 361060330, "step": 147 }, { "epoch": 0.15587151132174829, "grad_norm": 0.7477701902389526, "learning_rate": 0.0003546649489178636, "loss": 11.6398, "num_input_tokens_seen": 363503210, "step": 148 }, { "epoch": 0.1569246972090574, "grad_norm": 0.6183655858039856, "learning_rate": 0.000351684175600534, "loss": 12.0902, "num_input_tokens_seen": 365979336, "step": 149 }, { "epoch": 0.1579778830963665, "grad_norm": 0.8521149754524231, "learning_rate": 0.0003486859792274704, "loss": 12.2331, "num_input_tokens_seen": 368459936, "step": 150 }, { "epoch": 0.1590310689836756, "grad_norm": 0.6342733502388, "learning_rate": 0.00034567087352418665, "loss": 11.2938, "num_input_tokens_seen": 370972514, "step": 151 }, { "epoch": 0.16008425487098474, "grad_norm": 0.7194718718528748, "learning_rate": 0.00034263937511352314, "loss": 12.5729, "num_input_tokens_seen": 373538886, "step": 152 }, { "epoch": 0.16113744075829384, "grad_norm": 0.6463214755058289, "learning_rate": 0.00033959200342712626, "loss": 11.4804, "num_input_tokens_seen": 375997348, "step": 153 }, { "epoch": 0.16219062664560294, "grad_norm": 0.614433765411377, "learning_rate": 0.0003365292806164468, "loss": 13.3548, "num_input_tokens_seen": 378368452, "step": 154 }, { "epoch": 0.16324381253291206, "grad_norm": 0.7338926196098328, "learning_rate": 0.0003334517314632712, "loss": 12.9609, "num_input_tokens_seen": 380908798, "step": 155 }, { "epoch": 0.16429699842022116, "grad_norm": 0.6998718976974487, "learning_rate": 0.0003303598832898038, "loss": 12.9804, "num_input_tokens_seen": 383363952, "step": 156 }, { "epoch": 0.1653501843075303, "grad_norm": 0.8167688250541687, "learning_rate": 0.00032725426586831203, "loss": 11.7623, "num_input_tokens_seen": 385820504, "step": 157 }, { "epoch": 0.1664033701948394, "grad_norm": 0.7050719261169434, "learning_rate": 0.0003241354113303533, "loss": 13.1215, "num_input_tokens_seen": 388280752, "step": 158 }, { "epoch": 0.1674565560821485, "grad_norm": 0.5963127017021179, "learning_rate": 0.0003210038540755971, "loss": 12.8576, "num_input_tokens_seen": 390877896, "step": 159 }, { "epoch": 0.16850974196945762, "grad_norm": 0.6776026487350464, "learning_rate": 0.0003178601306802573, "loss": 12.7107, "num_input_tokens_seen": 393477216, "step": 160 }, { "epoch": 0.16956292785676672, "grad_norm": 0.45999521017074585, "learning_rate": 0.00031470477980515406, "loss": 10.5883, "num_input_tokens_seen": 396021712, "step": 161 }, { "epoch": 0.17061611374407584, "grad_norm": 0.7179697751998901, "learning_rate": 0.00031153834210341595, "loss": 11.4379, "num_input_tokens_seen": 398528988, "step": 162 }, { "epoch": 0.17166929963138494, "grad_norm": 0.5782619118690491, "learning_rate": 0.00030836136012784226, "loss": 12.687, "num_input_tokens_seen": 401033514, "step": 163 }, { "epoch": 0.17272248551869404, "grad_norm": 0.6894981861114502, "learning_rate": 0.00030517437823793947, "loss": 12.1689, "num_input_tokens_seen": 403501710, "step": 164 }, { "epoch": 0.17377567140600317, "grad_norm": 0.5236096978187561, "learning_rate": 0.00030197794250664753, "loss": 11.87, "num_input_tokens_seen": 405903642, "step": 165 }, { "epoch": 0.17482885729331227, "grad_norm": 0.5555451512336731, "learning_rate": 0.000298772600626774, "loss": 12.8906, "num_input_tokens_seen": 408463312, "step": 166 }, { "epoch": 0.17588204318062137, "grad_norm": 0.64606773853302, "learning_rate": 0.0002955589018171488, "loss": 12.867, "num_input_tokens_seen": 410857116, "step": 167 }, { "epoch": 0.1769352290679305, "grad_norm": 0.5008332133293152, "learning_rate": 0.0002923373967285185, "loss": 11.5597, "num_input_tokens_seen": 413293324, "step": 168 }, { "epoch": 0.1779884149552396, "grad_norm": 0.7233253121376038, "learning_rate": 0.00028910863734919615, "loss": 12.722, "num_input_tokens_seen": 415708708, "step": 169 }, { "epoch": 0.17904160084254872, "grad_norm": 0.44205302000045776, "learning_rate": 0.0002858731769104793, "loss": 13.1558, "num_input_tokens_seen": 418090970, "step": 170 }, { "epoch": 0.18009478672985782, "grad_norm": 0.8143755197525024, "learning_rate": 0.0002826315697918581, "loss": 10.9906, "num_input_tokens_seen": 420536658, "step": 171 }, { "epoch": 0.18114797261716692, "grad_norm": 0.7114960551261902, "learning_rate": 0.0002793843714260245, "loss": 13.044, "num_input_tokens_seen": 423068800, "step": 172 }, { "epoch": 0.18220115850447605, "grad_norm": 0.6115074753761292, "learning_rate": 0.0002761321382037018, "loss": 12.9103, "num_input_tokens_seen": 425683216, "step": 173 }, { "epoch": 0.18325434439178515, "grad_norm": 0.7895613312721252, "learning_rate": 0.00027287542737831016, "loss": 13.4783, "num_input_tokens_seen": 428019392, "step": 174 }, { "epoch": 0.18430753027909427, "grad_norm": 0.6167088150978088, "learning_rate": 0.00026961479697048385, "loss": 12.899, "num_input_tokens_seen": 430450162, "step": 175 }, { "epoch": 0.18536071616640337, "grad_norm": 0.5713215470314026, "learning_rate": 0.00026635080567245756, "loss": 13.2827, "num_input_tokens_seen": 432988892, "step": 176 }, { "epoch": 0.18641390205371247, "grad_norm": 0.5876232385635376, "learning_rate": 0.00026308401275233707, "loss": 11.3902, "num_input_tokens_seen": 435403104, "step": 177 }, { "epoch": 0.1874670879410216, "grad_norm": 0.49001753330230713, "learning_rate": 0.00025981497795827174, "loss": 12.9562, "num_input_tokens_seen": 437894208, "step": 178 }, { "epoch": 0.1885202738283307, "grad_norm": 0.6338276267051697, "learning_rate": 0.0002565442614225446, "loss": 12.074, "num_input_tokens_seen": 440361302, "step": 179 }, { "epoch": 0.1895734597156398, "grad_norm": 0.6284471154212952, "learning_rate": 0.0002532724235655962, "loss": 11.4969, "num_input_tokens_seen": 442772076, "step": 180 }, { "epoch": 0.19062664560294892, "grad_norm": 0.47973552346229553, "learning_rate": 0.000250000025, "loss": 12.1314, "num_input_tokens_seen": 445142352, "step": 181 }, { "epoch": 0.19167983149025802, "grad_norm": 0.6293119192123413, "learning_rate": 0.00024672762643440383, "loss": 11.6049, "num_input_tokens_seen": 447635106, "step": 182 }, { "epoch": 0.19273301737756715, "grad_norm": 0.5318001508712769, "learning_rate": 0.00024345578857745548, "loss": 12.5502, "num_input_tokens_seen": 449969556, "step": 183 }, { "epoch": 0.19378620326487625, "grad_norm": 0.7586023807525635, "learning_rate": 0.00024018507204172831, "loss": 11.5975, "num_input_tokens_seen": 452367082, "step": 184 }, { "epoch": 0.19483938915218535, "grad_norm": 0.7116125226020813, "learning_rate": 0.00023691603724766298, "loss": 12.029, "num_input_tokens_seen": 454829434, "step": 185 }, { "epoch": 0.19589257503949448, "grad_norm": 0.4598511755466461, "learning_rate": 0.00023364924432754246, "loss": 12.2994, "num_input_tokens_seen": 457344632, "step": 186 }, { "epoch": 0.19694576092680358, "grad_norm": 0.6228759288787842, "learning_rate": 0.0002303852530295162, "loss": 12.8659, "num_input_tokens_seen": 459728304, "step": 187 }, { "epoch": 0.1979989468141127, "grad_norm": 0.48603731393814087, "learning_rate": 0.0002271246226216899, "loss": 12.9379, "num_input_tokens_seen": 462371360, "step": 188 }, { "epoch": 0.1990521327014218, "grad_norm": 0.900370180606842, "learning_rate": 0.00022386791179629828, "loss": 11.9699, "num_input_tokens_seen": 464826406, "step": 189 }, { "epoch": 0.2001053185887309, "grad_norm": 0.6346839070320129, "learning_rate": 0.0002206156785739756, "loss": 11.3938, "num_input_tokens_seen": 467463708, "step": 190 }, { "epoch": 0.20115850447604003, "grad_norm": 0.6781347393989563, "learning_rate": 0.00021736848020814198, "loss": 12.586, "num_input_tokens_seen": 469992746, "step": 191 }, { "epoch": 0.20221169036334913, "grad_norm": 0.5637754201889038, "learning_rate": 0.00021412687308952077, "loss": 11.6584, "num_input_tokens_seen": 472498546, "step": 192 }, { "epoch": 0.20326487625065823, "grad_norm": 0.8815531134605408, "learning_rate": 0.00021089141265080388, "loss": 11.8585, "num_input_tokens_seen": 474859284, "step": 193 }, { "epoch": 0.20431806213796735, "grad_norm": 0.46758437156677246, "learning_rate": 0.00020766265327148146, "loss": 11.9465, "num_input_tokens_seen": 477239588, "step": 194 }, { "epoch": 0.20537124802527645, "grad_norm": 0.5134908556938171, "learning_rate": 0.00020444114818285127, "loss": 13.2315, "num_input_tokens_seen": 479559702, "step": 195 }, { "epoch": 0.20642443391258558, "grad_norm": 0.5208679437637329, "learning_rate": 0.00020122744937322602, "loss": 11.7135, "num_input_tokens_seen": 482139806, "step": 196 }, { "epoch": 0.20747761979989468, "grad_norm": 0.6516356468200684, "learning_rate": 0.0001980221074933525, "loss": 12.4686, "num_input_tokens_seen": 484606370, "step": 197 }, { "epoch": 0.20853080568720378, "grad_norm": 0.6271363496780396, "learning_rate": 0.00019482567176206064, "loss": 12.2299, "num_input_tokens_seen": 487088894, "step": 198 }, { "epoch": 0.2095839915745129, "grad_norm": 0.5966213345527649, "learning_rate": 0.00019163868987215785, "loss": 13.3167, "num_input_tokens_seen": 489502024, "step": 199 }, { "epoch": 0.210637177461822, "grad_norm": 0.5066636204719543, "learning_rate": 0.0001884617078965841, "loss": 12.0035, "num_input_tokens_seen": 491847958, "step": 200 }, { "epoch": 0.21169036334913113, "grad_norm": 0.5119277834892273, "learning_rate": 0.00018529527019484594, "loss": 12.021, "num_input_tokens_seen": 494483540, "step": 201 }, { "epoch": 0.21274354923644023, "grad_norm": 0.6364587545394897, "learning_rate": 0.00018213991931974273, "loss": 12.1343, "num_input_tokens_seen": 496784334, "step": 202 }, { "epoch": 0.21379673512374933, "grad_norm": 0.6135685443878174, "learning_rate": 0.00017899619592440298, "loss": 12.9802, "num_input_tokens_seen": 499122570, "step": 203 }, { "epoch": 0.21484992101105846, "grad_norm": 0.6455655694007874, "learning_rate": 0.00017586463866964668, "loss": 13.0798, "num_input_tokens_seen": 501601060, "step": 204 }, { "epoch": 0.21590310689836756, "grad_norm": 0.5825475454330444, "learning_rate": 0.00017274578413168805, "loss": 11.63, "num_input_tokens_seen": 504016506, "step": 205 }, { "epoch": 0.21695629278567669, "grad_norm": 0.6855435967445374, "learning_rate": 0.0001696401667101963, "loss": 12.1283, "num_input_tokens_seen": 506604150, "step": 206 }, { "epoch": 0.21800947867298578, "grad_norm": 0.5899667143821716, "learning_rate": 0.00016654831853672876, "loss": 12.7442, "num_input_tokens_seen": 509053728, "step": 207 }, { "epoch": 0.21906266456029488, "grad_norm": 0.7350984811782837, "learning_rate": 0.00016347076938355316, "loss": 12.8581, "num_input_tokens_seen": 511567478, "step": 208 }, { "epoch": 0.220115850447604, "grad_norm": 0.504421591758728, "learning_rate": 0.0001604080465728737, "loss": 11.274, "num_input_tokens_seen": 514136400, "step": 209 }, { "epoch": 0.2211690363349131, "grad_norm": 0.5333490371704102, "learning_rate": 0.00015736067488647686, "loss": 11.9151, "num_input_tokens_seen": 516530356, "step": 210 }, { "epoch": 0.2222222222222222, "grad_norm": 0.9190953969955444, "learning_rate": 0.00015432917647581338, "loss": 13.1548, "num_input_tokens_seen": 519028524, "step": 211 }, { "epoch": 0.22327540810953134, "grad_norm": 0.5969712138175964, "learning_rate": 0.00015131407077252965, "loss": 13.2013, "num_input_tokens_seen": 521436334, "step": 212 }, { "epoch": 0.22432859399684044, "grad_norm": 0.6031188368797302, "learning_rate": 0.0001483158743994661, "loss": 10.9069, "num_input_tokens_seen": 524024802, "step": 213 }, { "epoch": 0.22538177988414956, "grad_norm": 0.4547601640224457, "learning_rate": 0.0001453351010821365, "loss": 12.0999, "num_input_tokens_seen": 526580534, "step": 214 }, { "epoch": 0.22643496577145866, "grad_norm": 0.5760593414306641, "learning_rate": 0.0001423722615607036, "loss": 11.99, "num_input_tokens_seen": 529062886, "step": 215 }, { "epoch": 0.22748815165876776, "grad_norm": 0.44997331500053406, "learning_rate": 0.000139427863502467, "loss": 12.4474, "num_input_tokens_seen": 531535650, "step": 216 }, { "epoch": 0.2285413375460769, "grad_norm": 0.6070464849472046, "learning_rate": 0.00013650241141487582, "loss": 11.5504, "num_input_tokens_seen": 534061814, "step": 217 }, { "epoch": 0.229594523433386, "grad_norm": 0.6833432912826538, "learning_rate": 0.00013359640655908516, "loss": 13.798, "num_input_tokens_seen": 536687100, "step": 218 }, { "epoch": 0.23064770932069512, "grad_norm": 0.8076875805854797, "learning_rate": 0.0001307103468640669, "loss": 11.5622, "num_input_tokens_seen": 539150044, "step": 219 }, { "epoch": 0.23170089520800422, "grad_norm": 0.7293125987052917, "learning_rate": 0.0001278447268412924, "loss": 11.2022, "num_input_tokens_seen": 541648322, "step": 220 }, { "epoch": 0.23275408109531331, "grad_norm": 0.6351690292358398, "learning_rate": 0.00012500003750000004, "loss": 12.3972, "num_input_tokens_seen": 544168352, "step": 221 }, { "epoch": 0.23380726698262244, "grad_norm": 0.5785694718360901, "learning_rate": 0.00012217676626306417, "loss": 11.8149, "num_input_tokens_seen": 546741482, "step": 222 }, { "epoch": 0.23486045286993154, "grad_norm": 0.674688994884491, "learning_rate": 0.00011937539688347693, "loss": 12.0269, "num_input_tokens_seen": 549104356, "step": 223 }, { "epoch": 0.23591363875724064, "grad_norm": 0.5755442976951599, "learning_rate": 0.00011659640936146005, "loss": 13.1861, "num_input_tokens_seen": 551429474, "step": 224 }, { "epoch": 0.23696682464454977, "grad_norm": 0.553242027759552, "learning_rate": 0.00011384027986221911, "loss": 12.3476, "num_input_tokens_seen": 553859868, "step": 225 }, { "epoch": 0.23802001053185887, "grad_norm": 0.5550587773323059, "learning_rate": 0.00011110748063435535, "loss": 10.3826, "num_input_tokens_seen": 556346556, "step": 226 }, { "epoch": 0.239073196419168, "grad_norm": 0.4513484835624695, "learning_rate": 0.00010839847992894778, "loss": 13.194, "num_input_tokens_seen": 558924540, "step": 227 }, { "epoch": 0.2401263823064771, "grad_norm": 0.6309003829956055, "learning_rate": 0.00010571374191932138, "loss": 11.8884, "num_input_tokens_seen": 561327442, "step": 228 }, { "epoch": 0.2411795681937862, "grad_norm": 0.7048976421356201, "learning_rate": 0.00010305372662151306, "loss": 11.7711, "num_input_tokens_seen": 563837514, "step": 229 }, { "epoch": 0.24223275408109532, "grad_norm": 0.4465177655220032, "learning_rate": 0.00010041888981545026, "loss": 12.5637, "num_input_tokens_seen": 566302658, "step": 230 }, { "epoch": 0.24328593996840442, "grad_norm": 0.5073559880256653, "learning_rate": 9.780968296685557e-05, "loss": 11.6272, "num_input_tokens_seen": 568934814, "step": 231 }, { "epoch": 0.24433912585571355, "grad_norm": 0.6577916145324707, "learning_rate": 9.522655314989022e-05, "loss": 11.5156, "num_input_tokens_seen": 571237934, "step": 232 }, { "epoch": 0.24539231174302265, "grad_norm": 0.6333816647529602, "learning_rate": 9.266994297055047e-05, "loss": 13.5238, "num_input_tokens_seen": 573556058, "step": 233 }, { "epoch": 0.24644549763033174, "grad_norm": 0.48986560106277466, "learning_rate": 9.014029049082889e-05, "loss": 12.1345, "num_input_tokens_seen": 575930124, "step": 234 }, { "epoch": 0.24749868351764087, "grad_norm": 0.680926501750946, "learning_rate": 8.763802915365534e-05, "loss": 13.2642, "num_input_tokens_seen": 578430022, "step": 235 }, { "epoch": 0.24855186940494997, "grad_norm": 0.5158939361572266, "learning_rate": 8.516358770862817e-05, "loss": 13.2069, "num_input_tokens_seen": 580816748, "step": 236 }, { "epoch": 0.24960505529225907, "grad_norm": 0.7031413316726685, "learning_rate": 8.271739013855068e-05, "loss": 11.3934, "num_input_tokens_seen": 583366398, "step": 237 }, { "epoch": 0.2506582411795682, "grad_norm": 0.7286622524261475, "learning_rate": 8.02998555867832e-05, "loss": 11.138, "num_input_tokens_seen": 585825914, "step": 238 }, { "epoch": 0.2517114270668773, "grad_norm": 0.6047478318214417, "learning_rate": 7.791139828542587e-05, "loss": 11.4407, "num_input_tokens_seen": 588320922, "step": 239 }, { "epoch": 0.2527646129541864, "grad_norm": 0.5910817980766296, "learning_rate": 7.55524274843415e-05, "loss": 13.5983, "num_input_tokens_seen": 590762520, "step": 240 }, { "epoch": 0.2538177988414955, "grad_norm": 0.5623341202735901, "learning_rate": 7.322334738103267e-05, "loss": 11.1451, "num_input_tokens_seen": 593119346, "step": 241 }, { "epoch": 0.25487098472880465, "grad_norm": 0.40974873304367065, "learning_rate": 7.092455705138504e-05, "loss": 12.4794, "num_input_tokens_seen": 595480902, "step": 242 }, { "epoch": 0.2559241706161137, "grad_norm": 0.663348376750946, "learning_rate": 6.865645038128743e-05, "loss": 11.9646, "num_input_tokens_seen": 597986692, "step": 243 }, { "epoch": 0.25697735650342285, "grad_norm": 0.6817061901092529, "learning_rate": 6.64194159991414e-05, "loss": 11.9893, "num_input_tokens_seen": 600640014, "step": 244 }, { "epoch": 0.258030542390732, "grad_norm": 0.46102288365364075, "learning_rate": 6.421383720927206e-05, "loss": 13.107, "num_input_tokens_seen": 602922284, "step": 245 }, { "epoch": 0.25908372827804105, "grad_norm": 0.4497967064380646, "learning_rate": 6.204009192625087e-05, "loss": 13.4184, "num_input_tokens_seen": 605350038, "step": 246 }, { "epoch": 0.2601369141653502, "grad_norm": 0.6371114253997803, "learning_rate": 5.989855261014141e-05, "loss": 13.6668, "num_input_tokens_seen": 607725012, "step": 247 }, { "epoch": 0.2611901000526593, "grad_norm": 0.5277857780456543, "learning_rate": 5.778958620268094e-05, "loss": 13.765, "num_input_tokens_seen": 610117954, "step": 248 }, { "epoch": 0.26224328593996843, "grad_norm": 0.4079309403896332, "learning_rate": 5.5713554064406314e-05, "loss": 12.2847, "num_input_tokens_seen": 612557846, "step": 249 }, { "epoch": 0.2632964718272775, "grad_norm": 0.5939152240753174, "learning_rate": 5.3670811912737094e-05, "loss": 11.2133, "num_input_tokens_seen": 614861376, "step": 250 }, { "epoch": 0.2643496577145866, "grad_norm": 0.5120065212249756, "learning_rate": 5.166170976102475e-05, "loss": 12.9469, "num_input_tokens_seen": 617294518, "step": 251 }, { "epoch": 0.26540284360189575, "grad_norm": 0.6367613673210144, "learning_rate": 4.968659185858018e-05, "loss": 12.7285, "num_input_tokens_seen": 619635220, "step": 252 }, { "epoch": 0.2664560294892048, "grad_norm": 0.5774930119514465, "learning_rate": 4.774579663168803e-05, "loss": 11.6266, "num_input_tokens_seen": 622075956, "step": 253 }, { "epoch": 0.26750921537651395, "grad_norm": 0.7033908367156982, "learning_rate": 4.583965662561915e-05, "loss": 11.8729, "num_input_tokens_seen": 624607758, "step": 254 }, { "epoch": 0.2685624012638231, "grad_norm": 0.7179914712905884, "learning_rate": 4.396849844765079e-05, "loss": 13.1789, "num_input_tokens_seen": 627172634, "step": 255 }, { "epoch": 0.26961558715113215, "grad_norm": 0.41594576835632324, "learning_rate": 4.213264271110397e-05, "loss": 12.8395, "num_input_tokens_seen": 629690912, "step": 256 }, { "epoch": 0.2706687730384413, "grad_norm": 0.5446677207946777, "learning_rate": 4.0332403980408214e-05, "loss": 12.8879, "num_input_tokens_seen": 632255410, "step": 257 }, { "epoch": 0.2717219589257504, "grad_norm": 0.6709144711494446, "learning_rate": 3.856809071720225e-05, "loss": 11.9906, "num_input_tokens_seen": 634582544, "step": 258 }, { "epoch": 0.2727751448130595, "grad_norm": 0.6252522468566895, "learning_rate": 3.684000522748107e-05, "loss": 12.4937, "num_input_tokens_seen": 637108470, "step": 259 }, { "epoch": 0.2738283307003686, "grad_norm": 0.46408626437187195, "learning_rate": 3.514844360979712e-05, "loss": 12.4264, "num_input_tokens_seen": 639426634, "step": 260 }, { "epoch": 0.27488151658767773, "grad_norm": 0.5272302627563477, "learning_rate": 3.349369570452542e-05, "loss": 13.5761, "num_input_tokens_seen": 641900154, "step": 261 }, { "epoch": 0.27593470247498686, "grad_norm": 0.5684494972229004, "learning_rate": 3.1876045044200884e-05, "loss": 12.417, "num_input_tokens_seen": 644205144, "step": 262 }, { "epoch": 0.27698788836229593, "grad_norm": 0.5754216909408569, "learning_rate": 3.0295768804936502e-05, "loss": 13.1275, "num_input_tokens_seen": 646496932, "step": 263 }, { "epoch": 0.27804107424960506, "grad_norm": 0.5298753380775452, "learning_rate": 2.87531377589305e-05, "loss": 11.4534, "num_input_tokens_seen": 649096904, "step": 264 }, { "epoch": 0.2790942601369142, "grad_norm": 0.49052122235298157, "learning_rate": 2.724841622807116e-05, "loss": 12.8376, "num_input_tokens_seen": 651600900, "step": 265 }, { "epoch": 0.28014744602422326, "grad_norm": 0.5886843204498291, "learning_rate": 2.578186203864648e-05, "loss": 13.108, "num_input_tokens_seen": 654189842, "step": 266 }, { "epoch": 0.2812006319115324, "grad_norm": 0.5260895490646362, "learning_rate": 2.435372647716701e-05, "loss": 12.8051, "num_input_tokens_seen": 656636650, "step": 267 }, { "epoch": 0.2822538177988415, "grad_norm": 0.7550118565559387, "learning_rate": 2.2964254247309006e-05, "loss": 9.8832, "num_input_tokens_seen": 659070838, "step": 268 }, { "epoch": 0.2833070036861506, "grad_norm": 0.592755138874054, "learning_rate": 2.1613683427986202e-05, "loss": 11.5407, "num_input_tokens_seen": 661668122, "step": 269 }, { "epoch": 0.2843601895734597, "grad_norm": 0.5145466923713684, "learning_rate": 2.0302245432555708e-05, "loss": 11.0979, "num_input_tokens_seen": 664055676, "step": 270 }, { "epoch": 0.28541337546076884, "grad_norm": 0.5453233122825623, "learning_rate": 1.9030164969166632e-05, "loss": 12.9023, "num_input_tokens_seen": 666712162, "step": 271 }, { "epoch": 0.2864665613480779, "grad_norm": 0.8424168825149536, "learning_rate": 1.7797660002257764e-05, "loss": 10.2984, "num_input_tokens_seen": 669145358, "step": 272 }, { "epoch": 0.28751974723538704, "grad_norm": 0.6945912837982178, "learning_rate": 1.6604941715210256e-05, "loss": 12.3293, "num_input_tokens_seen": 671496934, "step": 273 }, { "epoch": 0.28857293312269616, "grad_norm": 0.6020376086235046, "learning_rate": 1.545221447416239e-05, "loss": 11.6243, "num_input_tokens_seen": 673956840, "step": 274 }, { "epoch": 0.2896261190100053, "grad_norm": 0.5085281133651733, "learning_rate": 1.4339675792992671e-05, "loss": 12.3864, "num_input_tokens_seen": 676522688, "step": 275 }, { "epoch": 0.29067930489731436, "grad_norm": 0.4224667549133301, "learning_rate": 1.3267516299476845e-05, "loss": 13.0538, "num_input_tokens_seen": 678987128, "step": 276 }, { "epoch": 0.2917324907846235, "grad_norm": 0.45536261796951294, "learning_rate": 1.2235919702624524e-05, "loss": 12.5957, "num_input_tokens_seen": 681376426, "step": 277 }, { "epoch": 0.2927856766719326, "grad_norm": 0.6411481499671936, "learning_rate": 1.1245062761201955e-05, "loss": 11.8706, "num_input_tokens_seen": 683908570, "step": 278 }, { "epoch": 0.2938388625592417, "grad_norm": 0.6085736155509949, "learning_rate": 1.0295115253445109e-05, "loss": 12.9511, "num_input_tokens_seen": 686435460, "step": 279 }, { "epoch": 0.2948920484465508, "grad_norm": 0.7676545977592468, "learning_rate": 9.38623994796912e-06, "loss": 12.9862, "num_input_tokens_seen": 689001010, "step": 280 }, { "epoch": 0.29594523433385994, "grad_norm": 0.5978413820266724, "learning_rate": 8.518592575878607e-06, "loss": 10.6406, "num_input_tokens_seen": 691555714, "step": 281 }, { "epoch": 0.296998420221169, "grad_norm": 0.5144767165184021, "learning_rate": 7.692321804084169e-06, "loss": 12.415, "num_input_tokens_seen": 694038390, "step": 282 }, { "epoch": 0.29805160610847814, "grad_norm": 0.4338608384132385, "learning_rate": 6.907569209828871e-06, "loss": 13.8655, "num_input_tokens_seen": 696690606, "step": 283 }, { "epoch": 0.29910479199578727, "grad_norm": 0.4250311851501465, "learning_rate": 6.1644692564298475e-06, "loss": 12.749, "num_input_tokens_seen": 698974368, "step": 284 }, { "epoch": 0.3001579778830964, "grad_norm": 0.57984459400177, "learning_rate": 5.463149270238596e-06, "loss": 13.3178, "num_input_tokens_seen": 701514324, "step": 285 }, { "epoch": 0.30121116377040547, "grad_norm": 0.7126944661140442, "learning_rate": 4.803729418824403e-06, "loss": 14.3177, "num_input_tokens_seen": 703856054, "step": 286 }, { "epoch": 0.3022643496577146, "grad_norm": 0.5229653120040894, "learning_rate": 4.1863226903840625e-06, "loss": 12.8253, "num_input_tokens_seen": 706380374, "step": 287 }, { "epoch": 0.3033175355450237, "grad_norm": 0.676238477230072, "learning_rate": 3.6110348743820393e-06, "loss": 12.9148, "num_input_tokens_seen": 708733546, "step": 288 }, { "epoch": 0.3043707214323328, "grad_norm": 0.5809754729270935, "learning_rate": 3.0779645434241003e-06, "loss": 13.8872, "num_input_tokens_seen": 711304704, "step": 289 }, { "epoch": 0.3054239073196419, "grad_norm": 0.5316653251647949, "learning_rate": 2.58720303636711e-06, "loss": 13.8184, "num_input_tokens_seen": 713915084, "step": 290 }, { "epoch": 0.30647709320695105, "grad_norm": 0.4242398142814636, "learning_rate": 2.1388344426689387e-06, "loss": 13.0624, "num_input_tokens_seen": 716304858, "step": 291 }, { "epoch": 0.3075302790942601, "grad_norm": 0.6927266716957092, "learning_rate": 1.7329355879798507e-06, "loss": 11.8993, "num_input_tokens_seen": 718782282, "step": 292 }, { "epoch": 0.30858346498156924, "grad_norm": 0.42387041449546814, "learning_rate": 1.3695760209790617e-06, "loss": 12.7312, "num_input_tokens_seen": 721256712, "step": 293 }, { "epoch": 0.30963665086887837, "grad_norm": 0.5390251874923706, "learning_rate": 1.048818001457775e-06, "loss": 12.1773, "num_input_tokens_seen": 723712914, "step": 294 }, { "epoch": 0.31068983675618744, "grad_norm": 0.632005512714386, "learning_rate": 7.707164896513524e-07, "loss": 12.2577, "num_input_tokens_seen": 726014718, "step": 295 }, { "epoch": 0.31174302264349657, "grad_norm": 0.623199462890625, "learning_rate": 5.353191368222112e-07, "loss": 12.2437, "num_input_tokens_seen": 728471242, "step": 296 }, { "epoch": 0.3127962085308057, "grad_norm": 0.5572063326835632, "learning_rate": 3.4266627709491055e-07, "loss": 11.9195, "num_input_tokens_seen": 730842942, "step": 297 }, { "epoch": 0.3138493944181148, "grad_norm": 0.6717016696929932, "learning_rate": 1.927909205451808e-07, "loss": 12.485, "num_input_tokens_seen": 733445910, "step": 298 }, { "epoch": 0.3149025803054239, "grad_norm": 0.46403953433036804, "learning_rate": 8.571874754380943e-08, "loss": 12.583, "num_input_tokens_seen": 735908532, "step": 299 }, { "epoch": 0.315955766192733, "grad_norm": 0.5197071433067322, "learning_rate": 2.1468104356439287e-08, "loss": 12.1468, "num_input_tokens_seen": 738472692, "step": 300 }, { "epoch": 0.315955766192733, "num_input_tokens_seen": 738472692, "step": 300, "total_flos": 4.204509531637547e+18, "train_loss": 12.393445908228557, "train_runtime": 50692.0245, "train_samples_per_second": 0.284, "train_steps_per_second": 0.006 } ], "logging_steps": 1.0, "max_steps": 300, "num_input_tokens_seen": 738472692, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.204509531637547e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }