{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.315955766192733, "eval_steps": 500, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00105318588730911, "grad_norm": 1.3147555589675903, "learning_rate": 0.0, "loss": 13.5907, "num_input_tokens_seen": 2363056, "step": 1 }, { "epoch": 0.00210637177461822, "grad_norm": 1.1668343544006348, "learning_rate": 5e-06, "loss": 15.1869, "num_input_tokens_seen": 4914924, "step": 2 }, { "epoch": 0.00315955766192733, "grad_norm": 1.078931212425232, "learning_rate": 1e-05, "loss": 14.1278, "num_input_tokens_seen": 7350666, "step": 3 }, { "epoch": 0.00421274354923644, "grad_norm": 0.9630680084228516, "learning_rate": 1.5e-05, "loss": 14.4448, "num_input_tokens_seen": 9911760, "step": 4 }, { "epoch": 0.0052659294365455505, "grad_norm": 1.2464635372161865, "learning_rate": 2e-05, "loss": 14.539, "num_input_tokens_seen": 12460110, "step": 5 }, { "epoch": 0.00631911532385466, "grad_norm": 1.4388000965118408, "learning_rate": 2.5e-05, "loss": 14.9989, "num_input_tokens_seen": 14941884, "step": 6 }, { "epoch": 0.00737230121116377, "grad_norm": 0.9652966856956482, "learning_rate": 3e-05, "loss": 14.4027, "num_input_tokens_seen": 17514914, "step": 7 }, { "epoch": 0.00842548709847288, "grad_norm": 1.1013789176940918, "learning_rate": 3.5000000000000004e-05, "loss": 12.0805, "num_input_tokens_seen": 20142298, "step": 8 }, { "epoch": 0.009478672985781991, "grad_norm": 1.1957684755325317, "learning_rate": 4e-05, "loss": 13.4646, "num_input_tokens_seen": 22656144, "step": 9 }, { "epoch": 0.010531858873091101, "grad_norm": 1.1999417543411255, "learning_rate": 4.4999999999999996e-05, "loss": 14.9708, "num_input_tokens_seen": 25175402, "step": 10 }, { "epoch": 0.01158504476040021, "grad_norm": 1.2181422710418701, "learning_rate": 5e-05, "loss": 12.8572, "num_input_tokens_seen": 27603302, "step": 11 }, { "epoch": 0.01263823064770932, "grad_norm": 1.1232861280441284, "learning_rate": 5.5e-05, "loss": 13.4139, "num_input_tokens_seen": 30063438, "step": 12 }, { "epoch": 0.01369141653501843, "grad_norm": 1.4094243049621582, "learning_rate": 6e-05, "loss": 13.6033, "num_input_tokens_seen": 32496926, "step": 13 }, { "epoch": 0.01474460242232754, "grad_norm": 1.1449795961380005, "learning_rate": 6.500000000000001e-05, "loss": 13.7514, "num_input_tokens_seen": 34824490, "step": 14 }, { "epoch": 0.01579778830963665, "grad_norm": 1.305707335472107, "learning_rate": 7.000000000000001e-05, "loss": 12.9037, "num_input_tokens_seen": 37474392, "step": 15 }, { "epoch": 0.01685097419694576, "grad_norm": 0.8636656403541565, "learning_rate": 7.5e-05, "loss": 14.264, "num_input_tokens_seen": 40079890, "step": 16 }, { "epoch": 0.01790416008425487, "grad_norm": 1.3949730396270752, "learning_rate": 8e-05, "loss": 14.129, "num_input_tokens_seen": 42483202, "step": 17 }, { "epoch": 0.018957345971563982, "grad_norm": 1.0275214910507202, "learning_rate": 8.5e-05, "loss": 14.9399, "num_input_tokens_seen": 44983536, "step": 18 }, { "epoch": 0.020010531858873092, "grad_norm": 1.1338497400283813, "learning_rate": 8.999999999999999e-05, "loss": 14.9256, "num_input_tokens_seen": 47556326, "step": 19 }, { "epoch": 0.021063717746182202, "grad_norm": 1.1364272832870483, "learning_rate": 9.5e-05, "loss": 14.2354, "num_input_tokens_seen": 49984930, "step": 20 }, { "epoch": 0.022116903633491312, "grad_norm": 1.0504145622253418, "learning_rate": 0.0001, "loss": 13.1098, "num_input_tokens_seen": 52475706, "step": 21 }, { "epoch": 0.02317008952080042, "grad_norm": 1.0692085027694702, "learning_rate": 0.000105, "loss": 13.6485, "num_input_tokens_seen": 54931676, "step": 22 }, { "epoch": 0.02422327540810953, "grad_norm": 1.125333309173584, "learning_rate": 0.00011, "loss": 12.9832, "num_input_tokens_seen": 57311266, "step": 23 }, { "epoch": 0.02527646129541864, "grad_norm": 1.2768787145614624, "learning_rate": 0.000115, "loss": 13.8192, "num_input_tokens_seen": 59782354, "step": 24 }, { "epoch": 0.02632964718272775, "grad_norm": 1.0284156799316406, "learning_rate": 0.00012, "loss": 14.8322, "num_input_tokens_seen": 62186446, "step": 25 }, { "epoch": 0.02738283307003686, "grad_norm": 1.0253136157989502, "learning_rate": 0.000125, "loss": 13.353, "num_input_tokens_seen": 64698904, "step": 26 }, { "epoch": 0.02843601895734597, "grad_norm": 1.2211846113204956, "learning_rate": 0.00013000000000000002, "loss": 13.8297, "num_input_tokens_seen": 67157778, "step": 27 }, { "epoch": 0.02948920484465508, "grad_norm": 1.8428621292114258, "learning_rate": 0.000135, "loss": 14.5567, "num_input_tokens_seen": 69536532, "step": 28 }, { "epoch": 0.030542390731964193, "grad_norm": 1.5557100772857666, "learning_rate": 0.00014000000000000001, "loss": 12.739, "num_input_tokens_seen": 72073830, "step": 29 }, { "epoch": 0.0315955766192733, "grad_norm": 1.0637712478637695, "learning_rate": 0.000145, "loss": 14.8383, "num_input_tokens_seen": 74649418, "step": 30 }, { "epoch": 0.03264876250658241, "grad_norm": 0.9268069863319397, "learning_rate": 0.00015, "loss": 14.7157, "num_input_tokens_seen": 77194468, "step": 31 }, { "epoch": 0.03370194839389152, "grad_norm": 1.081074595451355, "learning_rate": 0.000155, "loss": 14.3469, "num_input_tokens_seen": 79567360, "step": 32 }, { "epoch": 0.03475513428120063, "grad_norm": 1.1562591791152954, "learning_rate": 0.00016, "loss": 13.2922, "num_input_tokens_seen": 81976622, "step": 33 }, { "epoch": 0.03580832016850974, "grad_norm": 1.40042245388031, "learning_rate": 0.000165, "loss": 12.6052, "num_input_tokens_seen": 84456254, "step": 34 }, { "epoch": 0.03686150605581885, "grad_norm": 1.1846578121185303, "learning_rate": 0.00017, "loss": 14.8395, "num_input_tokens_seen": 86912082, "step": 35 }, { "epoch": 0.037914691943127965, "grad_norm": 1.1414552927017212, "learning_rate": 0.000175, "loss": 13.8918, "num_input_tokens_seen": 89296692, "step": 36 }, { "epoch": 0.03896787783043707, "grad_norm": 1.0319195985794067, "learning_rate": 0.00017999999999999998, "loss": 14.3268, "num_input_tokens_seen": 91786022, "step": 37 }, { "epoch": 0.040021063717746184, "grad_norm": 1.2396275997161865, "learning_rate": 0.000185, "loss": 13.6008, "num_input_tokens_seen": 94451030, "step": 38 }, { "epoch": 0.04107424960505529, "grad_norm": 0.9682689309120178, "learning_rate": 0.00019, "loss": 14.997, "num_input_tokens_seen": 96817518, "step": 39 }, { "epoch": 0.042127435492364404, "grad_norm": 0.8734373450279236, "learning_rate": 0.00019500000000000002, "loss": 13.0396, "num_input_tokens_seen": 99234416, "step": 40 }, { "epoch": 0.04318062137967351, "grad_norm": 1.5965349674224854, "learning_rate": 0.0002, "loss": 14.213, "num_input_tokens_seen": 101710538, "step": 41 }, { "epoch": 0.044233807266982623, "grad_norm": 1.6115487813949585, "learning_rate": 0.000205, "loss": 15.0594, "num_input_tokens_seen": 104066772, "step": 42 }, { "epoch": 0.04528699315429173, "grad_norm": 1.5525461435317993, "learning_rate": 0.00021, "loss": 14.3995, "num_input_tokens_seen": 106478524, "step": 43 }, { "epoch": 0.04634017904160084, "grad_norm": 1.5180985927581787, "learning_rate": 0.000215, "loss": 13.0277, "num_input_tokens_seen": 108918022, "step": 44 }, { "epoch": 0.04739336492890995, "grad_norm": 1.388283371925354, "learning_rate": 0.00022, "loss": 15.5197, "num_input_tokens_seen": 111331584, "step": 45 }, { "epoch": 0.04844655081621906, "grad_norm": 0.7749722003936768, "learning_rate": 0.00022500000000000002, "loss": 14.1346, "num_input_tokens_seen": 113804550, "step": 46 }, { "epoch": 0.049499736703528176, "grad_norm": 1.8673762083053589, "learning_rate": 0.00023, "loss": 15.5638, "num_input_tokens_seen": 116406598, "step": 47 }, { "epoch": 0.05055292259083728, "grad_norm": 1.6054580211639404, "learning_rate": 0.000235, "loss": 12.8775, "num_input_tokens_seen": 118834762, "step": 48 }, { "epoch": 0.051606108478146395, "grad_norm": 1.256948709487915, "learning_rate": 0.00024, "loss": 13.8707, "num_input_tokens_seen": 121241054, "step": 49 }, { "epoch": 0.0526592943654555, "grad_norm": 1.5192618370056152, "learning_rate": 0.000245, "loss": 12.771, "num_input_tokens_seen": 123714340, "step": 50 }, { "epoch": 0.053712480252764615, "grad_norm": 1.45964515209198, "learning_rate": 0.00025, "loss": 12.7207, "num_input_tokens_seen": 126399394, "step": 51 }, { "epoch": 0.05476566614007372, "grad_norm": 1.0744571685791016, "learning_rate": 0.000255, "loss": 14.2132, "num_input_tokens_seen": 128815712, "step": 52 }, { "epoch": 0.055818852027382834, "grad_norm": 0.9800078868865967, "learning_rate": 0.00026000000000000003, "loss": 12.3437, "num_input_tokens_seen": 131243982, "step": 53 }, { "epoch": 0.05687203791469194, "grad_norm": 1.7293643951416016, "learning_rate": 0.00026500000000000004, "loss": 15.0475, "num_input_tokens_seen": 133695752, "step": 54 }, { "epoch": 0.057925223802001054, "grad_norm": 1.134101152420044, "learning_rate": 0.00027, "loss": 15.3197, "num_input_tokens_seen": 136206586, "step": 55 }, { "epoch": 0.05897840968931016, "grad_norm": 1.7910157442092896, "learning_rate": 0.000275, "loss": 15.8412, "num_input_tokens_seen": 138588134, "step": 56 }, { "epoch": 0.06003159557661927, "grad_norm": 1.3835411071777344, "learning_rate": 0.00028000000000000003, "loss": 12.9363, "num_input_tokens_seen": 141117190, "step": 57 }, { "epoch": 0.061084781463928386, "grad_norm": 1.0232059955596924, "learning_rate": 0.000285, "loss": 15.1391, "num_input_tokens_seen": 143491738, "step": 58 }, { "epoch": 0.06213796735123749, "grad_norm": 1.1751936674118042, "learning_rate": 0.00029, "loss": 14.5678, "num_input_tokens_seen": 145991144, "step": 59 }, { "epoch": 0.0631911532385466, "grad_norm": 1.3430085182189941, "learning_rate": 0.000295, "loss": 13.1367, "num_input_tokens_seen": 148486356, "step": 60 }, { "epoch": 0.06424433912585571, "grad_norm": 1.3005839586257935, "learning_rate": 0.0003, "loss": 13.769, "num_input_tokens_seen": 150891696, "step": 61 }, { "epoch": 0.06529752501316483, "grad_norm": 2.0061893463134766, "learning_rate": 0.000305, "loss": 14.7883, "num_input_tokens_seen": 153414658, "step": 62 }, { "epoch": 0.06635071090047394, "grad_norm": 1.4493314027786255, "learning_rate": 0.00031, "loss": 12.5738, "num_input_tokens_seen": 155793700, "step": 63 }, { "epoch": 0.06740389678778304, "grad_norm": 1.2789517641067505, "learning_rate": 0.000315, "loss": 13.7151, "num_input_tokens_seen": 158248338, "step": 64 }, { "epoch": 0.06845708267509215, "grad_norm": 1.1854947805404663, "learning_rate": 0.00032, "loss": 13.5257, "num_input_tokens_seen": 160737224, "step": 65 }, { "epoch": 0.06951026856240126, "grad_norm": 0.983009934425354, "learning_rate": 0.00032500000000000004, "loss": 13.5613, "num_input_tokens_seen": 163317772, "step": 66 }, { "epoch": 0.07056345444971038, "grad_norm": 1.1745814085006714, "learning_rate": 0.00033, "loss": 14.2875, "num_input_tokens_seen": 165797524, "step": 67 }, { "epoch": 0.07161664033701948, "grad_norm": 1.0742534399032593, "learning_rate": 0.000335, "loss": 13.1399, "num_input_tokens_seen": 168202674, "step": 68 }, { "epoch": 0.07266982622432859, "grad_norm": 1.2481011152267456, "learning_rate": 0.00034, "loss": 12.9279, "num_input_tokens_seen": 170618878, "step": 69 }, { "epoch": 0.0737230121116377, "grad_norm": 1.4287097454071045, "learning_rate": 0.000345, "loss": 12.6019, "num_input_tokens_seen": 173023976, "step": 70 }, { "epoch": 0.07477619799894682, "grad_norm": 0.9344131350517273, "learning_rate": 0.00035, "loss": 14.5578, "num_input_tokens_seen": 175423500, "step": 71 }, { "epoch": 0.07582938388625593, "grad_norm": 1.383571982383728, "learning_rate": 0.000355, "loss": 12.3683, "num_input_tokens_seen": 177865696, "step": 72 }, { "epoch": 0.07688256977356503, "grad_norm": 1.5278881788253784, "learning_rate": 0.00035999999999999997, "loss": 13.7148, "num_input_tokens_seen": 180215372, "step": 73 }, { "epoch": 0.07793575566087414, "grad_norm": 0.9833160042762756, "learning_rate": 0.000365, "loss": 13.9058, "num_input_tokens_seen": 182660290, "step": 74 }, { "epoch": 0.07898894154818326, "grad_norm": 1.1421438455581665, "learning_rate": 0.00037, "loss": 14.5135, "num_input_tokens_seen": 185180996, "step": 75 }, { "epoch": 0.08004212743549237, "grad_norm": 1.5786970853805542, "learning_rate": 0.000375, "loss": 12.5171, "num_input_tokens_seen": 187704644, "step": 76 }, { "epoch": 0.08109531332280147, "grad_norm": 1.9583278894424438, "learning_rate": 0.00038, "loss": 13.6645, "num_input_tokens_seen": 190082764, "step": 77 }, { "epoch": 0.08214849921011058, "grad_norm": 2.4497997760772705, "learning_rate": 0.00038500000000000003, "loss": 14.2632, "num_input_tokens_seen": 192368804, "step": 78 }, { "epoch": 0.0832016850974197, "grad_norm": 1.5003597736358643, "learning_rate": 0.00039000000000000005, "loss": 13.2001, "num_input_tokens_seen": 194857438, "step": 79 }, { "epoch": 0.08425487098472881, "grad_norm": 1.39485502243042, "learning_rate": 0.000395, "loss": 13.5808, "num_input_tokens_seen": 197152948, "step": 80 }, { "epoch": 0.08530805687203792, "grad_norm": 1.288568139076233, "learning_rate": 0.0004, "loss": 13.7794, "num_input_tokens_seen": 199500708, "step": 81 }, { "epoch": 0.08636124275934702, "grad_norm": 1.300167202949524, "learning_rate": 0.00040500000000000003, "loss": 12.8507, "num_input_tokens_seen": 201991874, "step": 82 }, { "epoch": 0.08741442864665613, "grad_norm": 0.8545141816139221, "learning_rate": 0.00041, "loss": 14.328, "num_input_tokens_seen": 204251674, "step": 83 }, { "epoch": 0.08846761453396525, "grad_norm": 1.0194705724716187, "learning_rate": 0.000415, "loss": 13.6767, "num_input_tokens_seen": 206656880, "step": 84 }, { "epoch": 0.08952080042127436, "grad_norm": 0.9372128844261169, "learning_rate": 0.00042, "loss": 14.5693, "num_input_tokens_seen": 208918254, "step": 85 }, { "epoch": 0.09057398630858346, "grad_norm": 1.2995604276657104, "learning_rate": 0.000425, "loss": 13.5958, "num_input_tokens_seen": 211434142, "step": 86 }, { "epoch": 0.09162717219589257, "grad_norm": 1.2741903066635132, "learning_rate": 0.00043, "loss": 13.8848, "num_input_tokens_seen": 214000898, "step": 87 }, { "epoch": 0.09268035808320169, "grad_norm": 2.0325863361358643, "learning_rate": 0.000435, "loss": 13.8518, "num_input_tokens_seen": 216309292, "step": 88 }, { "epoch": 0.0937335439705108, "grad_norm": 1.449713110923767, "learning_rate": 0.00044, "loss": 13.7997, "num_input_tokens_seen": 218756170, "step": 89 }, { "epoch": 0.0947867298578199, "grad_norm": 1.5175477266311646, "learning_rate": 0.00044500000000000003, "loss": 13.5718, "num_input_tokens_seen": 221193636, "step": 90 }, { "epoch": 0.09583991574512901, "grad_norm": 1.2801398038864136, "learning_rate": 0.00045000000000000004, "loss": 14.4654, "num_input_tokens_seen": 223524870, "step": 91 }, { "epoch": 0.09689310163243813, "grad_norm": 1.0959213972091675, "learning_rate": 0.000455, "loss": 11.545, "num_input_tokens_seen": 226066666, "step": 92 }, { "epoch": 0.09794628751974724, "grad_norm": 1.1696399450302124, "learning_rate": 0.00046, "loss": 13.3462, "num_input_tokens_seen": 228525260, "step": 93 }, { "epoch": 0.09899947340705635, "grad_norm": 1.1905299425125122, "learning_rate": 0.000465, "loss": 13.4698, "num_input_tokens_seen": 231156516, "step": 94 }, { "epoch": 0.10005265929436545, "grad_norm": 0.9483806490898132, "learning_rate": 0.00047, "loss": 14.8114, "num_input_tokens_seen": 233846560, "step": 95 }, { "epoch": 0.10110584518167456, "grad_norm": 1.5830718278884888, "learning_rate": 0.000475, "loss": 12.9401, "num_input_tokens_seen": 236170696, "step": 96 }, { "epoch": 0.10215903106898368, "grad_norm": 1.66010582447052, "learning_rate": 0.00048, "loss": 13.3604, "num_input_tokens_seen": 238549390, "step": 97 }, { "epoch": 0.10321221695629279, "grad_norm": 1.1955476999282837, "learning_rate": 0.00048499999999999997, "loss": 14.7142, "num_input_tokens_seen": 240990068, "step": 98 }, { "epoch": 0.10426540284360189, "grad_norm": 1.2605278491973877, "learning_rate": 0.00049, "loss": 13.8786, "num_input_tokens_seen": 243394892, "step": 99 }, { "epoch": 0.105318588730911, "grad_norm": 1.1509404182434082, "learning_rate": 0.000495, "loss": 12.2201, "num_input_tokens_seen": 245927990, "step": 100 }, { "epoch": 0.10637177461822012, "grad_norm": 1.566206693649292, "learning_rate": 0.0005, "loss": 10.9783, "num_input_tokens_seen": 248369144, "step": 101 }, { "epoch": 0.10742496050552923, "grad_norm": 1.2243906259536743, "learning_rate": 0.0004999922894119685, "loss": 13.9885, "num_input_tokens_seen": 250819650, "step": 102 }, { "epoch": 0.10847814639283834, "grad_norm": 0.7832080125808716, "learning_rate": 0.0004999691581234994, "loss": 15.5307, "num_input_tokens_seen": 253219186, "step": 103 }, { "epoch": 0.10953133228014744, "grad_norm": 0.8975496888160706, "learning_rate": 0.0004999306075614394, "loss": 15.138, "num_input_tokens_seen": 255671766, "step": 104 }, { "epoch": 0.11058451816745656, "grad_norm": 0.9028383493423462, "learning_rate": 0.0004998766401037688, "loss": 14.13, "num_input_tokens_seen": 258276528, "step": 105 }, { "epoch": 0.11163770405476567, "grad_norm": 1.4537795782089233, "learning_rate": 0.0004998072590794548, "loss": 13.9269, "num_input_tokens_seen": 260625306, "step": 106 }, { "epoch": 0.11269088994207478, "grad_norm": 0.916918158531189, "learning_rate": 0.0004997224687682457, "loss": 13.1447, "num_input_tokens_seen": 263021648, "step": 107 }, { "epoch": 0.11374407582938388, "grad_norm": 1.0510584115982056, "learning_rate": 0.000499622274400407, "loss": 14.1054, "num_input_tokens_seen": 265467338, "step": 108 }, { "epoch": 0.114797261716693, "grad_norm": 0.8123775720596313, "learning_rate": 0.0004995066821563998, "loss": 14.0404, "num_input_tokens_seen": 267981590, "step": 109 }, { "epoch": 0.11585044760400211, "grad_norm": 1.1095675230026245, "learning_rate": 0.0004993756991664976, "loss": 12.6619, "num_input_tokens_seen": 270308658, "step": 110 }, { "epoch": 0.11690363349131122, "grad_norm": 0.9915770888328552, "learning_rate": 0.0004992293335103487, "loss": 13.9319, "num_input_tokens_seen": 272795368, "step": 111 }, { "epoch": 0.11795681937862032, "grad_norm": 1.075748324394226, "learning_rate": 0.0004990675942164759, "loss": 14.3729, "num_input_tokens_seen": 275169288, "step": 112 }, { "epoch": 0.11901000526592943, "grad_norm": 0.8833720684051514, "learning_rate": 0.0004988904912617209, "loss": 15.3386, "num_input_tokens_seen": 277598550, "step": 113 }, { "epoch": 0.12006319115323855, "grad_norm": 0.6812732815742493, "learning_rate": 0.000498698035570628, "loss": 14.4552, "num_input_tokens_seen": 280073876, "step": 114 }, { "epoch": 0.12111637704054766, "grad_norm": 1.2348415851593018, "learning_rate": 0.0004984902390147711, "loss": 13.8133, "num_input_tokens_seen": 282587368, "step": 115 }, { "epoch": 0.12216956292785677, "grad_norm": 1.1452962160110474, "learning_rate": 0.0004982671144120202, "loss": 13.8322, "num_input_tokens_seen": 285113802, "step": 116 }, { "epoch": 0.12322274881516587, "grad_norm": 1.5170722007751465, "learning_rate": 0.000498028675525752, "loss": 11.1536, "num_input_tokens_seen": 287411358, "step": 117 }, { "epoch": 0.12427593470247499, "grad_norm": 1.1934428215026855, "learning_rate": 0.000497774937064, "loss": 12.9464, "num_input_tokens_seen": 289841892, "step": 118 }, { "epoch": 0.1253291205897841, "grad_norm": 1.2093623876571655, "learning_rate": 0.0004975059146785479, "loss": 12.8343, "num_input_tokens_seen": 292269164, "step": 119 }, { "epoch": 0.1263823064770932, "grad_norm": 0.9261972308158875, "learning_rate": 0.0004972216249639638, "loss": 14.9848, "num_input_tokens_seen": 294553234, "step": 120 }, { "epoch": 0.12743549236440233, "grad_norm": 0.9841050505638123, "learning_rate": 0.000496922085456576, "loss": 13.9842, "num_input_tokens_seen": 297047120, "step": 121 }, { "epoch": 0.12848867825171142, "grad_norm": 1.0842055082321167, "learning_rate": 0.0004966073146333924, "loss": 12.092, "num_input_tokens_seen": 299619262, "step": 122 }, { "epoch": 0.12954186413902052, "grad_norm": 0.7394796013832092, "learning_rate": 0.0004962773319109604, "loss": 13.4262, "num_input_tokens_seen": 302193484, "step": 123 }, { "epoch": 0.13059505002632965, "grad_norm": 0.9171662330627441, "learning_rate": 0.0004959321576441683, "loss": 14.5414, "num_input_tokens_seen": 304771644, "step": 124 }, { "epoch": 0.13164823591363875, "grad_norm": 0.8625826239585876, "learning_rate": 0.0004955718131249909, "loss": 14.7163, "num_input_tokens_seen": 307154770, "step": 125 }, { "epoch": 0.13270142180094788, "grad_norm": 1.2097444534301758, "learning_rate": 0.0004951963205811756, "loss": 13.6051, "num_input_tokens_seen": 309623990, "step": 126 }, { "epoch": 0.13375460768825698, "grad_norm": 0.9684086441993713, "learning_rate": 0.0004948057031748712, "loss": 12.3401, "num_input_tokens_seen": 312189822, "step": 127 }, { "epoch": 0.13480779357556608, "grad_norm": 0.7330049872398376, "learning_rate": 0.0004943999850011993, "loss": 13.428, "num_input_tokens_seen": 314481274, "step": 128 }, { "epoch": 0.1358609794628752, "grad_norm": 0.8198027014732361, "learning_rate": 0.0004939791910867678, "loss": 13.366, "num_input_tokens_seen": 316975100, "step": 129 }, { "epoch": 0.1369141653501843, "grad_norm": 0.7812517285346985, "learning_rate": 0.0004935433473881276, "loss": 14.3236, "num_input_tokens_seen": 319629918, "step": 130 }, { "epoch": 0.13796735123749343, "grad_norm": 0.9930380582809448, "learning_rate": 0.0004930924807901711, "loss": 11.7665, "num_input_tokens_seen": 322045726, "step": 131 }, { "epoch": 0.13902053712480253, "grad_norm": 1.002437949180603, "learning_rate": 0.0004926266191044738, "loss": 13.4716, "num_input_tokens_seen": 324422858, "step": 132 }, { "epoch": 0.14007372301211163, "grad_norm": 0.9191110134124756, "learning_rate": 0.0004921457910675788, "loss": 13.1584, "num_input_tokens_seen": 326675990, "step": 133 }, { "epoch": 0.14112690889942076, "grad_norm": 0.9338858723640442, "learning_rate": 0.0004916500263392243, "loss": 13.192, "num_input_tokens_seen": 329185696, "step": 134 }, { "epoch": 0.14218009478672985, "grad_norm": 0.7161638140678406, "learning_rate": 0.000491139355500514, "loss": 13.9349, "num_input_tokens_seen": 331756854, "step": 135 }, { "epoch": 0.14323328067403895, "grad_norm": 0.8185762166976929, "learning_rate": 0.0004906138100520309, "loss": 12.6261, "num_input_tokens_seen": 334321854, "step": 136 }, { "epoch": 0.14428646656134808, "grad_norm": 0.6805868744850159, "learning_rate": 0.0004900734224118936, "loss": 14.4888, "num_input_tokens_seen": 336876626, "step": 137 }, { "epoch": 0.14533965244865718, "grad_norm": 0.9425003528594971, "learning_rate": 0.0004895182259137573, "loss": 12.113, "num_input_tokens_seen": 339348464, "step": 138 }, { "epoch": 0.1463928383359663, "grad_norm": 0.8056529760360718, "learning_rate": 0.0004889482548047572, "loss": 14.649, "num_input_tokens_seen": 341823736, "step": 139 }, { "epoch": 0.1474460242232754, "grad_norm": 0.7355366945266724, "learning_rate": 0.0004883635442433959, "loss": 12.0118, "num_input_tokens_seen": 344380626, "step": 140 }, { "epoch": 0.1484992101105845, "grad_norm": 0.8561354279518127, "learning_rate": 0.0004877641302973755, "loss": 15.144, "num_input_tokens_seen": 346692052, "step": 141 }, { "epoch": 0.14955239599789363, "grad_norm": 0.9879862666130066, "learning_rate": 0.00048715004994137124, "loss": 14.6901, "num_input_tokens_seen": 349112532, "step": 142 }, { "epoch": 0.15060558188520273, "grad_norm": 0.9572948217391968, "learning_rate": 0.0004865213410547524, "loss": 14.3239, "num_input_tokens_seen": 351452022, "step": 143 }, { "epoch": 0.15165876777251186, "grad_norm": 0.8329271078109741, "learning_rate": 0.0004858780424192443, "loss": 12.3414, "num_input_tokens_seen": 353797086, "step": 144 }, { "epoch": 0.15271195365982096, "grad_norm": 0.7736046314239502, "learning_rate": 0.0004852201937165372, "loss": 13.5231, "num_input_tokens_seen": 356162002, "step": 145 }, { "epoch": 0.15376513954713006, "grad_norm": 0.7838310599327087, "learning_rate": 0.0004845478355258377, "loss": 14.2139, "num_input_tokens_seen": 358794470, "step": 146 }, { "epoch": 0.15481832543443919, "grad_norm": 0.6623799204826355, "learning_rate": 0.00048386100932136614, "loss": 14.1852, "num_input_tokens_seen": 361060330, "step": 147 }, { "epoch": 0.15587151132174829, "grad_norm": 0.8017021417617798, "learning_rate": 0.00048315975746979797, "loss": 12.8818, "num_input_tokens_seen": 363503210, "step": 148 }, { "epoch": 0.1569246972090574, "grad_norm": 0.6893861889839172, "learning_rate": 0.0004824441232276507, "loss": 13.4499, "num_input_tokens_seen": 365979336, "step": 149 }, { "epoch": 0.1579778830963665, "grad_norm": 0.9454759955406189, "learning_rate": 0.0004817141507386153, "loss": 13.5658, "num_input_tokens_seen": 368459936, "step": 150 }, { "epoch": 0.1590310689836756, "grad_norm": 0.6542503237724304, "learning_rate": 0.0004809698850308334, "loss": 12.5393, "num_input_tokens_seen": 370972514, "step": 151 }, { "epoch": 0.16008425487098474, "grad_norm": 0.7843099236488342, "learning_rate": 0.0004802113720141196, "loss": 13.8876, "num_input_tokens_seen": 373538886, "step": 152 }, { "epoch": 0.16113744075829384, "grad_norm": 0.681400716304779, "learning_rate": 0.00047943865847712965, "loss": 12.6614, "num_input_tokens_seen": 375997348, "step": 153 }, { "epoch": 0.16219062664560294, "grad_norm": 0.6228338479995728, "learning_rate": 0.0004786517920844744, "loss": 14.8045, "num_input_tokens_seen": 378368452, "step": 154 }, { "epoch": 0.16324381253291206, "grad_norm": 0.696509599685669, "learning_rate": 0.00047785082137377936, "loss": 14.3078, "num_input_tokens_seen": 380908798, "step": 155 }, { "epoch": 0.16429699842022116, "grad_norm": 0.746185839176178, "learning_rate": 0.000477035795752691, "loss": 14.4622, "num_input_tokens_seen": 383363952, "step": 156 }, { "epoch": 0.1653501843075303, "grad_norm": 0.8552024364471436, "learning_rate": 0.0004762067654958286, "loss": 13.1467, "num_input_tokens_seen": 385820504, "step": 157 }, { "epoch": 0.1664033701948394, "grad_norm": 0.7352308034896851, "learning_rate": 0.0004753637817416835, "loss": 14.5263, "num_input_tokens_seen": 388280752, "step": 158 }, { "epoch": 0.1674565560821485, "grad_norm": 0.624199628829956, "learning_rate": 0.0004745068964894645, "loss": 14.1771, "num_input_tokens_seen": 390877896, "step": 159 }, { "epoch": 0.16850974196945762, "grad_norm": 0.7346677184104919, "learning_rate": 0.00047363616259589025, "loss": 14.1435, "num_input_tokens_seen": 393477216, "step": 160 }, { "epoch": 0.16956292785676672, "grad_norm": 0.500455915927887, "learning_rate": 0.00047275163377192886, "loss": 11.8482, "num_input_tokens_seen": 396021712, "step": 161 }, { "epoch": 0.17061611374407584, "grad_norm": 0.7637024521827698, "learning_rate": 0.0004718533645794847, "loss": 12.7018, "num_input_tokens_seen": 398528988, "step": 162 }, { "epoch": 0.17166929963138494, "grad_norm": 0.6212698221206665, "learning_rate": 0.0004709414104280326, "loss": 14.0043, "num_input_tokens_seen": 401033514, "step": 163 }, { "epoch": 0.17272248551869404, "grad_norm": 0.7096490859985352, "learning_rate": 0.00047001582757120054, "loss": 13.5022, "num_input_tokens_seen": 403501710, "step": 164 }, { "epoch": 0.17377567140600317, "grad_norm": 0.5144787430763245, "learning_rate": 0.00046907667310329887, "loss": 13.0917, "num_input_tokens_seen": 405903642, "step": 165 }, { "epoch": 0.17482885729331227, "grad_norm": 0.5807265043258667, "learning_rate": 0.0004681240049557991, "loss": 14.2594, "num_input_tokens_seen": 408463312, "step": 166 }, { "epoch": 0.17588204318062137, "grad_norm": 0.6672924160957336, "learning_rate": 0.00046715788189375995, "loss": 14.18, "num_input_tokens_seen": 410857116, "step": 167 }, { "epoch": 0.1769352290679305, "grad_norm": 0.5318564176559448, "learning_rate": 0.0004661783635122028, "loss": 12.7704, "num_input_tokens_seen": 413293324, "step": 168 }, { "epoch": 0.1779884149552396, "grad_norm": 0.794619083404541, "learning_rate": 0.0004651855102324352, "loss": 14.1618, "num_input_tokens_seen": 415708708, "step": 169 }, { "epoch": 0.17904160084254872, "grad_norm": 0.45533284544944763, "learning_rate": 0.0004641793832983245, "loss": 14.5507, "num_input_tokens_seen": 418090970, "step": 170 }, { "epoch": 0.18009478672985782, "grad_norm": 0.8352651000022888, "learning_rate": 0.0004631600447725189, "loss": 12.2149, "num_input_tokens_seen": 420536658, "step": 171 }, { "epoch": 0.18114797261716692, "grad_norm": 0.720396876335144, "learning_rate": 0.0004621275575326206, "loss": 14.3939, "num_input_tokens_seen": 423068800, "step": 172 }, { "epoch": 0.18220115850447605, "grad_norm": 0.6031028032302856, "learning_rate": 0.00046108198526730563, "loss": 14.285, "num_input_tokens_seen": 425683216, "step": 173 }, { "epoch": 0.18325434439178515, "grad_norm": 0.7864506840705872, "learning_rate": 0.0004600233924723966, "loss": 14.9537, "num_input_tokens_seen": 428019392, "step": 174 }, { "epoch": 0.18430753027909427, "grad_norm": 0.6037179827690125, "learning_rate": 0.0004589518444468836, "loss": 14.3359, "num_input_tokens_seen": 430450162, "step": 175 }, { "epoch": 0.18536071616640337, "grad_norm": 0.5774262547492981, "learning_rate": 0.000457867407288896, "loss": 14.7474, "num_input_tokens_seen": 432988892, "step": 176 }, { "epoch": 0.18641390205371247, "grad_norm": 0.5661394000053406, "learning_rate": 0.0004567701478916261, "loss": 12.7355, "num_input_tokens_seen": 435403104, "step": 177 }, { "epoch": 0.1874670879410216, "grad_norm": 0.48973754048347473, "learning_rate": 0.00045566013393920205, "loss": 14.4007, "num_input_tokens_seen": 437894208, "step": 178 }, { "epoch": 0.1885202738283307, "grad_norm": 0.6543258428573608, "learning_rate": 0.0004545374339025129, "loss": 13.388, "num_input_tokens_seen": 440361302, "step": 179 }, { "epoch": 0.1895734597156398, "grad_norm": 0.6435189247131348, "learning_rate": 0.0004534021170349856, "loss": 12.7891, "num_input_tokens_seen": 442772076, "step": 180 }, { "epoch": 0.19062664560294892, "grad_norm": 0.5193156599998474, "learning_rate": 0.000452254253368312, "loss": 13.4092, "num_input_tokens_seen": 445142352, "step": 181 }, { "epoch": 0.19167983149025802, "grad_norm": 0.6403075456619263, "learning_rate": 0.0004510939137081302, "loss": 12.8269, "num_input_tokens_seen": 447635106, "step": 182 }, { "epoch": 0.19273301737756715, "grad_norm": 0.5330055356025696, "learning_rate": 0.00044992116962965623, "loss": 13.845, "num_input_tokens_seen": 449969556, "step": 183 }, { "epoch": 0.19378620326487625, "grad_norm": 0.7812511920928955, "learning_rate": 0.00044873609347326866, "loss": 12.8747, "num_input_tokens_seen": 452367082, "step": 184 }, { "epoch": 0.19483938915218535, "grad_norm": 0.734370231628418, "learning_rate": 0.0004475387583400473, "loss": 13.4237, "num_input_tokens_seen": 454829434, "step": 185 }, { "epoch": 0.19589257503949448, "grad_norm": 0.4542326331138611, "learning_rate": 0.00044632923808726293, "loss": 13.5973, "num_input_tokens_seen": 457344632, "step": 186 }, { "epoch": 0.19694576092680358, "grad_norm": 0.630257248878479, "learning_rate": 0.0004451076073238223, "loss": 14.2557, "num_input_tokens_seen": 459728304, "step": 187 }, { "epoch": 0.1979989468141127, "grad_norm": 0.5169203281402588, "learning_rate": 0.0004438739414056651, "loss": 14.2164, "num_input_tokens_seen": 462371360, "step": 188 }, { "epoch": 0.1990521327014218, "grad_norm": 0.8144217729568481, "learning_rate": 0.0004426283164311162, "loss": 13.2628, "num_input_tokens_seen": 464826406, "step": 189 }, { "epoch": 0.2001053185887309, "grad_norm": 0.643693745136261, "learning_rate": 0.00044137080923619174, "loss": 12.6582, "num_input_tokens_seen": 467463708, "step": 190 }, { "epoch": 0.20115850447604003, "grad_norm": 0.7169672250747681, "learning_rate": 0.0004401014973898586, "loss": 13.8927, "num_input_tokens_seen": 469992746, "step": 191 }, { "epoch": 0.20221169036334913, "grad_norm": 0.5725452303886414, "learning_rate": 0.0004388204591892506, "loss": 12.8936, "num_input_tokens_seen": 472498546, "step": 192 }, { "epoch": 0.20326487625065823, "grad_norm": 0.885722279548645, "learning_rate": 0.00043752777365483816, "loss": 13.1909, "num_input_tokens_seen": 474859284, "step": 193 }, { "epoch": 0.20431806213796735, "grad_norm": 0.5468750596046448, "learning_rate": 0.0004362235205255541, "loss": 13.3254, "num_input_tokens_seen": 477239588, "step": 194 }, { "epoch": 0.20537124802527645, "grad_norm": 0.5252361297607422, "learning_rate": 0.0004349077802538751, "loss": 14.707, "num_input_tokens_seen": 479559702, "step": 195 }, { "epoch": 0.20642443391258558, "grad_norm": 0.5459883213043213, "learning_rate": 0.0004335806340008587, "loss": 12.9516, "num_input_tokens_seen": 482139806, "step": 196 }, { "epoch": 0.20747761979989468, "grad_norm": 0.6774821877479553, "learning_rate": 0.00043224216363113723, "loss": 13.8214, "num_input_tokens_seen": 484606370, "step": 197 }, { "epoch": 0.20853080568720378, "grad_norm": 0.6609899997711182, "learning_rate": 0.0004308924517078678, "loss": 13.5105, "num_input_tokens_seen": 487088894, "step": 198 }, { "epoch": 0.2095839915745129, "grad_norm": 0.6434972286224365, "learning_rate": 0.00042953158148763975, "loss": 14.704, "num_input_tokens_seen": 489502024, "step": 199 }, { "epoch": 0.210637177461822, "grad_norm": 0.530325710773468, "learning_rate": 0.0004281596369153384, "loss": 13.314, "num_input_tokens_seen": 491847958, "step": 200 }, { "epoch": 0.21169036334913113, "grad_norm": 0.5580898523330688, "learning_rate": 0.0004267767026189673, "loss": 13.3497, "num_input_tokens_seen": 494483540, "step": 201 }, { "epoch": 0.21274354923644023, "grad_norm": 0.7523868083953857, "learning_rate": 0.00042538286390442833, "loss": 13.4598, "num_input_tokens_seen": 496784334, "step": 202 }, { "epoch": 0.21379673512374933, "grad_norm": 0.6943124532699585, "learning_rate": 0.00042397820675025866, "loss": 14.4622, "num_input_tokens_seen": 499122570, "step": 203 }, { "epoch": 0.21484992101105846, "grad_norm": 0.703617513179779, "learning_rate": 0.0004225628178023283, "loss": 14.4754, "num_input_tokens_seen": 501601060, "step": 204 }, { "epoch": 0.21590310689836756, "grad_norm": 0.6258761286735535, "learning_rate": 0.00042113678436849454, "loss": 12.8945, "num_input_tokens_seen": 504016506, "step": 205 }, { "epoch": 0.21695629278567669, "grad_norm": 0.7714695334434509, "learning_rate": 0.0004197001944132168, "loss": 13.6326, "num_input_tokens_seen": 506604150, "step": 206 }, { "epoch": 0.21800947867298578, "grad_norm": 0.6725168228149414, "learning_rate": 0.0004182531365521305, "loss": 14.247, "num_input_tokens_seen": 509053728, "step": 207 }, { "epoch": 0.21906266456029488, "grad_norm": 0.767383337020874, "learning_rate": 0.0004167957000465808, "loss": 14.331, "num_input_tokens_seen": 511567478, "step": 208 }, { "epoch": 0.220115850447604, "grad_norm": 0.505607545375824, "learning_rate": 0.00041532797479811636, "loss": 12.4713, "num_input_tokens_seen": 514136400, "step": 209 }, { "epoch": 0.2211690363349131, "grad_norm": 0.5330973863601685, "learning_rate": 0.00041385005134294417, "loss": 13.1837, "num_input_tokens_seen": 516530356, "step": 210 }, { "epoch": 0.2222222222222222, "grad_norm": 0.9914455413818359, "learning_rate": 0.00041236202084634466, "loss": 14.6703, "num_input_tokens_seen": 519028524, "step": 211 }, { "epoch": 0.22327540810953134, "grad_norm": 0.6133384108543396, "learning_rate": 0.0004108639750970481, "loss": 14.6338, "num_input_tokens_seen": 521436334, "step": 212 }, { "epoch": 0.22432859399684044, "grad_norm": 0.5959540009498596, "learning_rate": 0.00040935600650157265, "loss": 12.1111, "num_input_tokens_seen": 524024802, "step": 213 }, { "epoch": 0.22538177988414956, "grad_norm": 0.45092490315437317, "learning_rate": 0.00040783820807852457, "loss": 13.4522, "num_input_tokens_seen": 526580534, "step": 214 }, { "epoch": 0.22643496577145866, "grad_norm": 0.5904210805892944, "learning_rate": 0.00040631067345285994, "loss": 13.2396, "num_input_tokens_seen": 529062886, "step": 215 }, { "epoch": 0.22748815165876776, "grad_norm": 0.46232736110687256, "learning_rate": 0.0004047734968501098, "loss": 13.8601, "num_input_tokens_seen": 531535650, "step": 216 }, { "epoch": 0.2285413375460769, "grad_norm": 0.617415189743042, "learning_rate": 0.0004032267730905678, "loss": 12.8285, "num_input_tokens_seen": 534061814, "step": 217 }, { "epoch": 0.229594523433386, "grad_norm": 0.7234218120574951, "learning_rate": 0.00040167059758344114, "loss": 15.3652, "num_input_tokens_seen": 536687100, "step": 218 }, { "epoch": 0.23064770932069512, "grad_norm": 0.8561363220214844, "learning_rate": 0.00040010506632096537, "loss": 12.8718, "num_input_tokens_seen": 539150044, "step": 219 }, { "epoch": 0.23170089520800422, "grad_norm": 0.7568314671516418, "learning_rate": 0.0003985302758724831, "loss": 12.4404, "num_input_tokens_seen": 541648322, "step": 220 }, { "epoch": 0.23275408109531331, "grad_norm": 0.6514911651611328, "learning_rate": 0.000396946323378487, "loss": 13.7264, "num_input_tokens_seen": 544168352, "step": 221 }, { "epoch": 0.23380726698262244, "grad_norm": 0.6163381338119507, "learning_rate": 0.0003953533065446281, "loss": 13.1166, "num_input_tokens_seen": 546741482, "step": 222 }, { "epoch": 0.23486045286993154, "grad_norm": 0.696421205997467, "learning_rate": 0.00039375132363568836, "loss": 13.3474, "num_input_tokens_seen": 549104356, "step": 223 }, { "epoch": 0.23591363875724064, "grad_norm": 0.6050801873207092, "learning_rate": 0.00039214047346951974, "loss": 14.6743, "num_input_tokens_seen": 551429474, "step": 224 }, { "epoch": 0.23696682464454977, "grad_norm": 0.6175264120101929, "learning_rate": 0.00039052085541094823, "loss": 13.7552, "num_input_tokens_seen": 553859868, "step": 225 }, { "epoch": 0.23802001053185887, "grad_norm": 0.5892110466957092, "learning_rate": 0.0003888925693656447, "loss": 11.5144, "num_input_tokens_seen": 556346556, "step": 226 }, { "epoch": 0.239073196419168, "grad_norm": 0.4362712502479553, "learning_rate": 0.00038725571577396254, "loss": 14.626, "num_input_tokens_seen": 558924540, "step": 227 }, { "epoch": 0.2401263823064771, "grad_norm": 0.6579407453536987, "learning_rate": 0.0003856103956047413, "loss": 13.2753, "num_input_tokens_seen": 561327442, "step": 228 }, { "epoch": 0.2411795681937862, "grad_norm": 0.7273943424224854, "learning_rate": 0.0003839567103490793, "loss": 13.0622, "num_input_tokens_seen": 563837514, "step": 229 }, { "epoch": 0.24223275408109532, "grad_norm": 0.4669589698314667, "learning_rate": 0.0003822947620140726, "loss": 13.8885, "num_input_tokens_seen": 566302658, "step": 230 }, { "epoch": 0.24328593996840442, "grad_norm": 0.5003737211227417, "learning_rate": 0.0003806246531165231, "loss": 12.9119, "num_input_tokens_seen": 568934814, "step": 231 }, { "epoch": 0.24433912585571355, "grad_norm": 0.6652275323867798, "learning_rate": 0.0003789464866766144, "loss": 12.8361, "num_input_tokens_seen": 571237934, "step": 232 }, { "epoch": 0.24539231174302265, "grad_norm": 0.6242262125015259, "learning_rate": 0.0003772603662115575, "loss": 15.0241, "num_input_tokens_seen": 573556058, "step": 233 }, { "epoch": 0.24644549763033174, "grad_norm": 0.48296427726745605, "learning_rate": 0.0003755663957292048, "loss": 13.4312, "num_input_tokens_seen": 575930124, "step": 234 }, { "epoch": 0.24749868351764087, "grad_norm": 0.6832424998283386, "learning_rate": 0.00037386467972163516, "loss": 14.7431, "num_input_tokens_seen": 578430022, "step": 235 }, { "epoch": 0.24855186940494997, "grad_norm": 0.5334222912788391, "learning_rate": 0.00037215532315870774, "loss": 14.7028, "num_input_tokens_seen": 580816748, "step": 236 }, { "epoch": 0.24960505529225907, "grad_norm": 0.7042843699455261, "learning_rate": 0.00037043843148158696, "loss": 12.5905, "num_input_tokens_seen": 583366398, "step": 237 }, { "epoch": 0.2506582411795682, "grad_norm": 0.7521989941596985, "learning_rate": 0.0003687141105962389, "loss": 12.383, "num_input_tokens_seen": 585825914, "step": 238 }, { "epoch": 0.2517114270668773, "grad_norm": 0.5999695658683777, "learning_rate": 0.000366982466866898, "loss": 12.7159, "num_input_tokens_seen": 588320922, "step": 239 }, { "epoch": 0.2527646129541864, "grad_norm": 0.5892810225486755, "learning_rate": 0.00036524360710950624, "loss": 15.1348, "num_input_tokens_seen": 590762520, "step": 240 }, { "epoch": 0.2538177988414955, "grad_norm": 0.5765227675437927, "learning_rate": 0.0003634976385851242, "loss": 12.3584, "num_input_tokens_seen": 593119346, "step": 241 }, { "epoch": 0.25487098472880465, "grad_norm": 0.4210558533668518, "learning_rate": 0.00036174466899331484, "loss": 13.7542, "num_input_tokens_seen": 595480902, "step": 242 }, { "epoch": 0.2559241706161137, "grad_norm": 0.6763030290603638, "learning_rate": 0.0003599848064654995, "loss": 13.2929, "num_input_tokens_seen": 597986692, "step": 243 }, { "epoch": 0.25697735650342285, "grad_norm": 0.7130559086799622, "learning_rate": 0.000358218159558289, "loss": 13.3078, "num_input_tokens_seen": 600640014, "step": 244 }, { "epoch": 0.258030542390732, "grad_norm": 0.4660380184650421, "learning_rate": 0.0003564448372467859, "loss": 14.6048, "num_input_tokens_seen": 602922284, "step": 245 }, { "epoch": 0.25908372827804105, "grad_norm": 0.44570890069007874, "learning_rate": 0.0003546649489178636, "loss": 14.8523, "num_input_tokens_seen": 605350038, "step": 246 }, { "epoch": 0.2601369141653502, "grad_norm": 0.6495239734649658, "learning_rate": 0.00035287860436341824, "loss": 15.1324, "num_input_tokens_seen": 607725012, "step": 247 }, { "epoch": 0.2611901000526593, "grad_norm": 0.5707351565361023, "learning_rate": 0.0003510859137735964, "loss": 15.3348, "num_input_tokens_seen": 610117954, "step": 248 }, { "epoch": 0.26224328593996843, "grad_norm": 0.4367353618144989, "learning_rate": 0.00034928698772999787, "loss": 13.6328, "num_input_tokens_seen": 612557846, "step": 249 }, { "epoch": 0.2632964718272775, "grad_norm": 0.6344426274299622, "learning_rate": 0.0003474819371988549, "loss": 12.4955, "num_input_tokens_seen": 614861376, "step": 250 }, { "epoch": 0.2643496577145866, "grad_norm": 0.6118125319480896, "learning_rate": 0.00034567087352418665, "loss": 14.4303, "num_input_tokens_seen": 617294518, "step": 251 }, { "epoch": 0.26540284360189575, "grad_norm": 0.7218714356422424, "learning_rate": 0.0003438539084209315, "loss": 14.22, "num_input_tokens_seen": 619635220, "step": 252 }, { "epoch": 0.2664560294892048, "grad_norm": 0.6843687891960144, "learning_rate": 0.0003420311539680557, "loss": 12.9144, "num_input_tokens_seen": 622075956, "step": 253 }, { "epoch": 0.26750921537651395, "grad_norm": 0.7561080455780029, "learning_rate": 0.00034020272260163977, "loss": 13.2422, "num_input_tokens_seen": 624607758, "step": 254 }, { "epoch": 0.2685624012638231, "grad_norm": 0.7663512229919434, "learning_rate": 0.0003383687271079432, "loss": 14.6787, "num_input_tokens_seen": 627172634, "step": 255 }, { "epoch": 0.26961558715113215, "grad_norm": 0.4203454554080963, "learning_rate": 0.0003365292806164468, "loss": 14.2549, "num_input_tokens_seen": 629690912, "step": 256 }, { "epoch": 0.2706687730384413, "grad_norm": 0.546712338924408, "learning_rate": 0.00033468449659287486, "loss": 14.3124, "num_input_tokens_seen": 632255410, "step": 257 }, { "epoch": 0.2717219589257504, "grad_norm": 0.7108402252197266, "learning_rate": 0.0003328344888321955, "loss": 13.3583, "num_input_tokens_seen": 634582544, "step": 258 }, { "epoch": 0.2727751448130595, "grad_norm": 0.6388810276985168, "learning_rate": 0.0003309793714516019, "loss": 13.9617, "num_input_tokens_seen": 637108470, "step": 259 }, { "epoch": 0.2738283307003686, "grad_norm": 0.5022799372673035, "learning_rate": 0.00032911925888347234, "loss": 13.8352, "num_input_tokens_seen": 639426634, "step": 260 }, { "epoch": 0.27488151658767773, "grad_norm": 0.5447593927383423, "learning_rate": 0.00032725426586831203, "loss": 15.0631, "num_input_tokens_seen": 641900154, "step": 261 }, { "epoch": 0.27593470247498686, "grad_norm": 0.5760501027107239, "learning_rate": 0.0003253845074476749, "loss": 13.7699, "num_input_tokens_seen": 644205144, "step": 262 }, { "epoch": 0.27698788836229593, "grad_norm": 0.5831496715545654, "learning_rate": 0.00032351009895706785, "loss": 14.5406, "num_input_tokens_seen": 646496932, "step": 263 }, { "epoch": 0.27804107424960506, "grad_norm": 0.5406723618507385, "learning_rate": 0.00032163115601883583, "loss": 12.7346, "num_input_tokens_seen": 649096904, "step": 264 }, { "epoch": 0.2790942601369142, "grad_norm": 0.49079757928848267, "learning_rate": 0.0003197477945350297, "loss": 14.2354, "num_input_tokens_seen": 651600900, "step": 265 }, { "epoch": 0.28014744602422326, "grad_norm": 0.6124666929244995, "learning_rate": 0.0003178601306802573, "loss": 14.5682, "num_input_tokens_seen": 654189842, "step": 266 }, { "epoch": 0.2812006319115324, "grad_norm": 0.5420570969581604, "learning_rate": 0.00031596828089451703, "loss": 14.2276, "num_input_tokens_seen": 656636650, "step": 267 }, { "epoch": 0.2822538177988415, "grad_norm": 0.8127923011779785, "learning_rate": 0.00031407236187601487, "loss": 11.0238, "num_input_tokens_seen": 659070838, "step": 268 }, { "epoch": 0.2833070036861506, "grad_norm": 0.6101799011230469, "learning_rate": 0.0003121724905739666, "loss": 12.8694, "num_input_tokens_seen": 661668122, "step": 269 }, { "epoch": 0.2843601895734597, "grad_norm": 0.5585333108901978, "learning_rate": 0.0003102687841813832, "loss": 12.3072, "num_input_tokens_seen": 664055676, "step": 270 }, { "epoch": 0.28541337546076884, "grad_norm": 0.5515812635421753, "learning_rate": 0.00030836136012784226, "loss": 14.4035, "num_input_tokens_seen": 666712162, "step": 271 }, { "epoch": 0.2864665613480779, "grad_norm": 0.873502254486084, "learning_rate": 0.00030645033607224425, "loss": 11.5386, "num_input_tokens_seen": 669145358, "step": 272 }, { "epoch": 0.28751974723538704, "grad_norm": 0.7929896712303162, "learning_rate": 0.0003045358298955546, "loss": 13.7234, "num_input_tokens_seen": 671496934, "step": 273 }, { "epoch": 0.28857293312269616, "grad_norm": 0.6259802579879761, "learning_rate": 0.0003026179596935324, "loss": 12.9467, "num_input_tokens_seen": 673956840, "step": 274 }, { "epoch": 0.2896261190100053, "grad_norm": 0.5827144980430603, "learning_rate": 0.00030069684376944573, "loss": 13.7525, "num_input_tokens_seen": 676522688, "step": 275 }, { "epoch": 0.29067930489731436, "grad_norm": 0.438773512840271, "learning_rate": 0.000298772600626774, "loss": 14.4876, "num_input_tokens_seen": 678987128, "step": 276 }, { "epoch": 0.2917324907846235, "grad_norm": 0.461264431476593, "learning_rate": 0.00029684534896189834, "loss": 13.9557, "num_input_tokens_seen": 681376426, "step": 277 }, { "epoch": 0.2927856766719326, "grad_norm": 0.6426395177841187, "learning_rate": 0.0002949152076567795, "loss": 13.2047, "num_input_tokens_seen": 683908570, "step": 278 }, { "epoch": 0.2938388625592417, "grad_norm": 0.6306378245353699, "learning_rate": 0.0002929822957716248, "loss": 14.3553, "num_input_tokens_seen": 686435460, "step": 279 }, { "epoch": 0.2948920484465508, "grad_norm": 0.7777087092399597, "learning_rate": 0.00029104673253754456, "loss": 14.459, "num_input_tokens_seen": 689001010, "step": 280 }, { "epoch": 0.29594523433385994, "grad_norm": 0.6122662425041199, "learning_rate": 0.00028910863734919615, "loss": 11.8841, "num_input_tokens_seen": 691555714, "step": 281 }, { "epoch": 0.296998420221169, "grad_norm": 0.5533509850502014, "learning_rate": 0.00028716812975741995, "loss": 13.8105, "num_input_tokens_seen": 694038390, "step": 282 }, { "epoch": 0.29805160610847814, "grad_norm": 0.4330850839614868, "learning_rate": 0.00028522532946186486, "loss": 15.3927, "num_input_tokens_seen": 696690606, "step": 283 }, { "epoch": 0.29910479199578727, "grad_norm": 0.42759576439857483, "learning_rate": 0.0002832803563036046, "loss": 14.1626, "num_input_tokens_seen": 698974368, "step": 284 }, { "epoch": 0.3001579778830964, "grad_norm": 0.603848934173584, "learning_rate": 0.00028133333025774524, "loss": 14.7466, "num_input_tokens_seen": 701514324, "step": 285 }, { "epoch": 0.30121116377040547, "grad_norm": 0.7019180655479431, "learning_rate": 0.0002793843714260245, "loss": 15.9288, "num_input_tokens_seen": 703856054, "step": 286 }, { "epoch": 0.3022643496577146, "grad_norm": 0.5272998213768005, "learning_rate": 0.0002774336000294035, "loss": 14.2158, "num_input_tokens_seen": 706380374, "step": 287 }, { "epoch": 0.3033175355450237, "grad_norm": 0.6808348298072815, "learning_rate": 0.0002754811364006511, "loss": 14.3927, "num_input_tokens_seen": 708733546, "step": 288 }, { "epoch": 0.3043707214323328, "grad_norm": 0.5963918566703796, "learning_rate": 0.0002735271009769208, "loss": 15.4471, "num_input_tokens_seen": 711304704, "step": 289 }, { "epoch": 0.3054239073196419, "grad_norm": 0.5460891127586365, "learning_rate": 0.00027157161429232173, "loss": 15.3405, "num_input_tokens_seen": 713915084, "step": 290 }, { "epoch": 0.30647709320695105, "grad_norm": 0.4156426787376404, "learning_rate": 0.00026961479697048385, "loss": 14.4655, "num_input_tokens_seen": 716304858, "step": 291 }, { "epoch": 0.3075302790942601, "grad_norm": 0.6879798173904419, "learning_rate": 0.00026765676971711704, "loss": 13.2176, "num_input_tokens_seen": 718782282, "step": 292 }, { "epoch": 0.30858346498156924, "grad_norm": 0.41598352789878845, "learning_rate": 0.00026569765331256536, "loss": 14.1726, "num_input_tokens_seen": 721256712, "step": 293 }, { "epoch": 0.30963665086887837, "grad_norm": 0.5724912285804749, "learning_rate": 0.000263737568604357, "loss": 13.6108, "num_input_tokens_seen": 723712914, "step": 294 }, { "epoch": 0.31068983675618744, "grad_norm": 0.6704716682434082, "learning_rate": 0.00026177663649974936, "loss": 13.6995, "num_input_tokens_seen": 726014718, "step": 295 }, { "epoch": 0.31174302264349657, "grad_norm": 0.6606157422065735, "learning_rate": 0.00025981497795827174, "loss": 13.7077, "num_input_tokens_seen": 728471242, "step": 296 }, { "epoch": 0.3127962085308057, "grad_norm": 0.5907959342002869, "learning_rate": 0.0002578527139842631, "loss": 13.3751, "num_input_tokens_seen": 730842942, "step": 297 }, { "epoch": 0.3138493944181148, "grad_norm": 0.710864782333374, "learning_rate": 0.00025588996561940846, "loss": 13.9066, "num_input_tokens_seen": 733445910, "step": 298 }, { "epoch": 0.3149025803054239, "grad_norm": 0.44538646936416626, "learning_rate": 0.0002539268539352723, "loss": 14.0321, "num_input_tokens_seen": 735908532, "step": 299 }, { "epoch": 0.315955766192733, "grad_norm": 0.5508079528808594, "learning_rate": 0.00025196350002583027, "loss": 13.4983, "num_input_tokens_seen": 738472692, "step": 300 } ], "logging_steps": 1.0, "max_steps": 500, "num_input_tokens_seen": 738472692, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.0242409621374894e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }