{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.3163444639718805, "eval_steps": 500, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001054481546572935, "grad_norm": 0.9705461859703064, "learning_rate": 0.0, "loss": 10.9361, "num_input_tokens_seen": 2425848, "step": 1 }, { "epoch": 0.00210896309314587, "grad_norm": 1.3177989721298218, "learning_rate": 5e-06, "loss": 11.6216, "num_input_tokens_seen": 4956204, "step": 2 }, { "epoch": 0.003163444639718805, "grad_norm": 1.0460824966430664, "learning_rate": 1e-05, "loss": 10.5512, "num_input_tokens_seen": 7504132, "step": 3 }, { "epoch": 0.00421792618629174, "grad_norm": 0.892030656337738, "learning_rate": 1.5e-05, "loss": 10.6304, "num_input_tokens_seen": 9828964, "step": 4 }, { "epoch": 0.005272407732864675, "grad_norm": 1.3544925451278687, "learning_rate": 2e-05, "loss": 11.746, "num_input_tokens_seen": 12166228, "step": 5 }, { "epoch": 0.00632688927943761, "grad_norm": 0.9905428886413574, "learning_rate": 2.5e-05, "loss": 11.586, "num_input_tokens_seen": 14513162, "step": 6 }, { "epoch": 0.007381370826010545, "grad_norm": 1.1137465238571167, "learning_rate": 3e-05, "loss": 11.9056, "num_input_tokens_seen": 16999606, "step": 7 }, { "epoch": 0.00843585237258348, "grad_norm": 1.0565546751022339, "learning_rate": 3.5000000000000004e-05, "loss": 11.1544, "num_input_tokens_seen": 19495694, "step": 8 }, { "epoch": 0.009490333919156414, "grad_norm": 1.173275113105774, "learning_rate": 4e-05, "loss": 10.8971, "num_input_tokens_seen": 21885690, "step": 9 }, { "epoch": 0.01054481546572935, "grad_norm": 1.478582501411438, "learning_rate": 4.4999999999999996e-05, "loss": 11.2455, "num_input_tokens_seen": 24390702, "step": 10 }, { "epoch": 0.011599297012302284, "grad_norm": 0.9788720011711121, "learning_rate": 5e-05, "loss": 11.7283, "num_input_tokens_seen": 26861992, "step": 11 }, { "epoch": 0.01265377855887522, "grad_norm": 1.572351336479187, "learning_rate": 5.5e-05, "loss": 11.1688, "num_input_tokens_seen": 29193680, "step": 12 }, { "epoch": 0.013708260105448155, "grad_norm": 1.4559812545776367, "learning_rate": 6e-05, "loss": 11.6758, "num_input_tokens_seen": 31664596, "step": 13 }, { "epoch": 0.01476274165202109, "grad_norm": 0.9523009657859802, "learning_rate": 6.500000000000001e-05, "loss": 10.4219, "num_input_tokens_seen": 34036374, "step": 14 }, { "epoch": 0.015817223198594025, "grad_norm": 1.2957357168197632, "learning_rate": 7.000000000000001e-05, "loss": 11.3482, "num_input_tokens_seen": 36512920, "step": 15 }, { "epoch": 0.01687170474516696, "grad_norm": 0.900630533695221, "learning_rate": 7.5e-05, "loss": 11.6171, "num_input_tokens_seen": 38907186, "step": 16 }, { "epoch": 0.017926186291739893, "grad_norm": 0.9069404006004333, "learning_rate": 8e-05, "loss": 11.163, "num_input_tokens_seen": 41485262, "step": 17 }, { "epoch": 0.01898066783831283, "grad_norm": 1.526408314704895, "learning_rate": 8.5e-05, "loss": 11.8419, "num_input_tokens_seen": 43786674, "step": 18 }, { "epoch": 0.020035149384885764, "grad_norm": 1.1045119762420654, "learning_rate": 8.999999999999999e-05, "loss": 10.9038, "num_input_tokens_seen": 46102944, "step": 19 }, { "epoch": 0.0210896309314587, "grad_norm": 0.66255122423172, "learning_rate": 9.5e-05, "loss": 11.94, "num_input_tokens_seen": 48591602, "step": 20 }, { "epoch": 0.022144112478031636, "grad_norm": 1.4556896686553955, "learning_rate": 0.0001, "loss": 12.2157, "num_input_tokens_seen": 50987434, "step": 21 }, { "epoch": 0.023198594024604568, "grad_norm": 1.3036383390426636, "learning_rate": 0.000105, "loss": 8.8805, "num_input_tokens_seen": 53372194, "step": 22 }, { "epoch": 0.024253075571177504, "grad_norm": 1.1424001455307007, "learning_rate": 0.00011, "loss": 10.7333, "num_input_tokens_seen": 55826644, "step": 23 }, { "epoch": 0.02530755711775044, "grad_norm": 1.3209176063537598, "learning_rate": 0.000115, "loss": 11.7093, "num_input_tokens_seen": 58227112, "step": 24 }, { "epoch": 0.026362038664323375, "grad_norm": 0.9307802319526672, "learning_rate": 0.00012, "loss": 10.5914, "num_input_tokens_seen": 60837186, "step": 25 }, { "epoch": 0.02741652021089631, "grad_norm": 1.4010263681411743, "learning_rate": 0.000125, "loss": 8.3295, "num_input_tokens_seen": 63230980, "step": 26 }, { "epoch": 0.028471001757469243, "grad_norm": 1.1759122610092163, "learning_rate": 0.00013000000000000002, "loss": 10.2116, "num_input_tokens_seen": 65610708, "step": 27 }, { "epoch": 0.02952548330404218, "grad_norm": 0.9759851694107056, "learning_rate": 0.000135, "loss": 10.6781, "num_input_tokens_seen": 68080890, "step": 28 }, { "epoch": 0.030579964850615114, "grad_norm": 1.082846760749817, "learning_rate": 0.00014000000000000001, "loss": 12.8138, "num_input_tokens_seen": 70585882, "step": 29 }, { "epoch": 0.03163444639718805, "grad_norm": 1.0576307773590088, "learning_rate": 0.000145, "loss": 10.8181, "num_input_tokens_seen": 73104506, "step": 30 }, { "epoch": 0.032688927943760986, "grad_norm": 1.0077800750732422, "learning_rate": 0.00015, "loss": 11.2286, "num_input_tokens_seen": 75511970, "step": 31 }, { "epoch": 0.03374340949033392, "grad_norm": 0.8206114172935486, "learning_rate": 0.000155, "loss": 10.6955, "num_input_tokens_seen": 78038336, "step": 32 }, { "epoch": 0.03479789103690686, "grad_norm": 1.0786269903182983, "learning_rate": 0.00016, "loss": 10.8638, "num_input_tokens_seen": 80369394, "step": 33 }, { "epoch": 0.035852372583479786, "grad_norm": 1.1006481647491455, "learning_rate": 0.000165, "loss": 10.5262, "num_input_tokens_seen": 82753120, "step": 34 }, { "epoch": 0.03690685413005272, "grad_norm": 0.8803530335426331, "learning_rate": 0.00017, "loss": 10.4982, "num_input_tokens_seen": 85056206, "step": 35 }, { "epoch": 0.03796133567662566, "grad_norm": 1.2676767110824585, "learning_rate": 0.000175, "loss": 12.4488, "num_input_tokens_seen": 87639648, "step": 36 }, { "epoch": 0.03901581722319859, "grad_norm": 0.9760981202125549, "learning_rate": 0.00017999999999999998, "loss": 11.5831, "num_input_tokens_seen": 89969532, "step": 37 }, { "epoch": 0.04007029876977153, "grad_norm": 1.011448860168457, "learning_rate": 0.000185, "loss": 11.4546, "num_input_tokens_seen": 92389948, "step": 38 }, { "epoch": 0.041124780316344464, "grad_norm": 0.9081957340240479, "learning_rate": 0.00019, "loss": 12.0174, "num_input_tokens_seen": 94893746, "step": 39 }, { "epoch": 0.0421792618629174, "grad_norm": 1.0000590085983276, "learning_rate": 0.00019500000000000002, "loss": 11.0123, "num_input_tokens_seen": 97286502, "step": 40 }, { "epoch": 0.043233743409490336, "grad_norm": 0.9923156499862671, "learning_rate": 0.0002, "loss": 12.0964, "num_input_tokens_seen": 99689044, "step": 41 }, { "epoch": 0.04428822495606327, "grad_norm": 0.9752357006072998, "learning_rate": 0.000205, "loss": 10.3987, "num_input_tokens_seen": 102250738, "step": 42 }, { "epoch": 0.04534270650263621, "grad_norm": 1.0063071250915527, "learning_rate": 0.00021, "loss": 12.1977, "num_input_tokens_seen": 104683216, "step": 43 }, { "epoch": 0.046397188049209136, "grad_norm": 0.9825400710105896, "learning_rate": 0.000215, "loss": 11.8863, "num_input_tokens_seen": 107261470, "step": 44 }, { "epoch": 0.04745166959578207, "grad_norm": 0.8624069094657898, "learning_rate": 0.00022, "loss": 11.4568, "num_input_tokens_seen": 109842334, "step": 45 }, { "epoch": 0.04850615114235501, "grad_norm": 0.989672064781189, "learning_rate": 0.00022500000000000002, "loss": 11.1933, "num_input_tokens_seen": 112206494, "step": 46 }, { "epoch": 0.04956063268892794, "grad_norm": 0.8774502873420715, "learning_rate": 0.00023, "loss": 10.3116, "num_input_tokens_seen": 114594098, "step": 47 }, { "epoch": 0.05061511423550088, "grad_norm": 1.1998200416564941, "learning_rate": 0.000235, "loss": 11.3885, "num_input_tokens_seen": 116970602, "step": 48 }, { "epoch": 0.051669595782073814, "grad_norm": 1.6917927265167236, "learning_rate": 0.00024, "loss": 11.2694, "num_input_tokens_seen": 119606816, "step": 49 }, { "epoch": 0.05272407732864675, "grad_norm": 1.5441681146621704, "learning_rate": 0.000245, "loss": 12.4193, "num_input_tokens_seen": 121995176, "step": 50 }, { "epoch": 0.053778558875219686, "grad_norm": 1.8777674436569214, "learning_rate": 0.00025, "loss": 11.4997, "num_input_tokens_seen": 124419224, "step": 51 }, { "epoch": 0.05483304042179262, "grad_norm": 1.384661316871643, "learning_rate": 0.000255, "loss": 10.7444, "num_input_tokens_seen": 126831556, "step": 52 }, { "epoch": 0.05588752196836556, "grad_norm": 1.0499628782272339, "learning_rate": 0.00026000000000000003, "loss": 9.8281, "num_input_tokens_seen": 129215044, "step": 53 }, { "epoch": 0.056942003514938486, "grad_norm": 1.7948285341262817, "learning_rate": 0.00026500000000000004, "loss": 11.318, "num_input_tokens_seen": 131708150, "step": 54 }, { "epoch": 0.05799648506151142, "grad_norm": 1.69430410861969, "learning_rate": 0.00027, "loss": 10.4812, "num_input_tokens_seen": 134191794, "step": 55 }, { "epoch": 0.05905096660808436, "grad_norm": 1.4970661401748657, "learning_rate": 0.000275, "loss": 10.7448, "num_input_tokens_seen": 136613856, "step": 56 }, { "epoch": 0.06010544815465729, "grad_norm": 1.2830063104629517, "learning_rate": 0.00028000000000000003, "loss": 10.8374, "num_input_tokens_seen": 139052346, "step": 57 }, { "epoch": 0.06115992970123023, "grad_norm": 1.44180428981781, "learning_rate": 0.000285, "loss": 12.5583, "num_input_tokens_seen": 141458664, "step": 58 }, { "epoch": 0.062214411247803164, "grad_norm": 1.5622317790985107, "learning_rate": 0.00029, "loss": 11.1813, "num_input_tokens_seen": 143985770, "step": 59 }, { "epoch": 0.0632688927943761, "grad_norm": 1.9193415641784668, "learning_rate": 0.000295, "loss": 11.025, "num_input_tokens_seen": 146472938, "step": 60 }, { "epoch": 0.06432337434094904, "grad_norm": 2.170022964477539, "learning_rate": 0.0003, "loss": 12.4049, "num_input_tokens_seen": 148907744, "step": 61 }, { "epoch": 0.06537785588752197, "grad_norm": 1.3234895467758179, "learning_rate": 0.000305, "loss": 11.8529, "num_input_tokens_seen": 151448294, "step": 62 }, { "epoch": 0.0664323374340949, "grad_norm": 1.6685116291046143, "learning_rate": 0.00031, "loss": 12.0368, "num_input_tokens_seen": 153917584, "step": 63 }, { "epoch": 0.06748681898066784, "grad_norm": 1.2088539600372314, "learning_rate": 0.000315, "loss": 10.435, "num_input_tokens_seen": 156591012, "step": 64 }, { "epoch": 0.06854130052724078, "grad_norm": 1.2755296230316162, "learning_rate": 0.00032, "loss": 10.8798, "num_input_tokens_seen": 158975418, "step": 65 }, { "epoch": 0.06959578207381371, "grad_norm": 1.654634714126587, "learning_rate": 0.00032500000000000004, "loss": 11.7874, "num_input_tokens_seen": 161420368, "step": 66 }, { "epoch": 0.07065026362038665, "grad_norm": 1.1874513626098633, "learning_rate": 0.00033, "loss": 11.6777, "num_input_tokens_seen": 163923386, "step": 67 }, { "epoch": 0.07170474516695957, "grad_norm": 1.0489180088043213, "learning_rate": 0.000335, "loss": 9.9053, "num_input_tokens_seen": 166570580, "step": 68 }, { "epoch": 0.07275922671353251, "grad_norm": 1.1512370109558105, "learning_rate": 0.00034, "loss": 11.4144, "num_input_tokens_seen": 169085496, "step": 69 }, { "epoch": 0.07381370826010544, "grad_norm": 1.5113091468811035, "learning_rate": 0.000345, "loss": 11.5717, "num_input_tokens_seen": 171510744, "step": 70 }, { "epoch": 0.07486818980667838, "grad_norm": 1.0373127460479736, "learning_rate": 0.00035, "loss": 11.2621, "num_input_tokens_seen": 173889198, "step": 71 }, { "epoch": 0.07592267135325131, "grad_norm": 1.2473938465118408, "learning_rate": 0.000355, "loss": 11.2106, "num_input_tokens_seen": 176486654, "step": 72 }, { "epoch": 0.07697715289982425, "grad_norm": 1.4872411489486694, "learning_rate": 0.00035999999999999997, "loss": 11.5685, "num_input_tokens_seen": 178783436, "step": 73 }, { "epoch": 0.07803163444639719, "grad_norm": 1.2539381980895996, "learning_rate": 0.000365, "loss": 11.9506, "num_input_tokens_seen": 181065054, "step": 74 }, { "epoch": 0.07908611599297012, "grad_norm": 1.0619122982025146, "learning_rate": 0.00037, "loss": 11.5489, "num_input_tokens_seen": 183491460, "step": 75 }, { "epoch": 0.08014059753954306, "grad_norm": 1.1141507625579834, "learning_rate": 0.000375, "loss": 10.4755, "num_input_tokens_seen": 185871790, "step": 76 }, { "epoch": 0.08119507908611599, "grad_norm": 0.8605804443359375, "learning_rate": 0.00038, "loss": 11.0816, "num_input_tokens_seen": 188255230, "step": 77 }, { "epoch": 0.08224956063268893, "grad_norm": 1.2247560024261475, "learning_rate": 0.00038500000000000003, "loss": 11.6941, "num_input_tokens_seen": 190823556, "step": 78 }, { "epoch": 0.08330404217926186, "grad_norm": 0.9986952543258667, "learning_rate": 0.00039000000000000005, "loss": 11.6883, "num_input_tokens_seen": 193241694, "step": 79 }, { "epoch": 0.0843585237258348, "grad_norm": 0.9467179179191589, "learning_rate": 0.000395, "loss": 10.8918, "num_input_tokens_seen": 195610918, "step": 80 }, { "epoch": 0.08541300527240774, "grad_norm": 1.0464240312576294, "learning_rate": 0.0004, "loss": 10.553, "num_input_tokens_seen": 197988346, "step": 81 }, { "epoch": 0.08646748681898067, "grad_norm": 1.1668314933776855, "learning_rate": 0.00040500000000000003, "loss": 11.474, "num_input_tokens_seen": 200427190, "step": 82 }, { "epoch": 0.0875219683655536, "grad_norm": 1.034874439239502, "learning_rate": 0.00041, "loss": 10.7481, "num_input_tokens_seen": 202809982, "step": 83 }, { "epoch": 0.08857644991212654, "grad_norm": 0.8593769669532776, "learning_rate": 0.000415, "loss": 9.1805, "num_input_tokens_seen": 205157580, "step": 84 }, { "epoch": 0.08963093145869948, "grad_norm": 0.8074848651885986, "learning_rate": 0.00042, "loss": 10.5358, "num_input_tokens_seen": 207687796, "step": 85 }, { "epoch": 0.09068541300527241, "grad_norm": 0.9401149153709412, "learning_rate": 0.000425, "loss": 11.6198, "num_input_tokens_seen": 209997718, "step": 86 }, { "epoch": 0.09173989455184535, "grad_norm": 1.0918290615081787, "learning_rate": 0.00043, "loss": 12.5703, "num_input_tokens_seen": 212407844, "step": 87 }, { "epoch": 0.09279437609841827, "grad_norm": 0.9897942543029785, "learning_rate": 0.000435, "loss": 11.042, "num_input_tokens_seen": 215008632, "step": 88 }, { "epoch": 0.09384885764499121, "grad_norm": 0.9593794345855713, "learning_rate": 0.00044, "loss": 10.6658, "num_input_tokens_seen": 217393470, "step": 89 }, { "epoch": 0.09490333919156414, "grad_norm": 0.9671006202697754, "learning_rate": 0.00044500000000000003, "loss": 11.7276, "num_input_tokens_seen": 219847368, "step": 90 }, { "epoch": 0.09595782073813708, "grad_norm": 0.9617960453033447, "learning_rate": 0.00045000000000000004, "loss": 12.1468, "num_input_tokens_seen": 222195586, "step": 91 }, { "epoch": 0.09701230228471001, "grad_norm": 1.0989540815353394, "learning_rate": 0.000455, "loss": 10.5941, "num_input_tokens_seen": 224689946, "step": 92 }, { "epoch": 0.09806678383128295, "grad_norm": 1.1951866149902344, "learning_rate": 0.00046, "loss": 11.0436, "num_input_tokens_seen": 227147082, "step": 93 }, { "epoch": 0.09912126537785589, "grad_norm": 1.192893385887146, "learning_rate": 0.000465, "loss": 11.3061, "num_input_tokens_seen": 229639860, "step": 94 }, { "epoch": 0.10017574692442882, "grad_norm": 1.1032167673110962, "learning_rate": 0.00047, "loss": 10.3604, "num_input_tokens_seen": 232075288, "step": 95 }, { "epoch": 0.10123022847100176, "grad_norm": 1.0772745609283447, "learning_rate": 0.000475, "loss": 11.8123, "num_input_tokens_seen": 234546596, "step": 96 }, { "epoch": 0.10228471001757469, "grad_norm": 1.0267037153244019, "learning_rate": 0.00048, "loss": 11.0106, "num_input_tokens_seen": 237127696, "step": 97 }, { "epoch": 0.10333919156414763, "grad_norm": 0.9997051358222961, "learning_rate": 0.00048499999999999997, "loss": 10.5487, "num_input_tokens_seen": 239595852, "step": 98 }, { "epoch": 0.10439367311072056, "grad_norm": 0.9370649456977844, "learning_rate": 0.00049, "loss": 10.3265, "num_input_tokens_seen": 242124460, "step": 99 }, { "epoch": 0.1054481546572935, "grad_norm": 0.7946972846984863, "learning_rate": 0.000495, "loss": 11.656, "num_input_tokens_seen": 244549536, "step": 100 }, { "epoch": 0.10650263620386644, "grad_norm": 0.8751388192176819, "learning_rate": 0.0005, "loss": 11.3816, "num_input_tokens_seen": 246899676, "step": 101 }, { "epoch": 0.10755711775043937, "grad_norm": 0.7762444019317627, "learning_rate": 0.0004999922894119685, "loss": 11.0355, "num_input_tokens_seen": 249263792, "step": 102 }, { "epoch": 0.1086115992970123, "grad_norm": 1.1343305110931396, "learning_rate": 0.0004999691581234994, "loss": 11.2074, "num_input_tokens_seen": 251764084, "step": 103 }, { "epoch": 0.10966608084358524, "grad_norm": 0.778600811958313, "learning_rate": 0.0004999306075614394, "loss": 8.9238, "num_input_tokens_seen": 254190814, "step": 104 }, { "epoch": 0.11072056239015818, "grad_norm": 0.8278611898422241, "learning_rate": 0.0004998766401037688, "loss": 10.2973, "num_input_tokens_seen": 256625058, "step": 105 }, { "epoch": 0.11177504393673111, "grad_norm": 0.9260748028755188, "learning_rate": 0.0004998072590794548, "loss": 11.447, "num_input_tokens_seen": 259008258, "step": 106 }, { "epoch": 0.11282952548330404, "grad_norm": 0.857938289642334, "learning_rate": 0.0004997224687682457, "loss": 10.8006, "num_input_tokens_seen": 261428048, "step": 107 }, { "epoch": 0.11388400702987697, "grad_norm": 0.8201888799667358, "learning_rate": 0.000499622274400407, "loss": 10.6493, "num_input_tokens_seen": 263817432, "step": 108 }, { "epoch": 0.11493848857644991, "grad_norm": 0.6717099547386169, "learning_rate": 0.0004995066821563998, "loss": 10.8537, "num_input_tokens_seen": 266228540, "step": 109 }, { "epoch": 0.11599297012302284, "grad_norm": 0.6603596210479736, "learning_rate": 0.0004993756991664976, "loss": 10.5697, "num_input_tokens_seen": 268491920, "step": 110 }, { "epoch": 0.11704745166959578, "grad_norm": 0.6117770671844482, "learning_rate": 0.0004992293335103487, "loss": 10.4003, "num_input_tokens_seen": 271003672, "step": 111 }, { "epoch": 0.11810193321616871, "grad_norm": 0.6692019701004028, "learning_rate": 0.0004990675942164759, "loss": 10.7826, "num_input_tokens_seen": 273280584, "step": 112 }, { "epoch": 0.11915641476274165, "grad_norm": 0.6404725909233093, "learning_rate": 0.0004988904912617209, "loss": 10.7471, "num_input_tokens_seen": 275653910, "step": 113 }, { "epoch": 0.12021089630931459, "grad_norm": 0.671772301197052, "learning_rate": 0.000498698035570628, "loss": 10.8945, "num_input_tokens_seen": 278191980, "step": 114 }, { "epoch": 0.12126537785588752, "grad_norm": 0.762337863445282, "learning_rate": 0.0004984902390147711, "loss": 10.5803, "num_input_tokens_seen": 280577286, "step": 115 }, { "epoch": 0.12231985940246046, "grad_norm": 0.5650742053985596, "learning_rate": 0.0004982671144120202, "loss": 11.7586, "num_input_tokens_seen": 283062546, "step": 116 }, { "epoch": 0.12337434094903339, "grad_norm": 0.49756231904029846, "learning_rate": 0.000498028675525752, "loss": 11.7438, "num_input_tokens_seen": 285359532, "step": 117 }, { "epoch": 0.12442882249560633, "grad_norm": 0.5226776599884033, "learning_rate": 0.000497774937064, "loss": 10.7112, "num_input_tokens_seen": 287755300, "step": 118 }, { "epoch": 0.12548330404217925, "grad_norm": 0.633590817451477, "learning_rate": 0.0004975059146785479, "loss": 10.0741, "num_input_tokens_seen": 290138596, "step": 119 }, { "epoch": 0.1265377855887522, "grad_norm": 0.4148622155189514, "learning_rate": 0.0004972216249639638, "loss": 10.797, "num_input_tokens_seen": 292503656, "step": 120 }, { "epoch": 0.12759226713532512, "grad_norm": 0.5795189738273621, "learning_rate": 0.000496922085456576, "loss": 10.1871, "num_input_tokens_seen": 294918832, "step": 121 }, { "epoch": 0.12864674868189807, "grad_norm": 0.6654810309410095, "learning_rate": 0.0004966073146333924, "loss": 11.5674, "num_input_tokens_seen": 297328590, "step": 122 }, { "epoch": 0.129701230228471, "grad_norm": 0.33947330713272095, "learning_rate": 0.0004962773319109604, "loss": 10.8928, "num_input_tokens_seen": 299705308, "step": 123 }, { "epoch": 0.13075571177504394, "grad_norm": 0.5353716611862183, "learning_rate": 0.0004959321576441683, "loss": 10.4202, "num_input_tokens_seen": 302143056, "step": 124 }, { "epoch": 0.13181019332161686, "grad_norm": 0.48882856965065, "learning_rate": 0.0004955718131249909, "loss": 10.4133, "num_input_tokens_seen": 304570734, "step": 125 }, { "epoch": 0.1328646748681898, "grad_norm": 0.455727219581604, "learning_rate": 0.0004951963205811756, "loss": 10.4053, "num_input_tokens_seen": 306918736, "step": 126 }, { "epoch": 0.13391915641476274, "grad_norm": 0.4142698049545288, "learning_rate": 0.0004948057031748712, "loss": 12.2709, "num_input_tokens_seen": 309461984, "step": 127 }, { "epoch": 0.13497363796133569, "grad_norm": 0.5193336606025696, "learning_rate": 0.0004943999850011993, "loss": 9.9356, "num_input_tokens_seen": 311925540, "step": 128 }, { "epoch": 0.1360281195079086, "grad_norm": 0.34628814458847046, "learning_rate": 0.0004939791910867678, "loss": 11.9624, "num_input_tokens_seen": 314443604, "step": 129 }, { "epoch": 0.13708260105448156, "grad_norm": 0.34039705991744995, "learning_rate": 0.0004935433473881276, "loss": 11.998, "num_input_tokens_seen": 316817696, "step": 130 }, { "epoch": 0.13813708260105448, "grad_norm": 0.31744757294654846, "learning_rate": 0.0004930924807901711, "loss": 10.7824, "num_input_tokens_seen": 319389504, "step": 131 }, { "epoch": 0.13919156414762743, "grad_norm": 0.4334086775779724, "learning_rate": 0.0004926266191044738, "loss": 10.9087, "num_input_tokens_seen": 321932538, "step": 132 }, { "epoch": 0.14024604569420035, "grad_norm": 0.39024171233177185, "learning_rate": 0.0004921457910675788, "loss": 11.258, "num_input_tokens_seen": 324411930, "step": 133 }, { "epoch": 0.1413005272407733, "grad_norm": 0.309071809053421, "learning_rate": 0.0004916500263392243, "loss": 10.8821, "num_input_tokens_seen": 326731560, "step": 134 }, { "epoch": 0.14235500878734622, "grad_norm": 0.51357102394104, "learning_rate": 0.000491139355500514, "loss": 10.8771, "num_input_tokens_seen": 329243246, "step": 135 }, { "epoch": 0.14340949033391914, "grad_norm": 0.4383300840854645, "learning_rate": 0.0004906138100520309, "loss": 10.6312, "num_input_tokens_seen": 331831630, "step": 136 }, { "epoch": 0.1444639718804921, "grad_norm": 0.33850258588790894, "learning_rate": 0.0004900734224118936, "loss": 10.5268, "num_input_tokens_seen": 334302218, "step": 137 }, { "epoch": 0.14551845342706501, "grad_norm": 0.5955417156219482, "learning_rate": 0.0004895182259137573, "loss": 11.8226, "num_input_tokens_seen": 336913326, "step": 138 }, { "epoch": 0.14657293497363796, "grad_norm": 0.4914705753326416, "learning_rate": 0.0004889482548047572, "loss": 12.5447, "num_input_tokens_seen": 339358906, "step": 139 }, { "epoch": 0.14762741652021089, "grad_norm": 0.27514171600341797, "learning_rate": 0.0004883635442433959, "loss": 10.3052, "num_input_tokens_seen": 341933644, "step": 140 }, { "epoch": 0.14868189806678384, "grad_norm": 0.35384228825569153, "learning_rate": 0.0004877641302973755, "loss": 10.873, "num_input_tokens_seen": 344324576, "step": 141 }, { "epoch": 0.14973637961335676, "grad_norm": 0.3017413914203644, "learning_rate": 0.00048715004994137124, "loss": 9.9987, "num_input_tokens_seen": 346908972, "step": 142 }, { "epoch": 0.1507908611599297, "grad_norm": 0.4270491898059845, "learning_rate": 0.0004865213410547524, "loss": 10.9375, "num_input_tokens_seen": 349386838, "step": 143 }, { "epoch": 0.15184534270650263, "grad_norm": 0.30478644371032715, "learning_rate": 0.0004858780424192443, "loss": 11.7221, "num_input_tokens_seen": 351755336, "step": 144 }, { "epoch": 0.15289982425307558, "grad_norm": 0.23904182016849518, "learning_rate": 0.0004852201937165372, "loss": 11.2525, "num_input_tokens_seen": 354150620, "step": 145 }, { "epoch": 0.1539543057996485, "grad_norm": 0.5767377018928528, "learning_rate": 0.0004845478355258377, "loss": 10.2205, "num_input_tokens_seen": 356645140, "step": 146 }, { "epoch": 0.15500878734622145, "grad_norm": 0.2709570527076721, "learning_rate": 0.00048386100932136614, "loss": 11.4829, "num_input_tokens_seen": 358928648, "step": 147 }, { "epoch": 0.15606326889279437, "grad_norm": 0.29028162360191345, "learning_rate": 0.00048315975746979797, "loss": 11.9542, "num_input_tokens_seen": 361553144, "step": 148 }, { "epoch": 0.15711775043936732, "grad_norm": 0.4260713458061218, "learning_rate": 0.0004824441232276507, "loss": 11.3444, "num_input_tokens_seen": 364124878, "step": 149 }, { "epoch": 0.15817223198594024, "grad_norm": 0.3330203890800476, "learning_rate": 0.0004817141507386153, "loss": 11.2711, "num_input_tokens_seen": 366451036, "step": 150 }, { "epoch": 0.1592267135325132, "grad_norm": 0.41251805424690247, "learning_rate": 0.0004809698850308334, "loss": 10.778, "num_input_tokens_seen": 368799018, "step": 151 }, { "epoch": 0.16028119507908611, "grad_norm": 0.31036192178726196, "learning_rate": 0.0004802113720141196, "loss": 10.9472, "num_input_tokens_seen": 371286512, "step": 152 }, { "epoch": 0.16133567662565906, "grad_norm": 0.36658334732055664, "learning_rate": 0.00047943865847712965, "loss": 10.1918, "num_input_tokens_seen": 373790542, "step": 153 }, { "epoch": 0.16239015817223199, "grad_norm": 0.16490666568279266, "learning_rate": 0.0004786517920844744, "loss": 11.6581, "num_input_tokens_seen": 376164786, "step": 154 }, { "epoch": 0.1634446397188049, "grad_norm": 0.2791113257408142, "learning_rate": 0.00047785082137377936, "loss": 11.235, "num_input_tokens_seen": 378668934, "step": 155 }, { "epoch": 0.16449912126537786, "grad_norm": 0.6701633930206299, "learning_rate": 0.000477035795752691, "loss": 10.7173, "num_input_tokens_seen": 380982868, "step": 156 }, { "epoch": 0.16555360281195078, "grad_norm": 0.20896555483341217, "learning_rate": 0.0004762067654958286, "loss": 11.2034, "num_input_tokens_seen": 383412848, "step": 157 }, { "epoch": 0.16660808435852373, "grad_norm": 0.33096417784690857, "learning_rate": 0.0004753637817416835, "loss": 11.0934, "num_input_tokens_seen": 385890530, "step": 158 }, { "epoch": 0.16766256590509665, "grad_norm": 0.3027982711791992, "learning_rate": 0.0004745068964894645, "loss": 10.5818, "num_input_tokens_seen": 388331186, "step": 159 }, { "epoch": 0.1687170474516696, "grad_norm": 0.49811211228370667, "learning_rate": 0.00047363616259589025, "loss": 10.2307, "num_input_tokens_seen": 390722478, "step": 160 }, { "epoch": 0.16977152899824252, "grad_norm": 0.2739993929862976, "learning_rate": 0.00047275163377192886, "loss": 10.2033, "num_input_tokens_seen": 393128528, "step": 161 }, { "epoch": 0.17082601054481547, "grad_norm": 0.2883780598640442, "learning_rate": 0.0004718533645794847, "loss": 12.0863, "num_input_tokens_seen": 395710636, "step": 162 }, { "epoch": 0.1718804920913884, "grad_norm": 0.2624651789665222, "learning_rate": 0.0004709414104280326, "loss": 11.3383, "num_input_tokens_seen": 398179608, "step": 163 }, { "epoch": 0.17293497363796134, "grad_norm": 0.47886499762535095, "learning_rate": 0.00047001582757120054, "loss": 10.8533, "num_input_tokens_seen": 400581072, "step": 164 }, { "epoch": 0.17398945518453426, "grad_norm": 0.2257954329252243, "learning_rate": 0.00046907667310329887, "loss": 10.798, "num_input_tokens_seen": 403057566, "step": 165 }, { "epoch": 0.1750439367311072, "grad_norm": 0.4420064091682434, "learning_rate": 0.0004681240049557991, "loss": 10.8315, "num_input_tokens_seen": 405479554, "step": 166 }, { "epoch": 0.17609841827768014, "grad_norm": 0.26745620369911194, "learning_rate": 0.00046715788189375995, "loss": 11.2626, "num_input_tokens_seen": 407921128, "step": 167 }, { "epoch": 0.17715289982425309, "grad_norm": 0.4664440453052521, "learning_rate": 0.0004661783635122028, "loss": 11.0442, "num_input_tokens_seen": 410550558, "step": 168 }, { "epoch": 0.178207381370826, "grad_norm": 0.28467220067977905, "learning_rate": 0.0004651855102324352, "loss": 11.3473, "num_input_tokens_seen": 412953064, "step": 169 }, { "epoch": 0.17926186291739896, "grad_norm": 0.49820107221603394, "learning_rate": 0.0004641793832983245, "loss": 9.2909, "num_input_tokens_seen": 415501368, "step": 170 }, { "epoch": 0.18031634446397188, "grad_norm": 0.3134786784648895, "learning_rate": 0.0004631600447725189, "loss": 11.3416, "num_input_tokens_seen": 417838436, "step": 171 }, { "epoch": 0.18137082601054483, "grad_norm": 0.24203424155712128, "learning_rate": 0.0004621275575326206, "loss": 10.9296, "num_input_tokens_seen": 420236388, "step": 172 }, { "epoch": 0.18242530755711775, "grad_norm": 0.3917903006076813, "learning_rate": 0.00046108198526730563, "loss": 11.122, "num_input_tokens_seen": 422659540, "step": 173 }, { "epoch": 0.1834797891036907, "grad_norm": 0.3565535545349121, "learning_rate": 0.0004600233924723966, "loss": 10.1823, "num_input_tokens_seen": 425004762, "step": 174 }, { "epoch": 0.18453427065026362, "grad_norm": 0.5487532019615173, "learning_rate": 0.0004589518444468836, "loss": 10.6171, "num_input_tokens_seen": 427453028, "step": 175 }, { "epoch": 0.18558875219683654, "grad_norm": 0.37913718819618225, "learning_rate": 0.000457867407288896, "loss": 11.9287, "num_input_tokens_seen": 429911724, "step": 176 }, { "epoch": 0.1866432337434095, "grad_norm": 0.32560214400291443, "learning_rate": 0.0004567701478916261, "loss": 11.9459, "num_input_tokens_seen": 432286308, "step": 177 }, { "epoch": 0.18769771528998241, "grad_norm": 0.48925817012786865, "learning_rate": 0.00045566013393920205, "loss": 11.7383, "num_input_tokens_seen": 434795636, "step": 178 }, { "epoch": 0.18875219683655536, "grad_norm": 0.2835240662097931, "learning_rate": 0.0004545374339025129, "loss": 11.8017, "num_input_tokens_seen": 437315386, "step": 179 }, { "epoch": 0.18980667838312829, "grad_norm": 0.28459757566452026, "learning_rate": 0.0004534021170349856, "loss": 10.8091, "num_input_tokens_seen": 439795046, "step": 180 }, { "epoch": 0.19086115992970124, "grad_norm": 0.32745423913002014, "learning_rate": 0.000452254253368312, "loss": 10.8863, "num_input_tokens_seen": 442280862, "step": 181 }, { "epoch": 0.19191564147627416, "grad_norm": 0.31233006715774536, "learning_rate": 0.0004510939137081302, "loss": 11.2497, "num_input_tokens_seen": 444878730, "step": 182 }, { "epoch": 0.1929701230228471, "grad_norm": 0.4110264480113983, "learning_rate": 0.00044992116962965623, "loss": 11.158, "num_input_tokens_seen": 447229276, "step": 183 }, { "epoch": 0.19402460456942003, "grad_norm": 0.18000631034374237, "learning_rate": 0.00044873609347326866, "loss": 11.9775, "num_input_tokens_seen": 449598746, "step": 184 }, { "epoch": 0.19507908611599298, "grad_norm": 0.4664939045906067, "learning_rate": 0.0004475387583400473, "loss": 9.4072, "num_input_tokens_seen": 452111232, "step": 185 }, { "epoch": 0.1961335676625659, "grad_norm": 0.5280163884162903, "learning_rate": 0.00044632923808726293, "loss": 11.1004, "num_input_tokens_seen": 454483758, "step": 186 }, { "epoch": 0.19718804920913885, "grad_norm": 0.2916633188724518, "learning_rate": 0.0004451076073238223, "loss": 11.4804, "num_input_tokens_seen": 456962194, "step": 187 }, { "epoch": 0.19824253075571177, "grad_norm": 0.335133820772171, "learning_rate": 0.0004438739414056651, "loss": 10.874, "num_input_tokens_seen": 459290608, "step": 188 }, { "epoch": 0.19929701230228472, "grad_norm": 0.2993161082267761, "learning_rate": 0.0004426283164311162, "loss": 12.3625, "num_input_tokens_seen": 461679768, "step": 189 }, { "epoch": 0.20035149384885764, "grad_norm": 0.36241772770881653, "learning_rate": 0.00044137080923619174, "loss": 10.7626, "num_input_tokens_seen": 464264660, "step": 190 }, { "epoch": 0.2014059753954306, "grad_norm": 0.36657917499542236, "learning_rate": 0.0004401014973898586, "loss": 10.042, "num_input_tokens_seen": 466692112, "step": 191 }, { "epoch": 0.2024604569420035, "grad_norm": 0.25538718700408936, "learning_rate": 0.0004388204591892506, "loss": 9.8264, "num_input_tokens_seen": 469183024, "step": 192 }, { "epoch": 0.20351493848857646, "grad_norm": 0.35221487283706665, "learning_rate": 0.00043752777365483816, "loss": 11.481, "num_input_tokens_seen": 471674424, "step": 193 }, { "epoch": 0.20456942003514939, "grad_norm": 0.2864941358566284, "learning_rate": 0.0004362235205255541, "loss": 10.06, "num_input_tokens_seen": 473839978, "step": 194 }, { "epoch": 0.2056239015817223, "grad_norm": 0.3318755030632019, "learning_rate": 0.0004349077802538751, "loss": 10.7263, "num_input_tokens_seen": 476332186, "step": 195 }, { "epoch": 0.20667838312829526, "grad_norm": 0.4062202274799347, "learning_rate": 0.0004335806340008587, "loss": 11.1529, "num_input_tokens_seen": 478785370, "step": 196 }, { "epoch": 0.20773286467486818, "grad_norm": 0.31146612763404846, "learning_rate": 0.00043224216363113723, "loss": 11.5957, "num_input_tokens_seen": 481219648, "step": 197 }, { "epoch": 0.20878734622144113, "grad_norm": 0.3751303255558014, "learning_rate": 0.0004308924517078678, "loss": 11.1931, "num_input_tokens_seen": 483719790, "step": 198 }, { "epoch": 0.20984182776801405, "grad_norm": 0.33134013414382935, "learning_rate": 0.00042953158148763975, "loss": 11.3433, "num_input_tokens_seen": 486326140, "step": 199 }, { "epoch": 0.210896309314587, "grad_norm": 0.2671400308609009, "learning_rate": 0.0004281596369153384, "loss": 10.9545, "num_input_tokens_seen": 488789092, "step": 200 }, { "epoch": 0.21195079086115992, "grad_norm": 0.3682190775871277, "learning_rate": 0.0004267767026189673, "loss": 12.1108, "num_input_tokens_seen": 491188440, "step": 201 }, { "epoch": 0.21300527240773287, "grad_norm": 0.2916874587535858, "learning_rate": 0.00042538286390442833, "loss": 10.8266, "num_input_tokens_seen": 493543466, "step": 202 }, { "epoch": 0.2140597539543058, "grad_norm": 0.4030936658382416, "learning_rate": 0.00042397820675025866, "loss": 10.67, "num_input_tokens_seen": 496149784, "step": 203 }, { "epoch": 0.21511423550087874, "grad_norm": 0.4171995520591736, "learning_rate": 0.0004225628178023283, "loss": 10.5863, "num_input_tokens_seen": 498838520, "step": 204 }, { "epoch": 0.21616871704745166, "grad_norm": 0.41781219840049744, "learning_rate": 0.00042113678436849454, "loss": 10.5018, "num_input_tokens_seen": 501399136, "step": 205 }, { "epoch": 0.2172231985940246, "grad_norm": 0.2998122572898865, "learning_rate": 0.0004197001944132168, "loss": 11.2194, "num_input_tokens_seen": 503776514, "step": 206 }, { "epoch": 0.21827768014059754, "grad_norm": 0.28352978825569153, "learning_rate": 0.0004182531365521305, "loss": 11.5346, "num_input_tokens_seen": 506130888, "step": 207 }, { "epoch": 0.21933216168717048, "grad_norm": 0.20737877488136292, "learning_rate": 0.0004167957000465808, "loss": 11.2783, "num_input_tokens_seen": 508565728, "step": 208 }, { "epoch": 0.2203866432337434, "grad_norm": 0.4097883403301239, "learning_rate": 0.00041532797479811636, "loss": 11.4614, "num_input_tokens_seen": 511010390, "step": 209 }, { "epoch": 0.22144112478031636, "grad_norm": 0.34904664754867554, "learning_rate": 0.00041385005134294417, "loss": 10.7346, "num_input_tokens_seen": 513483182, "step": 210 }, { "epoch": 0.22249560632688928, "grad_norm": 0.31537023186683655, "learning_rate": 0.00041236202084634466, "loss": 11.2394, "num_input_tokens_seen": 515869144, "step": 211 }, { "epoch": 0.22355008787346223, "grad_norm": 0.3396138846874237, "learning_rate": 0.0004108639750970481, "loss": 10.4913, "num_input_tokens_seen": 518419596, "step": 212 }, { "epoch": 0.22460456942003515, "grad_norm": 0.2921527028083801, "learning_rate": 0.00040935600650157265, "loss": 11.1316, "num_input_tokens_seen": 520915978, "step": 213 }, { "epoch": 0.22565905096660807, "grad_norm": 0.23728570342063904, "learning_rate": 0.00040783820807852457, "loss": 10.8167, "num_input_tokens_seen": 523288654, "step": 214 }, { "epoch": 0.22671353251318102, "grad_norm": 0.21741224825382233, "learning_rate": 0.00040631067345285994, "loss": 10.3892, "num_input_tokens_seen": 525939782, "step": 215 }, { "epoch": 0.22776801405975394, "grad_norm": 0.23055513203144073, "learning_rate": 0.0004047734968501098, "loss": 10.4061, "num_input_tokens_seen": 528213108, "step": 216 }, { "epoch": 0.2288224956063269, "grad_norm": 0.30807727575302124, "learning_rate": 0.0004032267730905678, "loss": 10.9689, "num_input_tokens_seen": 530690074, "step": 217 }, { "epoch": 0.22987697715289981, "grad_norm": 0.3024272620677948, "learning_rate": 0.00040167059758344114, "loss": 9.7481, "num_input_tokens_seen": 533032048, "step": 218 }, { "epoch": 0.23093145869947276, "grad_norm": 0.31856945157051086, "learning_rate": 0.00040010506632096537, "loss": 11.5326, "num_input_tokens_seen": 535625558, "step": 219 }, { "epoch": 0.23198594024604569, "grad_norm": 0.2052072137594223, "learning_rate": 0.0003985302758724831, "loss": 10.9529, "num_input_tokens_seen": 538342358, "step": 220 }, { "epoch": 0.23304042179261863, "grad_norm": 0.22243520617485046, "learning_rate": 0.000396946323378487, "loss": 12.1192, "num_input_tokens_seen": 540629126, "step": 221 }, { "epoch": 0.23409490333919156, "grad_norm": 0.43042072653770447, "learning_rate": 0.0003953533065446281, "loss": 11.7515, "num_input_tokens_seen": 543221254, "step": 222 }, { "epoch": 0.2351493848857645, "grad_norm": 0.33945250511169434, "learning_rate": 0.00039375132363568836, "loss": 10.8633, "num_input_tokens_seen": 545592750, "step": 223 }, { "epoch": 0.23620386643233743, "grad_norm": 0.2054436206817627, "learning_rate": 0.00039214047346951974, "loss": 11.2144, "num_input_tokens_seen": 547974264, "step": 224 }, { "epoch": 0.23725834797891038, "grad_norm": 0.30000683665275574, "learning_rate": 0.00039052085541094823, "loss": 11.2938, "num_input_tokens_seen": 550455212, "step": 225 }, { "epoch": 0.2383128295254833, "grad_norm": 0.2809171676635742, "learning_rate": 0.0003888925693656447, "loss": 11.0639, "num_input_tokens_seen": 552934874, "step": 226 }, { "epoch": 0.23936731107205625, "grad_norm": 0.2859019339084625, "learning_rate": 0.00038725571577396254, "loss": 12.5436, "num_input_tokens_seen": 555415658, "step": 227 }, { "epoch": 0.24042179261862917, "grad_norm": 0.30741429328918457, "learning_rate": 0.0003856103956047413, "loss": 11.2325, "num_input_tokens_seen": 557910548, "step": 228 }, { "epoch": 0.24147627416520212, "grad_norm": 0.358772873878479, "learning_rate": 0.0003839567103490793, "loss": 10.0116, "num_input_tokens_seen": 560400606, "step": 229 }, { "epoch": 0.24253075571177504, "grad_norm": 0.3477535843849182, "learning_rate": 0.0003822947620140726, "loss": 10.1542, "num_input_tokens_seen": 562814866, "step": 230 }, { "epoch": 0.243585237258348, "grad_norm": 0.3360324800014496, "learning_rate": 0.0003806246531165231, "loss": 12.0984, "num_input_tokens_seen": 565068902, "step": 231 }, { "epoch": 0.2446397188049209, "grad_norm": 0.3064538538455963, "learning_rate": 0.0003789464866766144, "loss": 10.6158, "num_input_tokens_seen": 567518434, "step": 232 }, { "epoch": 0.24569420035149384, "grad_norm": 0.23387250304222107, "learning_rate": 0.0003772603662115575, "loss": 11.0439, "num_input_tokens_seen": 569850658, "step": 233 }, { "epoch": 0.24674868189806679, "grad_norm": 0.3160035312175751, "learning_rate": 0.0003755663957292048, "loss": 9.4107, "num_input_tokens_seen": 572340312, "step": 234 }, { "epoch": 0.2478031634446397, "grad_norm": 0.20865343511104584, "learning_rate": 0.00037386467972163516, "loss": 10.5183, "num_input_tokens_seen": 574771196, "step": 235 }, { "epoch": 0.24885764499121266, "grad_norm": 0.35180842876434326, "learning_rate": 0.00037215532315870774, "loss": 10.9234, "num_input_tokens_seen": 577234360, "step": 236 }, { "epoch": 0.24991212653778558, "grad_norm": 0.3082864284515381, "learning_rate": 0.00037043843148158696, "loss": 10.4472, "num_input_tokens_seen": 579677372, "step": 237 }, { "epoch": 0.2509666080843585, "grad_norm": 0.4446444511413574, "learning_rate": 0.0003687141105962389, "loss": 11.721, "num_input_tokens_seen": 582233734, "step": 238 }, { "epoch": 0.25202108963093145, "grad_norm": 0.3163931965827942, "learning_rate": 0.000366982466866898, "loss": 12.3157, "num_input_tokens_seen": 584774048, "step": 239 }, { "epoch": 0.2530755711775044, "grad_norm": 0.3964653015136719, "learning_rate": 0.00036524360710950624, "loss": 10.9618, "num_input_tokens_seen": 587165168, "step": 240 }, { "epoch": 0.25413005272407735, "grad_norm": 0.3358474671840668, "learning_rate": 0.0003634976385851242, "loss": 9.4739, "num_input_tokens_seen": 589531588, "step": 241 }, { "epoch": 0.25518453427065024, "grad_norm": 0.2007070630788803, "learning_rate": 0.00036174466899331484, "loss": 10.169, "num_input_tokens_seen": 591851982, "step": 242 }, { "epoch": 0.2562390158172232, "grad_norm": 0.3300761580467224, "learning_rate": 0.0003599848064654995, "loss": 9.6157, "num_input_tokens_seen": 594293620, "step": 243 }, { "epoch": 0.25729349736379614, "grad_norm": 0.33818671107292175, "learning_rate": 0.000358218159558289, "loss": 11.107, "num_input_tokens_seen": 596810748, "step": 244 }, { "epoch": 0.2583479789103691, "grad_norm": 0.3691726326942444, "learning_rate": 0.0003564448372467859, "loss": 11.1006, "num_input_tokens_seen": 599497050, "step": 245 }, { "epoch": 0.259402460456942, "grad_norm": 0.23022472858428955, "learning_rate": 0.0003546649489178636, "loss": 11.5004, "num_input_tokens_seen": 601846484, "step": 246 }, { "epoch": 0.26045694200351494, "grad_norm": 0.2847031354904175, "learning_rate": 0.00035287860436341824, "loss": 10.0628, "num_input_tokens_seen": 604339142, "step": 247 }, { "epoch": 0.2615114235500879, "grad_norm": 0.33991608023643494, "learning_rate": 0.0003510859137735964, "loss": 11.1039, "num_input_tokens_seen": 606890192, "step": 248 }, { "epoch": 0.26256590509666083, "grad_norm": 0.3108097314834595, "learning_rate": 0.00034928698772999787, "loss": 11.3088, "num_input_tokens_seen": 609336404, "step": 249 }, { "epoch": 0.26362038664323373, "grad_norm": 0.3134739100933075, "learning_rate": 0.0003474819371988549, "loss": 10.7229, "num_input_tokens_seen": 611867684, "step": 250 }, { "epoch": 0.2646748681898067, "grad_norm": 0.22635768353939056, "learning_rate": 0.00034567087352418665, "loss": 9.5928, "num_input_tokens_seen": 614297542, "step": 251 }, { "epoch": 0.2657293497363796, "grad_norm": 0.3080601692199707, "learning_rate": 0.0003438539084209315, "loss": 8.6058, "num_input_tokens_seen": 616909048, "step": 252 }, { "epoch": 0.2667838312829525, "grad_norm": 0.21621114015579224, "learning_rate": 0.0003420311539680557, "loss": 10.6293, "num_input_tokens_seen": 619209032, "step": 253 }, { "epoch": 0.26783831282952547, "grad_norm": 0.2749236226081848, "learning_rate": 0.00034020272260163977, "loss": 10.3178, "num_input_tokens_seen": 621491468, "step": 254 }, { "epoch": 0.2688927943760984, "grad_norm": 0.39277732372283936, "learning_rate": 0.0003383687271079432, "loss": 11.1954, "num_input_tokens_seen": 623952608, "step": 255 }, { "epoch": 0.26994727592267137, "grad_norm": 0.22389429807662964, "learning_rate": 0.0003365292806164468, "loss": 10.1923, "num_input_tokens_seen": 626508340, "step": 256 }, { "epoch": 0.27100175746924426, "grad_norm": 0.3458981513977051, "learning_rate": 0.00033468449659287486, "loss": 11.4261, "num_input_tokens_seen": 628924044, "step": 257 }, { "epoch": 0.2720562390158172, "grad_norm": 0.39842402935028076, "learning_rate": 0.0003328344888321955, "loss": 11.8093, "num_input_tokens_seen": 631424656, "step": 258 }, { "epoch": 0.27311072056239016, "grad_norm": 0.1962989717721939, "learning_rate": 0.0003309793714516019, "loss": 10.161, "num_input_tokens_seen": 634003722, "step": 259 }, { "epoch": 0.2741652021089631, "grad_norm": 0.3088131844997406, "learning_rate": 0.00032911925888347234, "loss": 9.1669, "num_input_tokens_seen": 636266230, "step": 260 }, { "epoch": 0.275219683655536, "grad_norm": 0.3336693346500397, "learning_rate": 0.00032725426586831203, "loss": 11.0952, "num_input_tokens_seen": 638937762, "step": 261 }, { "epoch": 0.27627416520210896, "grad_norm": 0.2500801384449005, "learning_rate": 0.0003253845074476749, "loss": 10.6864, "num_input_tokens_seen": 641424216, "step": 262 }, { "epoch": 0.2773286467486819, "grad_norm": 0.25769153237342834, "learning_rate": 0.00032351009895706785, "loss": 11.0083, "num_input_tokens_seen": 643924398, "step": 263 }, { "epoch": 0.27838312829525486, "grad_norm": 0.3037450909614563, "learning_rate": 0.00032163115601883583, "loss": 10.4356, "num_input_tokens_seen": 646442606, "step": 264 }, { "epoch": 0.27943760984182775, "grad_norm": 0.3082483410835266, "learning_rate": 0.0003197477945350297, "loss": 10.755, "num_input_tokens_seen": 648807396, "step": 265 }, { "epoch": 0.2804920913884007, "grad_norm": 0.3269599378108978, "learning_rate": 0.0003178601306802573, "loss": 10.9678, "num_input_tokens_seen": 651255882, "step": 266 }, { "epoch": 0.28154657293497365, "grad_norm": 0.2472166121006012, "learning_rate": 0.00031596828089451703, "loss": 10.9866, "num_input_tokens_seen": 653500404, "step": 267 }, { "epoch": 0.2826010544815466, "grad_norm": 0.2659846842288971, "learning_rate": 0.00031407236187601487, "loss": 11.7376, "num_input_tokens_seen": 656044100, "step": 268 }, { "epoch": 0.2836555360281195, "grad_norm": 0.3505820631980896, "learning_rate": 0.0003121724905739666, "loss": 9.977, "num_input_tokens_seen": 658675374, "step": 269 }, { "epoch": 0.28471001757469244, "grad_norm": 0.1809883415699005, "learning_rate": 0.0003102687841813832, "loss": 11.2172, "num_input_tokens_seen": 661142718, "step": 270 }, { "epoch": 0.2857644991212654, "grad_norm": 0.22445712983608246, "learning_rate": 0.00030836136012784226, "loss": 10.346, "num_input_tokens_seen": 663510590, "step": 271 }, { "epoch": 0.2868189806678383, "grad_norm": 0.3175901472568512, "learning_rate": 0.00030645033607224425, "loss": 11.4312, "num_input_tokens_seen": 666115430, "step": 272 }, { "epoch": 0.28787346221441124, "grad_norm": 0.2390243262052536, "learning_rate": 0.0003045358298955546, "loss": 10.7897, "num_input_tokens_seen": 668556982, "step": 273 }, { "epoch": 0.2889279437609842, "grad_norm": 0.3173353374004364, "learning_rate": 0.0003026179596935324, "loss": 10.044, "num_input_tokens_seen": 671126456, "step": 274 }, { "epoch": 0.28998242530755713, "grad_norm": 0.24416379630565643, "learning_rate": 0.00030069684376944573, "loss": 11.0523, "num_input_tokens_seen": 673578434, "step": 275 }, { "epoch": 0.29103690685413003, "grad_norm": 0.3447067439556122, "learning_rate": 0.000298772600626774, "loss": 10.9912, "num_input_tokens_seen": 675988320, "step": 276 }, { "epoch": 0.292091388400703, "grad_norm": 0.26416268944740295, "learning_rate": 0.00029684534896189834, "loss": 11.2947, "num_input_tokens_seen": 678558982, "step": 277 }, { "epoch": 0.2931458699472759, "grad_norm": 0.20507994294166565, "learning_rate": 0.0002949152076567795, "loss": 10.4179, "num_input_tokens_seen": 681075438, "step": 278 }, { "epoch": 0.2942003514938489, "grad_norm": 0.2848092019557953, "learning_rate": 0.0002929822957716248, "loss": 12.0503, "num_input_tokens_seen": 683571248, "step": 279 }, { "epoch": 0.29525483304042177, "grad_norm": 0.28016605973243713, "learning_rate": 0.00029104673253754456, "loss": 10.6642, "num_input_tokens_seen": 685872324, "step": 280 }, { "epoch": 0.2963093145869947, "grad_norm": 0.1812390387058258, "learning_rate": 0.00028910863734919615, "loss": 11.5015, "num_input_tokens_seen": 688269678, "step": 281 }, { "epoch": 0.29736379613356767, "grad_norm": 0.3147426247596741, "learning_rate": 0.00028716812975741995, "loss": 9.6078, "num_input_tokens_seen": 690789842, "step": 282 }, { "epoch": 0.2984182776801406, "grad_norm": 0.24371254444122314, "learning_rate": 0.00028522532946186486, "loss": 10.7675, "num_input_tokens_seen": 693146222, "step": 283 }, { "epoch": 0.2994727592267135, "grad_norm": 0.3114820122718811, "learning_rate": 0.0002832803563036046, "loss": 10.806, "num_input_tokens_seen": 695527010, "step": 284 }, { "epoch": 0.30052724077328646, "grad_norm": 0.19095203280448914, "learning_rate": 0.00028133333025774524, "loss": 10.8544, "num_input_tokens_seen": 697890080, "step": 285 }, { "epoch": 0.3015817223198594, "grad_norm": 0.2632463574409485, "learning_rate": 0.0002793843714260245, "loss": 9.8436, "num_input_tokens_seen": 700327360, "step": 286 }, { "epoch": 0.30263620386643236, "grad_norm": 0.2614726722240448, "learning_rate": 0.0002774336000294035, "loss": 11.9291, "num_input_tokens_seen": 702777350, "step": 287 }, { "epoch": 0.30369068541300526, "grad_norm": 0.21014803647994995, "learning_rate": 0.0002754811364006511, "loss": 11.455, "num_input_tokens_seen": 705184528, "step": 288 }, { "epoch": 0.3047451669595782, "grad_norm": 0.31345221400260925, "learning_rate": 0.0002735271009769208, "loss": 9.8038, "num_input_tokens_seen": 707551254, "step": 289 }, { "epoch": 0.30579964850615116, "grad_norm": 0.2455073893070221, "learning_rate": 0.00027157161429232173, "loss": 10.5692, "num_input_tokens_seen": 709824754, "step": 290 }, { "epoch": 0.30685413005272405, "grad_norm": 0.2611768841743469, "learning_rate": 0.00026961479697048385, "loss": 10.3353, "num_input_tokens_seen": 712236092, "step": 291 }, { "epoch": 0.307908611599297, "grad_norm": 0.22272686660289764, "learning_rate": 0.00026765676971711704, "loss": 11.5529, "num_input_tokens_seen": 714627792, "step": 292 }, { "epoch": 0.30896309314586995, "grad_norm": 0.3416326940059662, "learning_rate": 0.00026569765331256536, "loss": 11.2797, "num_input_tokens_seen": 716881324, "step": 293 }, { "epoch": 0.3100175746924429, "grad_norm": 0.20943358540534973, "learning_rate": 0.000263737568604357, "loss": 11.5516, "num_input_tokens_seen": 719423592, "step": 294 }, { "epoch": 0.3110720562390158, "grad_norm": 0.3233245313167572, "learning_rate": 0.00026177663649974936, "loss": 10.0471, "num_input_tokens_seen": 721913756, "step": 295 }, { "epoch": 0.31212653778558874, "grad_norm": 0.3134172260761261, "learning_rate": 0.00025981497795827174, "loss": 9.5378, "num_input_tokens_seen": 724446930, "step": 296 }, { "epoch": 0.3131810193321617, "grad_norm": 0.32523971796035767, "learning_rate": 0.0002578527139842631, "loss": 10.4038, "num_input_tokens_seen": 726932626, "step": 297 }, { "epoch": 0.31423550087873464, "grad_norm": 0.2783624827861786, "learning_rate": 0.00025588996561940846, "loss": 10.9209, "num_input_tokens_seen": 729377678, "step": 298 }, { "epoch": 0.31528998242530754, "grad_norm": 0.2915477752685547, "learning_rate": 0.0002539268539352723, "loss": 10.0453, "num_input_tokens_seen": 731835522, "step": 299 }, { "epoch": 0.3163444639718805, "grad_norm": 0.40257516503334045, "learning_rate": 0.00025196350002583027, "loss": 11.8635, "num_input_tokens_seen": 734212212, "step": 300 } ], "logging_steps": 1.0, "max_steps": 500, "num_input_tokens_seen": 734212212, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.14219679856119e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }