{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5262234695667427, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0010524469391334855, "grad_norm": 1.2313014268875122, "learning_rate": 0.0, "loss": 14.7817, "num_input_tokens_seen": 2552598, "step": 1 }, { "epoch": 0.002104893878266971, "grad_norm": 1.1914018392562866, "learning_rate": 5e-06, "loss": 14.4504, "num_input_tokens_seen": 4819720, "step": 2 }, { "epoch": 0.003157340817400456, "grad_norm": 1.1657367944717407, "learning_rate": 1e-05, "loss": 12.5957, "num_input_tokens_seen": 7271614, "step": 3 }, { "epoch": 0.004209787756533942, "grad_norm": 1.1560744047164917, "learning_rate": 1.5e-05, "loss": 14.7908, "num_input_tokens_seen": 9877976, "step": 4 }, { "epoch": 0.005262234695667427, "grad_norm": 1.2366737127304077, "learning_rate": 2e-05, "loss": 12.9764, "num_input_tokens_seen": 12272960, "step": 5 }, { "epoch": 0.006314681634800912, "grad_norm": 1.171399712562561, "learning_rate": 2.5e-05, "loss": 14.0837, "num_input_tokens_seen": 14951530, "step": 6 }, { "epoch": 0.007367128573934397, "grad_norm": 0.8171337246894836, "learning_rate": 3e-05, "loss": 14.778, "num_input_tokens_seen": 17510004, "step": 7 }, { "epoch": 0.008419575513067884, "grad_norm": 0.8718011379241943, "learning_rate": 3.5000000000000004e-05, "loss": 14.1793, "num_input_tokens_seen": 20115996, "step": 8 }, { "epoch": 0.009472022452201368, "grad_norm": 1.0820726156234741, "learning_rate": 4e-05, "loss": 13.0845, "num_input_tokens_seen": 22532130, "step": 9 }, { "epoch": 0.010524469391334853, "grad_norm": 1.090196967124939, "learning_rate": 4.4999999999999996e-05, "loss": 14.0881, "num_input_tokens_seen": 24773342, "step": 10 }, { "epoch": 0.011576916330468338, "grad_norm": 0.9762588739395142, "learning_rate": 5e-05, "loss": 14.3957, "num_input_tokens_seen": 27134854, "step": 11 }, { "epoch": 0.012629363269601825, "grad_norm": 0.7875831723213196, "learning_rate": 5.5e-05, "loss": 15.4542, "num_input_tokens_seen": 29750242, "step": 12 }, { "epoch": 0.01368181020873531, "grad_norm": 1.1389998197555542, "learning_rate": 6e-05, "loss": 13.3294, "num_input_tokens_seen": 32126134, "step": 13 }, { "epoch": 0.014734257147868794, "grad_norm": 1.3320008516311646, "learning_rate": 6.500000000000001e-05, "loss": 12.2129, "num_input_tokens_seen": 34693314, "step": 14 }, { "epoch": 0.01578670408700228, "grad_norm": 1.0273903608322144, "learning_rate": 7.000000000000001e-05, "loss": 13.5101, "num_input_tokens_seen": 37253804, "step": 15 }, { "epoch": 0.016839151026135767, "grad_norm": 1.6416386365890503, "learning_rate": 7.5e-05, "loss": 13.2418, "num_input_tokens_seen": 39587136, "step": 16 }, { "epoch": 0.017891597965269252, "grad_norm": 0.7137455940246582, "learning_rate": 8e-05, "loss": 15.9393, "num_input_tokens_seen": 42078598, "step": 17 }, { "epoch": 0.018944044904402737, "grad_norm": 1.0483299493789673, "learning_rate": 8.5e-05, "loss": 14.7757, "num_input_tokens_seen": 44371870, "step": 18 }, { "epoch": 0.01999649184353622, "grad_norm": 0.7091999650001526, "learning_rate": 8.999999999999999e-05, "loss": 14.3012, "num_input_tokens_seen": 47024158, "step": 19 }, { "epoch": 0.021048938782669706, "grad_norm": 1.0973995923995972, "learning_rate": 9.5e-05, "loss": 14.1114, "num_input_tokens_seen": 49484266, "step": 20 }, { "epoch": 0.02210138572180319, "grad_norm": 1.3034206628799438, "learning_rate": 0.0001, "loss": 13.0443, "num_input_tokens_seen": 52013086, "step": 21 }, { "epoch": 0.023153832660936676, "grad_norm": 1.0242785215377808, "learning_rate": 0.000105, "loss": 15.104, "num_input_tokens_seen": 54555770, "step": 22 }, { "epoch": 0.024206279600070164, "grad_norm": 1.0558710098266602, "learning_rate": 0.00011, "loss": 15.583, "num_input_tokens_seen": 56959612, "step": 23 }, { "epoch": 0.02525872653920365, "grad_norm": 0.9309564828872681, "learning_rate": 0.000115, "loss": 14.0038, "num_input_tokens_seen": 59526070, "step": 24 }, { "epoch": 0.026311173478337134, "grad_norm": 1.0949842929840088, "learning_rate": 0.00012, "loss": 15.0757, "num_input_tokens_seen": 61950134, "step": 25 }, { "epoch": 0.02736362041747062, "grad_norm": 1.0121939182281494, "learning_rate": 0.000125, "loss": 13.9633, "num_input_tokens_seen": 64490470, "step": 26 }, { "epoch": 0.028416067356604104, "grad_norm": 0.9804402589797974, "learning_rate": 0.00013000000000000002, "loss": 14.2439, "num_input_tokens_seen": 66982246, "step": 27 }, { "epoch": 0.02946851429573759, "grad_norm": 0.7765673398971558, "learning_rate": 0.000135, "loss": 15.4559, "num_input_tokens_seen": 69402736, "step": 28 }, { "epoch": 0.030520961234871077, "grad_norm": 1.1596146821975708, "learning_rate": 0.00014000000000000001, "loss": 13.7701, "num_input_tokens_seen": 71929896, "step": 29 }, { "epoch": 0.03157340817400456, "grad_norm": 1.2360461950302124, "learning_rate": 0.000145, "loss": 13.3318, "num_input_tokens_seen": 74512508, "step": 30 }, { "epoch": 0.03262585511313804, "grad_norm": 1.2719630002975464, "learning_rate": 0.00015, "loss": 13.1903, "num_input_tokens_seen": 76920882, "step": 31 }, { "epoch": 0.033678302052271535, "grad_norm": 1.1349018812179565, "learning_rate": 0.000155, "loss": 14.9638, "num_input_tokens_seen": 79271826, "step": 32 }, { "epoch": 0.03473074899140502, "grad_norm": 1.2617769241333008, "learning_rate": 0.00016, "loss": 13.5991, "num_input_tokens_seen": 81618752, "step": 33 }, { "epoch": 0.035783195930538504, "grad_norm": 1.1602047681808472, "learning_rate": 0.000165, "loss": 14.4276, "num_input_tokens_seen": 84222280, "step": 34 }, { "epoch": 0.03683564286967199, "grad_norm": 1.4332772493362427, "learning_rate": 0.00017, "loss": 13.7944, "num_input_tokens_seen": 86617216, "step": 35 }, { "epoch": 0.037888089808805474, "grad_norm": 0.9722316861152649, "learning_rate": 0.000175, "loss": 13.4569, "num_input_tokens_seen": 89130406, "step": 36 }, { "epoch": 0.03894053674793896, "grad_norm": 1.4759581089019775, "learning_rate": 0.00017999999999999998, "loss": 12.2782, "num_input_tokens_seen": 91538480, "step": 37 }, { "epoch": 0.03999298368707244, "grad_norm": 1.143380045890808, "learning_rate": 0.000185, "loss": 13.8658, "num_input_tokens_seen": 94141494, "step": 38 }, { "epoch": 0.04104543062620593, "grad_norm": 1.2926298379898071, "learning_rate": 0.00019, "loss": 13.4335, "num_input_tokens_seen": 96591810, "step": 39 }, { "epoch": 0.04209787756533941, "grad_norm": 1.375290870666504, "learning_rate": 0.00019500000000000002, "loss": 11.8362, "num_input_tokens_seen": 99168846, "step": 40 }, { "epoch": 0.0431503245044729, "grad_norm": 1.7422271966934204, "learning_rate": 0.0002, "loss": 14.4799, "num_input_tokens_seen": 101623970, "step": 41 }, { "epoch": 0.04420277144360638, "grad_norm": 1.68968665599823, "learning_rate": 0.000205, "loss": 11.6396, "num_input_tokens_seen": 104013558, "step": 42 }, { "epoch": 0.04525521838273987, "grad_norm": 0.9919915795326233, "learning_rate": 0.00021, "loss": 14.5152, "num_input_tokens_seen": 106553292, "step": 43 }, { "epoch": 0.04630766532187335, "grad_norm": 1.1388020515441895, "learning_rate": 0.000215, "loss": 12.9848, "num_input_tokens_seen": 109061076, "step": 44 }, { "epoch": 0.047360112261006844, "grad_norm": 1.1153753995895386, "learning_rate": 0.00022, "loss": 13.6106, "num_input_tokens_seen": 111638220, "step": 45 }, { "epoch": 0.04841255920014033, "grad_norm": 1.4049729108810425, "learning_rate": 0.00022500000000000002, "loss": 12.1343, "num_input_tokens_seen": 113986254, "step": 46 }, { "epoch": 0.049465006139273814, "grad_norm": 1.2836921215057373, "learning_rate": 0.00023, "loss": 13.5637, "num_input_tokens_seen": 116525820, "step": 47 }, { "epoch": 0.0505174530784073, "grad_norm": 1.6902610063552856, "learning_rate": 0.000235, "loss": 12.636, "num_input_tokens_seen": 118866752, "step": 48 }, { "epoch": 0.05156990001754078, "grad_norm": 1.0417742729187012, "learning_rate": 0.00024, "loss": 14.4791, "num_input_tokens_seen": 121303180, "step": 49 }, { "epoch": 0.05262234695667427, "grad_norm": 0.8952522873878479, "learning_rate": 0.000245, "loss": 13.7733, "num_input_tokens_seen": 123865862, "step": 50 }, { "epoch": 0.05367479389580775, "grad_norm": 0.870211660861969, "learning_rate": 0.00025, "loss": 14.195, "num_input_tokens_seen": 126185956, "step": 51 }, { "epoch": 0.05472724083494124, "grad_norm": 1.2311478853225708, "learning_rate": 0.000255, "loss": 14.081, "num_input_tokens_seen": 128686600, "step": 52 }, { "epoch": 0.05577968777407472, "grad_norm": 0.8930346965789795, "learning_rate": 0.00026000000000000003, "loss": 13.7719, "num_input_tokens_seen": 131214556, "step": 53 }, { "epoch": 0.05683213471320821, "grad_norm": 1.0726232528686523, "learning_rate": 0.00026500000000000004, "loss": 12.4831, "num_input_tokens_seen": 133478050, "step": 54 }, { "epoch": 0.05788458165234169, "grad_norm": 1.1435436010360718, "learning_rate": 0.00027, "loss": 14.7828, "num_input_tokens_seen": 135845886, "step": 55 }, { "epoch": 0.05893702859147518, "grad_norm": 1.3371323347091675, "learning_rate": 0.000275, "loss": 13.6007, "num_input_tokens_seen": 138301528, "step": 56 }, { "epoch": 0.05998947553060867, "grad_norm": 1.4859648942947388, "learning_rate": 0.00028000000000000003, "loss": 11.5014, "num_input_tokens_seen": 140754452, "step": 57 }, { "epoch": 0.06104192246974215, "grad_norm": 1.0733438730239868, "learning_rate": 0.000285, "loss": 14.1451, "num_input_tokens_seen": 143240748, "step": 58 }, { "epoch": 0.06209436940887564, "grad_norm": 1.2057485580444336, "learning_rate": 0.00029, "loss": 13.9744, "num_input_tokens_seen": 145794960, "step": 59 }, { "epoch": 0.06314681634800912, "grad_norm": 1.8196871280670166, "learning_rate": 0.000295, "loss": 10.6486, "num_input_tokens_seen": 148234512, "step": 60 }, { "epoch": 0.06419926328714261, "grad_norm": 1.4653598070144653, "learning_rate": 0.0003, "loss": 12.9739, "num_input_tokens_seen": 150716606, "step": 61 }, { "epoch": 0.06525171022627609, "grad_norm": 1.0080358982086182, "learning_rate": 0.000305, "loss": 13.6266, "num_input_tokens_seen": 153281944, "step": 62 }, { "epoch": 0.06630415716540958, "grad_norm": 1.3227250576019287, "learning_rate": 0.00031, "loss": 13.0072, "num_input_tokens_seen": 155890052, "step": 63 }, { "epoch": 0.06735660410454307, "grad_norm": 0.8885237574577332, "learning_rate": 0.000315, "loss": 12.4343, "num_input_tokens_seen": 158277612, "step": 64 }, { "epoch": 0.06840905104367655, "grad_norm": 0.903510570526123, "learning_rate": 0.00032, "loss": 13.0652, "num_input_tokens_seen": 160877492, "step": 65 }, { "epoch": 0.06946149798281004, "grad_norm": 1.060062289237976, "learning_rate": 0.00032500000000000004, "loss": 14.1838, "num_input_tokens_seen": 163296028, "step": 66 }, { "epoch": 0.07051394492194352, "grad_norm": 0.9471303820610046, "learning_rate": 0.00033, "loss": 13.2799, "num_input_tokens_seen": 165722692, "step": 67 }, { "epoch": 0.07156639186107701, "grad_norm": 0.9983565807342529, "learning_rate": 0.000335, "loss": 13.3277, "num_input_tokens_seen": 168091814, "step": 68 }, { "epoch": 0.07261883880021049, "grad_norm": 0.947719395160675, "learning_rate": 0.00034, "loss": 13.0381, "num_input_tokens_seen": 170645106, "step": 69 }, { "epoch": 0.07367128573934398, "grad_norm": 1.4651892185211182, "learning_rate": 0.000345, "loss": 16.0194, "num_input_tokens_seen": 173162736, "step": 70 }, { "epoch": 0.07472373267847746, "grad_norm": 0.702907145023346, "learning_rate": 0.00035, "loss": 15.0851, "num_input_tokens_seen": 175613962, "step": 71 }, { "epoch": 0.07577617961761095, "grad_norm": 1.0725198984146118, "learning_rate": 0.000355, "loss": 14.9445, "num_input_tokens_seen": 178244028, "step": 72 }, { "epoch": 0.07682862655674443, "grad_norm": 1.0319093465805054, "learning_rate": 0.00035999999999999997, "loss": 13.3601, "num_input_tokens_seen": 180847730, "step": 73 }, { "epoch": 0.07788107349587792, "grad_norm": 0.8903796672821045, "learning_rate": 0.000365, "loss": 13.4871, "num_input_tokens_seen": 183453142, "step": 74 }, { "epoch": 0.0789335204350114, "grad_norm": 0.8626850247383118, "learning_rate": 0.00037, "loss": 13.9116, "num_input_tokens_seen": 185873752, "step": 75 }, { "epoch": 0.07998596737414489, "grad_norm": 1.1449741125106812, "learning_rate": 0.000375, "loss": 13.6459, "num_input_tokens_seen": 188443796, "step": 76 }, { "epoch": 0.08103841431327838, "grad_norm": 0.8763025403022766, "learning_rate": 0.00038, "loss": 13.9269, "num_input_tokens_seen": 190973818, "step": 77 }, { "epoch": 0.08209086125241186, "grad_norm": 0.9122107625007629, "learning_rate": 0.00038500000000000003, "loss": 13.8082, "num_input_tokens_seen": 193317028, "step": 78 }, { "epoch": 0.08314330819154535, "grad_norm": 0.9646636247634888, "learning_rate": 0.00039000000000000005, "loss": 13.5869, "num_input_tokens_seen": 195799620, "step": 79 }, { "epoch": 0.08419575513067883, "grad_norm": 0.6344845294952393, "learning_rate": 0.000395, "loss": 15.0292, "num_input_tokens_seen": 198391400, "step": 80 }, { "epoch": 0.08524820206981232, "grad_norm": 0.9185324907302856, "learning_rate": 0.0004, "loss": 13.9664, "num_input_tokens_seen": 200824232, "step": 81 }, { "epoch": 0.0863006490089458, "grad_norm": 0.8805983066558838, "learning_rate": 0.00040500000000000003, "loss": 14.5958, "num_input_tokens_seen": 203321058, "step": 82 }, { "epoch": 0.08735309594807929, "grad_norm": 1.210644006729126, "learning_rate": 0.00041, "loss": 13.2707, "num_input_tokens_seen": 205759916, "step": 83 }, { "epoch": 0.08840554288721277, "grad_norm": 0.951380729675293, "learning_rate": 0.000415, "loss": 13.3767, "num_input_tokens_seen": 208303126, "step": 84 }, { "epoch": 0.08945798982634626, "grad_norm": 0.7330085039138794, "learning_rate": 0.00042, "loss": 14.4419, "num_input_tokens_seen": 210801786, "step": 85 }, { "epoch": 0.09051043676547973, "grad_norm": 0.651300311088562, "learning_rate": 0.000425, "loss": 12.7518, "num_input_tokens_seen": 213211252, "step": 86 }, { "epoch": 0.09156288370461323, "grad_norm": 1.1722288131713867, "learning_rate": 0.00043, "loss": 12.723, "num_input_tokens_seen": 215702718, "step": 87 }, { "epoch": 0.0926153306437467, "grad_norm": 0.7686027884483337, "learning_rate": 0.000435, "loss": 13.1674, "num_input_tokens_seen": 218178652, "step": 88 }, { "epoch": 0.0936677775828802, "grad_norm": 1.2741395235061646, "learning_rate": 0.00044, "loss": 12.024, "num_input_tokens_seen": 220627634, "step": 89 }, { "epoch": 0.09472022452201369, "grad_norm": 1.2422987222671509, "learning_rate": 0.00044500000000000003, "loss": 13.6306, "num_input_tokens_seen": 223062184, "step": 90 }, { "epoch": 0.09577267146114717, "grad_norm": 1.449549674987793, "learning_rate": 0.00045000000000000004, "loss": 13.0454, "num_input_tokens_seen": 225439150, "step": 91 }, { "epoch": 0.09682511840028066, "grad_norm": 1.0337365865707397, "learning_rate": 0.000455, "loss": 13.5763, "num_input_tokens_seen": 227803092, "step": 92 }, { "epoch": 0.09787756533941414, "grad_norm": 0.6650864481925964, "learning_rate": 0.00046, "loss": 14.431, "num_input_tokens_seen": 230311970, "step": 93 }, { "epoch": 0.09893001227854763, "grad_norm": 1.2744181156158447, "learning_rate": 0.000465, "loss": 13.0944, "num_input_tokens_seen": 232926146, "step": 94 }, { "epoch": 0.0999824592176811, "grad_norm": 1.1105504035949707, "learning_rate": 0.00047, "loss": 13.2775, "num_input_tokens_seen": 235330002, "step": 95 }, { "epoch": 0.1010349061568146, "grad_norm": 1.0728318691253662, "learning_rate": 0.000475, "loss": 11.3577, "num_input_tokens_seen": 237751824, "step": 96 }, { "epoch": 0.10208735309594807, "grad_norm": 0.9665006399154663, "learning_rate": 0.00048, "loss": 12.9271, "num_input_tokens_seen": 240035336, "step": 97 }, { "epoch": 0.10313980003508157, "grad_norm": 0.994549572467804, "learning_rate": 0.00048499999999999997, "loss": 13.4015, "num_input_tokens_seen": 242569996, "step": 98 }, { "epoch": 0.10419224697421504, "grad_norm": 1.0495078563690186, "learning_rate": 0.00049, "loss": 14.2214, "num_input_tokens_seen": 245127946, "step": 99 }, { "epoch": 0.10524469391334854, "grad_norm": 0.905197024345398, "learning_rate": 0.000495, "loss": 14.3041, "num_input_tokens_seen": 247602514, "step": 100 }, { "epoch": 0.10629714085248201, "grad_norm": 1.026200771331787, "learning_rate": 0.0005, "loss": 12.7879, "num_input_tokens_seen": 250104426, "step": 101 }, { "epoch": 0.1073495877916155, "grad_norm": 0.625381350517273, "learning_rate": 0.0004999922894119685, "loss": 14.3942, "num_input_tokens_seen": 252649578, "step": 102 }, { "epoch": 0.108402034730749, "grad_norm": 0.673723578453064, "learning_rate": 0.0004999691581234994, "loss": 14.6965, "num_input_tokens_seen": 255070656, "step": 103 }, { "epoch": 0.10945448166988248, "grad_norm": 1.0828572511672974, "learning_rate": 0.0004999306075614394, "loss": 13.1026, "num_input_tokens_seen": 257597588, "step": 104 }, { "epoch": 0.11050692860901597, "grad_norm": 0.922892689704895, "learning_rate": 0.0004998766401037688, "loss": 11.9671, "num_input_tokens_seen": 260043236, "step": 105 }, { "epoch": 0.11155937554814944, "grad_norm": 1.1064248085021973, "learning_rate": 0.0004998072590794548, "loss": 13.263, "num_input_tokens_seen": 262478222, "step": 106 }, { "epoch": 0.11261182248728294, "grad_norm": 1.4321610927581787, "learning_rate": 0.0004997224687682457, "loss": 14.0588, "num_input_tokens_seen": 264975794, "step": 107 }, { "epoch": 0.11366426942641641, "grad_norm": 1.037821650505066, "learning_rate": 0.000499622274400407, "loss": 13.2263, "num_input_tokens_seen": 267353528, "step": 108 }, { "epoch": 0.1147167163655499, "grad_norm": 1.14232337474823, "learning_rate": 0.0004995066821563998, "loss": 13.9593, "num_input_tokens_seen": 269792618, "step": 109 }, { "epoch": 0.11576916330468338, "grad_norm": 1.222814917564392, "learning_rate": 0.0004993756991664976, "loss": 11.5108, "num_input_tokens_seen": 272380446, "step": 110 }, { "epoch": 0.11682161024381688, "grad_norm": 1.2184088230133057, "learning_rate": 0.0004992293335103487, "loss": 14.7927, "num_input_tokens_seen": 274927750, "step": 111 }, { "epoch": 0.11787405718295035, "grad_norm": 0.9133461117744446, "learning_rate": 0.0004990675942164759, "loss": 13.9425, "num_input_tokens_seen": 277305570, "step": 112 }, { "epoch": 0.11892650412208385, "grad_norm": 0.506476879119873, "learning_rate": 0.0004988904912617209, "loss": 14.123, "num_input_tokens_seen": 279758966, "step": 113 }, { "epoch": 0.11997895106121734, "grad_norm": 0.8582763671875, "learning_rate": 0.000498698035570628, "loss": 14.2176, "num_input_tokens_seen": 282252170, "step": 114 }, { "epoch": 0.12103139800035081, "grad_norm": 0.9771997332572937, "learning_rate": 0.0004984902390147711, "loss": 12.6458, "num_input_tokens_seen": 284755558, "step": 115 }, { "epoch": 0.1220838449394843, "grad_norm": 0.9598507881164551, "learning_rate": 0.0004982671144120202, "loss": 13.8564, "num_input_tokens_seen": 287309306, "step": 116 }, { "epoch": 0.12313629187861778, "grad_norm": 1.1979269981384277, "learning_rate": 0.000498028675525752, "loss": 13.1742, "num_input_tokens_seen": 289898194, "step": 117 }, { "epoch": 0.12418873881775128, "grad_norm": 0.9001169800758362, "learning_rate": 0.000497774937064, "loss": 13.0524, "num_input_tokens_seen": 292185416, "step": 118 }, { "epoch": 0.12524118575688475, "grad_norm": 1.0124322175979614, "learning_rate": 0.0004975059146785479, "loss": 13.2935, "num_input_tokens_seen": 294446504, "step": 119 }, { "epoch": 0.12629363269601823, "grad_norm": 1.3598519563674927, "learning_rate": 0.0004972216249639638, "loss": 14.3977, "num_input_tokens_seen": 296900838, "step": 120 }, { "epoch": 0.12734607963515174, "grad_norm": 0.5201650261878967, "learning_rate": 0.000496922085456576, "loss": 14.0295, "num_input_tokens_seen": 299288000, "step": 121 }, { "epoch": 0.12839852657428522, "grad_norm": 0.834181010723114, "learning_rate": 0.0004966073146333924, "loss": 13.5623, "num_input_tokens_seen": 301537506, "step": 122 }, { "epoch": 0.1294509735134187, "grad_norm": 1.1620839834213257, "learning_rate": 0.0004962773319109604, "loss": 14.1931, "num_input_tokens_seen": 304110792, "step": 123 }, { "epoch": 0.13050342045255217, "grad_norm": 1.145039677619934, "learning_rate": 0.0004959321576441683, "loss": 14.861, "num_input_tokens_seen": 306636802, "step": 124 }, { "epoch": 0.13155586739168568, "grad_norm": 0.9741944074630737, "learning_rate": 0.0004955718131249909, "loss": 12.7266, "num_input_tokens_seen": 309123814, "step": 125 }, { "epoch": 0.13260831433081915, "grad_norm": 1.1798901557922363, "learning_rate": 0.0004951963205811756, "loss": 12.9659, "num_input_tokens_seen": 311665136, "step": 126 }, { "epoch": 0.13366076126995263, "grad_norm": 1.1113797426223755, "learning_rate": 0.0004948057031748712, "loss": 15.4142, "num_input_tokens_seen": 314148958, "step": 127 }, { "epoch": 0.13471320820908614, "grad_norm": 0.8992429971694946, "learning_rate": 0.0004943999850011993, "loss": 13.0283, "num_input_tokens_seen": 316706802, "step": 128 }, { "epoch": 0.13576565514821962, "grad_norm": 0.8786268830299377, "learning_rate": 0.0004939791910867678, "loss": 12.9855, "num_input_tokens_seen": 319032830, "step": 129 }, { "epoch": 0.1368181020873531, "grad_norm": 0.7185277342796326, "learning_rate": 0.0004935433473881276, "loss": 14.5965, "num_input_tokens_seen": 321501656, "step": 130 }, { "epoch": 0.13787054902648657, "grad_norm": 0.6323119401931763, "learning_rate": 0.0004930924807901711, "loss": 11.3642, "num_input_tokens_seen": 324249744, "step": 131 }, { "epoch": 0.13892299596562008, "grad_norm": 0.9608746767044067, "learning_rate": 0.0004926266191044738, "loss": 14.3587, "num_input_tokens_seen": 326785234, "step": 132 }, { "epoch": 0.13997544290475356, "grad_norm": 1.0390254259109497, "learning_rate": 0.0004921457910675788, "loss": 13.6779, "num_input_tokens_seen": 329369442, "step": 133 }, { "epoch": 0.14102788984388703, "grad_norm": 1.1686254739761353, "learning_rate": 0.0004916500263392243, "loss": 13.0251, "num_input_tokens_seen": 331678690, "step": 134 }, { "epoch": 0.1420803367830205, "grad_norm": 0.9643059372901917, "learning_rate": 0.000491139355500514, "loss": 14.3289, "num_input_tokens_seen": 334109168, "step": 135 }, { "epoch": 0.14313278372215402, "grad_norm": 0.972047746181488, "learning_rate": 0.0004906138100520309, "loss": 13.9752, "num_input_tokens_seen": 336490890, "step": 136 }, { "epoch": 0.1441852306612875, "grad_norm": 0.5441385507583618, "learning_rate": 0.0004900734224118936, "loss": 13.4017, "num_input_tokens_seen": 338975062, "step": 137 }, { "epoch": 0.14523767760042097, "grad_norm": 1.0113803148269653, "learning_rate": 0.0004895182259137573, "loss": 12.9987, "num_input_tokens_seen": 341479152, "step": 138 }, { "epoch": 0.14629012453955445, "grad_norm": 0.9675781726837158, "learning_rate": 0.0004889482548047572, "loss": 14.6963, "num_input_tokens_seen": 344011478, "step": 139 }, { "epoch": 0.14734257147868796, "grad_norm": 0.879116415977478, "learning_rate": 0.0004883635442433959, "loss": 12.8071, "num_input_tokens_seen": 346368742, "step": 140 }, { "epoch": 0.14839501841782143, "grad_norm": 0.8611754179000854, "learning_rate": 0.0004877641302973755, "loss": 13.9555, "num_input_tokens_seen": 348946792, "step": 141 }, { "epoch": 0.1494474653569549, "grad_norm": 0.907088577747345, "learning_rate": 0.00048715004994137124, "loss": 14.4171, "num_input_tokens_seen": 351483512, "step": 142 }, { "epoch": 0.15049991229608842, "grad_norm": 0.8465937376022339, "learning_rate": 0.0004865213410547524, "loss": 13.855, "num_input_tokens_seen": 354051172, "step": 143 }, { "epoch": 0.1515523592352219, "grad_norm": 0.7247462868690491, "learning_rate": 0.0004858780424192443, "loss": 13.8533, "num_input_tokens_seen": 356517348, "step": 144 }, { "epoch": 0.15260480617435537, "grad_norm": 0.8869636058807373, "learning_rate": 0.0004852201937165372, "loss": 13.6736, "num_input_tokens_seen": 358865994, "step": 145 }, { "epoch": 0.15365725311348885, "grad_norm": 1.2550673484802246, "learning_rate": 0.0004845478355258377, "loss": 13.504, "num_input_tokens_seen": 361253468, "step": 146 }, { "epoch": 0.15470970005262236, "grad_norm": 0.7472496032714844, "learning_rate": 0.00048386100932136614, "loss": 14.5781, "num_input_tokens_seen": 363682878, "step": 147 }, { "epoch": 0.15576214699175583, "grad_norm": 0.877994954586029, "learning_rate": 0.00048315975746979797, "loss": 13.1523, "num_input_tokens_seen": 366235238, "step": 148 }, { "epoch": 0.1568145939308893, "grad_norm": 0.7401669025421143, "learning_rate": 0.0004824441232276507, "loss": 13.6601, "num_input_tokens_seen": 368605256, "step": 149 }, { "epoch": 0.1578670408700228, "grad_norm": 1.011855959892273, "learning_rate": 0.0004817141507386153, "loss": 12.7433, "num_input_tokens_seen": 371144526, "step": 150 }, { "epoch": 0.1589194878091563, "grad_norm": 0.9353221654891968, "learning_rate": 0.0004809698850308334, "loss": 11.8924, "num_input_tokens_seen": 373671240, "step": 151 }, { "epoch": 0.15997193474828977, "grad_norm": 0.9218998551368713, "learning_rate": 0.0004802113720141196, "loss": 12.8597, "num_input_tokens_seen": 376237680, "step": 152 }, { "epoch": 0.16102438168742325, "grad_norm": 0.6882659792900085, "learning_rate": 0.00047943865847712965, "loss": 13.793, "num_input_tokens_seen": 378722806, "step": 153 }, { "epoch": 0.16207682862655676, "grad_norm": 0.8327513337135315, "learning_rate": 0.0004786517920844744, "loss": 15.2889, "num_input_tokens_seen": 381192922, "step": 154 }, { "epoch": 0.16312927556569023, "grad_norm": 0.8780426383018494, "learning_rate": 0.00047785082137377936, "loss": 13.7398, "num_input_tokens_seen": 383568682, "step": 155 }, { "epoch": 0.1641817225048237, "grad_norm": 0.836845874786377, "learning_rate": 0.000477035795752691, "loss": 12.7139, "num_input_tokens_seen": 385953990, "step": 156 }, { "epoch": 0.1652341694439572, "grad_norm": 0.7991174459457397, "learning_rate": 0.0004762067654958286, "loss": 13.5879, "num_input_tokens_seen": 388329524, "step": 157 }, { "epoch": 0.1662866163830907, "grad_norm": 0.6912776231765747, "learning_rate": 0.0004753637817416835, "loss": 13.885, "num_input_tokens_seen": 390898178, "step": 158 }, { "epoch": 0.16733906332222417, "grad_norm": 0.5859223008155823, "learning_rate": 0.0004745068964894645, "loss": 14.5179, "num_input_tokens_seen": 393483212, "step": 159 }, { "epoch": 0.16839151026135765, "grad_norm": 0.6805679202079773, "learning_rate": 0.00047363616259589025, "loss": 14.0655, "num_input_tokens_seen": 395896478, "step": 160 }, { "epoch": 0.16944395720049113, "grad_norm": 0.741980254650116, "learning_rate": 0.00047275163377192886, "loss": 13.0205, "num_input_tokens_seen": 398340954, "step": 161 }, { "epoch": 0.17049640413962464, "grad_norm": 0.6432321071624756, "learning_rate": 0.0004718533645794847, "loss": 14.6024, "num_input_tokens_seen": 400878746, "step": 162 }, { "epoch": 0.1715488510787581, "grad_norm": 0.7603353261947632, "learning_rate": 0.0004709414104280326, "loss": 12.3878, "num_input_tokens_seen": 403440664, "step": 163 }, { "epoch": 0.1726012980178916, "grad_norm": 1.0762574672698975, "learning_rate": 0.00047001582757120054, "loss": 13.5747, "num_input_tokens_seen": 405884594, "step": 164 }, { "epoch": 0.1736537449570251, "grad_norm": 0.8860725164413452, "learning_rate": 0.00046907667310329887, "loss": 13.8578, "num_input_tokens_seen": 408459258, "step": 165 }, { "epoch": 0.17470619189615857, "grad_norm": 0.7291695475578308, "learning_rate": 0.0004681240049557991, "loss": 13.5135, "num_input_tokens_seen": 410839412, "step": 166 }, { "epoch": 0.17575863883529205, "grad_norm": 0.9141060709953308, "learning_rate": 0.00046715788189375995, "loss": 13.9999, "num_input_tokens_seen": 413265712, "step": 167 }, { "epoch": 0.17681108577442553, "grad_norm": 0.9221991896629333, "learning_rate": 0.0004661783635122028, "loss": 13.5436, "num_input_tokens_seen": 415770362, "step": 168 }, { "epoch": 0.17786353271355904, "grad_norm": 0.7605065703392029, "learning_rate": 0.0004651855102324352, "loss": 14.2376, "num_input_tokens_seen": 418301438, "step": 169 }, { "epoch": 0.17891597965269251, "grad_norm": 0.7537558078765869, "learning_rate": 0.0004641793832983245, "loss": 13.2266, "num_input_tokens_seen": 420728862, "step": 170 }, { "epoch": 0.179968426591826, "grad_norm": 0.8314628601074219, "learning_rate": 0.0004631600447725189, "loss": 13.3294, "num_input_tokens_seen": 423290148, "step": 171 }, { "epoch": 0.18102087353095947, "grad_norm": 0.7050144672393799, "learning_rate": 0.0004621275575326206, "loss": 15.0056, "num_input_tokens_seen": 425681450, "step": 172 }, { "epoch": 0.18207332047009298, "grad_norm": 0.829322874546051, "learning_rate": 0.00046108198526730563, "loss": 12.727, "num_input_tokens_seen": 427996244, "step": 173 }, { "epoch": 0.18312576740922645, "grad_norm": 0.8979415893554688, "learning_rate": 0.0004600233924723966, "loss": 12.8465, "num_input_tokens_seen": 430664766, "step": 174 }, { "epoch": 0.18417821434835993, "grad_norm": 0.8352232575416565, "learning_rate": 0.0004589518444468836, "loss": 14.3189, "num_input_tokens_seen": 433291322, "step": 175 }, { "epoch": 0.1852306612874934, "grad_norm": 0.6047113537788391, "learning_rate": 0.000457867407288896, "loss": 13.6758, "num_input_tokens_seen": 435871848, "step": 176 }, { "epoch": 0.18628310822662691, "grad_norm": 0.9233571887016296, "learning_rate": 0.0004567701478916261, "loss": 15.2745, "num_input_tokens_seen": 438227478, "step": 177 }, { "epoch": 0.1873355551657604, "grad_norm": 0.6702125668525696, "learning_rate": 0.00045566013393920205, "loss": 12.4752, "num_input_tokens_seen": 440734742, "step": 178 }, { "epoch": 0.18838800210489387, "grad_norm": 0.8898003101348877, "learning_rate": 0.0004545374339025129, "loss": 14.4045, "num_input_tokens_seen": 443205588, "step": 179 }, { "epoch": 0.18944044904402738, "grad_norm": 0.9900922775268555, "learning_rate": 0.0004534021170349856, "loss": 12.0756, "num_input_tokens_seen": 445620602, "step": 180 }, { "epoch": 0.19049289598316085, "grad_norm": 1.0743823051452637, "learning_rate": 0.000452254253368312, "loss": 13.2523, "num_input_tokens_seen": 448097850, "step": 181 }, { "epoch": 0.19154534292229433, "grad_norm": 0.6152368187904358, "learning_rate": 0.0004510939137081302, "loss": 13.7245, "num_input_tokens_seen": 450639630, "step": 182 }, { "epoch": 0.1925977898614278, "grad_norm": 0.8884441256523132, "learning_rate": 0.00044992116962965623, "loss": 13.1044, "num_input_tokens_seen": 453263188, "step": 183 }, { "epoch": 0.19365023680056132, "grad_norm": 0.9135752320289612, "learning_rate": 0.00044873609347326866, "loss": 12.92, "num_input_tokens_seen": 455894204, "step": 184 }, { "epoch": 0.1947026837396948, "grad_norm": 0.8144251704216003, "learning_rate": 0.0004475387583400473, "loss": 12.2695, "num_input_tokens_seen": 458127806, "step": 185 }, { "epoch": 0.19575513067882827, "grad_norm": 0.7485114932060242, "learning_rate": 0.00044632923808726293, "loss": 13.106, "num_input_tokens_seen": 460593188, "step": 186 }, { "epoch": 0.19680757761796175, "grad_norm": 0.937698245048523, "learning_rate": 0.0004451076073238223, "loss": 12.5838, "num_input_tokens_seen": 462986828, "step": 187 }, { "epoch": 0.19786002455709525, "grad_norm": 0.8235747218132019, "learning_rate": 0.0004438739414056651, "loss": 13.1601, "num_input_tokens_seen": 465515324, "step": 188 }, { "epoch": 0.19891247149622873, "grad_norm": 0.5789831280708313, "learning_rate": 0.0004426283164311162, "loss": 14.4765, "num_input_tokens_seen": 467803150, "step": 189 }, { "epoch": 0.1999649184353622, "grad_norm": 0.7775219678878784, "learning_rate": 0.00044137080923619174, "loss": 13.1265, "num_input_tokens_seen": 470447066, "step": 190 }, { "epoch": 0.20101736537449572, "grad_norm": 0.7598588466644287, "learning_rate": 0.0004401014973898586, "loss": 14.101, "num_input_tokens_seen": 472923494, "step": 191 }, { "epoch": 0.2020698123136292, "grad_norm": 0.5476176738739014, "learning_rate": 0.0004388204591892506, "loss": 13.7653, "num_input_tokens_seen": 475403632, "step": 192 }, { "epoch": 0.20312225925276267, "grad_norm": 0.5955540537834167, "learning_rate": 0.00043752777365483816, "loss": 15.6452, "num_input_tokens_seen": 477793942, "step": 193 }, { "epoch": 0.20417470619189615, "grad_norm": 0.7483817934989929, "learning_rate": 0.0004362235205255541, "loss": 13.5634, "num_input_tokens_seen": 480231028, "step": 194 }, { "epoch": 0.20522715313102965, "grad_norm": 0.7962237000465393, "learning_rate": 0.0004349077802538751, "loss": 14.4975, "num_input_tokens_seen": 482714392, "step": 195 }, { "epoch": 0.20627960007016313, "grad_norm": 1.1036089658737183, "learning_rate": 0.0004335806340008587, "loss": 14.6236, "num_input_tokens_seen": 485207698, "step": 196 }, { "epoch": 0.2073320470092966, "grad_norm": 0.6767985820770264, "learning_rate": 0.00043224216363113723, "loss": 15.1432, "num_input_tokens_seen": 487761136, "step": 197 }, { "epoch": 0.2083844939484301, "grad_norm": 0.6471667289733887, "learning_rate": 0.0004308924517078678, "loss": 13.9088, "num_input_tokens_seen": 490162634, "step": 198 }, { "epoch": 0.2094369408875636, "grad_norm": 0.7849085927009583, "learning_rate": 0.00042953158148763975, "loss": 12.4976, "num_input_tokens_seen": 492622978, "step": 199 }, { "epoch": 0.21048938782669707, "grad_norm": 0.5790635943412781, "learning_rate": 0.0004281596369153384, "loss": 13.4096, "num_input_tokens_seen": 495148462, "step": 200 }, { "epoch": 0.21154183476583055, "grad_norm": 0.6563763618469238, "learning_rate": 0.0004267767026189673, "loss": 13.4088, "num_input_tokens_seen": 497513016, "step": 201 }, { "epoch": 0.21259428170496403, "grad_norm": 0.865070641040802, "learning_rate": 0.00042538286390442833, "loss": 14.4415, "num_input_tokens_seen": 499867956, "step": 202 }, { "epoch": 0.21364672864409753, "grad_norm": 0.9038957953453064, "learning_rate": 0.00042397820675025866, "loss": 13.3683, "num_input_tokens_seen": 502285638, "step": 203 }, { "epoch": 0.214699175583231, "grad_norm": 0.8758959174156189, "learning_rate": 0.0004225628178023283, "loss": 13.8119, "num_input_tokens_seen": 504687130, "step": 204 }, { "epoch": 0.2157516225223645, "grad_norm": 0.8445808291435242, "learning_rate": 0.00042113678436849454, "loss": 14.048, "num_input_tokens_seen": 507059102, "step": 205 }, { "epoch": 0.216804069461498, "grad_norm": 0.6597614288330078, "learning_rate": 0.0004197001944132168, "loss": 14.8573, "num_input_tokens_seen": 509569362, "step": 206 }, { "epoch": 0.21785651640063147, "grad_norm": 0.8512585759162903, "learning_rate": 0.0004182531365521305, "loss": 12.6988, "num_input_tokens_seen": 511746350, "step": 207 }, { "epoch": 0.21890896333976495, "grad_norm": 0.7619369626045227, "learning_rate": 0.0004167957000465808, "loss": 13.2403, "num_input_tokens_seen": 514219662, "step": 208 }, { "epoch": 0.21996141027889843, "grad_norm": 0.936169445514679, "learning_rate": 0.00041532797479811636, "loss": 13.8222, "num_input_tokens_seen": 516484996, "step": 209 }, { "epoch": 0.22101385721803193, "grad_norm": 0.5255652070045471, "learning_rate": 0.00041385005134294417, "loss": 13.2086, "num_input_tokens_seen": 519017410, "step": 210 }, { "epoch": 0.2220663041571654, "grad_norm": 0.8877251148223877, "learning_rate": 0.00041236202084634466, "loss": 12.4594, "num_input_tokens_seen": 521666908, "step": 211 }, { "epoch": 0.2231187510962989, "grad_norm": 0.8751455545425415, "learning_rate": 0.0004108639750970481, "loss": 12.9147, "num_input_tokens_seen": 524244282, "step": 212 }, { "epoch": 0.22417119803543237, "grad_norm": 0.8350555300712585, "learning_rate": 0.00040935600650157265, "loss": 12.5417, "num_input_tokens_seen": 526734894, "step": 213 }, { "epoch": 0.22522364497456587, "grad_norm": 0.561469554901123, "learning_rate": 0.00040783820807852457, "loss": 15.026, "num_input_tokens_seen": 529070404, "step": 214 }, { "epoch": 0.22627609191369935, "grad_norm": 0.5630365014076233, "learning_rate": 0.00040631067345285994, "loss": 13.6316, "num_input_tokens_seen": 531426160, "step": 215 }, { "epoch": 0.22732853885283283, "grad_norm": 0.49598175287246704, "learning_rate": 0.0004047734968501098, "loss": 13.3619, "num_input_tokens_seen": 533845182, "step": 216 }, { "epoch": 0.22838098579196633, "grad_norm": 0.47708484530448914, "learning_rate": 0.0004032267730905678, "loss": 14.4387, "num_input_tokens_seen": 536268014, "step": 217 }, { "epoch": 0.2294334327310998, "grad_norm": 0.7337295413017273, "learning_rate": 0.00040167059758344114, "loss": 12.0945, "num_input_tokens_seen": 538693364, "step": 218 }, { "epoch": 0.2304858796702333, "grad_norm": 1.0579578876495361, "learning_rate": 0.00040010506632096537, "loss": 12.5091, "num_input_tokens_seen": 541187024, "step": 219 }, { "epoch": 0.23153832660936677, "grad_norm": 0.7490728497505188, "learning_rate": 0.0003985302758724831, "loss": 13.1909, "num_input_tokens_seen": 543604444, "step": 220 }, { "epoch": 0.23259077354850027, "grad_norm": 0.7212266325950623, "learning_rate": 0.000396946323378487, "loss": 12.6965, "num_input_tokens_seen": 545984088, "step": 221 }, { "epoch": 0.23364322048763375, "grad_norm": 0.646439254283905, "learning_rate": 0.0003953533065446281, "loss": 13.8939, "num_input_tokens_seen": 548381794, "step": 222 }, { "epoch": 0.23469566742676723, "grad_norm": 0.8953412771224976, "learning_rate": 0.00039375132363568836, "loss": 13.8391, "num_input_tokens_seen": 550729046, "step": 223 }, { "epoch": 0.2357481143659007, "grad_norm": 0.7593062520027161, "learning_rate": 0.00039214047346951974, "loss": 13.0687, "num_input_tokens_seen": 553143400, "step": 224 }, { "epoch": 0.2368005613050342, "grad_norm": 0.69405198097229, "learning_rate": 0.00039052085541094823, "loss": 11.9046, "num_input_tokens_seen": 555642392, "step": 225 }, { "epoch": 0.2378530082441677, "grad_norm": 0.9592947959899902, "learning_rate": 0.0003888925693656447, "loss": 12.7413, "num_input_tokens_seen": 558135252, "step": 226 }, { "epoch": 0.23890545518330117, "grad_norm": 0.7023069262504578, "learning_rate": 0.00038725571577396254, "loss": 13.4486, "num_input_tokens_seen": 560539252, "step": 227 }, { "epoch": 0.23995790212243467, "grad_norm": 0.7300988435745239, "learning_rate": 0.0003856103956047413, "loss": 13.0376, "num_input_tokens_seen": 562864832, "step": 228 }, { "epoch": 0.24101034906156815, "grad_norm": 0.7498049736022949, "learning_rate": 0.0003839567103490793, "loss": 14.5989, "num_input_tokens_seen": 565436444, "step": 229 }, { "epoch": 0.24206279600070163, "grad_norm": 0.6054151654243469, "learning_rate": 0.0003822947620140726, "loss": 13.0184, "num_input_tokens_seen": 567888342, "step": 230 }, { "epoch": 0.2431152429398351, "grad_norm": 0.9312774538993835, "learning_rate": 0.0003806246531165231, "loss": 13.5703, "num_input_tokens_seen": 570445538, "step": 231 }, { "epoch": 0.2441676898789686, "grad_norm": 0.8951175808906555, "learning_rate": 0.0003789464866766144, "loss": 14.1627, "num_input_tokens_seen": 572908450, "step": 232 }, { "epoch": 0.2452201368181021, "grad_norm": 0.6929954886436462, "learning_rate": 0.0003772603662115575, "loss": 14.1037, "num_input_tokens_seen": 575332936, "step": 233 }, { "epoch": 0.24627258375723557, "grad_norm": 0.881860613822937, "learning_rate": 0.0003755663957292048, "loss": 13.1959, "num_input_tokens_seen": 577743092, "step": 234 }, { "epoch": 0.24732503069636905, "grad_norm": 0.6283185482025146, "learning_rate": 0.00037386467972163516, "loss": 13.8093, "num_input_tokens_seen": 580242946, "step": 235 }, { "epoch": 0.24837747763550255, "grad_norm": 0.7237707376480103, "learning_rate": 0.00037215532315870774, "loss": 12.6056, "num_input_tokens_seen": 582794522, "step": 236 }, { "epoch": 0.24942992457463603, "grad_norm": 0.5777227878570557, "learning_rate": 0.00037043843148158696, "loss": 13.8277, "num_input_tokens_seen": 585148490, "step": 237 }, { "epoch": 0.2504823715137695, "grad_norm": 0.8257551789283752, "learning_rate": 0.0003687141105962389, "loss": 12.8759, "num_input_tokens_seen": 587640978, "step": 238 }, { "epoch": 0.251534818452903, "grad_norm": 0.6794307231903076, "learning_rate": 0.000366982466866898, "loss": 13.4984, "num_input_tokens_seen": 590145570, "step": 239 }, { "epoch": 0.25258726539203646, "grad_norm": 0.7445029616355896, "learning_rate": 0.00036524360710950624, "loss": 14.3103, "num_input_tokens_seen": 592516528, "step": 240 }, { "epoch": 0.25363971233117, "grad_norm": 0.5747973918914795, "learning_rate": 0.0003634976385851242, "loss": 13.8574, "num_input_tokens_seen": 595074072, "step": 241 }, { "epoch": 0.2546921592703035, "grad_norm": 0.600624144077301, "learning_rate": 0.00036174466899331484, "loss": 13.2213, "num_input_tokens_seen": 597677888, "step": 242 }, { "epoch": 0.25574460620943695, "grad_norm": 0.45756641030311584, "learning_rate": 0.0003599848064654995, "loss": 13.7403, "num_input_tokens_seen": 600188888, "step": 243 }, { "epoch": 0.25679705314857043, "grad_norm": 0.8629601001739502, "learning_rate": 0.000358218159558289, "loss": 12.8846, "num_input_tokens_seen": 602602846, "step": 244 }, { "epoch": 0.2578495000877039, "grad_norm": 0.6151244640350342, "learning_rate": 0.0003564448372467859, "loss": 12.9059, "num_input_tokens_seen": 605219918, "step": 245 }, { "epoch": 0.2589019470268374, "grad_norm": 0.5167385339736938, "learning_rate": 0.0003546649489178636, "loss": 13.0844, "num_input_tokens_seen": 607577988, "step": 246 }, { "epoch": 0.25995439396597086, "grad_norm": 0.7532935738563538, "learning_rate": 0.00035287860436341824, "loss": 14.4682, "num_input_tokens_seen": 610085092, "step": 247 }, { "epoch": 0.26100684090510434, "grad_norm": 0.7155802845954895, "learning_rate": 0.0003510859137735964, "loss": 13.8606, "num_input_tokens_seen": 612785012, "step": 248 }, { "epoch": 0.2620592878442379, "grad_norm": 0.6364856362342834, "learning_rate": 0.00034928698772999787, "loss": 14.53, "num_input_tokens_seen": 615290532, "step": 249 }, { "epoch": 0.26311173478337135, "grad_norm": 0.9832903146743774, "learning_rate": 0.0003474819371988549, "loss": 13.1002, "num_input_tokens_seen": 617672096, "step": 250 }, { "epoch": 0.26416418172250483, "grad_norm": 0.9047756791114807, "learning_rate": 0.00034567087352418665, "loss": 14.5856, "num_input_tokens_seen": 620113346, "step": 251 }, { "epoch": 0.2652166286616383, "grad_norm": 0.861052393913269, "learning_rate": 0.0003438539084209315, "loss": 13.5915, "num_input_tokens_seen": 622398438, "step": 252 }, { "epoch": 0.2662690756007718, "grad_norm": 0.7577503323554993, "learning_rate": 0.0003420311539680557, "loss": 12.7828, "num_input_tokens_seen": 624785192, "step": 253 }, { "epoch": 0.26732152253990527, "grad_norm": 0.6618896126747131, "learning_rate": 0.00034020272260163977, "loss": 14.1012, "num_input_tokens_seen": 627227780, "step": 254 }, { "epoch": 0.26837396947903874, "grad_norm": 0.6488011479377747, "learning_rate": 0.0003383687271079432, "loss": 12.9119, "num_input_tokens_seen": 629611168, "step": 255 }, { "epoch": 0.2694264164181723, "grad_norm": 0.6784930229187012, "learning_rate": 0.0003365292806164468, "loss": 12.7832, "num_input_tokens_seen": 632251574, "step": 256 }, { "epoch": 0.27047886335730575, "grad_norm": 0.6751644611358643, "learning_rate": 0.00033468449659287486, "loss": 13.7319, "num_input_tokens_seen": 634797650, "step": 257 }, { "epoch": 0.27153131029643923, "grad_norm": 0.6870245337486267, "learning_rate": 0.0003328344888321955, "loss": 15.2889, "num_input_tokens_seen": 637362118, "step": 258 }, { "epoch": 0.2725837572355727, "grad_norm": 0.656185507774353, "learning_rate": 0.0003309793714516019, "loss": 13.6542, "num_input_tokens_seen": 639901340, "step": 259 }, { "epoch": 0.2736362041747062, "grad_norm": 0.4918140769004822, "learning_rate": 0.00032911925888347234, "loss": 13.8268, "num_input_tokens_seen": 642337410, "step": 260 }, { "epoch": 0.27468865111383967, "grad_norm": 0.6098233461380005, "learning_rate": 0.00032725426586831203, "loss": 13.1035, "num_input_tokens_seen": 644643422, "step": 261 }, { "epoch": 0.27574109805297314, "grad_norm": 0.8890863060951233, "learning_rate": 0.0003253845074476749, "loss": 12.7544, "num_input_tokens_seen": 647242042, "step": 262 }, { "epoch": 0.2767935449921066, "grad_norm": 0.6168068647384644, "learning_rate": 0.00032351009895706785, "loss": 13.7177, "num_input_tokens_seen": 649622532, "step": 263 }, { "epoch": 0.27784599193124015, "grad_norm": 0.8917167782783508, "learning_rate": 0.00032163115601883583, "loss": 13.1059, "num_input_tokens_seen": 652089318, "step": 264 }, { "epoch": 0.27889843887037363, "grad_norm": 0.7454566955566406, "learning_rate": 0.0003197477945350297, "loss": 12.9977, "num_input_tokens_seen": 654556424, "step": 265 }, { "epoch": 0.2799508858095071, "grad_norm": 0.6139355301856995, "learning_rate": 0.0003178601306802573, "loss": 13.3918, "num_input_tokens_seen": 656892266, "step": 266 }, { "epoch": 0.2810033327486406, "grad_norm": 0.7222312688827515, "learning_rate": 0.00031596828089451703, "loss": 14.4271, "num_input_tokens_seen": 659263176, "step": 267 }, { "epoch": 0.28205577968777407, "grad_norm": 0.47004249691963196, "learning_rate": 0.00031407236187601487, "loss": 14.9307, "num_input_tokens_seen": 661699446, "step": 268 }, { "epoch": 0.28310822662690754, "grad_norm": 0.6071431636810303, "learning_rate": 0.0003121724905739666, "loss": 13.8697, "num_input_tokens_seen": 664059960, "step": 269 }, { "epoch": 0.284160673566041, "grad_norm": 0.8162275552749634, "learning_rate": 0.0003102687841813832, "loss": 13.2029, "num_input_tokens_seen": 666544998, "step": 270 }, { "epoch": 0.28521312050517456, "grad_norm": 0.9714298248291016, "learning_rate": 0.00030836136012784226, "loss": 13.505, "num_input_tokens_seen": 669032238, "step": 271 }, { "epoch": 0.28626556744430803, "grad_norm": 0.7387194037437439, "learning_rate": 0.00030645033607224425, "loss": 13.0927, "num_input_tokens_seen": 671661922, "step": 272 }, { "epoch": 0.2873180143834415, "grad_norm": 0.7696450352668762, "learning_rate": 0.0003045358298955546, "loss": 14.0523, "num_input_tokens_seen": 674141342, "step": 273 }, { "epoch": 0.288370461322575, "grad_norm": 0.8243244886398315, "learning_rate": 0.0003026179596935324, "loss": 14.1965, "num_input_tokens_seen": 676547598, "step": 274 }, { "epoch": 0.28942290826170847, "grad_norm": 0.5004672408103943, "learning_rate": 0.00030069684376944573, "loss": 13.9038, "num_input_tokens_seen": 679012666, "step": 275 }, { "epoch": 0.29047535520084194, "grad_norm": 0.5318673849105835, "learning_rate": 0.000298772600626774, "loss": 13.2475, "num_input_tokens_seen": 681469470, "step": 276 }, { "epoch": 0.2915278021399754, "grad_norm": 0.7408674359321594, "learning_rate": 0.00029684534896189834, "loss": 14.4854, "num_input_tokens_seen": 683932156, "step": 277 }, { "epoch": 0.2925802490791089, "grad_norm": 0.6096925139427185, "learning_rate": 0.0002949152076567795, "loss": 14.9576, "num_input_tokens_seen": 686358838, "step": 278 }, { "epoch": 0.29363269601824243, "grad_norm": 0.9611358642578125, "learning_rate": 0.0002929822957716248, "loss": 12.8637, "num_input_tokens_seen": 688790324, "step": 279 }, { "epoch": 0.2946851429573759, "grad_norm": 0.8757907748222351, "learning_rate": 0.00029104673253754456, "loss": 12.1678, "num_input_tokens_seen": 691194216, "step": 280 }, { "epoch": 0.2957375898965094, "grad_norm": 0.6873354315757751, "learning_rate": 0.00028910863734919615, "loss": 14.3149, "num_input_tokens_seen": 693647450, "step": 281 }, { "epoch": 0.29679003683564287, "grad_norm": 0.5603722929954529, "learning_rate": 0.00028716812975741995, "loss": 13.9725, "num_input_tokens_seen": 696074394, "step": 282 }, { "epoch": 0.29784248377477635, "grad_norm": 0.9630053639411926, "learning_rate": 0.00028522532946186486, "loss": 12.7401, "num_input_tokens_seen": 698509918, "step": 283 }, { "epoch": 0.2988949307139098, "grad_norm": 0.7901732921600342, "learning_rate": 0.0002832803563036046, "loss": 12.5756, "num_input_tokens_seen": 701006970, "step": 284 }, { "epoch": 0.2999473776530433, "grad_norm": 0.9098412990570068, "learning_rate": 0.00028133333025774524, "loss": 12.6859, "num_input_tokens_seen": 703451216, "step": 285 }, { "epoch": 0.30099982459217683, "grad_norm": 0.5902009606361389, "learning_rate": 0.0002793843714260245, "loss": 12.9448, "num_input_tokens_seen": 705793372, "step": 286 }, { "epoch": 0.3020522715313103, "grad_norm": 0.8642719984054565, "learning_rate": 0.0002774336000294035, "loss": 13.6738, "num_input_tokens_seen": 708235934, "step": 287 }, { "epoch": 0.3031047184704438, "grad_norm": 0.5522446632385254, "learning_rate": 0.0002754811364006511, "loss": 13.7713, "num_input_tokens_seen": 710526636, "step": 288 }, { "epoch": 0.30415716540957727, "grad_norm": 0.5527954697608948, "learning_rate": 0.0002735271009769208, "loss": 13.1263, "num_input_tokens_seen": 713041976, "step": 289 }, { "epoch": 0.30520961234871075, "grad_norm": 0.6918050646781921, "learning_rate": 0.00027157161429232173, "loss": 13.9847, "num_input_tokens_seen": 715500808, "step": 290 }, { "epoch": 0.3062620592878442, "grad_norm": 0.761479377746582, "learning_rate": 0.00026961479697048385, "loss": 15.1361, "num_input_tokens_seen": 717789966, "step": 291 }, { "epoch": 0.3073145062269777, "grad_norm": 0.7217794060707092, "learning_rate": 0.00026765676971711704, "loss": 14.3201, "num_input_tokens_seen": 720050894, "step": 292 }, { "epoch": 0.30836695316611124, "grad_norm": 0.6466794610023499, "learning_rate": 0.00026569765331256536, "loss": 12.3215, "num_input_tokens_seen": 722610666, "step": 293 }, { "epoch": 0.3094194001052447, "grad_norm": 0.4965052902698517, "learning_rate": 0.000263737568604357, "loss": 14.798, "num_input_tokens_seen": 725032234, "step": 294 }, { "epoch": 0.3104718470443782, "grad_norm": 0.7593343257904053, "learning_rate": 0.00026177663649974936, "loss": 13.7831, "num_input_tokens_seen": 727438016, "step": 295 }, { "epoch": 0.31152429398351167, "grad_norm": 0.5348352193832397, "learning_rate": 0.00025981497795827174, "loss": 14.22, "num_input_tokens_seen": 729835496, "step": 296 }, { "epoch": 0.31257674092264515, "grad_norm": 1.0191282033920288, "learning_rate": 0.0002578527139842631, "loss": 13.3076, "num_input_tokens_seen": 732386278, "step": 297 }, { "epoch": 0.3136291878617786, "grad_norm": 0.7821326851844788, "learning_rate": 0.00025588996561940846, "loss": 13.0542, "num_input_tokens_seen": 734887418, "step": 298 }, { "epoch": 0.3146816348009121, "grad_norm": 0.9365267157554626, "learning_rate": 0.0002539268539352723, "loss": 11.8609, "num_input_tokens_seen": 737306656, "step": 299 }, { "epoch": 0.3157340817400456, "grad_norm": 0.8272538185119629, "learning_rate": 0.00025196350002583027, "loss": 12.2074, "num_input_tokens_seen": 739805892, "step": 300 }, { "epoch": 0.3167865286791791, "grad_norm": 0.6483294367790222, "learning_rate": 0.000250000025, "loss": 12.868, "num_input_tokens_seen": 742274446, "step": 301 }, { "epoch": 0.3178389756183126, "grad_norm": 0.6506504416465759, "learning_rate": 0.0002480365499741698, "loss": 13.2374, "num_input_tokens_seen": 744888840, "step": 302 }, { "epoch": 0.31889142255744607, "grad_norm": 0.868910014629364, "learning_rate": 0.00024607319606472785, "loss": 14.1672, "num_input_tokens_seen": 747432162, "step": 303 }, { "epoch": 0.31994386949657955, "grad_norm": 0.7248483300209045, "learning_rate": 0.00024411008438059164, "loss": 13.3665, "num_input_tokens_seen": 749975482, "step": 304 }, { "epoch": 0.320996316435713, "grad_norm": 0.565228283405304, "learning_rate": 0.00024214733601573695, "loss": 13.5641, "num_input_tokens_seen": 752470316, "step": 305 }, { "epoch": 0.3220487633748465, "grad_norm": 0.9313757419586182, "learning_rate": 0.00024018507204172831, "loss": 12.2515, "num_input_tokens_seen": 754904094, "step": 306 }, { "epoch": 0.32310121031398, "grad_norm": 0.6211026310920715, "learning_rate": 0.00023822341350025064, "loss": 13.9024, "num_input_tokens_seen": 757596584, "step": 307 }, { "epoch": 0.3241536572531135, "grad_norm": 0.6052622199058533, "learning_rate": 0.00023626248139564312, "loss": 13.7315, "num_input_tokens_seen": 760089904, "step": 308 }, { "epoch": 0.325206104192247, "grad_norm": 0.629616916179657, "learning_rate": 0.00023430239668743467, "loss": 13.5593, "num_input_tokens_seen": 762676990, "step": 309 }, { "epoch": 0.32625855113138047, "grad_norm": 0.5862231254577637, "learning_rate": 0.00023234328028288304, "loss": 14.7201, "num_input_tokens_seen": 765269936, "step": 310 }, { "epoch": 0.32731099807051395, "grad_norm": 0.33856290578842163, "learning_rate": 0.0002303852530295162, "loss": 14.3706, "num_input_tokens_seen": 767708356, "step": 311 }, { "epoch": 0.3283634450096474, "grad_norm": 0.6568730473518372, "learning_rate": 0.00022842843570767835, "loss": 14.1941, "num_input_tokens_seen": 770239018, "step": 312 }, { "epoch": 0.3294158919487809, "grad_norm": 0.7415146231651306, "learning_rate": 0.00022647294902307927, "loss": 12.9074, "num_input_tokens_seen": 772730824, "step": 313 }, { "epoch": 0.3304683388879144, "grad_norm": 0.6825469136238098, "learning_rate": 0.00022451891359934894, "loss": 13.7534, "num_input_tokens_seen": 775075628, "step": 314 }, { "epoch": 0.33152078582704786, "grad_norm": 0.9483029842376709, "learning_rate": 0.00022256644997059648, "loss": 12.2248, "num_input_tokens_seen": 777489320, "step": 315 }, { "epoch": 0.3325732327661814, "grad_norm": 0.6055121421813965, "learning_rate": 0.0002206156785739756, "loss": 11.7426, "num_input_tokens_seen": 779994114, "step": 316 }, { "epoch": 0.33362567970531487, "grad_norm": 0.8969120383262634, "learning_rate": 0.00021866671974225477, "loss": 12.3666, "num_input_tokens_seen": 782438644, "step": 317 }, { "epoch": 0.33467812664444835, "grad_norm": 0.6655110120773315, "learning_rate": 0.00021671969369639545, "loss": 13.1219, "num_input_tokens_seen": 784904920, "step": 318 }, { "epoch": 0.3357305735835818, "grad_norm": 0.8371227979660034, "learning_rate": 0.00021477472053813517, "loss": 12.9637, "num_input_tokens_seen": 787325224, "step": 319 }, { "epoch": 0.3367830205227153, "grad_norm": 0.9360111355781555, "learning_rate": 0.00021283192024258013, "loss": 12.2981, "num_input_tokens_seen": 789859994, "step": 320 }, { "epoch": 0.3378354674618488, "grad_norm": 0.8280896544456482, "learning_rate": 0.00021089141265080388, "loss": 13.6604, "num_input_tokens_seen": 792449130, "step": 321 }, { "epoch": 0.33888791440098226, "grad_norm": 0.6332969069480896, "learning_rate": 0.00020895331746245547, "loss": 14.7076, "num_input_tokens_seen": 795064220, "step": 322 }, { "epoch": 0.3399403613401158, "grad_norm": 0.6264622807502747, "learning_rate": 0.0002070177542283751, "loss": 13.9214, "num_input_tokens_seen": 797467722, "step": 323 }, { "epoch": 0.34099280827924927, "grad_norm": 0.7760950326919556, "learning_rate": 0.00020508484234322058, "loss": 12.4921, "num_input_tokens_seen": 800080298, "step": 324 }, { "epoch": 0.34204525521838275, "grad_norm": 0.6662678122520447, "learning_rate": 0.0002031547010381017, "loss": 14.852, "num_input_tokens_seen": 802433890, "step": 325 }, { "epoch": 0.3430977021575162, "grad_norm": 0.7059805989265442, "learning_rate": 0.00020122744937322602, "loss": 13.2825, "num_input_tokens_seen": 804881138, "step": 326 }, { "epoch": 0.3441501490966497, "grad_norm": 0.6212575435638428, "learning_rate": 0.00019930320623055435, "loss": 14.3103, "num_input_tokens_seen": 807339016, "step": 327 }, { "epoch": 0.3452025960357832, "grad_norm": 0.6667155623435974, "learning_rate": 0.00019738209030646764, "loss": 13.4151, "num_input_tokens_seen": 809901380, "step": 328 }, { "epoch": 0.34625504297491666, "grad_norm": 0.5312402248382568, "learning_rate": 0.00019546422010444548, "loss": 13.9075, "num_input_tokens_seen": 812291846, "step": 329 }, { "epoch": 0.3473074899140502, "grad_norm": 0.727912187576294, "learning_rate": 0.00019354971392775578, "loss": 13.0755, "num_input_tokens_seen": 814880042, "step": 330 }, { "epoch": 0.34835993685318367, "grad_norm": 0.675326943397522, "learning_rate": 0.00019163868987215785, "loss": 13.3719, "num_input_tokens_seen": 817425360, "step": 331 }, { "epoch": 0.34941238379231715, "grad_norm": 0.8266693949699402, "learning_rate": 0.0001897312658186169, "loss": 13.2629, "num_input_tokens_seen": 819757120, "step": 332 }, { "epoch": 0.3504648307314506, "grad_norm": 0.5413087606430054, "learning_rate": 0.00018782755942603355, "loss": 13.8487, "num_input_tokens_seen": 822236352, "step": 333 }, { "epoch": 0.3515172776705841, "grad_norm": 0.5950444936752319, "learning_rate": 0.00018592768812398516, "loss": 13.2504, "num_input_tokens_seen": 824801066, "step": 334 }, { "epoch": 0.3525697246097176, "grad_norm": 0.7902871370315552, "learning_rate": 0.0001840317691054831, "loss": 13.141, "num_input_tokens_seen": 827214108, "step": 335 }, { "epoch": 0.35362217154885106, "grad_norm": 0.6074835658073425, "learning_rate": 0.00018213991931974273, "loss": 13.1971, "num_input_tokens_seen": 829688706, "step": 336 }, { "epoch": 0.35467461848798454, "grad_norm": 0.8195119500160217, "learning_rate": 0.00018025225546497038, "loss": 13.7278, "num_input_tokens_seen": 832226940, "step": 337 }, { "epoch": 0.35572706542711807, "grad_norm": 0.6209080815315247, "learning_rate": 0.00017836889398116423, "loss": 12.7686, "num_input_tokens_seen": 834609840, "step": 338 }, { "epoch": 0.35677951236625155, "grad_norm": 0.741378128528595, "learning_rate": 0.0001764899510429322, "loss": 15.7956, "num_input_tokens_seen": 836916686, "step": 339 }, { "epoch": 0.35783195930538503, "grad_norm": 0.6312536597251892, "learning_rate": 0.00017461554255232512, "loss": 13.872, "num_input_tokens_seen": 839443446, "step": 340 }, { "epoch": 0.3588844062445185, "grad_norm": 0.495096892118454, "learning_rate": 0.00017274578413168805, "loss": 14.3015, "num_input_tokens_seen": 841951454, "step": 341 }, { "epoch": 0.359936853183652, "grad_norm": 0.7591367959976196, "learning_rate": 0.00017088079111652767, "loss": 13.2749, "num_input_tokens_seen": 844453022, "step": 342 }, { "epoch": 0.36098930012278546, "grad_norm": 0.6761007308959961, "learning_rate": 0.00016902067854839817, "loss": 13.3922, "num_input_tokens_seen": 846780950, "step": 343 }, { "epoch": 0.36204174706191894, "grad_norm": 0.7525628209114075, "learning_rate": 0.00016716556116780446, "loss": 14.6929, "num_input_tokens_seen": 849183708, "step": 344 }, { "epoch": 0.3630941940010525, "grad_norm": 0.9306416511535645, "learning_rate": 0.00016531555340712516, "loss": 13.432, "num_input_tokens_seen": 851511450, "step": 345 }, { "epoch": 0.36414664094018595, "grad_norm": 0.6304547786712646, "learning_rate": 0.00016347076938355316, "loss": 15.4361, "num_input_tokens_seen": 853993150, "step": 346 }, { "epoch": 0.36519908787931943, "grad_norm": 1.0461174249649048, "learning_rate": 0.00016163132289205687, "loss": 13.2764, "num_input_tokens_seen": 856344778, "step": 347 }, { "epoch": 0.3662515348184529, "grad_norm": 0.7617403268814087, "learning_rate": 0.00015979732739836023, "loss": 14.3852, "num_input_tokens_seen": 858729952, "step": 348 }, { "epoch": 0.3673039817575864, "grad_norm": 0.5618032813072205, "learning_rate": 0.00015796889603194434, "loss": 12.0861, "num_input_tokens_seen": 861130632, "step": 349 }, { "epoch": 0.36835642869671986, "grad_norm": 0.6796286702156067, "learning_rate": 0.00015614614157906848, "loss": 14.1596, "num_input_tokens_seen": 863712730, "step": 350 }, { "epoch": 0.36940887563585334, "grad_norm": 0.6022249460220337, "learning_rate": 0.00015432917647581338, "loss": 13.8241, "num_input_tokens_seen": 866066478, "step": 351 }, { "epoch": 0.3704613225749868, "grad_norm": 0.5393820405006409, "learning_rate": 0.00015251811280114522, "loss": 14.4018, "num_input_tokens_seen": 868398442, "step": 352 }, { "epoch": 0.37151376951412035, "grad_norm": 0.5526579022407532, "learning_rate": 0.00015071306227000213, "loss": 13.5037, "num_input_tokens_seen": 871065104, "step": 353 }, { "epoch": 0.37256621645325383, "grad_norm": 0.6372029781341553, "learning_rate": 0.00014891413622640368, "loss": 14.1205, "num_input_tokens_seen": 873457652, "step": 354 }, { "epoch": 0.3736186633923873, "grad_norm": 0.5851763486862183, "learning_rate": 0.00014712144563658178, "loss": 13.8458, "num_input_tokens_seen": 875897950, "step": 355 }, { "epoch": 0.3746711103315208, "grad_norm": 0.5874693393707275, "learning_rate": 0.0001453351010821365, "loss": 13.1795, "num_input_tokens_seen": 878370924, "step": 356 }, { "epoch": 0.37572355727065426, "grad_norm": 0.8740347623825073, "learning_rate": 0.00014355521275321415, "loss": 11.385, "num_input_tokens_seen": 880969016, "step": 357 }, { "epoch": 0.37677600420978774, "grad_norm": 0.5970618724822998, "learning_rate": 0.00014178189044171117, "loss": 14.6007, "num_input_tokens_seen": 883387056, "step": 358 }, { "epoch": 0.3778284511489212, "grad_norm": 0.7638245224952698, "learning_rate": 0.00014001524353450046, "loss": 14.0914, "num_input_tokens_seen": 885934126, "step": 359 }, { "epoch": 0.37888089808805475, "grad_norm": 0.7875133156776428, "learning_rate": 0.00013825538100668526, "loss": 12.3132, "num_input_tokens_seen": 888385086, "step": 360 }, { "epoch": 0.37993334502718823, "grad_norm": 0.6146247386932373, "learning_rate": 0.00013650241141487582, "loss": 13.5411, "num_input_tokens_seen": 890881778, "step": 361 }, { "epoch": 0.3809857919663217, "grad_norm": 0.6689059138298035, "learning_rate": 0.00013475644289049382, "loss": 13.9515, "num_input_tokens_seen": 893336660, "step": 362 }, { "epoch": 0.3820382389054552, "grad_norm": 0.862234890460968, "learning_rate": 0.00013301758313310206, "loss": 13.7711, "num_input_tokens_seen": 895904604, "step": 363 }, { "epoch": 0.38309068584458866, "grad_norm": 1.0258100032806396, "learning_rate": 0.00013128593940376116, "loss": 10.847, "num_input_tokens_seen": 898445222, "step": 364 }, { "epoch": 0.38414313278372214, "grad_norm": 0.6869201064109802, "learning_rate": 0.000129561618518413, "loss": 15.5992, "num_input_tokens_seen": 900863048, "step": 365 }, { "epoch": 0.3851955797228556, "grad_norm": 0.5204945206642151, "learning_rate": 0.0001278447268412924, "loss": 14.8587, "num_input_tokens_seen": 903463666, "step": 366 }, { "epoch": 0.38624802666198915, "grad_norm": 0.721432626247406, "learning_rate": 0.00012613537027836484, "loss": 13.0708, "num_input_tokens_seen": 905997924, "step": 367 }, { "epoch": 0.38730047360112263, "grad_norm": 0.9438528418540955, "learning_rate": 0.00012443365427079522, "loss": 13.3503, "num_input_tokens_seen": 908380228, "step": 368 }, { "epoch": 0.3883529205402561, "grad_norm": 1.0060820579528809, "learning_rate": 0.00012273968378844258, "loss": 13.2575, "num_input_tokens_seen": 910897324, "step": 369 }, { "epoch": 0.3894053674793896, "grad_norm": 0.78006511926651, "learning_rate": 0.00012105356332338561, "loss": 12.1983, "num_input_tokens_seen": 913353406, "step": 370 }, { "epoch": 0.39045781441852306, "grad_norm": 0.7305580973625183, "learning_rate": 0.00011937539688347693, "loss": 13.0814, "num_input_tokens_seen": 915785748, "step": 371 }, { "epoch": 0.39151026135765654, "grad_norm": 0.6684518456459045, "learning_rate": 0.00011770528798592742, "loss": 13.2799, "num_input_tokens_seen": 918190994, "step": 372 }, { "epoch": 0.39256270829679, "grad_norm": 0.6175460815429688, "learning_rate": 0.00011604333965092066, "loss": 11.9522, "num_input_tokens_seen": 920686404, "step": 373 }, { "epoch": 0.3936151552359235, "grad_norm": 0.5656607747077942, "learning_rate": 0.00011438965439525878, "loss": 13.4447, "num_input_tokens_seen": 923095658, "step": 374 }, { "epoch": 0.39466760217505703, "grad_norm": 0.6983336210250854, "learning_rate": 0.00011274433422603749, "loss": 14.1552, "num_input_tokens_seen": 925586940, "step": 375 }, { "epoch": 0.3957200491141905, "grad_norm": 0.5841440558433533, "learning_rate": 0.00011110748063435535, "loss": 13.7801, "num_input_tokens_seen": 927981268, "step": 376 }, { "epoch": 0.396772496053324, "grad_norm": 0.6267594695091248, "learning_rate": 0.00010947919458905186, "loss": 13.4331, "num_input_tokens_seen": 930493644, "step": 377 }, { "epoch": 0.39782494299245746, "grad_norm": 0.7009475231170654, "learning_rate": 0.00010785957653048027, "loss": 13.0888, "num_input_tokens_seen": 932987250, "step": 378 }, { "epoch": 0.39887738993159094, "grad_norm": 0.8064966797828674, "learning_rate": 0.00010624872636431175, "loss": 14.1741, "num_input_tokens_seen": 935503420, "step": 379 }, { "epoch": 0.3999298368707244, "grad_norm": 0.9372491836547852, "learning_rate": 0.00010464674345537197, "loss": 12.661, "num_input_tokens_seen": 937916022, "step": 380 }, { "epoch": 0.4009822838098579, "grad_norm": 0.7157238125801086, "learning_rate": 0.00010305372662151306, "loss": 12.9818, "num_input_tokens_seen": 940356578, "step": 381 }, { "epoch": 0.40203473074899143, "grad_norm": 0.8827713131904602, "learning_rate": 0.00010146977412751696, "loss": 13.0314, "num_input_tokens_seen": 942857210, "step": 382 }, { "epoch": 0.4030871776881249, "grad_norm": 0.751990795135498, "learning_rate": 9.989498367903467e-05, "loss": 12.3977, "num_input_tokens_seen": 945433158, "step": 383 }, { "epoch": 0.4041396246272584, "grad_norm": 0.761873185634613, "learning_rate": 9.83294524165589e-05, "loss": 13.9703, "num_input_tokens_seen": 947919298, "step": 384 }, { "epoch": 0.40519207156639186, "grad_norm": 0.4988192915916443, "learning_rate": 9.677327690943229e-05, "loss": 11.4229, "num_input_tokens_seen": 950383016, "step": 385 }, { "epoch": 0.40624451850552534, "grad_norm": 0.6990517973899841, "learning_rate": 9.522655314989022e-05, "loss": 12.8871, "num_input_tokens_seen": 952918510, "step": 386 }, { "epoch": 0.4072969654446588, "grad_norm": 0.745590329170227, "learning_rate": 9.36893765471402e-05, "loss": 13.1671, "num_input_tokens_seen": 955359356, "step": 387 }, { "epoch": 0.4083494123837923, "grad_norm": 0.6261069774627686, "learning_rate": 9.216184192147546e-05, "loss": 13.4955, "num_input_tokens_seen": 957716656, "step": 388 }, { "epoch": 0.4094018593229258, "grad_norm": 0.9301333427429199, "learning_rate": 9.064404349842731e-05, "loss": 13.2041, "num_input_tokens_seen": 960222350, "step": 389 }, { "epoch": 0.4104543062620593, "grad_norm": 0.7653475999832153, "learning_rate": 8.913607490295189e-05, "loss": 13.6695, "num_input_tokens_seen": 962817616, "step": 390 }, { "epoch": 0.4115067532011928, "grad_norm": 0.5150465369224548, "learning_rate": 8.763802915365534e-05, "loss": 13.9587, "num_input_tokens_seen": 965379812, "step": 391 }, { "epoch": 0.41255920014032627, "grad_norm": 0.6819850206375122, "learning_rate": 8.614999865705583e-05, "loss": 13.9956, "num_input_tokens_seen": 967770264, "step": 392 }, { "epoch": 0.41361164707945974, "grad_norm": 0.6354870200157166, "learning_rate": 8.46720752018837e-05, "loss": 14.0567, "num_input_tokens_seen": 970191078, "step": 393 }, { "epoch": 0.4146640940185932, "grad_norm": 0.7000157833099365, "learning_rate": 8.320434995341921e-05, "loss": 12.864, "num_input_tokens_seen": 972628920, "step": 394 }, { "epoch": 0.4157165409577267, "grad_norm": 0.6780827045440674, "learning_rate": 8.174691344786956e-05, "loss": 15.1169, "num_input_tokens_seen": 975172648, "step": 395 }, { "epoch": 0.4167689878968602, "grad_norm": 0.6616666316986084, "learning_rate": 8.02998555867832e-05, "loss": 13.6806, "num_input_tokens_seen": 977537432, "step": 396 }, { "epoch": 0.4178214348359937, "grad_norm": 0.6873002648353577, "learning_rate": 7.886326563150547e-05, "loss": 14.3124, "num_input_tokens_seen": 980061168, "step": 397 }, { "epoch": 0.4188738817751272, "grad_norm": 0.6357054710388184, "learning_rate": 7.743723219767172e-05, "loss": 12.8707, "num_input_tokens_seen": 982492018, "step": 398 }, { "epoch": 0.41992632871426067, "grad_norm": 0.6620740294456482, "learning_rate": 7.602184324974135e-05, "loss": 13.7987, "num_input_tokens_seen": 984909712, "step": 399 }, { "epoch": 0.42097877565339414, "grad_norm": 0.8262244462966919, "learning_rate": 7.461718609557173e-05, "loss": 12.0499, "num_input_tokens_seen": 987300404, "step": 400 }, { "epoch": 0.4220312225925276, "grad_norm": 0.40218013525009155, "learning_rate": 7.322334738103267e-05, "loss": 14.411, "num_input_tokens_seen": 989692606, "step": 401 }, { "epoch": 0.4230836695316611, "grad_norm": 0.5674455165863037, "learning_rate": 7.184041308466164e-05, "loss": 13.5246, "num_input_tokens_seen": 992001040, "step": 402 }, { "epoch": 0.4241361164707946, "grad_norm": 0.8044447898864746, "learning_rate": 7.046846851236027e-05, "loss": 13.9591, "num_input_tokens_seen": 994545566, "step": 403 }, { "epoch": 0.42518856340992806, "grad_norm": 0.8320671319961548, "learning_rate": 6.910759829213217e-05, "loss": 12.727, "num_input_tokens_seen": 996909322, "step": 404 }, { "epoch": 0.4262410103490616, "grad_norm": 0.45550259947776794, "learning_rate": 6.775788636886286e-05, "loss": 14.0767, "num_input_tokens_seen": 999395798, "step": 405 }, { "epoch": 0.42729345728819507, "grad_norm": 0.5679682493209839, "learning_rate": 6.64194159991414e-05, "loss": 13.6373, "num_input_tokens_seen": 1001628552, "step": 406 }, { "epoch": 0.42834590422732854, "grad_norm": 0.6373701691627502, "learning_rate": 6.509226974612494e-05, "loss": 14.5913, "num_input_tokens_seen": 1004121580, "step": 407 }, { "epoch": 0.429398351166462, "grad_norm": 0.9872604608535767, "learning_rate": 6.377652947444598e-05, "loss": 12.2578, "num_input_tokens_seen": 1006616742, "step": 408 }, { "epoch": 0.4304507981055955, "grad_norm": 0.8993603587150574, "learning_rate": 6.247227634516185e-05, "loss": 14.2681, "num_input_tokens_seen": 1009046128, "step": 409 }, { "epoch": 0.431503245044729, "grad_norm": 0.5047195553779602, "learning_rate": 6.117959081074942e-05, "loss": 13.8359, "num_input_tokens_seen": 1011365008, "step": 410 }, { "epoch": 0.43255569198386246, "grad_norm": 0.7039942145347595, "learning_rate": 5.989855261014141e-05, "loss": 13.6884, "num_input_tokens_seen": 1013738386, "step": 411 }, { "epoch": 0.433608138922996, "grad_norm": 0.640019953250885, "learning_rate": 5.862924076380834e-05, "loss": 13.8977, "num_input_tokens_seen": 1016294640, "step": 412 }, { "epoch": 0.43466058586212947, "grad_norm": 0.7602420449256897, "learning_rate": 5.737173356888379e-05, "loss": 12.9368, "num_input_tokens_seen": 1018758128, "step": 413 }, { "epoch": 0.43571303280126294, "grad_norm": 0.5742185711860657, "learning_rate": 5.6126108594335e-05, "loss": 13.6752, "num_input_tokens_seen": 1021235728, "step": 414 }, { "epoch": 0.4367654797403964, "grad_norm": 0.8279076814651489, "learning_rate": 5.489244267617774e-05, "loss": 11.8657, "num_input_tokens_seen": 1023706144, "step": 415 }, { "epoch": 0.4378179266795299, "grad_norm": 0.6805025339126587, "learning_rate": 5.3670811912737094e-05, "loss": 12.8483, "num_input_tokens_seen": 1026352302, "step": 416 }, { "epoch": 0.4388703736186634, "grad_norm": 0.6015728712081909, "learning_rate": 5.246129165995271e-05, "loss": 14.792, "num_input_tokens_seen": 1028913122, "step": 417 }, { "epoch": 0.43992282055779686, "grad_norm": 0.546303391456604, "learning_rate": 5.126395652673133e-05, "loss": 12.1327, "num_input_tokens_seen": 1031498770, "step": 418 }, { "epoch": 0.4409752674969304, "grad_norm": 0.8424621820449829, "learning_rate": 5.0078880370343816e-05, "loss": 14.136, "num_input_tokens_seen": 1034059758, "step": 419 }, { "epoch": 0.44202771443606387, "grad_norm": 0.6246097087860107, "learning_rate": 4.890613629186976e-05, "loss": 14.3438, "num_input_tokens_seen": 1036227524, "step": 420 }, { "epoch": 0.44308016137519735, "grad_norm": 0.7312206029891968, "learning_rate": 4.774579663168803e-05, "loss": 13.7757, "num_input_tokens_seen": 1038566110, "step": 421 }, { "epoch": 0.4441326083143308, "grad_norm": 1.095587134361267, "learning_rate": 4.659793296501451e-05, "loss": 11.8454, "num_input_tokens_seen": 1040926360, "step": 422 }, { "epoch": 0.4451850552534643, "grad_norm": 0.7614743113517761, "learning_rate": 4.546261609748703e-05, "loss": 13.9703, "num_input_tokens_seen": 1043399064, "step": 423 }, { "epoch": 0.4462375021925978, "grad_norm": 0.5542216300964355, "learning_rate": 4.4339916060798035e-05, "loss": 14.1185, "num_input_tokens_seen": 1045906682, "step": 424 }, { "epoch": 0.44728994913173126, "grad_norm": 0.6299486756324768, "learning_rate": 4.322990210837387e-05, "loss": 13.9111, "num_input_tokens_seen": 1048364322, "step": 425 }, { "epoch": 0.44834239607086473, "grad_norm": 0.5358718633651733, "learning_rate": 4.213264271110397e-05, "loss": 13.7337, "num_input_tokens_seen": 1050787094, "step": 426 }, { "epoch": 0.44939484300999827, "grad_norm": 0.5977242588996887, "learning_rate": 4.1048205553116524e-05, "loss": 14.4294, "num_input_tokens_seen": 1053206500, "step": 427 }, { "epoch": 0.45044728994913175, "grad_norm": 0.6246611475944519, "learning_rate": 3.997665752760337e-05, "loss": 14.1778, "num_input_tokens_seen": 1055658840, "step": 428 }, { "epoch": 0.4514997368882652, "grad_norm": 0.7311842441558838, "learning_rate": 3.891806473269443e-05, "loss": 12.7367, "num_input_tokens_seen": 1058022042, "step": 429 }, { "epoch": 0.4525521838273987, "grad_norm": 0.6863019466400146, "learning_rate": 3.7872492467379484e-05, "loss": 13.1931, "num_input_tokens_seen": 1060447832, "step": 430 }, { "epoch": 0.4536046307665322, "grad_norm": 0.5671864151954651, "learning_rate": 3.684000522748107e-05, "loss": 14.1438, "num_input_tokens_seen": 1063104176, "step": 431 }, { "epoch": 0.45465707770566566, "grad_norm": 0.7966405749320984, "learning_rate": 3.582066670167559e-05, "loss": 12.3563, "num_input_tokens_seen": 1065688708, "step": 432 }, { "epoch": 0.45570952464479914, "grad_norm": 0.4855959713459015, "learning_rate": 3.4814539767564794e-05, "loss": 14.5512, "num_input_tokens_seen": 1068178408, "step": 433 }, { "epoch": 0.45676197158393267, "grad_norm": 0.6974388360977173, "learning_rate": 3.382168648779727e-05, "loss": 14.3312, "num_input_tokens_seen": 1070677814, "step": 434 }, { "epoch": 0.45781441852306615, "grad_norm": 0.7173691391944885, "learning_rate": 3.284216810624009e-05, "loss": 14.734, "num_input_tokens_seen": 1073127242, "step": 435 }, { "epoch": 0.4588668654621996, "grad_norm": 0.7349993586540222, "learning_rate": 3.1876045044200884e-05, "loss": 13.1637, "num_input_tokens_seen": 1075536720, "step": 436 }, { "epoch": 0.4599193124013331, "grad_norm": 0.8061495423316956, "learning_rate": 3.092337689670117e-05, "loss": 11.83, "num_input_tokens_seen": 1078097762, "step": 437 }, { "epoch": 0.4609717593404666, "grad_norm": 0.6008654832839966, "learning_rate": 2.9984222428799473e-05, "loss": 14.8336, "num_input_tokens_seen": 1080718132, "step": 438 }, { "epoch": 0.46202420627960006, "grad_norm": 0.6301268935203552, "learning_rate": 2.9058639571967387e-05, "loss": 13.1775, "num_input_tokens_seen": 1083127310, "step": 439 }, { "epoch": 0.46307665321873354, "grad_norm": 0.4426977038383484, "learning_rate": 2.814668542051539e-05, "loss": 13.0319, "num_input_tokens_seen": 1085568326, "step": 440 }, { "epoch": 0.464129100157867, "grad_norm": 0.8434951305389404, "learning_rate": 2.724841622807116e-05, "loss": 13.3485, "num_input_tokens_seen": 1088007162, "step": 441 }, { "epoch": 0.46518154709700055, "grad_norm": 0.7317315340042114, "learning_rate": 2.6363887404109744e-05, "loss": 13.436, "num_input_tokens_seen": 1090544604, "step": 442 }, { "epoch": 0.466233994036134, "grad_norm": 0.6956573128700256, "learning_rate": 2.5493153510535512e-05, "loss": 12.6054, "num_input_tokens_seen": 1093110288, "step": 443 }, { "epoch": 0.4672864409752675, "grad_norm": 0.6341850161552429, "learning_rate": 2.4636268258316483e-05, "loss": 13.9492, "num_input_tokens_seen": 1095456540, "step": 444 }, { "epoch": 0.468338887914401, "grad_norm": 0.6020154356956482, "learning_rate": 2.3793284504171477e-05, "loss": 12.8645, "num_input_tokens_seen": 1098018502, "step": 445 }, { "epoch": 0.46939133485353446, "grad_norm": 0.7971248030662537, "learning_rate": 2.2964254247309006e-05, "loss": 13.2963, "num_input_tokens_seen": 1100536788, "step": 446 }, { "epoch": 0.47044378179266794, "grad_norm": 0.5500831604003906, "learning_rate": 2.2149228626220597e-05, "loss": 14.3365, "num_input_tokens_seen": 1103063202, "step": 447 }, { "epoch": 0.4714962287318014, "grad_norm": 0.6911066174507141, "learning_rate": 2.1348257915525593e-05, "loss": 13.1402, "num_input_tokens_seen": 1105464122, "step": 448 }, { "epoch": 0.47254867567093495, "grad_norm": 0.8113967180252075, "learning_rate": 2.056139152287036e-05, "loss": 12.0192, "num_input_tokens_seen": 1107882000, "step": 449 }, { "epoch": 0.4736011226100684, "grad_norm": 0.7161016464233398, "learning_rate": 1.9788677985880458e-05, "loss": 13.2784, "num_input_tokens_seen": 1110250460, "step": 450 }, { "epoch": 0.4746535695492019, "grad_norm": 0.657977819442749, "learning_rate": 1.9030164969166632e-05, "loss": 13.867, "num_input_tokens_seen": 1112709658, "step": 451 }, { "epoch": 0.4757060164883354, "grad_norm": 0.7000032663345337, "learning_rate": 1.8285899261384692e-05, "loss": 14.4708, "num_input_tokens_seen": 1115334966, "step": 452 }, { "epoch": 0.47675846342746886, "grad_norm": 0.6344372630119324, "learning_rate": 1.7555926772349314e-05, "loss": 13.3293, "num_input_tokens_seen": 1117752722, "step": 453 }, { "epoch": 0.47781091036660234, "grad_norm": 0.5095113515853882, "learning_rate": 1.6840292530202083e-05, "loss": 14.4371, "num_input_tokens_seen": 1120147724, "step": 454 }, { "epoch": 0.4788633573057358, "grad_norm": 0.7433231472969055, "learning_rate": 1.6139040678633956e-05, "loss": 14.4739, "num_input_tokens_seen": 1122526110, "step": 455 }, { "epoch": 0.47991580424486935, "grad_norm": 0.6738459467887878, "learning_rate": 1.545221447416239e-05, "loss": 14.1935, "num_input_tokens_seen": 1125050832, "step": 456 }, { "epoch": 0.4809682511840028, "grad_norm": 0.5307145714759827, "learning_rate": 1.477985628346286e-05, "loss": 14.1727, "num_input_tokens_seen": 1127531366, "step": 457 }, { "epoch": 0.4820206981231363, "grad_norm": 0.671658456325531, "learning_rate": 1.412200758075573e-05, "loss": 14.8937, "num_input_tokens_seen": 1130070944, "step": 458 }, { "epoch": 0.4830731450622698, "grad_norm": 0.8140004277229309, "learning_rate": 1.3478708945247646e-05, "loss": 13.9695, "num_input_tokens_seen": 1132505090, "step": 459 }, { "epoch": 0.48412559200140326, "grad_norm": 0.541232168674469, "learning_rate": 1.2850000058628718e-05, "loss": 13.0277, "num_input_tokens_seen": 1134933018, "step": 460 }, { "epoch": 0.48517803894053674, "grad_norm": 0.5849113464355469, "learning_rate": 1.2235919702624524e-05, "loss": 13.395, "num_input_tokens_seen": 1137318036, "step": 461 }, { "epoch": 0.4862304858796702, "grad_norm": 0.8292491436004639, "learning_rate": 1.1636505756604008e-05, "loss": 12.6616, "num_input_tokens_seen": 1139822632, "step": 462 }, { "epoch": 0.4872829328188037, "grad_norm": 0.37731918692588806, "learning_rate": 1.105179519524284e-05, "loss": 14.3294, "num_input_tokens_seen": 1142311370, "step": 463 }, { "epoch": 0.4883353797579372, "grad_norm": 0.8327419757843018, "learning_rate": 1.048182408624271e-05, "loss": 13.062, "num_input_tokens_seen": 1144778138, "step": 464 }, { "epoch": 0.4893878266970707, "grad_norm": 0.4809480905532837, "learning_rate": 9.926627588106376e-06, "loss": 13.46, "num_input_tokens_seen": 1147178708, "step": 465 }, { "epoch": 0.4904402736362042, "grad_norm": 0.7914392948150635, "learning_rate": 9.38623994796912e-06, "loss": 12.4968, "num_input_tokens_seen": 1149583894, "step": 466 }, { "epoch": 0.49149272057533766, "grad_norm": 0.741800844669342, "learning_rate": 8.860694499485942e-06, "loss": 13.6282, "num_input_tokens_seen": 1152083358, "step": 467 }, { "epoch": 0.49254516751447114, "grad_norm": 0.5907045602798462, "learning_rate": 8.350023660775733e-06, "loss": 13.7365, "num_input_tokens_seen": 1154606974, "step": 468 }, { "epoch": 0.4935976144536046, "grad_norm": 0.8996415138244629, "learning_rate": 7.85425893242126e-06, "loss": 14.2235, "num_input_tokens_seen": 1156988262, "step": 469 }, { "epoch": 0.4946500613927381, "grad_norm": 0.5644741654396057, "learning_rate": 7.373430895526221e-06, "loss": 14.1453, "num_input_tokens_seen": 1159550376, "step": 470 }, { "epoch": 0.4957025083318716, "grad_norm": 0.5078418254852295, "learning_rate": 6.907569209828871e-06, "loss": 14.5073, "num_input_tokens_seen": 1161953972, "step": 471 }, { "epoch": 0.4967549552710051, "grad_norm": 0.5072042346000671, "learning_rate": 6.4567026118723395e-06, "loss": 13.6735, "num_input_tokens_seen": 1164459876, "step": 472 }, { "epoch": 0.4978074022101386, "grad_norm": 0.8498991131782532, "learning_rate": 6.02085891323219e-06, "loss": 12.9484, "num_input_tokens_seen": 1166944212, "step": 473 }, { "epoch": 0.49885984914927206, "grad_norm": 0.5259454250335693, "learning_rate": 5.600064998800795e-06, "loss": 13.0602, "num_input_tokens_seen": 1169363446, "step": 474 }, { "epoch": 0.49991229608840554, "grad_norm": 0.8748670816421509, "learning_rate": 5.1943468251288e-06, "loss": 12.8074, "num_input_tokens_seen": 1171773940, "step": 475 }, { "epoch": 0.500964743027539, "grad_norm": 0.7007580995559692, "learning_rate": 4.803729418824403e-06, "loss": 14.0435, "num_input_tokens_seen": 1174175376, "step": 476 }, { "epoch": 0.5020171899666726, "grad_norm": 0.8660174012184143, "learning_rate": 4.428236875009116e-06, "loss": 12.0599, "num_input_tokens_seen": 1176702780, "step": 477 }, { "epoch": 0.503069636905806, "grad_norm": 0.5998972654342651, "learning_rate": 4.067892355831737e-06, "loss": 12.7006, "num_input_tokens_seen": 1179103346, "step": 478 }, { "epoch": 0.5041220838449395, "grad_norm": 0.523493766784668, "learning_rate": 3.7227180890396875e-06, "loss": 13.5402, "num_input_tokens_seen": 1181641814, "step": 479 }, { "epoch": 0.5051745307840729, "grad_norm": 0.7041788101196289, "learning_rate": 3.3927353666075715e-06, "loss": 12.8639, "num_input_tokens_seen": 1184182596, "step": 480 }, { "epoch": 0.5062269777232065, "grad_norm": 0.7297492027282715, "learning_rate": 3.0779645434241003e-06, "loss": 12.8612, "num_input_tokens_seen": 1186681022, "step": 481 }, { "epoch": 0.50727942466234, "grad_norm": 0.4691142737865448, "learning_rate": 2.77842503603634e-06, "loss": 14.5915, "num_input_tokens_seen": 1189091364, "step": 482 }, { "epoch": 0.5083318716014734, "grad_norm": 0.5547676086425781, "learning_rate": 2.494135321452059e-06, "loss": 13.4161, "num_input_tokens_seen": 1191699806, "step": 483 }, { "epoch": 0.509384318540607, "grad_norm": 0.7380577325820923, "learning_rate": 2.225112936000003e-06, "loss": 13.8698, "num_input_tokens_seen": 1194212232, "step": 484 }, { "epoch": 0.5104367654797404, "grad_norm": 0.9378113746643066, "learning_rate": 1.971374474248092e-06, "loss": 14.4525, "num_input_tokens_seen": 1196750508, "step": 485 }, { "epoch": 0.5114892124188739, "grad_norm": 0.8839085102081299, "learning_rate": 1.7329355879798507e-06, "loss": 12.9042, "num_input_tokens_seen": 1199088972, "step": 486 }, { "epoch": 0.5125416593580073, "grad_norm": 0.6598412990570068, "learning_rate": 1.5098109852289876e-06, "loss": 12.2719, "num_input_tokens_seen": 1201585668, "step": 487 }, { "epoch": 0.5135941062971409, "grad_norm": 0.5915337800979614, "learning_rate": 1.3020144293719245e-06, "loss": 13.4832, "num_input_tokens_seen": 1204102596, "step": 488 }, { "epoch": 0.5146465532362743, "grad_norm": 0.7219597697257996, "learning_rate": 1.1095587382791162e-06, "loss": 12.7485, "num_input_tokens_seen": 1206590908, "step": 489 }, { "epoch": 0.5156990001754078, "grad_norm": 0.5665842294692993, "learning_rate": 9.324557835240982e-07, "loss": 14.8871, "num_input_tokens_seen": 1208913176, "step": 490 }, { "epoch": 0.5167514471145414, "grad_norm": 0.7350465059280396, "learning_rate": 7.707164896513524e-07, "loss": 12.5099, "num_input_tokens_seen": 1211451838, "step": 491 }, { "epoch": 0.5178038940536748, "grad_norm": 0.797666072845459, "learning_rate": 6.243508335023997e-07, "loss": 13.7286, "num_input_tokens_seen": 1213826532, "step": 492 }, { "epoch": 0.5188563409928083, "grad_norm": 1.060686707496643, "learning_rate": 4.93367843600321e-07, "loss": 13.8689, "num_input_tokens_seen": 1216247526, "step": 493 }, { "epoch": 0.5199087879319417, "grad_norm": 0.860808789730072, "learning_rate": 3.7777559959295314e-07, "loss": 12.8744, "num_input_tokens_seen": 1218736208, "step": 494 }, { "epoch": 0.5209612348710753, "grad_norm": 0.5102307796478271, "learning_rate": 2.775812317543982e-07, "loss": 15.3648, "num_input_tokens_seen": 1221143152, "step": 495 }, { "epoch": 0.5220136818102087, "grad_norm": 0.8146570324897766, "learning_rate": 1.927909205451808e-07, "loss": 13.0279, "num_input_tokens_seen": 1223602950, "step": 496 }, { "epoch": 0.5230661287493422, "grad_norm": 0.6213780045509338, "learning_rate": 1.2340989623110917e-07, "loss": 14.1967, "num_input_tokens_seen": 1226026300, "step": 497 }, { "epoch": 0.5241185756884758, "grad_norm": 0.9054573178291321, "learning_rate": 6.944243856061614e-08, "loss": 11.9951, "num_input_tokens_seen": 1228463134, "step": 498 }, { "epoch": 0.5251710226276092, "grad_norm": 0.6855276226997375, "learning_rate": 3.089187650066512e-08, "loss": 13.0875, "num_input_tokens_seen": 1230894900, "step": 499 }, { "epoch": 0.5262234695667427, "grad_norm": 0.7106302380561829, "learning_rate": 7.76058803152525e-09, "loss": 13.6421, "num_input_tokens_seen": 1233302178, "step": 500 }, { "epoch": 0.5262234695667427, "num_input_tokens_seen": 1233302178, "step": 500, "total_flos": 3.380627081653125e+18, "train_loss": 13.568301763534546, "train_runtime": 49956.6557, "train_samples_per_second": 0.48, "train_steps_per_second": 0.01 } ], "logging_steps": 1.0, "max_steps": 500, "num_input_tokens_seen": 1233302178, "num_train_epochs": 1, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.380627081653125e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }