{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 558, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003590664272890485, "grad_norm": 2.390625, "learning_rate": 0.0, "loss": 1.3118, "step": 1 }, { "epoch": 0.00718132854578097, "grad_norm": 1.78125, "learning_rate": 5.000000000000001e-07, "loss": 1.3109, "step": 2 }, { "epoch": 0.010771992818671455, "grad_norm": 2.3125, "learning_rate": 1.0000000000000002e-06, "loss": 1.3326, "step": 3 }, { "epoch": 0.01436265709156194, "grad_norm": 2.1875, "learning_rate": 1.5e-06, "loss": 1.1802, "step": 4 }, { "epoch": 0.017953321364452424, "grad_norm": 1.875, "learning_rate": 2.0000000000000003e-06, "loss": 1.1417, "step": 5 }, { "epoch": 0.02154398563734291, "grad_norm": 1.9296875, "learning_rate": 2.5e-06, "loss": 1.1354, "step": 6 }, { "epoch": 0.025134649910233394, "grad_norm": 1.8515625, "learning_rate": 3e-06, "loss": 1.3538, "step": 7 }, { "epoch": 0.02872531418312388, "grad_norm": 1.5078125, "learning_rate": 3.5000000000000004e-06, "loss": 1.5037, "step": 8 }, { "epoch": 0.03231597845601436, "grad_norm": 1.7265625, "learning_rate": 4.000000000000001e-06, "loss": 1.3873, "step": 9 }, { "epoch": 0.03590664272890485, "grad_norm": 1.859375, "learning_rate": 4.5e-06, "loss": 1.493, "step": 10 }, { "epoch": 0.03949730700179533, "grad_norm": 1.3515625, "learning_rate": 5e-06, "loss": 1.0249, "step": 11 }, { "epoch": 0.04308797127468582, "grad_norm": 1.4296875, "learning_rate": 5.500000000000001e-06, "loss": 1.348, "step": 12 }, { "epoch": 0.0466786355475763, "grad_norm": 1.53125, "learning_rate": 6e-06, "loss": 1.1326, "step": 13 }, { "epoch": 0.05026929982046679, "grad_norm": 1.5625, "learning_rate": 6.5000000000000004e-06, "loss": 1.483, "step": 14 }, { "epoch": 0.05385996409335727, "grad_norm": 1.5625, "learning_rate": 7.000000000000001e-06, "loss": 1.319, "step": 15 }, { "epoch": 0.05745062836624776, "grad_norm": 1.125, "learning_rate": 7.5e-06, "loss": 1.2356, "step": 16 }, { "epoch": 0.06104129263913824, "grad_norm": 0.8828125, "learning_rate": 8.000000000000001e-06, "loss": 1.3812, "step": 17 }, { "epoch": 0.06463195691202872, "grad_norm": 0.88671875, "learning_rate": 8.500000000000002e-06, "loss": 1.09, "step": 18 }, { "epoch": 0.06822262118491922, "grad_norm": 0.84765625, "learning_rate": 9e-06, "loss": 1.129, "step": 19 }, { "epoch": 0.0718132854578097, "grad_norm": 0.6640625, "learning_rate": 9.5e-06, "loss": 1.0359, "step": 20 }, { "epoch": 0.07540394973070018, "grad_norm": 0.66796875, "learning_rate": 1e-05, "loss": 1.0774, "step": 21 }, { "epoch": 0.07899461400359066, "grad_norm": 0.470703125, "learning_rate": 1.05e-05, "loss": 1.17, "step": 22 }, { "epoch": 0.08258527827648116, "grad_norm": 0.41796875, "learning_rate": 1.1000000000000001e-05, "loss": 1.3448, "step": 23 }, { "epoch": 0.08617594254937164, "grad_norm": 0.423828125, "learning_rate": 1.1500000000000002e-05, "loss": 0.9623, "step": 24 }, { "epoch": 0.08976660682226212, "grad_norm": 0.392578125, "learning_rate": 1.2e-05, "loss": 1.1739, "step": 25 }, { "epoch": 0.0933572710951526, "grad_norm": 0.333984375, "learning_rate": 1.25e-05, "loss": 1.3992, "step": 26 }, { "epoch": 0.09694793536804308, "grad_norm": 0.33203125, "learning_rate": 1.3000000000000001e-05, "loss": 1.1802, "step": 27 }, { "epoch": 0.10053859964093358, "grad_norm": 0.2451171875, "learning_rate": 1.3500000000000001e-05, "loss": 0.9227, "step": 28 }, { "epoch": 0.10412926391382406, "grad_norm": 0.37890625, "learning_rate": 1.4000000000000001e-05, "loss": 1.0569, "step": 29 }, { "epoch": 0.10771992818671454, "grad_norm": 0.34375, "learning_rate": 1.45e-05, "loss": 1.0893, "step": 30 }, { "epoch": 0.11131059245960502, "grad_norm": 0.291015625, "learning_rate": 1.5e-05, "loss": 1.0711, "step": 31 }, { "epoch": 0.11490125673249552, "grad_norm": 0.259765625, "learning_rate": 1.55e-05, "loss": 0.9898, "step": 32 }, { "epoch": 0.118491921005386, "grad_norm": 0.228515625, "learning_rate": 1.6000000000000003e-05, "loss": 1.2314, "step": 33 }, { "epoch": 0.12208258527827648, "grad_norm": 0.279296875, "learning_rate": 1.65e-05, "loss": 0.9239, "step": 34 }, { "epoch": 0.12567324955116696, "grad_norm": 0.263671875, "learning_rate": 1.7000000000000003e-05, "loss": 0.9493, "step": 35 }, { "epoch": 0.12926391382405744, "grad_norm": 0.212890625, "learning_rate": 1.75e-05, "loss": 0.816, "step": 36 }, { "epoch": 0.13285457809694792, "grad_norm": 0.2158203125, "learning_rate": 1.8e-05, "loss": 0.8736, "step": 37 }, { "epoch": 0.13644524236983843, "grad_norm": 0.2265625, "learning_rate": 1.85e-05, "loss": 0.947, "step": 38 }, { "epoch": 0.1400359066427289, "grad_norm": 0.1875, "learning_rate": 1.9e-05, "loss": 1.0573, "step": 39 }, { "epoch": 0.1436265709156194, "grad_norm": 0.228515625, "learning_rate": 1.9500000000000003e-05, "loss": 0.8813, "step": 40 }, { "epoch": 0.14721723518850988, "grad_norm": 0.1982421875, "learning_rate": 2e-05, "loss": 1.0222, "step": 41 }, { "epoch": 0.15080789946140036, "grad_norm": 0.2138671875, "learning_rate": 2.05e-05, "loss": 1.0931, "step": 42 }, { "epoch": 0.15439856373429084, "grad_norm": 0.240234375, "learning_rate": 2.1e-05, "loss": 1.0039, "step": 43 }, { "epoch": 0.15798922800718132, "grad_norm": 0.2119140625, "learning_rate": 2.15e-05, "loss": 0.862, "step": 44 }, { "epoch": 0.1615798922800718, "grad_norm": 0.2216796875, "learning_rate": 2.2000000000000003e-05, "loss": 1.1314, "step": 45 }, { "epoch": 0.1651705565529623, "grad_norm": 0.22265625, "learning_rate": 2.25e-05, "loss": 0.9189, "step": 46 }, { "epoch": 0.1687612208258528, "grad_norm": 0.205078125, "learning_rate": 2.3000000000000003e-05, "loss": 0.8286, "step": 47 }, { "epoch": 0.17235188509874327, "grad_norm": 0.1455078125, "learning_rate": 2.35e-05, "loss": 0.6787, "step": 48 }, { "epoch": 0.17594254937163376, "grad_norm": 0.1923828125, "learning_rate": 2.4e-05, "loss": 0.8788, "step": 49 }, { "epoch": 0.17953321364452424, "grad_norm": 0.20703125, "learning_rate": 2.45e-05, "loss": 0.7822, "step": 50 }, { "epoch": 0.18312387791741472, "grad_norm": 0.19140625, "learning_rate": 2.5e-05, "loss": 1.0333, "step": 51 }, { "epoch": 0.1867145421903052, "grad_norm": 0.1904296875, "learning_rate": 2.5500000000000003e-05, "loss": 0.8933, "step": 52 }, { "epoch": 0.19030520646319568, "grad_norm": 0.2333984375, "learning_rate": 2.6000000000000002e-05, "loss": 0.8214, "step": 53 }, { "epoch": 0.19389587073608616, "grad_norm": 0.2109375, "learning_rate": 2.6500000000000004e-05, "loss": 0.801, "step": 54 }, { "epoch": 0.19748653500897667, "grad_norm": 0.1904296875, "learning_rate": 2.7000000000000002e-05, "loss": 1.0485, "step": 55 }, { "epoch": 0.20107719928186715, "grad_norm": 0.208984375, "learning_rate": 2.7500000000000004e-05, "loss": 0.8809, "step": 56 }, { "epoch": 0.20466786355475763, "grad_norm": 0.205078125, "learning_rate": 2.8000000000000003e-05, "loss": 0.6927, "step": 57 }, { "epoch": 0.20825852782764812, "grad_norm": 0.1572265625, "learning_rate": 2.8499999999999998e-05, "loss": 0.8153, "step": 58 }, { "epoch": 0.2118491921005386, "grad_norm": 0.1708984375, "learning_rate": 2.9e-05, "loss": 0.7591, "step": 59 }, { "epoch": 0.21543985637342908, "grad_norm": 0.21484375, "learning_rate": 2.95e-05, "loss": 0.9105, "step": 60 }, { "epoch": 0.21903052064631956, "grad_norm": 0.255859375, "learning_rate": 3e-05, "loss": 0.6683, "step": 61 }, { "epoch": 0.22262118491921004, "grad_norm": 0.2392578125, "learning_rate": 3.05e-05, "loss": 0.7747, "step": 62 }, { "epoch": 0.22621184919210055, "grad_norm": 0.212890625, "learning_rate": 3.1e-05, "loss": 0.9528, "step": 63 }, { "epoch": 0.22980251346499103, "grad_norm": 0.181640625, "learning_rate": 3.15e-05, "loss": 0.6261, "step": 64 }, { "epoch": 0.2333931777378815, "grad_norm": 0.25, "learning_rate": 3.2000000000000005e-05, "loss": 0.7867, "step": 65 }, { "epoch": 0.236983842010772, "grad_norm": 0.2412109375, "learning_rate": 3.2500000000000004e-05, "loss": 0.6613, "step": 66 }, { "epoch": 0.24057450628366248, "grad_norm": 0.34765625, "learning_rate": 3.3e-05, "loss": 0.6554, "step": 67 }, { "epoch": 0.24416517055655296, "grad_norm": 0.244140625, "learning_rate": 3.35e-05, "loss": 0.5564, "step": 68 }, { "epoch": 0.24775583482944344, "grad_norm": 0.2578125, "learning_rate": 3.4000000000000007e-05, "loss": 0.5263, "step": 69 }, { "epoch": 0.2513464991023339, "grad_norm": 0.263671875, "learning_rate": 3.45e-05, "loss": 0.7767, "step": 70 }, { "epoch": 0.25493716337522443, "grad_norm": 0.2080078125, "learning_rate": 3.5e-05, "loss": 0.5112, "step": 71 }, { "epoch": 0.2585278276481149, "grad_norm": 0.259765625, "learning_rate": 3.55e-05, "loss": 0.5489, "step": 72 }, { "epoch": 0.2621184919210054, "grad_norm": 0.263671875, "learning_rate": 3.6e-05, "loss": 0.8188, "step": 73 }, { "epoch": 0.26570915619389585, "grad_norm": 0.248046875, "learning_rate": 3.65e-05, "loss": 0.436, "step": 74 }, { "epoch": 0.26929982046678635, "grad_norm": 0.2353515625, "learning_rate": 3.7e-05, "loss": 0.5416, "step": 75 }, { "epoch": 0.27289048473967686, "grad_norm": 0.267578125, "learning_rate": 3.7500000000000003e-05, "loss": 0.4053, "step": 76 }, { "epoch": 0.2764811490125673, "grad_norm": 0.193359375, "learning_rate": 3.8e-05, "loss": 0.8033, "step": 77 }, { "epoch": 0.2800718132854578, "grad_norm": 0.1962890625, "learning_rate": 3.85e-05, "loss": 0.6404, "step": 78 }, { "epoch": 0.2836624775583483, "grad_norm": 0.1552734375, "learning_rate": 3.9000000000000006e-05, "loss": 0.5165, "step": 79 }, { "epoch": 0.2872531418312388, "grad_norm": 0.173828125, "learning_rate": 3.9500000000000005e-05, "loss": 0.5472, "step": 80 }, { "epoch": 0.29084380610412924, "grad_norm": 0.173828125, "learning_rate": 4e-05, "loss": 0.4093, "step": 81 }, { "epoch": 0.29443447037701975, "grad_norm": 0.2060546875, "learning_rate": 4.05e-05, "loss": 0.6968, "step": 82 }, { "epoch": 0.2980251346499102, "grad_norm": 0.1748046875, "learning_rate": 4.1e-05, "loss": 0.3833, "step": 83 }, { "epoch": 0.3016157989228007, "grad_norm": 0.1865234375, "learning_rate": 4.15e-05, "loss": 0.885, "step": 84 }, { "epoch": 0.3052064631956912, "grad_norm": 0.1796875, "learning_rate": 4.2e-05, "loss": 0.754, "step": 85 }, { "epoch": 0.3087971274685817, "grad_norm": 0.171875, "learning_rate": 4.25e-05, "loss": 0.718, "step": 86 }, { "epoch": 0.3123877917414722, "grad_norm": 0.1767578125, "learning_rate": 4.3e-05, "loss": 0.4215, "step": 87 }, { "epoch": 0.31597845601436264, "grad_norm": 0.1630859375, "learning_rate": 4.35e-05, "loss": 0.3714, "step": 88 }, { "epoch": 0.31956912028725315, "grad_norm": 0.1669921875, "learning_rate": 4.4000000000000006e-05, "loss": 0.6781, "step": 89 }, { "epoch": 0.3231597845601436, "grad_norm": 0.1787109375, "learning_rate": 4.4500000000000004e-05, "loss": 0.3836, "step": 90 }, { "epoch": 0.3267504488330341, "grad_norm": 0.1630859375, "learning_rate": 4.5e-05, "loss": 0.4808, "step": 91 }, { "epoch": 0.3303411131059246, "grad_norm": 0.146484375, "learning_rate": 4.55e-05, "loss": 0.5064, "step": 92 }, { "epoch": 0.3339317773788151, "grad_norm": 0.16015625, "learning_rate": 4.600000000000001e-05, "loss": 0.5408, "step": 93 }, { "epoch": 0.3375224416517056, "grad_norm": 0.1923828125, "learning_rate": 4.6500000000000005e-05, "loss": 0.3994, "step": 94 }, { "epoch": 0.34111310592459604, "grad_norm": 0.1826171875, "learning_rate": 4.7e-05, "loss": 0.3588, "step": 95 }, { "epoch": 0.34470377019748655, "grad_norm": 0.2255859375, "learning_rate": 4.75e-05, "loss": 0.5898, "step": 96 }, { "epoch": 0.348294434470377, "grad_norm": 0.171875, "learning_rate": 4.8e-05, "loss": 0.3161, "step": 97 }, { "epoch": 0.3518850987432675, "grad_norm": 0.1650390625, "learning_rate": 4.85e-05, "loss": 0.3188, "step": 98 }, { "epoch": 0.35547576301615796, "grad_norm": 0.1650390625, "learning_rate": 4.9e-05, "loss": 0.3662, "step": 99 }, { "epoch": 0.3590664272890485, "grad_norm": 0.1650390625, "learning_rate": 4.9500000000000004e-05, "loss": 0.3499, "step": 100 }, { "epoch": 0.362657091561939, "grad_norm": 0.21484375, "learning_rate": 5e-05, "loss": 1.0732, "step": 101 }, { "epoch": 0.36624775583482944, "grad_norm": 0.1923828125, "learning_rate": 4.9981412639405204e-05, "loss": 0.3832, "step": 102 }, { "epoch": 0.36983842010771995, "grad_norm": 0.1748046875, "learning_rate": 4.996282527881041e-05, "loss": 0.3694, "step": 103 }, { "epoch": 0.3734290843806104, "grad_norm": 0.173828125, "learning_rate": 4.9944237918215614e-05, "loss": 0.7403, "step": 104 }, { "epoch": 0.3770197486535009, "grad_norm": 0.169921875, "learning_rate": 4.9925650557620816e-05, "loss": 0.718, "step": 105 }, { "epoch": 0.38061041292639136, "grad_norm": 0.1904296875, "learning_rate": 4.9907063197026024e-05, "loss": 0.3711, "step": 106 }, { "epoch": 0.38420107719928187, "grad_norm": 0.1728515625, "learning_rate": 4.988847583643123e-05, "loss": 0.3222, "step": 107 }, { "epoch": 0.3877917414721723, "grad_norm": 0.17578125, "learning_rate": 4.9869888475836434e-05, "loss": 0.3346, "step": 108 }, { "epoch": 0.39138240574506283, "grad_norm": 0.166015625, "learning_rate": 4.9851301115241635e-05, "loss": 0.6638, "step": 109 }, { "epoch": 0.39497307001795334, "grad_norm": 0.1826171875, "learning_rate": 4.9832713754646844e-05, "loss": 0.4425, "step": 110 }, { "epoch": 0.3985637342908438, "grad_norm": 0.2001953125, "learning_rate": 4.9814126394052045e-05, "loss": 0.4638, "step": 111 }, { "epoch": 0.4021543985637343, "grad_norm": 0.2197265625, "learning_rate": 4.979553903345725e-05, "loss": 0.6048, "step": 112 }, { "epoch": 0.40574506283662476, "grad_norm": 0.2236328125, "learning_rate": 4.9776951672862455e-05, "loss": 0.3296, "step": 113 }, { "epoch": 0.40933572710951527, "grad_norm": 0.1630859375, "learning_rate": 4.975836431226766e-05, "loss": 0.2778, "step": 114 }, { "epoch": 0.4129263913824057, "grad_norm": 0.17578125, "learning_rate": 4.9739776951672865e-05, "loss": 0.6095, "step": 115 }, { "epoch": 0.41651705565529623, "grad_norm": 0.1845703125, "learning_rate": 4.9721189591078074e-05, "loss": 0.3734, "step": 116 }, { "epoch": 0.42010771992818674, "grad_norm": 0.20703125, "learning_rate": 4.9702602230483275e-05, "loss": 0.406, "step": 117 }, { "epoch": 0.4236983842010772, "grad_norm": 0.1455078125, "learning_rate": 4.968401486988848e-05, "loss": 0.5716, "step": 118 }, { "epoch": 0.4272890484739677, "grad_norm": 0.2294921875, "learning_rate": 4.9665427509293685e-05, "loss": 0.436, "step": 119 }, { "epoch": 0.43087971274685816, "grad_norm": 0.1953125, "learning_rate": 4.964684014869889e-05, "loss": 0.3044, "step": 120 }, { "epoch": 0.43447037701974867, "grad_norm": 0.203125, "learning_rate": 4.962825278810409e-05, "loss": 0.9405, "step": 121 }, { "epoch": 0.4380610412926391, "grad_norm": 0.181640625, "learning_rate": 4.96096654275093e-05, "loss": 0.6842, "step": 122 }, { "epoch": 0.44165170556552963, "grad_norm": 0.169921875, "learning_rate": 4.95910780669145e-05, "loss": 0.6166, "step": 123 }, { "epoch": 0.4452423698384201, "grad_norm": 0.1884765625, "learning_rate": 4.957249070631971e-05, "loss": 0.3507, "step": 124 }, { "epoch": 0.4488330341113106, "grad_norm": 0.1689453125, "learning_rate": 4.955390334572491e-05, "loss": 0.4158, "step": 125 }, { "epoch": 0.4524236983842011, "grad_norm": 0.197265625, "learning_rate": 4.953531598513012e-05, "loss": 0.5619, "step": 126 }, { "epoch": 0.45601436265709155, "grad_norm": 0.1875, "learning_rate": 4.951672862453532e-05, "loss": 0.6515, "step": 127 }, { "epoch": 0.45960502692998206, "grad_norm": 0.2001953125, "learning_rate": 4.949814126394052e-05, "loss": 0.5829, "step": 128 }, { "epoch": 0.4631956912028725, "grad_norm": 0.2275390625, "learning_rate": 4.947955390334573e-05, "loss": 0.3444, "step": 129 }, { "epoch": 0.466786355475763, "grad_norm": 0.291015625, "learning_rate": 4.946096654275093e-05, "loss": 0.7474, "step": 130 }, { "epoch": 0.4703770197486535, "grad_norm": 0.1962890625, "learning_rate": 4.944237918215613e-05, "loss": 0.291, "step": 131 }, { "epoch": 0.473967684021544, "grad_norm": 0.1767578125, "learning_rate": 4.942379182156134e-05, "loss": 0.482, "step": 132 }, { "epoch": 0.47755834829443444, "grad_norm": 0.15625, "learning_rate": 4.940520446096655e-05, "loss": 0.775, "step": 133 }, { "epoch": 0.48114901256732495, "grad_norm": 0.1845703125, "learning_rate": 4.938661710037175e-05, "loss": 0.266, "step": 134 }, { "epoch": 0.48473967684021546, "grad_norm": 0.19921875, "learning_rate": 4.936802973977696e-05, "loss": 0.3308, "step": 135 }, { "epoch": 0.4883303411131059, "grad_norm": 0.189453125, "learning_rate": 4.934944237918216e-05, "loss": 0.5599, "step": 136 }, { "epoch": 0.4919210053859964, "grad_norm": 0.1875, "learning_rate": 4.933085501858736e-05, "loss": 0.4163, "step": 137 }, { "epoch": 0.4955116696588869, "grad_norm": 0.2578125, "learning_rate": 4.931226765799257e-05, "loss": 0.3903, "step": 138 }, { "epoch": 0.4991023339317774, "grad_norm": 0.1728515625, "learning_rate": 4.929368029739777e-05, "loss": 0.5689, "step": 139 }, { "epoch": 0.5026929982046678, "grad_norm": 0.2109375, "learning_rate": 4.927509293680297e-05, "loss": 0.3814, "step": 140 }, { "epoch": 0.5062836624775583, "grad_norm": 0.1845703125, "learning_rate": 4.925650557620818e-05, "loss": 0.6202, "step": 141 }, { "epoch": 0.5098743267504489, "grad_norm": 0.193359375, "learning_rate": 4.923791821561339e-05, "loss": 0.4465, "step": 142 }, { "epoch": 0.5134649910233393, "grad_norm": 0.201171875, "learning_rate": 4.921933085501859e-05, "loss": 0.3151, "step": 143 }, { "epoch": 0.5170556552962298, "grad_norm": 0.2109375, "learning_rate": 4.920074349442379e-05, "loss": 0.5802, "step": 144 }, { "epoch": 0.5206463195691203, "grad_norm": 0.173828125, "learning_rate": 4.9182156133829e-05, "loss": 0.4289, "step": 145 }, { "epoch": 0.5242369838420108, "grad_norm": 0.21875, "learning_rate": 4.91635687732342e-05, "loss": 0.7393, "step": 146 }, { "epoch": 0.5278276481149012, "grad_norm": 0.1806640625, "learning_rate": 4.9144981412639404e-05, "loss": 0.4351, "step": 147 }, { "epoch": 0.5314183123877917, "grad_norm": 0.20703125, "learning_rate": 4.912639405204461e-05, "loss": 0.5175, "step": 148 }, { "epoch": 0.5350089766606823, "grad_norm": 0.228515625, "learning_rate": 4.9107806691449814e-05, "loss": 0.5497, "step": 149 }, { "epoch": 0.5385996409335727, "grad_norm": 0.267578125, "learning_rate": 4.9089219330855016e-05, "loss": 0.3371, "step": 150 }, { "epoch": 0.5421903052064632, "grad_norm": 0.201171875, "learning_rate": 4.907063197026023e-05, "loss": 0.2978, "step": 151 }, { "epoch": 0.5457809694793537, "grad_norm": 0.2060546875, "learning_rate": 4.905204460966543e-05, "loss": 0.5361, "step": 152 }, { "epoch": 0.5493716337522442, "grad_norm": 0.1943359375, "learning_rate": 4.9033457249070634e-05, "loss": 0.6422, "step": 153 }, { "epoch": 0.5529622980251346, "grad_norm": 0.1552734375, "learning_rate": 4.901486988847584e-05, "loss": 0.3489, "step": 154 }, { "epoch": 0.5565529622980251, "grad_norm": 0.197265625, "learning_rate": 4.8996282527881044e-05, "loss": 0.2929, "step": 155 }, { "epoch": 0.5601436265709157, "grad_norm": 0.2177734375, "learning_rate": 4.8977695167286246e-05, "loss": 0.5586, "step": 156 }, { "epoch": 0.5637342908438061, "grad_norm": 0.208984375, "learning_rate": 4.8959107806691454e-05, "loss": 0.2645, "step": 157 }, { "epoch": 0.5673249551166966, "grad_norm": 0.173828125, "learning_rate": 4.8940520446096656e-05, "loss": 0.4642, "step": 158 }, { "epoch": 0.5709156193895871, "grad_norm": 0.2265625, "learning_rate": 4.892193308550186e-05, "loss": 0.4657, "step": 159 }, { "epoch": 0.5745062836624776, "grad_norm": 0.1982421875, "learning_rate": 4.8903345724907066e-05, "loss": 0.2824, "step": 160 }, { "epoch": 0.578096947935368, "grad_norm": 0.1982421875, "learning_rate": 4.8884758364312274e-05, "loss": 0.3551, "step": 161 }, { "epoch": 0.5816876122082585, "grad_norm": 0.1767578125, "learning_rate": 4.8866171003717476e-05, "loss": 0.4457, "step": 162 }, { "epoch": 0.585278276481149, "grad_norm": 0.2177734375, "learning_rate": 4.884758364312268e-05, "loss": 0.5692, "step": 163 }, { "epoch": 0.5888689407540395, "grad_norm": 0.2060546875, "learning_rate": 4.8828996282527886e-05, "loss": 0.2869, "step": 164 }, { "epoch": 0.59245960502693, "grad_norm": 0.1767578125, "learning_rate": 4.881040892193309e-05, "loss": 0.3708, "step": 165 }, { "epoch": 0.5960502692998204, "grad_norm": 0.1884765625, "learning_rate": 4.879182156133829e-05, "loss": 0.5186, "step": 166 }, { "epoch": 0.599640933572711, "grad_norm": 0.1552734375, "learning_rate": 4.87732342007435e-05, "loss": 0.531, "step": 167 }, { "epoch": 0.6032315978456014, "grad_norm": 0.1884765625, "learning_rate": 4.87546468401487e-05, "loss": 0.2633, "step": 168 }, { "epoch": 0.6068222621184919, "grad_norm": 0.2109375, "learning_rate": 4.873605947955391e-05, "loss": 0.5839, "step": 169 }, { "epoch": 0.6104129263913824, "grad_norm": 0.2197265625, "learning_rate": 4.8717472118959115e-05, "loss": 0.3252, "step": 170 }, { "epoch": 0.6140035906642729, "grad_norm": 0.2734375, "learning_rate": 4.869888475836432e-05, "loss": 0.4997, "step": 171 }, { "epoch": 0.6175942549371634, "grad_norm": 0.1826171875, "learning_rate": 4.868029739776952e-05, "loss": 0.2423, "step": 172 }, { "epoch": 0.6211849192100538, "grad_norm": 0.236328125, "learning_rate": 4.866171003717473e-05, "loss": 0.3213, "step": 173 }, { "epoch": 0.6247755834829444, "grad_norm": 0.19140625, "learning_rate": 4.864312267657993e-05, "loss": 0.3245, "step": 174 }, { "epoch": 0.6283662477558348, "grad_norm": 0.224609375, "learning_rate": 4.862453531598513e-05, "loss": 0.319, "step": 175 }, { "epoch": 0.6319569120287253, "grad_norm": 0.162109375, "learning_rate": 4.860594795539034e-05, "loss": 0.3641, "step": 176 }, { "epoch": 0.6355475763016158, "grad_norm": 0.220703125, "learning_rate": 4.858736059479554e-05, "loss": 0.5961, "step": 177 }, { "epoch": 0.6391382405745063, "grad_norm": 0.1962890625, "learning_rate": 4.856877323420075e-05, "loss": 0.2885, "step": 178 }, { "epoch": 0.6427289048473968, "grad_norm": 0.2021484375, "learning_rate": 4.855018587360595e-05, "loss": 0.4729, "step": 179 }, { "epoch": 0.6463195691202872, "grad_norm": 0.2060546875, "learning_rate": 4.853159851301116e-05, "loss": 0.2408, "step": 180 }, { "epoch": 0.6499102333931778, "grad_norm": 0.2158203125, "learning_rate": 4.851301115241636e-05, "loss": 0.5975, "step": 181 }, { "epoch": 0.6535008976660682, "grad_norm": 0.2197265625, "learning_rate": 4.849442379182156e-05, "loss": 0.5767, "step": 182 }, { "epoch": 0.6570915619389587, "grad_norm": 0.1640625, "learning_rate": 4.847583643122677e-05, "loss": 0.4695, "step": 183 }, { "epoch": 0.6606822262118492, "grad_norm": 0.2158203125, "learning_rate": 4.845724907063197e-05, "loss": 0.6781, "step": 184 }, { "epoch": 0.6642728904847397, "grad_norm": 0.203125, "learning_rate": 4.843866171003717e-05, "loss": 0.6286, "step": 185 }, { "epoch": 0.6678635547576302, "grad_norm": 0.2041015625, "learning_rate": 4.842007434944238e-05, "loss": 0.2714, "step": 186 }, { "epoch": 0.6714542190305206, "grad_norm": 0.208984375, "learning_rate": 4.840148698884759e-05, "loss": 0.4527, "step": 187 }, { "epoch": 0.6750448833034112, "grad_norm": 0.2138671875, "learning_rate": 4.838289962825279e-05, "loss": 0.6688, "step": 188 }, { "epoch": 0.6786355475763016, "grad_norm": 0.255859375, "learning_rate": 4.836431226765799e-05, "loss": 0.3316, "step": 189 }, { "epoch": 0.6822262118491921, "grad_norm": 0.1826171875, "learning_rate": 4.83457249070632e-05, "loss": 0.3403, "step": 190 }, { "epoch": 0.6858168761220825, "grad_norm": 0.2041015625, "learning_rate": 4.83271375464684e-05, "loss": 0.631, "step": 191 }, { "epoch": 0.6894075403949731, "grad_norm": 0.1884765625, "learning_rate": 4.8308550185873605e-05, "loss": 0.7783, "step": 192 }, { "epoch": 0.6929982046678635, "grad_norm": 0.23046875, "learning_rate": 4.828996282527881e-05, "loss": 0.3601, "step": 193 }, { "epoch": 0.696588868940754, "grad_norm": 0.201171875, "learning_rate": 4.8271375464684015e-05, "loss": 0.5412, "step": 194 }, { "epoch": 0.7001795332136446, "grad_norm": 0.2080078125, "learning_rate": 4.8252788104089216e-05, "loss": 0.269, "step": 195 }, { "epoch": 0.703770197486535, "grad_norm": 0.19140625, "learning_rate": 4.823420074349443e-05, "loss": 0.5399, "step": 196 }, { "epoch": 0.7073608617594255, "grad_norm": 0.224609375, "learning_rate": 4.821561338289963e-05, "loss": 0.641, "step": 197 }, { "epoch": 0.7109515260323159, "grad_norm": 0.216796875, "learning_rate": 4.8197026022304835e-05, "loss": 0.5893, "step": 198 }, { "epoch": 0.7145421903052065, "grad_norm": 0.3046875, "learning_rate": 4.817843866171004e-05, "loss": 0.8207, "step": 199 }, { "epoch": 0.718132854578097, "grad_norm": 0.2421875, "learning_rate": 4.8159851301115244e-05, "loss": 0.4649, "step": 200 }, { "epoch": 0.7217235188509874, "grad_norm": 0.2138671875, "learning_rate": 4.8141263940520446e-05, "loss": 0.5323, "step": 201 }, { "epoch": 0.725314183123878, "grad_norm": 0.2119140625, "learning_rate": 4.8122676579925654e-05, "loss": 0.3443, "step": 202 }, { "epoch": 0.7289048473967684, "grad_norm": 0.2041015625, "learning_rate": 4.8104089219330856e-05, "loss": 0.3208, "step": 203 }, { "epoch": 0.7324955116696589, "grad_norm": 0.2275390625, "learning_rate": 4.808550185873606e-05, "loss": 0.5681, "step": 204 }, { "epoch": 0.7360861759425493, "grad_norm": 0.236328125, "learning_rate": 4.8066914498141266e-05, "loss": 0.2878, "step": 205 }, { "epoch": 0.7396768402154399, "grad_norm": 0.203125, "learning_rate": 4.8048327137546474e-05, "loss": 0.2644, "step": 206 }, { "epoch": 0.7432675044883303, "grad_norm": 0.2890625, "learning_rate": 4.8029739776951676e-05, "loss": 0.3917, "step": 207 }, { "epoch": 0.7468581687612208, "grad_norm": 0.2158203125, "learning_rate": 4.801115241635688e-05, "loss": 0.287, "step": 208 }, { "epoch": 0.7504488330341114, "grad_norm": 0.1845703125, "learning_rate": 4.7992565055762086e-05, "loss": 0.5449, "step": 209 }, { "epoch": 0.7540394973070018, "grad_norm": 0.2314453125, "learning_rate": 4.797397769516729e-05, "loss": 0.3362, "step": 210 }, { "epoch": 0.7576301615798923, "grad_norm": 0.2060546875, "learning_rate": 4.795539033457249e-05, "loss": 0.2707, "step": 211 }, { "epoch": 0.7612208258527827, "grad_norm": 0.21484375, "learning_rate": 4.79368029739777e-05, "loss": 0.3236, "step": 212 }, { "epoch": 0.7648114901256733, "grad_norm": 0.197265625, "learning_rate": 4.79182156133829e-05, "loss": 0.3069, "step": 213 }, { "epoch": 0.7684021543985637, "grad_norm": 0.1953125, "learning_rate": 4.789962825278811e-05, "loss": 0.5512, "step": 214 }, { "epoch": 0.7719928186714542, "grad_norm": 0.62890625, "learning_rate": 4.7881040892193316e-05, "loss": 0.2504, "step": 215 }, { "epoch": 0.7755834829443446, "grad_norm": 0.20703125, "learning_rate": 4.786245353159852e-05, "loss": 0.2519, "step": 216 }, { "epoch": 0.7791741472172352, "grad_norm": 0.259765625, "learning_rate": 4.784386617100372e-05, "loss": 0.2239, "step": 217 }, { "epoch": 0.7827648114901257, "grad_norm": 0.2236328125, "learning_rate": 4.782527881040893e-05, "loss": 0.3473, "step": 218 }, { "epoch": 0.7863554757630161, "grad_norm": 0.24609375, "learning_rate": 4.780669144981413e-05, "loss": 0.5388, "step": 219 }, { "epoch": 0.7899461400359067, "grad_norm": 0.2412109375, "learning_rate": 4.778810408921933e-05, "loss": 0.3809, "step": 220 }, { "epoch": 0.7935368043087971, "grad_norm": 0.232421875, "learning_rate": 4.776951672862454e-05, "loss": 0.3564, "step": 221 }, { "epoch": 0.7971274685816876, "grad_norm": 0.2060546875, "learning_rate": 4.775092936802974e-05, "loss": 0.2457, "step": 222 }, { "epoch": 0.800718132854578, "grad_norm": 0.2275390625, "learning_rate": 4.773234200743495e-05, "loss": 0.6414, "step": 223 }, { "epoch": 0.8043087971274686, "grad_norm": 0.1962890625, "learning_rate": 4.771375464684015e-05, "loss": 0.3071, "step": 224 }, { "epoch": 0.8078994614003591, "grad_norm": 0.2080078125, "learning_rate": 4.769516728624536e-05, "loss": 0.2726, "step": 225 }, { "epoch": 0.8114901256732495, "grad_norm": 0.2177734375, "learning_rate": 4.767657992565056e-05, "loss": 0.2465, "step": 226 }, { "epoch": 0.8150807899461401, "grad_norm": 0.2197265625, "learning_rate": 4.765799256505576e-05, "loss": 0.4524, "step": 227 }, { "epoch": 0.8186714542190305, "grad_norm": 0.2001953125, "learning_rate": 4.763940520446097e-05, "loss": 0.2691, "step": 228 }, { "epoch": 0.822262118491921, "grad_norm": 0.224609375, "learning_rate": 4.762081784386617e-05, "loss": 0.4049, "step": 229 }, { "epoch": 0.8258527827648114, "grad_norm": 0.1640625, "learning_rate": 4.7602230483271374e-05, "loss": 0.2995, "step": 230 }, { "epoch": 0.829443447037702, "grad_norm": 0.2001953125, "learning_rate": 4.758364312267658e-05, "loss": 0.2954, "step": 231 }, { "epoch": 0.8330341113105925, "grad_norm": 0.216796875, "learning_rate": 4.756505576208179e-05, "loss": 0.5055, "step": 232 }, { "epoch": 0.8366247755834829, "grad_norm": 0.2275390625, "learning_rate": 4.754646840148699e-05, "loss": 0.2917, "step": 233 }, { "epoch": 0.8402154398563735, "grad_norm": 0.2080078125, "learning_rate": 4.75278810408922e-05, "loss": 0.5433, "step": 234 }, { "epoch": 0.8438061041292639, "grad_norm": 0.2109375, "learning_rate": 4.75092936802974e-05, "loss": 0.2799, "step": 235 }, { "epoch": 0.8473967684021544, "grad_norm": 0.2080078125, "learning_rate": 4.7490706319702603e-05, "loss": 0.3019, "step": 236 }, { "epoch": 0.8509874326750448, "grad_norm": 0.2265625, "learning_rate": 4.747211895910781e-05, "loss": 0.6139, "step": 237 }, { "epoch": 0.8545780969479354, "grad_norm": 0.1982421875, "learning_rate": 4.745353159851301e-05, "loss": 0.4745, "step": 238 }, { "epoch": 0.8581687612208259, "grad_norm": 0.2080078125, "learning_rate": 4.7434944237918215e-05, "loss": 0.2757, "step": 239 }, { "epoch": 0.8617594254937163, "grad_norm": 0.2109375, "learning_rate": 4.741635687732342e-05, "loss": 0.2292, "step": 240 }, { "epoch": 0.8653500897666068, "grad_norm": 0.203125, "learning_rate": 4.739776951672863e-05, "loss": 0.2724, "step": 241 }, { "epoch": 0.8689407540394973, "grad_norm": 0.201171875, "learning_rate": 4.737918215613383e-05, "loss": 0.7023, "step": 242 }, { "epoch": 0.8725314183123878, "grad_norm": 0.2158203125, "learning_rate": 4.7360594795539035e-05, "loss": 0.5998, "step": 243 }, { "epoch": 0.8761220825852782, "grad_norm": 0.2236328125, "learning_rate": 4.734200743494424e-05, "loss": 0.2796, "step": 244 }, { "epoch": 0.8797127468581688, "grad_norm": 0.193359375, "learning_rate": 4.7323420074349445e-05, "loss": 0.3897, "step": 245 }, { "epoch": 0.8833034111310593, "grad_norm": 0.2119140625, "learning_rate": 4.7304832713754646e-05, "loss": 0.8566, "step": 246 }, { "epoch": 0.8868940754039497, "grad_norm": 0.19140625, "learning_rate": 4.7286245353159855e-05, "loss": 0.2867, "step": 247 }, { "epoch": 0.8904847396768402, "grad_norm": 0.21484375, "learning_rate": 4.7267657992565056e-05, "loss": 0.5576, "step": 248 }, { "epoch": 0.8940754039497307, "grad_norm": 0.1953125, "learning_rate": 4.724907063197026e-05, "loss": 0.2801, "step": 249 }, { "epoch": 0.8976660682226212, "grad_norm": 0.2041015625, "learning_rate": 4.7230483271375466e-05, "loss": 0.307, "step": 250 }, { "epoch": 0.9012567324955116, "grad_norm": 0.2197265625, "learning_rate": 4.7211895910780675e-05, "loss": 0.6804, "step": 251 }, { "epoch": 0.9048473967684022, "grad_norm": 0.197265625, "learning_rate": 4.7193308550185876e-05, "loss": 0.2864, "step": 252 }, { "epoch": 0.9084380610412927, "grad_norm": 0.2041015625, "learning_rate": 4.717472118959108e-05, "loss": 0.309, "step": 253 }, { "epoch": 0.9120287253141831, "grad_norm": 0.2041015625, "learning_rate": 4.7156133828996286e-05, "loss": 0.3269, "step": 254 }, { "epoch": 0.9156193895870736, "grad_norm": 0.228515625, "learning_rate": 4.713754646840149e-05, "loss": 0.3827, "step": 255 }, { "epoch": 0.9192100538599641, "grad_norm": 0.201171875, "learning_rate": 4.711895910780669e-05, "loss": 0.2834, "step": 256 }, { "epoch": 0.9228007181328546, "grad_norm": 0.201171875, "learning_rate": 4.71003717472119e-05, "loss": 0.2551, "step": 257 }, { "epoch": 0.926391382405745, "grad_norm": 0.2080078125, "learning_rate": 4.70817843866171e-05, "loss": 0.2416, "step": 258 }, { "epoch": 0.9299820466786356, "grad_norm": 0.216796875, "learning_rate": 4.706319702602231e-05, "loss": 0.2697, "step": 259 }, { "epoch": 0.933572710951526, "grad_norm": 0.2265625, "learning_rate": 4.7044609665427516e-05, "loss": 0.3121, "step": 260 }, { "epoch": 0.9371633752244165, "grad_norm": 0.2431640625, "learning_rate": 4.702602230483272e-05, "loss": 0.3622, "step": 261 }, { "epoch": 0.940754039497307, "grad_norm": 0.203125, "learning_rate": 4.700743494423792e-05, "loss": 0.3064, "step": 262 }, { "epoch": 0.9443447037701975, "grad_norm": 0.2138671875, "learning_rate": 4.698884758364313e-05, "loss": 0.399, "step": 263 }, { "epoch": 0.947935368043088, "grad_norm": 0.212890625, "learning_rate": 4.697026022304833e-05, "loss": 0.2546, "step": 264 }, { "epoch": 0.9515260323159784, "grad_norm": 0.341796875, "learning_rate": 4.695167286245353e-05, "loss": 0.4054, "step": 265 }, { "epoch": 0.9551166965888689, "grad_norm": 0.232421875, "learning_rate": 4.693308550185874e-05, "loss": 0.657, "step": 266 }, { "epoch": 0.9587073608617595, "grad_norm": 0.20703125, "learning_rate": 4.691449814126394e-05, "loss": 0.2879, "step": 267 }, { "epoch": 0.9622980251346499, "grad_norm": 0.19921875, "learning_rate": 4.689591078066915e-05, "loss": 0.4132, "step": 268 }, { "epoch": 0.9658886894075404, "grad_norm": 0.28515625, "learning_rate": 4.687732342007435e-05, "loss": 0.4783, "step": 269 }, { "epoch": 0.9694793536804309, "grad_norm": 0.22265625, "learning_rate": 4.685873605947956e-05, "loss": 0.4306, "step": 270 }, { "epoch": 0.9730700179533214, "grad_norm": 0.279296875, "learning_rate": 4.684014869888476e-05, "loss": 0.7503, "step": 271 }, { "epoch": 0.9766606822262118, "grad_norm": 0.193359375, "learning_rate": 4.682156133828996e-05, "loss": 0.5797, "step": 272 }, { "epoch": 0.9802513464991023, "grad_norm": 0.2119140625, "learning_rate": 4.680297397769517e-05, "loss": 0.2591, "step": 273 }, { "epoch": 0.9838420107719928, "grad_norm": 0.251953125, "learning_rate": 4.678438661710037e-05, "loss": 0.6978, "step": 274 }, { "epoch": 0.9874326750448833, "grad_norm": 0.2373046875, "learning_rate": 4.6765799256505574e-05, "loss": 0.2719, "step": 275 }, { "epoch": 0.9910233393177738, "grad_norm": 0.15625, "learning_rate": 4.674721189591078e-05, "loss": 0.43, "step": 276 }, { "epoch": 0.9946140035906643, "grad_norm": 0.2001953125, "learning_rate": 4.6728624535315984e-05, "loss": 0.5569, "step": 277 }, { "epoch": 0.9982046678635548, "grad_norm": 0.201171875, "learning_rate": 4.671003717472119e-05, "loss": 0.2543, "step": 278 }, { "epoch": 1.0, "grad_norm": 0.38671875, "learning_rate": 4.66914498141264e-05, "loss": 0.2229, "step": 279 }, { "epoch": 1.0, "eval_loss": 0.3737908899784088, "eval_model_preparation_time": 0.008, "eval_runtime": 3.9865, "eval_samples_per_second": 1.254, "eval_steps_per_second": 1.254, "step": 279 }, { "epoch": 1.0, "eval_loss": 0.7759888768196106, "eval_model_preparation_time": 0.008, "eval_runtime": 3.0783, "eval_samples_per_second": 1.624, "eval_steps_per_second": 1.624, "step": 279 }, { "epoch": 1.0, "eval_loss": 0.45089930295944214, "eval_model_preparation_time": 0.008, "eval_runtime": 4.5609, "eval_samples_per_second": 1.096, "eval_steps_per_second": 1.096, "step": 279 }, { "epoch": 1.0, "eval_loss": 0.991237998008728, "eval_model_preparation_time": 0.008, "eval_runtime": 4.769, "eval_samples_per_second": 1.048, "eval_steps_per_second": 1.048, "step": 279 }, { "epoch": 1.0035906642728905, "grad_norm": 0.142578125, "learning_rate": 4.66728624535316e-05, "loss": 0.2825, "step": 280 }, { "epoch": 1.007181328545781, "grad_norm": 0.236328125, "learning_rate": 4.6654275092936804e-05, "loss": 0.3988, "step": 281 }, { "epoch": 1.0107719928186714, "grad_norm": 0.21484375, "learning_rate": 4.663568773234201e-05, "loss": 0.3361, "step": 282 }, { "epoch": 1.014362657091562, "grad_norm": 0.2353515625, "learning_rate": 4.6617100371747214e-05, "loss": 0.268, "step": 283 }, { "epoch": 1.0179533213644525, "grad_norm": 0.2265625, "learning_rate": 4.6598513011152415e-05, "loss": 0.2578, "step": 284 }, { "epoch": 1.021543985637343, "grad_norm": 0.205078125, "learning_rate": 4.6579925650557624e-05, "loss": 0.6743, "step": 285 }, { "epoch": 1.0251346499102334, "grad_norm": 0.205078125, "learning_rate": 4.6561338289962825e-05, "loss": 0.4614, "step": 286 }, { "epoch": 1.0287253141831239, "grad_norm": 0.2109375, "learning_rate": 4.6542750929368034e-05, "loss": 0.2527, "step": 287 }, { "epoch": 1.0323159784560143, "grad_norm": 0.21875, "learning_rate": 4.6524163568773235e-05, "loss": 0.247, "step": 288 }, { "epoch": 1.0359066427289048, "grad_norm": 0.1904296875, "learning_rate": 4.6505576208178444e-05, "loss": 0.3458, "step": 289 }, { "epoch": 1.0394973070017954, "grad_norm": 0.2421875, "learning_rate": 4.6486988847583645e-05, "loss": 0.3973, "step": 290 }, { "epoch": 1.0430879712746859, "grad_norm": 0.203125, "learning_rate": 4.646840148698885e-05, "loss": 0.4701, "step": 291 }, { "epoch": 1.0466786355475763, "grad_norm": 0.22265625, "learning_rate": 4.6449814126394055e-05, "loss": 0.2721, "step": 292 }, { "epoch": 1.0502692998204668, "grad_norm": 0.1962890625, "learning_rate": 4.643122676579926e-05, "loss": 0.6061, "step": 293 }, { "epoch": 1.0538599640933572, "grad_norm": 0.2265625, "learning_rate": 4.641263940520446e-05, "loss": 0.8369, "step": 294 }, { "epoch": 1.0574506283662477, "grad_norm": 0.1669921875, "learning_rate": 4.639405204460967e-05, "loss": 0.3504, "step": 295 }, { "epoch": 1.0610412926391382, "grad_norm": 0.20703125, "learning_rate": 4.6375464684014875e-05, "loss": 0.2423, "step": 296 }, { "epoch": 1.0646319569120286, "grad_norm": 0.251953125, "learning_rate": 4.635687732342008e-05, "loss": 0.305, "step": 297 }, { "epoch": 1.0682226211849193, "grad_norm": 0.251953125, "learning_rate": 4.6338289962825285e-05, "loss": 0.2884, "step": 298 }, { "epoch": 1.0718132854578097, "grad_norm": 0.2236328125, "learning_rate": 4.6319702602230487e-05, "loss": 0.2225, "step": 299 }, { "epoch": 1.0754039497307002, "grad_norm": 0.2158203125, "learning_rate": 4.630111524163569e-05, "loss": 0.7404, "step": 300 }, { "epoch": 1.0789946140035906, "grad_norm": 0.2060546875, "learning_rate": 4.6282527881040897e-05, "loss": 0.4138, "step": 301 }, { "epoch": 1.082585278276481, "grad_norm": 0.2353515625, "learning_rate": 4.62639405204461e-05, "loss": 0.4618, "step": 302 }, { "epoch": 1.0861759425493716, "grad_norm": 0.251953125, "learning_rate": 4.62453531598513e-05, "loss": 0.4119, "step": 303 }, { "epoch": 1.0897666068222622, "grad_norm": 0.2294921875, "learning_rate": 4.622676579925651e-05, "loss": 0.3272, "step": 304 }, { "epoch": 1.0933572710951527, "grad_norm": 0.216796875, "learning_rate": 4.6208178438661716e-05, "loss": 0.6113, "step": 305 }, { "epoch": 1.0969479353680431, "grad_norm": 0.2021484375, "learning_rate": 4.618959107806692e-05, "loss": 0.5141, "step": 306 }, { "epoch": 1.1005385996409336, "grad_norm": 0.23046875, "learning_rate": 4.617100371747212e-05, "loss": 0.5032, "step": 307 }, { "epoch": 1.104129263913824, "grad_norm": 0.205078125, "learning_rate": 4.615241635687733e-05, "loss": 0.4546, "step": 308 }, { "epoch": 1.1077199281867145, "grad_norm": 0.265625, "learning_rate": 4.613382899628253e-05, "loss": 0.8695, "step": 309 }, { "epoch": 1.111310592459605, "grad_norm": 0.216796875, "learning_rate": 4.611524163568773e-05, "loss": 0.4311, "step": 310 }, { "epoch": 1.1149012567324954, "grad_norm": 0.2119140625, "learning_rate": 4.609665427509294e-05, "loss": 0.2857, "step": 311 }, { "epoch": 1.118491921005386, "grad_norm": 0.20703125, "learning_rate": 4.607806691449814e-05, "loss": 0.2421, "step": 312 }, { "epoch": 1.1220825852782765, "grad_norm": 0.2119140625, "learning_rate": 4.605947955390334e-05, "loss": 0.2709, "step": 313 }, { "epoch": 1.125673249551167, "grad_norm": 0.208984375, "learning_rate": 4.604089219330856e-05, "loss": 0.2203, "step": 314 }, { "epoch": 1.1292639138240574, "grad_norm": 0.1865234375, "learning_rate": 4.602230483271376e-05, "loss": 0.3568, "step": 315 }, { "epoch": 1.132854578096948, "grad_norm": 0.1923828125, "learning_rate": 4.600371747211896e-05, "loss": 0.2197, "step": 316 }, { "epoch": 1.1364452423698383, "grad_norm": 0.2177734375, "learning_rate": 4.598513011152417e-05, "loss": 0.4316, "step": 317 }, { "epoch": 1.140035906642729, "grad_norm": 0.2119140625, "learning_rate": 4.596654275092937e-05, "loss": 0.2792, "step": 318 }, { "epoch": 1.1436265709156195, "grad_norm": 0.2060546875, "learning_rate": 4.594795539033457e-05, "loss": 0.2294, "step": 319 }, { "epoch": 1.14721723518851, "grad_norm": 0.21875, "learning_rate": 4.592936802973978e-05, "loss": 0.3019, "step": 320 }, { "epoch": 1.1508078994614004, "grad_norm": 0.18359375, "learning_rate": 4.591078066914498e-05, "loss": 0.4066, "step": 321 }, { "epoch": 1.1543985637342908, "grad_norm": 0.302734375, "learning_rate": 4.5892193308550184e-05, "loss": 0.7007, "step": 322 }, { "epoch": 1.1579892280071813, "grad_norm": 0.2412109375, "learning_rate": 4.587360594795539e-05, "loss": 0.2212, "step": 323 }, { "epoch": 1.1615798922800717, "grad_norm": 0.2080078125, "learning_rate": 4.58550185873606e-05, "loss": 0.268, "step": 324 }, { "epoch": 1.1651705565529622, "grad_norm": 0.22265625, "learning_rate": 4.58364312267658e-05, "loss": 0.3889, "step": 325 }, { "epoch": 1.1687612208258529, "grad_norm": 0.2216796875, "learning_rate": 4.5817843866171004e-05, "loss": 0.3795, "step": 326 }, { "epoch": 1.1723518850987433, "grad_norm": 0.2109375, "learning_rate": 4.579925650557621e-05, "loss": 0.2576, "step": 327 }, { "epoch": 1.1759425493716338, "grad_norm": 0.224609375, "learning_rate": 4.5780669144981414e-05, "loss": 0.5303, "step": 328 }, { "epoch": 1.1795332136445242, "grad_norm": 0.2080078125, "learning_rate": 4.5762081784386616e-05, "loss": 0.2587, "step": 329 }, { "epoch": 1.1831238779174147, "grad_norm": 0.208984375, "learning_rate": 4.5743494423791824e-05, "loss": 0.2683, "step": 330 }, { "epoch": 1.1867145421903051, "grad_norm": 0.228515625, "learning_rate": 4.5724907063197026e-05, "loss": 0.2963, "step": 331 }, { "epoch": 1.1903052064631956, "grad_norm": 0.2021484375, "learning_rate": 4.5706319702602234e-05, "loss": 0.5846, "step": 332 }, { "epoch": 1.1938958707360863, "grad_norm": 0.201171875, "learning_rate": 4.5687732342007436e-05, "loss": 0.3283, "step": 333 }, { "epoch": 1.1974865350089767, "grad_norm": 0.21875, "learning_rate": 4.5669144981412644e-05, "loss": 0.2244, "step": 334 }, { "epoch": 1.2010771992818672, "grad_norm": 0.271484375, "learning_rate": 4.5650557620817846e-05, "loss": 0.6349, "step": 335 }, { "epoch": 1.2046678635547576, "grad_norm": 0.21875, "learning_rate": 4.563197026022305e-05, "loss": 0.5115, "step": 336 }, { "epoch": 1.208258527827648, "grad_norm": 0.2099609375, "learning_rate": 4.5613382899628255e-05, "loss": 0.2732, "step": 337 }, { "epoch": 1.2118491921005385, "grad_norm": 0.19140625, "learning_rate": 4.559479553903346e-05, "loss": 0.3806, "step": 338 }, { "epoch": 1.215439856373429, "grad_norm": 0.26171875, "learning_rate": 4.557620817843866e-05, "loss": 0.2904, "step": 339 }, { "epoch": 1.2190305206463194, "grad_norm": 0.20703125, "learning_rate": 4.555762081784387e-05, "loss": 0.4204, "step": 340 }, { "epoch": 1.2226211849192101, "grad_norm": 0.205078125, "learning_rate": 4.5539033457249075e-05, "loss": 0.2501, "step": 341 }, { "epoch": 1.2262118491921006, "grad_norm": 0.2216796875, "learning_rate": 4.552044609665428e-05, "loss": 0.6786, "step": 342 }, { "epoch": 1.229802513464991, "grad_norm": 0.244140625, "learning_rate": 4.5501858736059485e-05, "loss": 0.3281, "step": 343 }, { "epoch": 1.2333931777378815, "grad_norm": 0.2470703125, "learning_rate": 4.548327137546469e-05, "loss": 0.3257, "step": 344 }, { "epoch": 1.236983842010772, "grad_norm": 0.2216796875, "learning_rate": 4.546468401486989e-05, "loss": 0.4858, "step": 345 }, { "epoch": 1.2405745062836624, "grad_norm": 0.2353515625, "learning_rate": 4.54460966542751e-05, "loss": 0.3437, "step": 346 }, { "epoch": 1.244165170556553, "grad_norm": 0.2177734375, "learning_rate": 4.54275092936803e-05, "loss": 0.2798, "step": 347 }, { "epoch": 1.2477558348294435, "grad_norm": 0.2294921875, "learning_rate": 4.54089219330855e-05, "loss": 0.3048, "step": 348 }, { "epoch": 1.251346499102334, "grad_norm": 0.1865234375, "learning_rate": 4.539033457249071e-05, "loss": 0.4216, "step": 349 }, { "epoch": 1.2549371633752244, "grad_norm": 0.2255859375, "learning_rate": 4.537174721189592e-05, "loss": 0.2394, "step": 350 }, { "epoch": 1.2585278276481149, "grad_norm": 0.2216796875, "learning_rate": 4.535315985130112e-05, "loss": 0.5155, "step": 351 }, { "epoch": 1.2621184919210053, "grad_norm": 0.236328125, "learning_rate": 4.533457249070632e-05, "loss": 0.5352, "step": 352 }, { "epoch": 1.2657091561938958, "grad_norm": 0.1943359375, "learning_rate": 4.531598513011153e-05, "loss": 0.2784, "step": 353 }, { "epoch": 1.2692998204667862, "grad_norm": 0.2001953125, "learning_rate": 4.529739776951673e-05, "loss": 0.4403, "step": 354 }, { "epoch": 1.272890484739677, "grad_norm": 0.2119140625, "learning_rate": 4.527881040892193e-05, "loss": 0.296, "step": 355 }, { "epoch": 1.2764811490125674, "grad_norm": 0.2138671875, "learning_rate": 4.526022304832714e-05, "loss": 0.491, "step": 356 }, { "epoch": 1.2800718132854578, "grad_norm": 0.2470703125, "learning_rate": 4.524163568773234e-05, "loss": 0.2491, "step": 357 }, { "epoch": 1.2836624775583483, "grad_norm": 0.2236328125, "learning_rate": 4.522304832713754e-05, "loss": 0.5229, "step": 358 }, { "epoch": 1.2872531418312387, "grad_norm": 0.2099609375, "learning_rate": 4.520446096654276e-05, "loss": 0.2133, "step": 359 }, { "epoch": 1.2908438061041292, "grad_norm": 0.2578125, "learning_rate": 4.518587360594796e-05, "loss": 0.2535, "step": 360 }, { "epoch": 1.2944344703770199, "grad_norm": 0.166015625, "learning_rate": 4.516728624535316e-05, "loss": 0.3225, "step": 361 }, { "epoch": 1.2980251346499103, "grad_norm": 0.2275390625, "learning_rate": 4.514869888475837e-05, "loss": 0.4062, "step": 362 }, { "epoch": 1.3016157989228008, "grad_norm": 0.2294921875, "learning_rate": 4.513011152416357e-05, "loss": 0.2444, "step": 363 }, { "epoch": 1.3052064631956912, "grad_norm": 0.2216796875, "learning_rate": 4.511152416356877e-05, "loss": 0.2864, "step": 364 }, { "epoch": 1.3087971274685817, "grad_norm": 0.1875, "learning_rate": 4.509293680297398e-05, "loss": 0.3735, "step": 365 }, { "epoch": 1.3123877917414721, "grad_norm": 0.1943359375, "learning_rate": 4.507434944237918e-05, "loss": 0.1985, "step": 366 }, { "epoch": 1.3159784560143626, "grad_norm": 0.220703125, "learning_rate": 4.5055762081784385e-05, "loss": 0.5282, "step": 367 }, { "epoch": 1.319569120287253, "grad_norm": 0.2294921875, "learning_rate": 4.503717472118959e-05, "loss": 0.2346, "step": 368 }, { "epoch": 1.3231597845601435, "grad_norm": 0.2412109375, "learning_rate": 4.50185873605948e-05, "loss": 0.2417, "step": 369 }, { "epoch": 1.3267504488330342, "grad_norm": 0.1728515625, "learning_rate": 4.5e-05, "loss": 0.3541, "step": 370 }, { "epoch": 1.3303411131059246, "grad_norm": 0.251953125, "learning_rate": 4.4981412639405204e-05, "loss": 0.4394, "step": 371 }, { "epoch": 1.333931777378815, "grad_norm": 0.22265625, "learning_rate": 4.496282527881041e-05, "loss": 0.2102, "step": 372 }, { "epoch": 1.3375224416517055, "grad_norm": 0.1806640625, "learning_rate": 4.4944237918215614e-05, "loss": 0.4646, "step": 373 }, { "epoch": 1.341113105924596, "grad_norm": 0.2119140625, "learning_rate": 4.4925650557620816e-05, "loss": 0.2573, "step": 374 }, { "epoch": 1.3447037701974867, "grad_norm": 0.2197265625, "learning_rate": 4.4907063197026024e-05, "loss": 0.6274, "step": 375 }, { "epoch": 1.3482944344703771, "grad_norm": 0.23828125, "learning_rate": 4.4888475836431226e-05, "loss": 0.2583, "step": 376 }, { "epoch": 1.3518850987432676, "grad_norm": 0.224609375, "learning_rate": 4.4869888475836434e-05, "loss": 0.2022, "step": 377 }, { "epoch": 1.355475763016158, "grad_norm": 0.224609375, "learning_rate": 4.485130111524164e-05, "loss": 0.5312, "step": 378 }, { "epoch": 1.3590664272890485, "grad_norm": 0.232421875, "learning_rate": 4.4832713754646844e-05, "loss": 0.6319, "step": 379 }, { "epoch": 1.362657091561939, "grad_norm": 0.232421875, "learning_rate": 4.4814126394052046e-05, "loss": 0.366, "step": 380 }, { "epoch": 1.3662477558348294, "grad_norm": 0.2119140625, "learning_rate": 4.4795539033457254e-05, "loss": 0.3536, "step": 381 }, { "epoch": 1.3698384201077198, "grad_norm": 0.26171875, "learning_rate": 4.4776951672862456e-05, "loss": 0.5689, "step": 382 }, { "epoch": 1.3734290843806103, "grad_norm": 0.2421875, "learning_rate": 4.475836431226766e-05, "loss": 0.5611, "step": 383 }, { "epoch": 1.377019748653501, "grad_norm": 0.1962890625, "learning_rate": 4.4739776951672866e-05, "loss": 0.3653, "step": 384 }, { "epoch": 1.3806104129263914, "grad_norm": 0.2236328125, "learning_rate": 4.472118959107807e-05, "loss": 0.2861, "step": 385 }, { "epoch": 1.3842010771992819, "grad_norm": 0.1962890625, "learning_rate": 4.4702602230483276e-05, "loss": 0.3731, "step": 386 }, { "epoch": 1.3877917414721723, "grad_norm": 0.2236328125, "learning_rate": 4.468401486988848e-05, "loss": 0.8125, "step": 387 }, { "epoch": 1.3913824057450628, "grad_norm": 0.1953125, "learning_rate": 4.4665427509293686e-05, "loss": 0.2126, "step": 388 }, { "epoch": 1.3949730700179535, "grad_norm": 0.21875, "learning_rate": 4.464684014869889e-05, "loss": 0.5073, "step": 389 }, { "epoch": 1.398563734290844, "grad_norm": 0.2060546875, "learning_rate": 4.462825278810409e-05, "loss": 0.5119, "step": 390 }, { "epoch": 1.4021543985637344, "grad_norm": 0.2197265625, "learning_rate": 4.46096654275093e-05, "loss": 0.5054, "step": 391 }, { "epoch": 1.4057450628366248, "grad_norm": 0.2119140625, "learning_rate": 4.45910780669145e-05, "loss": 0.2192, "step": 392 }, { "epoch": 1.4093357271095153, "grad_norm": 0.2177734375, "learning_rate": 4.45724907063197e-05, "loss": 0.553, "step": 393 }, { "epoch": 1.4129263913824057, "grad_norm": 0.2236328125, "learning_rate": 4.455390334572491e-05, "loss": 0.8202, "step": 394 }, { "epoch": 1.4165170556552962, "grad_norm": 0.208984375, "learning_rate": 4.453531598513012e-05, "loss": 0.269, "step": 395 }, { "epoch": 1.4201077199281866, "grad_norm": 0.216796875, "learning_rate": 4.451672862453532e-05, "loss": 0.2566, "step": 396 }, { "epoch": 1.423698384201077, "grad_norm": 0.212890625, "learning_rate": 4.449814126394053e-05, "loss": 0.2365, "step": 397 }, { "epoch": 1.4272890484739678, "grad_norm": 0.2265625, "learning_rate": 4.447955390334573e-05, "loss": 0.3126, "step": 398 }, { "epoch": 1.4308797127468582, "grad_norm": 0.2470703125, "learning_rate": 4.446096654275093e-05, "loss": 0.6362, "step": 399 }, { "epoch": 1.4344703770197487, "grad_norm": 0.2294921875, "learning_rate": 4.444237918215614e-05, "loss": 0.3714, "step": 400 }, { "epoch": 1.4380610412926391, "grad_norm": 0.234375, "learning_rate": 4.442379182156134e-05, "loss": 0.3601, "step": 401 }, { "epoch": 1.4416517055655296, "grad_norm": 0.2109375, "learning_rate": 4.440520446096654e-05, "loss": 0.2388, "step": 402 }, { "epoch": 1.44524236983842, "grad_norm": 0.17578125, "learning_rate": 4.4386617100371743e-05, "loss": 0.5235, "step": 403 }, { "epoch": 1.4488330341113107, "grad_norm": 0.21875, "learning_rate": 4.436802973977696e-05, "loss": 0.245, "step": 404 }, { "epoch": 1.4524236983842012, "grad_norm": 0.201171875, "learning_rate": 4.434944237918216e-05, "loss": 0.5509, "step": 405 }, { "epoch": 1.4560143626570916, "grad_norm": 0.2333984375, "learning_rate": 4.433085501858736e-05, "loss": 0.2173, "step": 406 }, { "epoch": 1.459605026929982, "grad_norm": 0.25, "learning_rate": 4.431226765799257e-05, "loss": 0.493, "step": 407 }, { "epoch": 1.4631956912028725, "grad_norm": 0.220703125, "learning_rate": 4.429368029739777e-05, "loss": 0.6478, "step": 408 }, { "epoch": 1.466786355475763, "grad_norm": 0.2216796875, "learning_rate": 4.427509293680297e-05, "loss": 0.5515, "step": 409 }, { "epoch": 1.4703770197486534, "grad_norm": 0.263671875, "learning_rate": 4.425650557620818e-05, "loss": 0.7162, "step": 410 }, { "epoch": 1.4739676840215439, "grad_norm": 0.2216796875, "learning_rate": 4.423791821561338e-05, "loss": 0.5562, "step": 411 }, { "epoch": 1.4775583482944343, "grad_norm": 0.205078125, "learning_rate": 4.4219330855018585e-05, "loss": 0.2492, "step": 412 }, { "epoch": 1.481149012567325, "grad_norm": 0.2041015625, "learning_rate": 4.420074349442379e-05, "loss": 0.2045, "step": 413 }, { "epoch": 1.4847396768402155, "grad_norm": 0.228515625, "learning_rate": 4.4182156133829e-05, "loss": 0.6559, "step": 414 }, { "epoch": 1.488330341113106, "grad_norm": 0.205078125, "learning_rate": 4.41635687732342e-05, "loss": 0.2367, "step": 415 }, { "epoch": 1.4919210053859964, "grad_norm": 0.19921875, "learning_rate": 4.4144981412639405e-05, "loss": 0.2112, "step": 416 }, { "epoch": 1.4955116696588868, "grad_norm": 0.2314453125, "learning_rate": 4.412639405204461e-05, "loss": 0.2841, "step": 417 }, { "epoch": 1.4991023339317775, "grad_norm": 0.2255859375, "learning_rate": 4.4107806691449815e-05, "loss": 0.4962, "step": 418 }, { "epoch": 1.502692998204668, "grad_norm": 0.2041015625, "learning_rate": 4.4089219330855016e-05, "loss": 0.5164, "step": 419 }, { "epoch": 1.5062836624775584, "grad_norm": 0.2109375, "learning_rate": 4.4070631970260225e-05, "loss": 0.2734, "step": 420 }, { "epoch": 1.5098743267504489, "grad_norm": 0.2451171875, "learning_rate": 4.4052044609665426e-05, "loss": 0.3068, "step": 421 }, { "epoch": 1.5134649910233393, "grad_norm": 0.2412109375, "learning_rate": 4.4033457249070635e-05, "loss": 0.5347, "step": 422 }, { "epoch": 1.5170556552962298, "grad_norm": 0.2197265625, "learning_rate": 4.401486988847584e-05, "loss": 0.2524, "step": 423 }, { "epoch": 1.5206463195691202, "grad_norm": 0.2236328125, "learning_rate": 4.3996282527881045e-05, "loss": 0.7398, "step": 424 }, { "epoch": 1.5242369838420107, "grad_norm": 0.185546875, "learning_rate": 4.3977695167286246e-05, "loss": 0.3826, "step": 425 }, { "epoch": 1.5278276481149011, "grad_norm": 0.228515625, "learning_rate": 4.3959107806691455e-05, "loss": 0.4545, "step": 426 }, { "epoch": 1.5314183123877916, "grad_norm": 0.2275390625, "learning_rate": 4.3940520446096656e-05, "loss": 0.2171, "step": 427 }, { "epoch": 1.5350089766606823, "grad_norm": 0.2236328125, "learning_rate": 4.392193308550186e-05, "loss": 0.2556, "step": 428 }, { "epoch": 1.5385996409335727, "grad_norm": 0.22265625, "learning_rate": 4.3903345724907066e-05, "loss": 0.4926, "step": 429 }, { "epoch": 1.5421903052064632, "grad_norm": 0.2470703125, "learning_rate": 4.388475836431227e-05, "loss": 0.6468, "step": 430 }, { "epoch": 1.5457809694793538, "grad_norm": 0.208984375, "learning_rate": 4.3866171003717476e-05, "loss": 0.2415, "step": 431 }, { "epoch": 1.5493716337522443, "grad_norm": 0.240234375, "learning_rate": 4.384758364312268e-05, "loss": 0.2634, "step": 432 }, { "epoch": 1.5529622980251347, "grad_norm": 0.2216796875, "learning_rate": 4.3828996282527886e-05, "loss": 0.3863, "step": 433 }, { "epoch": 1.5565529622980252, "grad_norm": 0.2177734375, "learning_rate": 4.381040892193309e-05, "loss": 0.3253, "step": 434 }, { "epoch": 1.5601436265709157, "grad_norm": 0.2138671875, "learning_rate": 4.379182156133829e-05, "loss": 0.6391, "step": 435 }, { "epoch": 1.563734290843806, "grad_norm": 0.173828125, "learning_rate": 4.37732342007435e-05, "loss": 0.3246, "step": 436 }, { "epoch": 1.5673249551166966, "grad_norm": 0.216796875, "learning_rate": 4.37546468401487e-05, "loss": 0.5692, "step": 437 }, { "epoch": 1.570915619389587, "grad_norm": 0.2412109375, "learning_rate": 4.37360594795539e-05, "loss": 0.2787, "step": 438 }, { "epoch": 1.5745062836624775, "grad_norm": 0.2421875, "learning_rate": 4.371747211895911e-05, "loss": 0.2529, "step": 439 }, { "epoch": 1.578096947935368, "grad_norm": 0.232421875, "learning_rate": 4.369888475836432e-05, "loss": 0.2168, "step": 440 }, { "epoch": 1.5816876122082584, "grad_norm": 0.2451171875, "learning_rate": 4.368029739776952e-05, "loss": 0.3306, "step": 441 }, { "epoch": 1.585278276481149, "grad_norm": 0.1923828125, "learning_rate": 4.366171003717473e-05, "loss": 0.3574, "step": 442 }, { "epoch": 1.5888689407540395, "grad_norm": 0.212890625, "learning_rate": 4.364312267657993e-05, "loss": 0.2277, "step": 443 }, { "epoch": 1.59245960502693, "grad_norm": 0.25390625, "learning_rate": 4.362453531598513e-05, "loss": 0.4842, "step": 444 }, { "epoch": 1.5960502692998204, "grad_norm": 0.2275390625, "learning_rate": 4.360594795539034e-05, "loss": 0.623, "step": 445 }, { "epoch": 1.599640933572711, "grad_norm": 0.2080078125, "learning_rate": 4.358736059479554e-05, "loss": 0.2313, "step": 446 }, { "epoch": 1.6032315978456015, "grad_norm": 0.220703125, "learning_rate": 4.356877323420074e-05, "loss": 0.3802, "step": 447 }, { "epoch": 1.606822262118492, "grad_norm": 0.220703125, "learning_rate": 4.355018587360595e-05, "loss": 0.6244, "step": 448 }, { "epoch": 1.6104129263913824, "grad_norm": 0.1572265625, "learning_rate": 4.353159851301116e-05, "loss": 0.287, "step": 449 }, { "epoch": 1.614003590664273, "grad_norm": 0.2236328125, "learning_rate": 4.351301115241636e-05, "loss": 0.5317, "step": 450 }, { "epoch": 1.6175942549371634, "grad_norm": 0.212890625, "learning_rate": 4.349442379182156e-05, "loss": 0.241, "step": 451 }, { "epoch": 1.6211849192100538, "grad_norm": 0.22265625, "learning_rate": 4.347583643122677e-05, "loss": 0.2728, "step": 452 }, { "epoch": 1.6247755834829443, "grad_norm": 0.2265625, "learning_rate": 4.345724907063197e-05, "loss": 0.4384, "step": 453 }, { "epoch": 1.6283662477558347, "grad_norm": 0.2451171875, "learning_rate": 4.3438661710037174e-05, "loss": 0.3325, "step": 454 }, { "epoch": 1.6319569120287252, "grad_norm": 0.21484375, "learning_rate": 4.342007434944238e-05, "loss": 0.263, "step": 455 }, { "epoch": 1.6355475763016158, "grad_norm": 0.169921875, "learning_rate": 4.3401486988847584e-05, "loss": 0.301, "step": 456 }, { "epoch": 1.6391382405745063, "grad_norm": 0.216796875, "learning_rate": 4.3382899628252785e-05, "loss": 0.2594, "step": 457 }, { "epoch": 1.6427289048473968, "grad_norm": 0.2119140625, "learning_rate": 4.3364312267658e-05, "loss": 0.205, "step": 458 }, { "epoch": 1.6463195691202872, "grad_norm": 0.2255859375, "learning_rate": 4.33457249070632e-05, "loss": 0.7932, "step": 459 }, { "epoch": 1.6499102333931779, "grad_norm": 0.240234375, "learning_rate": 4.3327137546468404e-05, "loss": 0.2878, "step": 460 }, { "epoch": 1.6535008976660683, "grad_norm": 0.1923828125, "learning_rate": 4.330855018587361e-05, "loss": 0.2058, "step": 461 }, { "epoch": 1.6570915619389588, "grad_norm": 0.212890625, "learning_rate": 4.3289962825278813e-05, "loss": 0.2042, "step": 462 }, { "epoch": 1.6606822262118492, "grad_norm": 0.244140625, "learning_rate": 4.3271375464684015e-05, "loss": 0.2794, "step": 463 }, { "epoch": 1.6642728904847397, "grad_norm": 0.251953125, "learning_rate": 4.3252788104089223e-05, "loss": 0.5715, "step": 464 }, { "epoch": 1.6678635547576302, "grad_norm": 0.2490234375, "learning_rate": 4.3234200743494425e-05, "loss": 0.6816, "step": 465 }, { "epoch": 1.6714542190305206, "grad_norm": 0.2197265625, "learning_rate": 4.3215613382899627e-05, "loss": 0.5783, "step": 466 }, { "epoch": 1.675044883303411, "grad_norm": 0.208984375, "learning_rate": 4.3197026022304835e-05, "loss": 0.5576, "step": 467 }, { "epoch": 1.6786355475763015, "grad_norm": 0.19140625, "learning_rate": 4.317843866171004e-05, "loss": 0.3913, "step": 468 }, { "epoch": 1.682226211849192, "grad_norm": 0.1787109375, "learning_rate": 4.3159851301115245e-05, "loss": 0.3219, "step": 469 }, { "epoch": 1.6858168761220824, "grad_norm": 0.1796875, "learning_rate": 4.3141263940520447e-05, "loss": 0.2854, "step": 470 }, { "epoch": 1.689407540394973, "grad_norm": 0.23828125, "learning_rate": 4.3122676579925655e-05, "loss": 0.1921, "step": 471 }, { "epoch": 1.6929982046678635, "grad_norm": 0.2314453125, "learning_rate": 4.3104089219330856e-05, "loss": 0.5357, "step": 472 }, { "epoch": 1.696588868940754, "grad_norm": 0.25390625, "learning_rate": 4.308550185873606e-05, "loss": 0.4323, "step": 473 }, { "epoch": 1.7001795332136447, "grad_norm": 0.2060546875, "learning_rate": 4.3066914498141266e-05, "loss": 0.5633, "step": 474 }, { "epoch": 1.7037701974865351, "grad_norm": 0.21484375, "learning_rate": 4.304832713754647e-05, "loss": 0.2324, "step": 475 }, { "epoch": 1.7073608617594256, "grad_norm": 0.2275390625, "learning_rate": 4.3029739776951676e-05, "loss": 0.4631, "step": 476 }, { "epoch": 1.710951526032316, "grad_norm": 0.2333984375, "learning_rate": 4.301115241635688e-05, "loss": 0.2248, "step": 477 }, { "epoch": 1.7145421903052065, "grad_norm": 0.2431640625, "learning_rate": 4.2992565055762086e-05, "loss": 0.2052, "step": 478 }, { "epoch": 1.718132854578097, "grad_norm": 0.2392578125, "learning_rate": 4.297397769516729e-05, "loss": 0.2809, "step": 479 }, { "epoch": 1.7217235188509874, "grad_norm": 0.21484375, "learning_rate": 4.295539033457249e-05, "loss": 0.2086, "step": 480 }, { "epoch": 1.7253141831238779, "grad_norm": 0.28515625, "learning_rate": 4.29368029739777e-05, "loss": 0.4203, "step": 481 }, { "epoch": 1.7289048473967683, "grad_norm": 0.287109375, "learning_rate": 4.29182156133829e-05, "loss": 0.2797, "step": 482 }, { "epoch": 1.7324955116696588, "grad_norm": 0.228515625, "learning_rate": 4.28996282527881e-05, "loss": 0.4699, "step": 483 }, { "epoch": 1.7360861759425492, "grad_norm": 0.171875, "learning_rate": 4.288104089219331e-05, "loss": 0.2998, "step": 484 }, { "epoch": 1.73967684021544, "grad_norm": 0.185546875, "learning_rate": 4.286245353159852e-05, "loss": 0.4453, "step": 485 }, { "epoch": 1.7432675044883303, "grad_norm": 0.23828125, "learning_rate": 4.284386617100372e-05, "loss": 0.2455, "step": 486 }, { "epoch": 1.7468581687612208, "grad_norm": 0.2265625, "learning_rate": 4.282527881040893e-05, "loss": 0.4399, "step": 487 }, { "epoch": 1.7504488330341115, "grad_norm": 0.2255859375, "learning_rate": 4.280669144981413e-05, "loss": 0.4758, "step": 488 }, { "epoch": 1.754039497307002, "grad_norm": 0.2001953125, "learning_rate": 4.278810408921933e-05, "loss": 0.301, "step": 489 }, { "epoch": 1.7576301615798924, "grad_norm": 0.283203125, "learning_rate": 4.276951672862454e-05, "loss": 0.3291, "step": 490 }, { "epoch": 1.7612208258527828, "grad_norm": 0.2421875, "learning_rate": 4.275092936802974e-05, "loss": 0.2654, "step": 491 }, { "epoch": 1.7648114901256733, "grad_norm": 0.25, "learning_rate": 4.273234200743494e-05, "loss": 0.2526, "step": 492 }, { "epoch": 1.7684021543985637, "grad_norm": 0.2470703125, "learning_rate": 4.271375464684015e-05, "loss": 0.2548, "step": 493 }, { "epoch": 1.7719928186714542, "grad_norm": 0.2490234375, "learning_rate": 4.269516728624536e-05, "loss": 0.2842, "step": 494 }, { "epoch": 1.7755834829443446, "grad_norm": 0.2470703125, "learning_rate": 4.267657992565056e-05, "loss": 0.2201, "step": 495 }, { "epoch": 1.779174147217235, "grad_norm": 0.2734375, "learning_rate": 4.265799256505576e-05, "loss": 0.341, "step": 496 }, { "epoch": 1.7827648114901256, "grad_norm": 0.201171875, "learning_rate": 4.263940520446097e-05, "loss": 0.2174, "step": 497 }, { "epoch": 1.786355475763016, "grad_norm": 0.2890625, "learning_rate": 4.262081784386617e-05, "loss": 0.6514, "step": 498 }, { "epoch": 1.7899461400359067, "grad_norm": 0.2392578125, "learning_rate": 4.2602230483271374e-05, "loss": 0.328, "step": 499 }, { "epoch": 1.7935368043087971, "grad_norm": 0.322265625, "learning_rate": 4.258364312267658e-05, "loss": 0.5382, "step": 500 }, { "epoch": 1.7971274685816876, "grad_norm": 0.25390625, "learning_rate": 4.2565055762081784e-05, "loss": 0.4579, "step": 501 }, { "epoch": 1.800718132854578, "grad_norm": 0.2177734375, "learning_rate": 4.2546468401486986e-05, "loss": 0.1797, "step": 502 }, { "epoch": 1.8043087971274687, "grad_norm": 0.240234375, "learning_rate": 4.25278810408922e-05, "loss": 0.3398, "step": 503 }, { "epoch": 1.8078994614003592, "grad_norm": 0.275390625, "learning_rate": 4.25092936802974e-05, "loss": 0.3381, "step": 504 }, { "epoch": 1.8114901256732496, "grad_norm": 0.236328125, "learning_rate": 4.2490706319702604e-05, "loss": 0.5352, "step": 505 }, { "epoch": 1.81508078994614, "grad_norm": 0.267578125, "learning_rate": 4.247211895910781e-05, "loss": 0.3696, "step": 506 }, { "epoch": 1.8186714542190305, "grad_norm": 0.21484375, "learning_rate": 4.2453531598513014e-05, "loss": 0.2222, "step": 507 }, { "epoch": 1.822262118491921, "grad_norm": 0.234375, "learning_rate": 4.2434944237918215e-05, "loss": 0.4542, "step": 508 }, { "epoch": 1.8258527827648114, "grad_norm": 0.21484375, "learning_rate": 4.2416356877323424e-05, "loss": 0.2314, "step": 509 }, { "epoch": 1.829443447037702, "grad_norm": 0.2197265625, "learning_rate": 4.2397769516728625e-05, "loss": 0.8278, "step": 510 }, { "epoch": 1.8330341113105924, "grad_norm": 0.2255859375, "learning_rate": 4.237918215613383e-05, "loss": 0.4584, "step": 511 }, { "epoch": 1.8366247755834828, "grad_norm": 0.1767578125, "learning_rate": 4.2360594795539035e-05, "loss": 0.3394, "step": 512 }, { "epoch": 1.8402154398563735, "grad_norm": 0.2099609375, "learning_rate": 4.2342007434944244e-05, "loss": 0.4449, "step": 513 }, { "epoch": 1.843806104129264, "grad_norm": 0.21875, "learning_rate": 4.2323420074349445e-05, "loss": 0.2344, "step": 514 }, { "epoch": 1.8473967684021544, "grad_norm": 0.2080078125, "learning_rate": 4.230483271375465e-05, "loss": 0.4271, "step": 515 }, { "epoch": 1.8509874326750448, "grad_norm": 0.2177734375, "learning_rate": 4.2286245353159855e-05, "loss": 0.2834, "step": 516 }, { "epoch": 1.8545780969479355, "grad_norm": 0.2255859375, "learning_rate": 4.226765799256506e-05, "loss": 0.3447, "step": 517 }, { "epoch": 1.858168761220826, "grad_norm": 0.2099609375, "learning_rate": 4.224907063197026e-05, "loss": 0.25, "step": 518 }, { "epoch": 1.8617594254937164, "grad_norm": 0.26953125, "learning_rate": 4.223048327137547e-05, "loss": 0.3412, "step": 519 }, { "epoch": 1.8653500897666069, "grad_norm": 0.21875, "learning_rate": 4.221189591078067e-05, "loss": 0.2483, "step": 520 }, { "epoch": 1.8689407540394973, "grad_norm": 0.2275390625, "learning_rate": 4.219330855018588e-05, "loss": 0.5079, "step": 521 }, { "epoch": 1.8725314183123878, "grad_norm": 0.1787109375, "learning_rate": 4.2174721189591085e-05, "loss": 0.3158, "step": 522 }, { "epoch": 1.8761220825852782, "grad_norm": 0.2021484375, "learning_rate": 4.215613382899629e-05, "loss": 0.2663, "step": 523 }, { "epoch": 1.8797127468581687, "grad_norm": 0.26953125, "learning_rate": 4.213754646840149e-05, "loss": 0.55, "step": 524 }, { "epoch": 1.8833034111310591, "grad_norm": 0.2158203125, "learning_rate": 4.21189591078067e-05, "loss": 0.194, "step": 525 }, { "epoch": 1.8868940754039496, "grad_norm": 0.23828125, "learning_rate": 4.21003717472119e-05, "loss": 0.4912, "step": 526 }, { "epoch": 1.89048473967684, "grad_norm": 0.2041015625, "learning_rate": 4.20817843866171e-05, "loss": 0.2155, "step": 527 }, { "epoch": 1.8940754039497307, "grad_norm": 0.283203125, "learning_rate": 4.206319702602231e-05, "loss": 0.4561, "step": 528 }, { "epoch": 1.8976660682226212, "grad_norm": 0.232421875, "learning_rate": 4.204460966542751e-05, "loss": 0.2733, "step": 529 }, { "epoch": 1.9012567324955116, "grad_norm": 0.248046875, "learning_rate": 4.202602230483272e-05, "loss": 0.2774, "step": 530 }, { "epoch": 1.9048473967684023, "grad_norm": 0.228515625, "learning_rate": 4.200743494423792e-05, "loss": 0.2533, "step": 531 }, { "epoch": 1.9084380610412928, "grad_norm": 0.263671875, "learning_rate": 4.198884758364313e-05, "loss": 0.2638, "step": 532 }, { "epoch": 1.9120287253141832, "grad_norm": 0.25, "learning_rate": 4.197026022304833e-05, "loss": 0.4109, "step": 533 }, { "epoch": 1.9156193895870737, "grad_norm": 0.2236328125, "learning_rate": 4.195167286245353e-05, "loss": 0.4329, "step": 534 }, { "epoch": 1.9192100538599641, "grad_norm": 0.2470703125, "learning_rate": 4.193308550185874e-05, "loss": 0.5587, "step": 535 }, { "epoch": 1.9228007181328546, "grad_norm": 0.2001953125, "learning_rate": 4.191449814126394e-05, "loss": 0.3477, "step": 536 }, { "epoch": 1.926391382405745, "grad_norm": 0.267578125, "learning_rate": 4.189591078066914e-05, "loss": 0.6133, "step": 537 }, { "epoch": 1.9299820466786355, "grad_norm": 0.259765625, "learning_rate": 4.187732342007435e-05, "loss": 0.4435, "step": 538 }, { "epoch": 1.933572710951526, "grad_norm": 0.2421875, "learning_rate": 4.185873605947956e-05, "loss": 0.2621, "step": 539 }, { "epoch": 1.9371633752244164, "grad_norm": 0.228515625, "learning_rate": 4.184014869888476e-05, "loss": 0.2486, "step": 540 }, { "epoch": 1.9407540394973068, "grad_norm": 0.1884765625, "learning_rate": 4.182156133828997e-05, "loss": 0.4535, "step": 541 }, { "epoch": 1.9443447037701975, "grad_norm": 0.2490234375, "learning_rate": 4.180297397769517e-05, "loss": 0.2746, "step": 542 }, { "epoch": 1.947935368043088, "grad_norm": 0.2451171875, "learning_rate": 4.178438661710037e-05, "loss": 0.2327, "step": 543 }, { "epoch": 1.9515260323159784, "grad_norm": 0.216796875, "learning_rate": 4.176579925650558e-05, "loss": 0.5695, "step": 544 }, { "epoch": 1.9551166965888689, "grad_norm": 0.21875, "learning_rate": 4.174721189591078e-05, "loss": 0.1911, "step": 545 }, { "epoch": 1.9587073608617596, "grad_norm": 0.236328125, "learning_rate": 4.1728624535315984e-05, "loss": 0.7008, "step": 546 }, { "epoch": 1.96229802513465, "grad_norm": 0.171875, "learning_rate": 4.171003717472119e-05, "loss": 0.3025, "step": 547 }, { "epoch": 1.9658886894075405, "grad_norm": 0.255859375, "learning_rate": 4.16914498141264e-05, "loss": 0.4982, "step": 548 }, { "epoch": 1.969479353680431, "grad_norm": 0.2275390625, "learning_rate": 4.16728624535316e-05, "loss": 0.258, "step": 549 }, { "epoch": 1.9730700179533214, "grad_norm": 0.228515625, "learning_rate": 4.1654275092936804e-05, "loss": 0.2937, "step": 550 }, { "epoch": 1.9766606822262118, "grad_norm": 0.25390625, "learning_rate": 4.163568773234201e-05, "loss": 0.2185, "step": 551 }, { "epoch": 1.9802513464991023, "grad_norm": 0.26953125, "learning_rate": 4.1617100371747214e-05, "loss": 0.4574, "step": 552 }, { "epoch": 1.9838420107719927, "grad_norm": 0.25, "learning_rate": 4.1598513011152416e-05, "loss": 0.6302, "step": 553 }, { "epoch": 1.9874326750448832, "grad_norm": 0.2353515625, "learning_rate": 4.1579925650557624e-05, "loss": 0.3314, "step": 554 }, { "epoch": 1.9910233393177736, "grad_norm": 0.294921875, "learning_rate": 4.1561338289962826e-05, "loss": 0.5739, "step": 555 }, { "epoch": 1.9946140035906643, "grad_norm": 0.271484375, "learning_rate": 4.154275092936803e-05, "loss": 0.665, "step": 556 }, { "epoch": 1.9982046678635548, "grad_norm": 0.2431640625, "learning_rate": 4.1524163568773236e-05, "loss": 0.2907, "step": 557 }, { "epoch": 2.0, "grad_norm": 0.421875, "learning_rate": 4.1505576208178444e-05, "loss": 0.246, "step": 558 }, { "epoch": 2.0, "eval_loss": 0.363563597202301, "eval_model_preparation_time": 0.008, "eval_runtime": 3.9845, "eval_samples_per_second": 1.255, "eval_steps_per_second": 1.255, "step": 558 }, { "epoch": 2.0, "eval_loss": 0.7576545476913452, "eval_model_preparation_time": 0.008, "eval_runtime": 3.0768, "eval_samples_per_second": 1.625, "eval_steps_per_second": 1.625, "step": 558 }, { "epoch": 2.0, "eval_loss": 0.43855899572372437, "eval_model_preparation_time": 0.008, "eval_runtime": 4.5631, "eval_samples_per_second": 1.096, "eval_steps_per_second": 1.096, "step": 558 }, { "epoch": 2.0, "eval_loss": 0.955260157585144, "eval_model_preparation_time": 0.008, "eval_runtime": 4.7659, "eval_samples_per_second": 1.049, "eval_steps_per_second": 1.049, "step": 558 } ], "logging_steps": 1, "max_steps": 2790, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.660680277697311e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }