{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.25, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "Batch Mean": -0.7529067993164062, "accuracy": 0.4609375, "epoch": 0, "step": 0 }, { "epoch": 0.0025, "grad_norm": 68.71814727783203, "learning_rate": 1.5000000000000002e-07, "loss": 2.2308, "step": 1 }, { "Batch Mean": -0.7621135711669922, "accuracy": 0.6015625, "epoch": 0.0025, "step": 1 }, { "epoch": 0.005, "grad_norm": 69.99313354492188, "learning_rate": 3.0000000000000004e-07, "loss": 2.2058, "step": 2 }, { "Batch Mean": -0.7623519897460938, "accuracy": 0.5, "epoch": 0.005, "step": 2 }, { "epoch": 0.0075, "grad_norm": 69.62191009521484, "learning_rate": 4.5e-07, "loss": 2.2136, "step": 3 }, { "Batch Mean": -0.7178099155426025, "accuracy": 0.5390625, "epoch": 0.0075, "step": 3 }, { "epoch": 0.01, "grad_norm": 66.7057876586914, "learning_rate": 6.000000000000001e-07, "loss": 2.1995, "step": 4 }, { "Batch Mean": -0.73297119140625, "accuracy": 0.5, "epoch": 0.01, "step": 4 }, { "epoch": 0.0125, "grad_norm": 68.0609359741211, "learning_rate": 7.5e-07, "loss": 2.2217, "step": 5 }, { "Batch Mean": -0.65020751953125, "accuracy": 0.515625, "epoch": 0.0125, "step": 5 }, { "epoch": 0.015, "grad_norm": 60.05727767944336, "learning_rate": 9e-07, "loss": 2.1842, "step": 6 }, { "Batch Mean": -0.4335050582885742, "accuracy": 0.4765625, "epoch": 0.015, "step": 6 }, { "epoch": 0.0175, "grad_norm": 42.355037689208984, "learning_rate": 1.05e-06, "loss": 2.1272, "step": 7 }, { "Batch Mean": -0.34102821350097656, "accuracy": 0.59375, "epoch": 0.0175, "step": 7 }, { "epoch": 0.02, "grad_norm": 33.14927291870117, "learning_rate": 1.2000000000000002e-06, "loss": 2.0863, "step": 8 }, { "Batch Mean": 0.4715290069580078, "accuracy": 0.5078125, "epoch": 0.02, "step": 8 }, { "epoch": 0.0225, "grad_norm": 44.73906326293945, "learning_rate": 1.35e-06, "loss": 2.1269, "step": 9 }, { "Batch Mean": 0.5773391723632812, "accuracy": 0.4921875, "epoch": 0.0225, "step": 9 }, { "epoch": 0.025, "grad_norm": 53.71864318847656, "learning_rate": 1.5e-06, "loss": 2.1479, "step": 10 }, { "Batch Mean": 0.6384601593017578, "accuracy": 0.5859375, "epoch": 0.025, "step": 10 }, { "epoch": 0.0275, "grad_norm": 59.3089714050293, "learning_rate": 1.65e-06, "loss": 2.1613, "step": 11 }, { "Batch Mean": 0.7264938354492188, "accuracy": 0.609375, "epoch": 0.0275, "step": 11 }, { "epoch": 0.03, "grad_norm": 66.10692596435547, "learning_rate": 1.8e-06, "loss": 2.1863, "step": 12 }, { "Batch Mean": 0.7215728759765625, "accuracy": 0.6015625, "epoch": 0.03, "step": 12 }, { "epoch": 0.0325, "grad_norm": 65.17693328857422, "learning_rate": 1.95e-06, "loss": 2.1765, "step": 13 }, { "Batch Mean": 0.6965312957763672, "accuracy": 0.6796875, "epoch": 0.0325, "step": 13 }, { "epoch": 0.035, "grad_norm": 62.07670593261719, "learning_rate": 2.1e-06, "loss": 2.1453, "step": 14 }, { "Batch Mean": 0.5494976043701172, "accuracy": 0.609375, "epoch": 0.035, "step": 14 }, { "epoch": 0.0375, "grad_norm": 49.54670333862305, "learning_rate": 2.25e-06, "loss": 2.1056, "step": 15 }, { "Batch Mean": 0.3555606007575989, "accuracy": 0.5859375, "epoch": 0.0375, "step": 15 }, { "epoch": 0.04, "grad_norm": 32.00790786743164, "learning_rate": 2.4000000000000003e-06, "loss": 2.0704, "step": 16 }, { "Batch Mean": -0.04851384460926056, "accuracy": 0.625, "epoch": 0.04, "step": 16 }, { "epoch": 0.0425, "grad_norm": 6.706290245056152, "learning_rate": 2.55e-06, "loss": 2.0362, "step": 17 }, { "Batch Mean": -0.2664855122566223, "accuracy": 0.6796875, "epoch": 0.0425, "step": 17 }, { "epoch": 0.045, "grad_norm": 25.608001708984375, "learning_rate": 2.7e-06, "loss": 2.0153, "step": 18 }, { "Batch Mean": -0.30804145336151123, "accuracy": 0.6484375, "epoch": 0.045, "step": 18 }, { "epoch": 0.0475, "grad_norm": 28.354061126708984, "learning_rate": 2.85e-06, "loss": 2.025, "step": 19 }, { "Batch Mean": -0.19035214185714722, "accuracy": 0.640625, "epoch": 0.0475, "step": 19 }, { "epoch": 0.05, "grad_norm": 17.62181282043457, "learning_rate": 3e-06, "loss": 2.0205, "step": 20 }, { "Batch Mean": 0.039848215878009796, "accuracy": 0.671875, "epoch": 0.05, "step": 20 }, { "epoch": 0.0525, "grad_norm": 6.653412818908691, "learning_rate": 2.992105263157895e-06, "loss": 1.9401, "step": 21 }, { "Batch Mean": 0.1799747347831726, "accuracy": 0.703125, "epoch": 0.0525, "step": 21 }, { "epoch": 0.055, "grad_norm": 17.049795150756836, "learning_rate": 2.9842105263157896e-06, "loss": 1.9494, "step": 22 }, { "Batch Mean": 0.24787741899490356, "accuracy": 0.671875, "epoch": 0.055, "step": 22 }, { "epoch": 0.0575, "grad_norm": 23.90768814086914, "learning_rate": 2.9763157894736843e-06, "loss": 1.9571, "step": 23 }, { "Batch Mean": 0.19024890661239624, "accuracy": 0.6640625, "epoch": 0.0575, "step": 23 }, { "epoch": 0.06, "grad_norm": 20.936309814453125, "learning_rate": 2.968421052631579e-06, "loss": 1.956, "step": 24 }, { "Batch Mean": -0.17568430304527283, "accuracy": 0.6640625, "epoch": 0.06, "step": 24 }, { "epoch": 0.0625, "grad_norm": 15.15001106262207, "learning_rate": 2.960526315789474e-06, "loss": 1.9323, "step": 25 }, { "Batch Mean": -0.42217159271240234, "accuracy": 0.6796875, "epoch": 0.0625, "step": 25 }, { "epoch": 0.065, "grad_norm": 36.079620361328125, "learning_rate": 2.9526315789473685e-06, "loss": 1.92, "step": 26 }, { "Batch Mean": -0.343856543302536, "accuracy": 0.7265625, "epoch": 0.065, "step": 26 }, { "epoch": 0.0675, "grad_norm": 27.65888786315918, "learning_rate": 2.9447368421052633e-06, "loss": 1.8904, "step": 27 }, { "Batch Mean": -0.018065452575683594, "accuracy": 0.734375, "epoch": 0.0675, "step": 27 }, { "epoch": 0.07, "grad_norm": 12.963483810424805, "learning_rate": 2.936842105263158e-06, "loss": 1.8525, "step": 28 }, { "Batch Mean": 0.1784379482269287, "accuracy": 0.671875, "epoch": 0.07, "step": 28 }, { "epoch": 0.0725, "grad_norm": 24.365882873535156, "learning_rate": 2.9289473684210528e-06, "loss": 1.9962, "step": 29 }, { "Batch Mean": 0.030784010887145996, "accuracy": 0.6640625, "epoch": 0.0725, "step": 29 }, { "epoch": 0.075, "grad_norm": 14.051633834838867, "learning_rate": 2.9210526315789475e-06, "loss": 1.9876, "step": 30 }, { "Batch Mean": -0.15207087993621826, "accuracy": 0.734375, "epoch": 0.075, "step": 30 }, { "epoch": 0.0775, "grad_norm": 14.313821792602539, "learning_rate": 2.9131578947368423e-06, "loss": 1.8637, "step": 31 }, { "Batch Mean": -0.18611329793930054, "accuracy": 0.671875, "epoch": 0.0775, "step": 31 }, { "epoch": 0.08, "grad_norm": 16.775203704833984, "learning_rate": 2.905263157894737e-06, "loss": 1.8864, "step": 32 }, { "Batch Mean": -0.024571657180786133, "accuracy": 0.765625, "epoch": 0.08, "step": 32 }, { "epoch": 0.0825, "grad_norm": 8.754944801330566, "learning_rate": 2.8973684210526318e-06, "loss": 1.7853, "step": 33 }, { "Batch Mean": -0.02640557289123535, "accuracy": 0.65625, "epoch": 0.0825, "step": 33 }, { "epoch": 0.085, "grad_norm": 9.384634017944336, "learning_rate": 2.8894736842105265e-06, "loss": 1.8665, "step": 34 }, { "Batch Mean": 0.21018171310424805, "accuracy": 0.796875, "epoch": 0.085, "step": 34 }, { "epoch": 0.0875, "grad_norm": 19.718061447143555, "learning_rate": 2.8815789473684213e-06, "loss": 1.7445, "step": 35 }, { "Batch Mean": 0.13466757535934448, "accuracy": 0.7578125, "epoch": 0.0875, "step": 35 }, { "epoch": 0.09, "grad_norm": 15.639932632446289, "learning_rate": 2.873684210526316e-06, "loss": 1.8598, "step": 36 }, { "Batch Mean": -0.1862117350101471, "accuracy": 0.7109375, "epoch": 0.09, "step": 36 }, { "epoch": 0.0925, "grad_norm": 17.499744415283203, "learning_rate": 2.8657894736842103e-06, "loss": 1.8443, "step": 37 }, { "Batch Mean": -0.22551095485687256, "accuracy": 0.7265625, "epoch": 0.0925, "step": 37 }, { "epoch": 0.095, "grad_norm": 20.46396255493164, "learning_rate": 2.857894736842105e-06, "loss": 1.8398, "step": 38 }, { "Batch Mean": -0.13848888874053955, "accuracy": 0.796875, "epoch": 0.095, "step": 38 }, { "epoch": 0.0975, "grad_norm": 14.592827796936035, "learning_rate": 2.85e-06, "loss": 1.7161, "step": 39 }, { "Batch Mean": 0.24413752555847168, "accuracy": 0.734375, "epoch": 0.0975, "step": 39 }, { "epoch": 0.1, "grad_norm": 24.573457717895508, "learning_rate": 2.8421052631578946e-06, "loss": 1.843, "step": 40 }, { "Batch Mean": 0.2594280242919922, "accuracy": 0.78125, "epoch": 0.1, "step": 40 }, { "epoch": 0.1025, "grad_norm": 22.92070960998535, "learning_rate": 2.8342105263157897e-06, "loss": 1.8117, "step": 41 }, { "Batch Mean": 0.24114465713500977, "accuracy": 0.7109375, "epoch": 0.1025, "step": 41 }, { "epoch": 0.105, "grad_norm": 24.361190795898438, "learning_rate": 2.8263157894736845e-06, "loss": 1.858, "step": 42 }, { "Batch Mean": -0.2355879247188568, "accuracy": 0.734375, "epoch": 0.105, "step": 42 }, { "epoch": 0.1075, "grad_norm": 22.678878784179688, "learning_rate": 2.8184210526315792e-06, "loss": 1.746, "step": 43 }, { "Batch Mean": -0.3663749694824219, "accuracy": 0.75, "epoch": 0.1075, "step": 43 }, { "epoch": 0.11, "grad_norm": 32.974552154541016, "learning_rate": 2.810526315789474e-06, "loss": 1.8107, "step": 44 }, { "Batch Mean": -0.12125219404697418, "accuracy": 0.6796875, "epoch": 0.11, "step": 44 }, { "epoch": 0.1125, "grad_norm": 15.224480628967285, "learning_rate": 2.8026315789473687e-06, "loss": 1.8054, "step": 45 }, { "Batch Mean": 0.12122505903244019, "accuracy": 0.8046875, "epoch": 0.1125, "step": 45 }, { "epoch": 0.115, "grad_norm": 15.367413520812988, "learning_rate": 2.7947368421052635e-06, "loss": 1.6621, "step": 46 }, { "Batch Mean": 0.5257976055145264, "accuracy": 0.6875, "epoch": 0.115, "step": 46 }, { "epoch": 0.1175, "grad_norm": 46.61113739013672, "learning_rate": 2.7868421052631578e-06, "loss": 1.8809, "step": 47 }, { "Batch Mean": 0.26863813400268555, "accuracy": 0.8046875, "epoch": 0.1175, "step": 47 }, { "epoch": 0.12, "grad_norm": 25.281469345092773, "learning_rate": 2.7789473684210525e-06, "loss": 1.6595, "step": 48 }, { "Batch Mean": -0.052232980728149414, "accuracy": 0.7265625, "epoch": 0.12, "step": 48 }, { "epoch": 0.1225, "grad_norm": 14.708171844482422, "learning_rate": 2.7710526315789473e-06, "loss": 1.8164, "step": 49 }, { "Batch Mean": -0.44012928009033203, "accuracy": 0.8046875, "epoch": 0.1225, "step": 49 }, { "epoch": 0.125, "grad_norm": 37.772132873535156, "learning_rate": 2.763157894736842e-06, "loss": 1.7354, "step": 50 }, { "Batch Mean": -0.3808678388595581, "accuracy": 0.7578125, "epoch": 0.125, "step": 50 }, { "epoch": 0.1275, "grad_norm": 33.93559265136719, "learning_rate": 2.7552631578947368e-06, "loss": 1.7865, "step": 51 }, { "Batch Mean": -0.16808462142944336, "accuracy": 0.796875, "epoch": 0.1275, "step": 51 }, { "epoch": 0.13, "grad_norm": 18.351085662841797, "learning_rate": 2.7473684210526315e-06, "loss": 1.637, "step": 52 }, { "Batch Mean": 0.4238893985748291, "accuracy": 0.75, "epoch": 0.13, "step": 52 }, { "epoch": 0.1325, "grad_norm": 39.48286056518555, "learning_rate": 2.7394736842105263e-06, "loss": 1.7314, "step": 53 }, { "Batch Mean": 0.6634122133255005, "accuracy": 0.734375, "epoch": 0.1325, "step": 53 }, { "epoch": 0.135, "grad_norm": 58.41835021972656, "learning_rate": 2.7315789473684214e-06, "loss": 1.8783, "step": 54 }, { "Batch Mean": 0.38930320739746094, "accuracy": 0.7734375, "epoch": 0.135, "step": 54 }, { "epoch": 0.1375, "grad_norm": 37.195316314697266, "learning_rate": 2.723684210526316e-06, "loss": 1.707, "step": 55 }, { "Batch Mean": 0.054001301527023315, "accuracy": 0.7734375, "epoch": 0.1375, "step": 55 }, { "epoch": 0.14, "grad_norm": 17.124969482421875, "learning_rate": 2.715789473684211e-06, "loss": 1.86, "step": 56 }, { "Batch Mean": -0.6273295879364014, "accuracy": 0.765625, "epoch": 0.14, "step": 56 }, { "epoch": 0.1425, "grad_norm": 51.50697708129883, "learning_rate": 2.7078947368421052e-06, "loss": 1.7128, "step": 57 }, { "Batch Mean": -0.939124345779419, "accuracy": 0.7578125, "epoch": 0.1425, "step": 57 }, { "epoch": 0.145, "grad_norm": 76.270263671875, "learning_rate": 2.7e-06, "loss": 1.8785, "step": 58 }, { "Batch Mean": -0.724966287612915, "accuracy": 0.7890625, "epoch": 0.145, "step": 58 }, { "epoch": 0.1475, "grad_norm": 58.816280364990234, "learning_rate": 2.6921052631578947e-06, "loss": 1.7561, "step": 59 }, { "Batch Mean": -0.3745152950286865, "accuracy": 0.78125, "epoch": 0.1475, "step": 59 }, { "epoch": 0.15, "grad_norm": 30.07851791381836, "learning_rate": 2.6842105263157895e-06, "loss": 1.7832, "step": 60 }, { "Batch Mean": 0.20729660987854004, "accuracy": 0.7734375, "epoch": 0.15, "step": 60 }, { "epoch": 0.1525, "grad_norm": 22.731454849243164, "learning_rate": 2.6763157894736842e-06, "loss": 1.6978, "step": 61 }, { "Batch Mean": 0.5101933479309082, "accuracy": 0.78125, "epoch": 0.1525, "step": 61 }, { "epoch": 0.155, "grad_norm": 43.194828033447266, "learning_rate": 2.668421052631579e-06, "loss": 1.6722, "step": 62 }, { "Batch Mean": 0.4944732189178467, "accuracy": 0.75, "epoch": 0.155, "step": 62 }, { "epoch": 0.1575, "grad_norm": 40.437767028808594, "learning_rate": 2.6605263157894737e-06, "loss": 1.7644, "step": 63 }, { "Batch Mean": 0.2801190912723541, "accuracy": 0.7890625, "epoch": 0.1575, "step": 63 }, { "epoch": 0.16, "grad_norm": 25.225372314453125, "learning_rate": 2.6526315789473685e-06, "loss": 1.6741, "step": 64 }, { "Batch Mean": -0.27378302812576294, "accuracy": 0.796875, "epoch": 0.16, "step": 64 }, { "epoch": 0.1625, "grad_norm": 22.944217681884766, "learning_rate": 2.644736842105263e-06, "loss": 1.6181, "step": 65 }, { "Batch Mean": -0.401081919670105, "accuracy": 0.7734375, "epoch": 0.1625, "step": 65 }, { "epoch": 0.165, "grad_norm": 31.199247360229492, "learning_rate": 2.636842105263158e-06, "loss": 1.7014, "step": 66 }, { "Batch Mean": -0.34598207473754883, "accuracy": 0.796875, "epoch": 0.165, "step": 66 }, { "epoch": 0.1675, "grad_norm": 27.971452713012695, "learning_rate": 2.6289473684210527e-06, "loss": 1.6905, "step": 67 }, { "Batch Mean": -0.15681158006191254, "accuracy": 0.7890625, "epoch": 0.1675, "step": 67 }, { "epoch": 0.17, "grad_norm": 14.416444778442383, "learning_rate": 2.6210526315789474e-06, "loss": 1.7231, "step": 68 }, { "Batch Mean": 0.2771502137184143, "accuracy": 0.7421875, "epoch": 0.17, "step": 68 }, { "epoch": 0.1725, "grad_norm": 25.049633026123047, "learning_rate": 2.613157894736842e-06, "loss": 1.8426, "step": 69 }, { "Batch Mean": 0.49720096588134766, "accuracy": 0.71875, "epoch": 0.1725, "step": 69 }, { "epoch": 0.175, "grad_norm": 39.608943939208984, "learning_rate": 2.605263157894737e-06, "loss": 1.7503, "step": 70 }, { "Batch Mean": 0.30828404426574707, "accuracy": 0.7734375, "epoch": 0.175, "step": 70 }, { "epoch": 0.1775, "grad_norm": 26.048912048339844, "learning_rate": 2.5973684210526317e-06, "loss": 1.7175, "step": 71 }, { "Batch Mean": -0.031830430030822754, "accuracy": 0.84375, "epoch": 0.1775, "step": 71 }, { "epoch": 0.18, "grad_norm": 11.506328582763672, "learning_rate": 2.5894736842105264e-06, "loss": 1.616, "step": 72 }, { "Batch Mean": -0.2989373207092285, "accuracy": 0.8828125, "epoch": 0.18, "step": 72 }, { "epoch": 0.1825, "grad_norm": 25.800830841064453, "learning_rate": 2.581578947368421e-06, "loss": 1.4934, "step": 73 }, { "Batch Mean": -0.22713744640350342, "accuracy": 0.7421875, "epoch": 0.1825, "step": 73 }, { "epoch": 0.185, "grad_norm": 20.301925659179688, "learning_rate": 2.573684210526316e-06, "loss": 1.8363, "step": 74 }, { "Batch Mean": -0.1990160048007965, "accuracy": 0.78125, "epoch": 0.185, "step": 74 }, { "epoch": 0.1875, "grad_norm": 21.120750427246094, "learning_rate": 2.5657894736842107e-06, "loss": 1.5762, "step": 75 }, { "Batch Mean": 0.12756872177124023, "accuracy": 0.75, "epoch": 0.1875, "step": 75 }, { "epoch": 0.19, "grad_norm": 16.904523849487305, "learning_rate": 2.5578947368421054e-06, "loss": 1.8199, "step": 76 }, { "Batch Mean": 0.3172837495803833, "accuracy": 0.7734375, "epoch": 0.19, "step": 76 }, { "epoch": 0.1925, "grad_norm": 24.2791690826416, "learning_rate": 2.55e-06, "loss": 1.6359, "step": 77 }, { "Batch Mean": 0.2543300986289978, "accuracy": 0.71875, "epoch": 0.1925, "step": 77 }, { "epoch": 0.195, "grad_norm": 21.98880958557129, "learning_rate": 2.542105263157895e-06, "loss": 1.7617, "step": 78 }, { "Batch Mean": -0.14586102962493896, "accuracy": 0.8203125, "epoch": 0.195, "step": 78 }, { "epoch": 0.1975, "grad_norm": 16.464216232299805, "learning_rate": 2.5342105263157892e-06, "loss": 1.6585, "step": 79 }, { "Batch Mean": -0.0921928882598877, "accuracy": 0.8046875, "epoch": 0.1975, "step": 79 }, { "epoch": 0.2, "grad_norm": 15.101926803588867, "learning_rate": 2.526315789473684e-06, "loss": 1.6269, "step": 80 }, { "Batch Mean": 0.1173478364944458, "accuracy": 0.78125, "epoch": 0.2, "step": 80 }, { "epoch": 0.2025, "grad_norm": 16.049543380737305, "learning_rate": 2.5184210526315787e-06, "loss": 1.7549, "step": 81 }, { "Batch Mean": -0.0017396211624145508, "accuracy": 0.796875, "epoch": 0.2025, "step": 81 }, { "epoch": 0.205, "grad_norm": 12.137611389160156, "learning_rate": 2.510526315789474e-06, "loss": 1.7105, "step": 82 }, { "Batch Mean": 0.023346424102783203, "accuracy": 0.78125, "epoch": 0.205, "step": 82 }, { "epoch": 0.2075, "grad_norm": 12.019243240356445, "learning_rate": 2.5026315789473686e-06, "loss": 1.7107, "step": 83 }, { "Batch Mean": -0.2651742398738861, "accuracy": 0.765625, "epoch": 0.2075, "step": 83 }, { "epoch": 0.21, "grad_norm": 23.745969772338867, "learning_rate": 2.4947368421052634e-06, "loss": 1.732, "step": 84 }, { "Batch Mean": 0.19950759410858154, "accuracy": 0.7890625, "epoch": 0.21, "step": 84 }, { "epoch": 0.2125, "grad_norm": 18.219072341918945, "learning_rate": 2.486842105263158e-06, "loss": 1.6071, "step": 85 }, { "Batch Mean": 0.034912109375, "accuracy": 0.796875, "epoch": 0.2125, "step": 85 }, { "epoch": 0.215, "grad_norm": 11.669795989990234, "learning_rate": 2.478947368421053e-06, "loss": 1.6051, "step": 86 }, { "Batch Mean": -0.03407713770866394, "accuracy": 0.703125, "epoch": 0.215, "step": 86 }, { "epoch": 0.2175, "grad_norm": 12.774690628051758, "learning_rate": 2.4710526315789476e-06, "loss": 1.6696, "step": 87 }, { "Batch Mean": 0.08496832847595215, "accuracy": 0.796875, "epoch": 0.2175, "step": 87 }, { "epoch": 0.22, "grad_norm": 14.214601516723633, "learning_rate": 2.4631578947368424e-06, "loss": 1.6722, "step": 88 }, { "Batch Mean": -0.08868759870529175, "accuracy": 0.7421875, "epoch": 0.22, "step": 88 }, { "epoch": 0.2225, "grad_norm": 14.382110595703125, "learning_rate": 2.4552631578947367e-06, "loss": 1.7245, "step": 89 }, { "Batch Mean": -0.13518363237380981, "accuracy": 0.7890625, "epoch": 0.2225, "step": 89 }, { "epoch": 0.225, "grad_norm": 17.522390365600586, "learning_rate": 2.4473684210526314e-06, "loss": 1.6425, "step": 90 }, { "Batch Mean": 0.09633910655975342, "accuracy": 0.828125, "epoch": 0.225, "step": 90 }, { "epoch": 0.2275, "grad_norm": 14.231717109680176, "learning_rate": 2.439473684210526e-06, "loss": 1.5357, "step": 91 }, { "Batch Mean": -0.008280754089355469, "accuracy": 0.8203125, "epoch": 0.2275, "step": 91 }, { "epoch": 0.23, "grad_norm": 12.2304048538208, "learning_rate": 2.431578947368421e-06, "loss": 1.5443, "step": 92 }, { "Batch Mean": 0.059021830558776855, "accuracy": 0.7421875, "epoch": 0.23, "step": 92 }, { "epoch": 0.2325, "grad_norm": 14.887194633483887, "learning_rate": 2.4236842105263157e-06, "loss": 1.6318, "step": 93 }, { "Batch Mean": -0.06925773620605469, "accuracy": 0.7734375, "epoch": 0.2325, "step": 93 }, { "epoch": 0.235, "grad_norm": 14.755175590515137, "learning_rate": 2.4157894736842104e-06, "loss": 1.6039, "step": 94 }, { "Batch Mean": -0.05870896577835083, "accuracy": 0.796875, "epoch": 0.235, "step": 94 }, { "epoch": 0.2375, "grad_norm": 14.340346336364746, "learning_rate": 2.4078947368421056e-06, "loss": 1.5973, "step": 95 }, { "Batch Mean": 0.07913792133331299, "accuracy": 0.8203125, "epoch": 0.2375, "step": 95 }, { "epoch": 0.24, "grad_norm": 18.1204891204834, "learning_rate": 2.4000000000000003e-06, "loss": 1.6845, "step": 96 }, { "Batch Mean": -0.06982064247131348, "accuracy": 0.8203125, "epoch": 0.24, "step": 96 }, { "epoch": 0.2425, "grad_norm": 15.789064407348633, "learning_rate": 2.392105263157895e-06, "loss": 1.5769, "step": 97 }, { "Batch Mean": -0.13198983669281006, "accuracy": 0.84375, "epoch": 0.2425, "step": 97 }, { "epoch": 0.245, "grad_norm": 17.205394744873047, "learning_rate": 2.38421052631579e-06, "loss": 1.5754, "step": 98 }, { "Batch Mean": 0.029864728450775146, "accuracy": 0.7265625, "epoch": 0.245, "step": 98 }, { "epoch": 0.2475, "grad_norm": 14.032808303833008, "learning_rate": 2.376315789473684e-06, "loss": 1.6284, "step": 99 }, { "Batch Mean": 0.21759963035583496, "accuracy": 0.796875, "epoch": 0.2475, "step": 99 }, { "epoch": 0.25, "grad_norm": 23.904977798461914, "learning_rate": 2.368421052631579e-06, "loss": 1.5753, "step": 100 } ], "logging_steps": 1, "max_steps": 400, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }