{ "best_metric": 1.161059856414795, "best_model_checkpoint": "miner_id_besimray/checkpoint-225", "epoch": 1.263157894736842, "eval_steps": 5, "global_step": 240, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005263157894736842, "grad_norm": 0.7649895548820496, "learning_rate": 2.0000000000000003e-06, "loss": 1.3316, "step": 1 }, { "epoch": 0.005263157894736842, "eval_loss": 1.2585704326629639, "eval_runtime": 22.246, "eval_samples_per_second": 4.495, "eval_steps_per_second": 0.45, "step": 1 }, { "epoch": 0.010526315789473684, "grad_norm": 0.5886185169219971, "learning_rate": 4.000000000000001e-06, "loss": 1.3575, "step": 2 }, { "epoch": 0.015789473684210527, "grad_norm": 0.7825772762298584, "learning_rate": 6e-06, "loss": 1.2322, "step": 3 }, { "epoch": 0.021052631578947368, "grad_norm": 0.8063048720359802, "learning_rate": 8.000000000000001e-06, "loss": 1.2845, "step": 4 }, { "epoch": 0.02631578947368421, "grad_norm": 0.7027026414871216, "learning_rate": 1e-05, "loss": 1.1351, "step": 5 }, { "epoch": 0.02631578947368421, "eval_loss": 1.2596009969711304, "eval_runtime": 22.3875, "eval_samples_per_second": 4.467, "eval_steps_per_second": 0.447, "step": 5 }, { "epoch": 0.031578947368421054, "grad_norm": 0.5887891054153442, "learning_rate": 1.2e-05, "loss": 1.3178, "step": 6 }, { "epoch": 0.03684210526315789, "grad_norm": 0.7495357990264893, "learning_rate": 1.4e-05, "loss": 1.601, "step": 7 }, { "epoch": 0.042105263157894736, "grad_norm": 0.624264657497406, "learning_rate": 1.6000000000000003e-05, "loss": 1.7585, "step": 8 }, { "epoch": 0.04736842105263158, "grad_norm": 0.7106643319129944, "learning_rate": 1.8e-05, "loss": 1.229, "step": 9 }, { "epoch": 0.05263157894736842, "grad_norm": 0.6951448321342468, "learning_rate": 2e-05, "loss": 1.2604, "step": 10 }, { "epoch": 0.05263157894736842, "eval_loss": 1.256644606590271, "eval_runtime": 22.4707, "eval_samples_per_second": 4.45, "eval_steps_per_second": 0.445, "step": 10 }, { "epoch": 0.05789473684210526, "grad_norm": 0.5871052742004395, "learning_rate": 1.999994415122672e-05, "loss": 1.2751, "step": 11 }, { "epoch": 0.06315789473684211, "grad_norm": 0.631374180316925, "learning_rate": 1.9999776605530693e-05, "loss": 1.3235, "step": 12 }, { "epoch": 0.06842105263157895, "grad_norm": 0.7013016939163208, "learning_rate": 1.9999497364783364e-05, "loss": 1.3739, "step": 13 }, { "epoch": 0.07368421052631578, "grad_norm": 0.5672230124473572, "learning_rate": 1.9999106432103785e-05, "loss": 1.3509, "step": 14 }, { "epoch": 0.07894736842105263, "grad_norm": 0.7572811841964722, "learning_rate": 1.9998603811858573e-05, "loss": 1.5396, "step": 15 }, { "epoch": 0.07894736842105263, "eval_loss": 1.2453573942184448, "eval_runtime": 22.4924, "eval_samples_per_second": 4.446, "eval_steps_per_second": 0.445, "step": 15 }, { "epoch": 0.08421052631578947, "grad_norm": 0.8022782802581787, "learning_rate": 1.999798950966188e-05, "loss": 1.422, "step": 16 }, { "epoch": 0.08947368421052632, "grad_norm": 0.48137226700782776, "learning_rate": 1.9997263532375303e-05, "loss": 0.9486, "step": 17 }, { "epoch": 0.09473684210526316, "grad_norm": 0.5582326650619507, "learning_rate": 1.999642588810784e-05, "loss": 1.2432, "step": 18 }, { "epoch": 0.1, "grad_norm": 0.561917245388031, "learning_rate": 1.9995476586215764e-05, "loss": 1.2858, "step": 19 }, { "epoch": 0.10526315789473684, "grad_norm": 0.80231112241745, "learning_rate": 1.9994415637302545e-05, "loss": 1.4895, "step": 20 }, { "epoch": 0.10526315789473684, "eval_loss": 1.2336317300796509, "eval_runtime": 22.5114, "eval_samples_per_second": 4.442, "eval_steps_per_second": 0.444, "step": 20 }, { "epoch": 0.11052631578947368, "grad_norm": 0.9142282009124756, "learning_rate": 1.999324305321873e-05, "loss": 1.5426, "step": 21 }, { "epoch": 0.11578947368421053, "grad_norm": 0.6059389114379883, "learning_rate": 1.9991958847061786e-05, "loss": 1.357, "step": 22 }, { "epoch": 0.12105263157894737, "grad_norm": 0.700927197933197, "learning_rate": 1.9990563033175984e-05, "loss": 1.1997, "step": 23 }, { "epoch": 0.12631578947368421, "grad_norm": 0.6965627074241638, "learning_rate": 1.9989055627152222e-05, "loss": 1.4878, "step": 24 }, { "epoch": 0.13157894736842105, "grad_norm": 1.1500164270401, "learning_rate": 1.998743664582786e-05, "loss": 1.1625, "step": 25 }, { "epoch": 0.13157894736842105, "eval_loss": 1.2236310243606567, "eval_runtime": 22.5023, "eval_samples_per_second": 4.444, "eval_steps_per_second": 0.444, "step": 25 }, { "epoch": 0.1368421052631579, "grad_norm": 0.5978737473487854, "learning_rate": 1.9985706107286515e-05, "loss": 1.4717, "step": 26 }, { "epoch": 0.14210526315789473, "grad_norm": 0.581161618232727, "learning_rate": 1.9983864030857883e-05, "loss": 1.3742, "step": 27 }, { "epoch": 0.14736842105263157, "grad_norm": 0.6772504448890686, "learning_rate": 1.9981910437117502e-05, "loss": 1.3352, "step": 28 }, { "epoch": 0.15263157894736842, "grad_norm": 0.4610370397567749, "learning_rate": 1.9979845347886543e-05, "loss": 1.1699, "step": 29 }, { "epoch": 0.15789473684210525, "grad_norm": 0.6659037470817566, "learning_rate": 1.9977668786231536e-05, "loss": 1.3554, "step": 30 }, { "epoch": 0.15789473684210525, "eval_loss": 1.2150318622589111, "eval_runtime": 22.5146, "eval_samples_per_second": 4.442, "eval_steps_per_second": 0.444, "step": 30 }, { "epoch": 0.1631578947368421, "grad_norm": 0.54472416639328, "learning_rate": 1.9975380776464143e-05, "loss": 1.104, "step": 31 }, { "epoch": 0.16842105263157894, "grad_norm": 0.5897062420845032, "learning_rate": 1.9972981344140875e-05, "loss": 1.2643, "step": 32 }, { "epoch": 0.1736842105263158, "grad_norm": 0.6215894222259521, "learning_rate": 1.99704705160628e-05, "loss": 1.06, "step": 33 }, { "epoch": 0.17894736842105263, "grad_norm": 0.5205391645431519, "learning_rate": 1.9967848320275253e-05, "loss": 1.2555, "step": 34 }, { "epoch": 0.18421052631578946, "grad_norm": 0.7713693380355835, "learning_rate": 1.9965114786067515e-05, "loss": 1.3275, "step": 35 }, { "epoch": 0.18421052631578946, "eval_loss": 1.210020899772644, "eval_runtime": 22.4964, "eval_samples_per_second": 4.445, "eval_steps_per_second": 0.445, "step": 35 }, { "epoch": 0.18947368421052632, "grad_norm": 0.669132649898529, "learning_rate": 1.99622699439725e-05, "loss": 1.1772, "step": 36 }, { "epoch": 0.19473684210526315, "grad_norm": 0.48863786458969116, "learning_rate": 1.9959313825766385e-05, "loss": 1.277, "step": 37 }, { "epoch": 0.2, "grad_norm": 0.5156219601631165, "learning_rate": 1.9956246464468294e-05, "loss": 1.6873, "step": 38 }, { "epoch": 0.20526315789473684, "grad_norm": 0.6323127150535583, "learning_rate": 1.9953067894339897e-05, "loss": 1.3801, "step": 39 }, { "epoch": 0.21052631578947367, "grad_norm": 0.5111508965492249, "learning_rate": 1.994977815088504e-05, "loss": 1.1912, "step": 40 }, { "epoch": 0.21052631578947367, "eval_loss": 1.205823302268982, "eval_runtime": 22.5125, "eval_samples_per_second": 4.442, "eval_steps_per_second": 0.444, "step": 40 }, { "epoch": 0.21578947368421053, "grad_norm": 0.6412497162818909, "learning_rate": 1.994637727084936e-05, "loss": 1.0898, "step": 41 }, { "epoch": 0.22105263157894736, "grad_norm": 0.48939433693885803, "learning_rate": 1.9942865292219837e-05, "loss": 1.0148, "step": 42 }, { "epoch": 0.22631578947368422, "grad_norm": 0.5883517861366272, "learning_rate": 1.9939242254224422e-05, "loss": 1.4908, "step": 43 }, { "epoch": 0.23157894736842105, "grad_norm": 0.590800404548645, "learning_rate": 1.9935508197331556e-05, "loss": 1.1897, "step": 44 }, { "epoch": 0.23684210526315788, "grad_norm": 0.5026158094406128, "learning_rate": 1.9931663163249744e-05, "loss": 1.2335, "step": 45 }, { "epoch": 0.23684210526315788, "eval_loss": 1.2029670476913452, "eval_runtime": 22.4943, "eval_samples_per_second": 4.446, "eval_steps_per_second": 0.445, "step": 45 }, { "epoch": 0.24210526315789474, "grad_norm": 0.7319494485855103, "learning_rate": 1.9927707194927067e-05, "loss": 1.4356, "step": 46 }, { "epoch": 0.24736842105263157, "grad_norm": 0.4193449020385742, "learning_rate": 1.992364033655072e-05, "loss": 1.1428, "step": 47 }, { "epoch": 0.25263157894736843, "grad_norm": 0.4515672028064728, "learning_rate": 1.991946263354652e-05, "loss": 1.0943, "step": 48 }, { "epoch": 0.2578947368421053, "grad_norm": 0.471021831035614, "learning_rate": 1.9915174132578377e-05, "loss": 1.0749, "step": 49 }, { "epoch": 0.2631578947368421, "grad_norm": 0.473336398601532, "learning_rate": 1.9910774881547803e-05, "loss": 1.0253, "step": 50 }, { "epoch": 0.2631578947368421, "eval_loss": 1.1978802680969238, "eval_runtime": 22.5092, "eval_samples_per_second": 4.443, "eval_steps_per_second": 0.444, "step": 50 }, { "epoch": 0.26842105263157895, "grad_norm": 0.4880618751049042, "learning_rate": 1.9906264929593348e-05, "loss": 1.188, "step": 51 }, { "epoch": 0.2736842105263158, "grad_norm": 0.5559892654418945, "learning_rate": 1.9901644327090063e-05, "loss": 1.0132, "step": 52 }, { "epoch": 0.2789473684210526, "grad_norm": 0.6128857135772705, "learning_rate": 1.9896913125648957e-05, "loss": 1.1513, "step": 53 }, { "epoch": 0.28421052631578947, "grad_norm": 0.4551655650138855, "learning_rate": 1.9892071378116378e-05, "loss": 1.0801, "step": 54 }, { "epoch": 0.2894736842105263, "grad_norm": 0.4341316223144531, "learning_rate": 1.9887119138573462e-05, "loss": 1.1242, "step": 55 }, { "epoch": 0.2894736842105263, "eval_loss": 1.1970165967941284, "eval_runtime": 22.5043, "eval_samples_per_second": 4.444, "eval_steps_per_second": 0.444, "step": 55 }, { "epoch": 0.29473684210526313, "grad_norm": 0.6209389567375183, "learning_rate": 1.9882056462335513e-05, "loss": 0.9911, "step": 56 }, { "epoch": 0.3, "grad_norm": 0.46495211124420166, "learning_rate": 1.9876883405951378e-05, "loss": 0.8705, "step": 57 }, { "epoch": 0.30526315789473685, "grad_norm": 0.5986958146095276, "learning_rate": 1.987160002720283e-05, "loss": 1.0298, "step": 58 }, { "epoch": 0.3105263157894737, "grad_norm": 0.43767863512039185, "learning_rate": 1.9866206385103915e-05, "loss": 1.1415, "step": 59 }, { "epoch": 0.3157894736842105, "grad_norm": 0.7036319971084595, "learning_rate": 1.9860702539900288e-05, "loss": 0.9963, "step": 60 }, { "epoch": 0.3157894736842105, "eval_loss": 1.1909849643707275, "eval_runtime": 22.5149, "eval_samples_per_second": 4.442, "eval_steps_per_second": 0.444, "step": 60 }, { "epoch": 0.32105263157894737, "grad_norm": 0.483674019575119, "learning_rate": 1.985508855306855e-05, "loss": 1.1012, "step": 61 }, { "epoch": 0.3263157894736842, "grad_norm": 0.48016050457954407, "learning_rate": 1.984936448731556e-05, "loss": 0.9743, "step": 62 }, { "epoch": 0.33157894736842103, "grad_norm": 0.4120512902736664, "learning_rate": 1.9843530406577725e-05, "loss": 1.2744, "step": 63 }, { "epoch": 0.3368421052631579, "grad_norm": 0.5034608244895935, "learning_rate": 1.9837586376020293e-05, "loss": 1.5509, "step": 64 }, { "epoch": 0.34210526315789475, "grad_norm": 0.46461227536201477, "learning_rate": 1.9831532462036634e-05, "loss": 1.0977, "step": 65 }, { "epoch": 0.34210526315789475, "eval_loss": 1.1918507814407349, "eval_runtime": 22.517, "eval_samples_per_second": 4.441, "eval_steps_per_second": 0.444, "step": 65 }, { "epoch": 0.3473684210526316, "grad_norm": 0.6687688231468201, "learning_rate": 1.982536873224748e-05, "loss": 1.4021, "step": 66 }, { "epoch": 0.3526315789473684, "grad_norm": 0.6348972916603088, "learning_rate": 1.9819095255500178e-05, "loss": 1.4068, "step": 67 }, { "epoch": 0.35789473684210527, "grad_norm": 0.5892879962921143, "learning_rate": 1.9812712101867923e-05, "loss": 1.271, "step": 68 }, { "epoch": 0.3631578947368421, "grad_norm": 0.49744340777397156, "learning_rate": 1.9806219342648977e-05, "loss": 1.2763, "step": 69 }, { "epoch": 0.3684210526315789, "grad_norm": 0.5569796562194824, "learning_rate": 1.979961705036587e-05, "loss": 1.1263, "step": 70 }, { "epoch": 0.3684210526315789, "eval_loss": 1.1880035400390625, "eval_runtime": 22.4976, "eval_samples_per_second": 4.445, "eval_steps_per_second": 0.444, "step": 70 }, { "epoch": 0.3736842105263158, "grad_norm": 0.4775034785270691, "learning_rate": 1.9792905298764582e-05, "loss": 1.1954, "step": 71 }, { "epoch": 0.37894736842105264, "grad_norm": 0.6287166476249695, "learning_rate": 1.9786084162813735e-05, "loss": 1.3243, "step": 72 }, { "epoch": 0.38421052631578945, "grad_norm": 0.7143110632896423, "learning_rate": 1.9779153718703746e-05, "loss": 1.2978, "step": 73 }, { "epoch": 0.3894736842105263, "grad_norm": 0.6126271486282349, "learning_rate": 1.9772114043845968e-05, "loss": 1.2755, "step": 74 }, { "epoch": 0.39473684210526316, "grad_norm": 0.592986524105072, "learning_rate": 1.9764965216871848e-05, "loss": 1.2144, "step": 75 }, { "epoch": 0.39473684210526316, "eval_loss": 1.185987114906311, "eval_runtime": 26.2117, "eval_samples_per_second": 3.815, "eval_steps_per_second": 0.382, "step": 75 }, { "epoch": 0.4, "grad_norm": 0.5967031717300415, "learning_rate": 1.975770731763203e-05, "loss": 1.3366, "step": 76 }, { "epoch": 0.4052631578947368, "grad_norm": 0.5800940990447998, "learning_rate": 1.9750340427195462e-05, "loss": 1.3066, "step": 77 }, { "epoch": 0.4105263157894737, "grad_norm": 0.6693890690803528, "learning_rate": 1.974286462784851e-05, "loss": 1.3971, "step": 78 }, { "epoch": 0.41578947368421054, "grad_norm": 0.6859252452850342, "learning_rate": 1.9735280003094014e-05, "loss": 1.4623, "step": 79 }, { "epoch": 0.42105263157894735, "grad_norm": 0.5931267142295837, "learning_rate": 1.9727586637650373e-05, "loss": 1.3055, "step": 80 }, { "epoch": 0.42105263157894735, "eval_loss": 1.1839442253112793, "eval_runtime": 22.9186, "eval_samples_per_second": 4.363, "eval_steps_per_second": 0.436, "step": 80 }, { "epoch": 0.4263157894736842, "grad_norm": 0.9936294555664062, "learning_rate": 1.9719784617450593e-05, "loss": 1.1704, "step": 81 }, { "epoch": 0.43157894736842106, "grad_norm": 0.48384058475494385, "learning_rate": 1.971187402964132e-05, "loss": 1.0459, "step": 82 }, { "epoch": 0.4368421052631579, "grad_norm": 0.6293362379074097, "learning_rate": 1.9703854962581886e-05, "loss": 1.2003, "step": 83 }, { "epoch": 0.4421052631578947, "grad_norm": 0.5654959082603455, "learning_rate": 1.9695727505843298e-05, "loss": 1.0349, "step": 84 }, { "epoch": 0.4473684210526316, "grad_norm": 0.4325632154941559, "learning_rate": 1.9687491750207255e-05, "loss": 1.1513, "step": 85 }, { "epoch": 0.4473684210526316, "eval_loss": 1.1818199157714844, "eval_runtime": 22.5259, "eval_samples_per_second": 4.439, "eval_steps_per_second": 0.444, "step": 85 }, { "epoch": 0.45263157894736844, "grad_norm": 0.6001327037811279, "learning_rate": 1.9679147787665128e-05, "loss": 1.1973, "step": 86 }, { "epoch": 0.45789473684210524, "grad_norm": 0.637558102607727, "learning_rate": 1.967069571141693e-05, "loss": 1.1293, "step": 87 }, { "epoch": 0.4631578947368421, "grad_norm": 0.49120739102363586, "learning_rate": 1.966213561587028e-05, "loss": 1.2718, "step": 88 }, { "epoch": 0.46842105263157896, "grad_norm": 0.750114917755127, "learning_rate": 1.965346759663934e-05, "loss": 1.2126, "step": 89 }, { "epoch": 0.47368421052631576, "grad_norm": 0.5382435917854309, "learning_rate": 1.964469175054377e-05, "loss": 1.0702, "step": 90 }, { "epoch": 0.47368421052631576, "eval_loss": 1.1818785667419434, "eval_runtime": 22.5099, "eval_samples_per_second": 4.442, "eval_steps_per_second": 0.444, "step": 90 }, { "epoch": 0.4789473684210526, "grad_norm": 0.662380039691925, "learning_rate": 1.9635808175607604e-05, "loss": 1.2796, "step": 91 }, { "epoch": 0.4842105263157895, "grad_norm": 0.6774739027023315, "learning_rate": 1.9626816971058205e-05, "loss": 1.435, "step": 92 }, { "epoch": 0.48947368421052634, "grad_norm": 0.5468034148216248, "learning_rate": 1.9617718237325116e-05, "loss": 1.0335, "step": 93 }, { "epoch": 0.49473684210526314, "grad_norm": 1.4940450191497803, "learning_rate": 1.9608512076038964e-05, "loss": 1.4274, "step": 94 }, { "epoch": 0.5, "grad_norm": 0.45356297492980957, "learning_rate": 1.9599198590030308e-05, "loss": 1.2561, "step": 95 }, { "epoch": 0.5, "eval_loss": 1.1796730756759644, "eval_runtime": 26.2149, "eval_samples_per_second": 3.815, "eval_steps_per_second": 0.381, "step": 95 }, { "epoch": 0.5052631578947369, "grad_norm": 0.7565329670906067, "learning_rate": 1.9589777883328506e-05, "loss": 1.2047, "step": 96 }, { "epoch": 0.5105263157894737, "grad_norm": 0.7254753112792969, "learning_rate": 1.958025006116054e-05, "loss": 1.7007, "step": 97 }, { "epoch": 0.5157894736842106, "grad_norm": 0.47857415676116943, "learning_rate": 1.9570615229949844e-05, "loss": 0.8284, "step": 98 }, { "epoch": 0.5210526315789473, "grad_norm": 0.5894288420677185, "learning_rate": 1.9560873497315118e-05, "loss": 1.18, "step": 99 }, { "epoch": 0.5263157894736842, "grad_norm": 0.6634359359741211, "learning_rate": 1.9551024972069127e-05, "loss": 1.1373, "step": 100 }, { "epoch": 0.5263157894736842, "eval_loss": 1.1775206327438354, "eval_runtime": 26.6039, "eval_samples_per_second": 3.759, "eval_steps_per_second": 0.376, "step": 100 }, { "epoch": 0.531578947368421, "grad_norm": 0.8511372208595276, "learning_rate": 1.954106976421748e-05, "loss": 1.3198, "step": 101 }, { "epoch": 0.5368421052631579, "grad_norm": 0.6378451585769653, "learning_rate": 1.9531007984957408e-05, "loss": 1.3432, "step": 102 }, { "epoch": 0.5421052631578948, "grad_norm": 0.7719001770019531, "learning_rate": 1.9520839746676522e-05, "loss": 1.1474, "step": 103 }, { "epoch": 0.5473684210526316, "grad_norm": 0.7478865385055542, "learning_rate": 1.9510565162951538e-05, "loss": 1.1988, "step": 104 }, { "epoch": 0.5526315789473685, "grad_norm": 0.6055854558944702, "learning_rate": 1.950018434854704e-05, "loss": 1.2136, "step": 105 }, { "epoch": 0.5526315789473685, "eval_loss": 1.1779677867889404, "eval_runtime": 22.8963, "eval_samples_per_second": 4.368, "eval_steps_per_second": 0.437, "step": 105 }, { "epoch": 0.5578947368421052, "grad_norm": 1.0286527872085571, "learning_rate": 1.948969741941418e-05, "loss": 1.0934, "step": 106 }, { "epoch": 0.5631578947368421, "grad_norm": 0.5236276984214783, "learning_rate": 1.9479104492689384e-05, "loss": 1.2468, "step": 107 }, { "epoch": 0.5684210526315789, "grad_norm": 0.7151781320571899, "learning_rate": 1.9468405686693044e-05, "loss": 1.5067, "step": 108 }, { "epoch": 0.5736842105263158, "grad_norm": 0.5677098631858826, "learning_rate": 1.9457601120928194e-05, "loss": 1.3059, "step": 109 }, { "epoch": 0.5789473684210527, "grad_norm": 0.6112512946128845, "learning_rate": 1.944669091607919e-05, "loss": 1.3591, "step": 110 }, { "epoch": 0.5789473684210527, "eval_loss": 1.1770991086959839, "eval_runtime": 22.5378, "eval_samples_per_second": 4.437, "eval_steps_per_second": 0.444, "step": 110 }, { "epoch": 0.5842105263157895, "grad_norm": 0.8143637776374817, "learning_rate": 1.9435675194010338e-05, "loss": 1.1316, "step": 111 }, { "epoch": 0.5894736842105263, "grad_norm": 0.8077625036239624, "learning_rate": 1.9424554077764548e-05, "loss": 1.1663, "step": 112 }, { "epoch": 0.5947368421052631, "grad_norm": 0.7377018332481384, "learning_rate": 1.9413327691561967e-05, "loss": 1.4111, "step": 113 }, { "epoch": 0.6, "grad_norm": 0.5326575636863708, "learning_rate": 1.9401996160798574e-05, "loss": 1.2205, "step": 114 }, { "epoch": 0.6052631578947368, "grad_norm": 0.8595677018165588, "learning_rate": 1.9390559612044783e-05, "loss": 1.5703, "step": 115 }, { "epoch": 0.6052631578947368, "eval_loss": 1.174440622329712, "eval_runtime": 22.5072, "eval_samples_per_second": 4.443, "eval_steps_per_second": 0.444, "step": 115 }, { "epoch": 0.6105263157894737, "grad_norm": 0.6437440514564514, "learning_rate": 1.9379018173044038e-05, "loss": 1.1587, "step": 116 }, { "epoch": 0.6157894736842106, "grad_norm": 0.5231457948684692, "learning_rate": 1.9367371972711384e-05, "loss": 0.9803, "step": 117 }, { "epoch": 0.6210526315789474, "grad_norm": 0.6611769795417786, "learning_rate": 1.9355621141132022e-05, "loss": 1.3107, "step": 118 }, { "epoch": 0.6263157894736842, "grad_norm": 0.5940364003181458, "learning_rate": 1.9343765809559854e-05, "loss": 1.1463, "step": 119 }, { "epoch": 0.631578947368421, "grad_norm": 0.5217463374137878, "learning_rate": 1.9331806110416027e-05, "loss": 1.1601, "step": 120 }, { "epoch": 0.631578947368421, "eval_loss": 1.1753787994384766, "eval_runtime": 22.5001, "eval_samples_per_second": 4.444, "eval_steps_per_second": 0.444, "step": 120 }, { "epoch": 0.6368421052631579, "grad_norm": 0.5099672675132751, "learning_rate": 1.9319742177287447e-05, "loss": 1.0889, "step": 121 }, { "epoch": 0.6421052631578947, "grad_norm": 0.5484328866004944, "learning_rate": 1.9307574144925288e-05, "loss": 1.0682, "step": 122 }, { "epoch": 0.6473684210526316, "grad_norm": 0.5644655823707581, "learning_rate": 1.9295302149243482e-05, "loss": 1.1083, "step": 123 }, { "epoch": 0.6526315789473685, "grad_norm": 0.8160917162895203, "learning_rate": 1.9282926327317213e-05, "loss": 1.1152, "step": 124 }, { "epoch": 0.6578947368421053, "grad_norm": 0.6236352324485779, "learning_rate": 1.9270446817381377e-05, "loss": 1.1412, "step": 125 }, { "epoch": 0.6578947368421053, "eval_loss": 1.174829363822937, "eval_runtime": 22.5164, "eval_samples_per_second": 4.441, "eval_steps_per_second": 0.444, "step": 125 }, { "epoch": 0.6631578947368421, "grad_norm": 0.6855475306510925, "learning_rate": 1.9257863758829038e-05, "loss": 1.1203, "step": 126 }, { "epoch": 0.6684210526315789, "grad_norm": 0.6221477389335632, "learning_rate": 1.9245177292209867e-05, "loss": 1.099, "step": 127 }, { "epoch": 0.6736842105263158, "grad_norm": 0.5227069854736328, "learning_rate": 1.9232387559228587e-05, "loss": 1.2765, "step": 128 }, { "epoch": 0.6789473684210526, "grad_norm": 0.6748736500740051, "learning_rate": 1.921949470274338e-05, "loss": 1.3664, "step": 129 }, { "epoch": 0.6842105263157895, "grad_norm": 0.598393976688385, "learning_rate": 1.920649886676429e-05, "loss": 1.1449, "step": 130 }, { "epoch": 0.6842105263157895, "eval_loss": 1.1731172800064087, "eval_runtime": 22.5238, "eval_samples_per_second": 4.44, "eval_steps_per_second": 0.444, "step": 130 }, { "epoch": 0.6894736842105263, "grad_norm": 0.6862363219261169, "learning_rate": 1.919340019645161e-05, "loss": 1.1826, "step": 131 }, { "epoch": 0.6947368421052632, "grad_norm": 0.6221817135810852, "learning_rate": 1.9180198838114284e-05, "loss": 1.2809, "step": 132 }, { "epoch": 0.7, "grad_norm": 0.7746241688728333, "learning_rate": 1.916689493920824e-05, "loss": 1.1165, "step": 133 }, { "epoch": 0.7052631578947368, "grad_norm": 1.4680792093276978, "learning_rate": 1.915348864833476e-05, "loss": 1.2015, "step": 134 }, { "epoch": 0.7105263157894737, "grad_norm": 0.6323035955429077, "learning_rate": 1.9139980115238827e-05, "loss": 1.1706, "step": 135 }, { "epoch": 0.7105263157894737, "eval_loss": 1.173556923866272, "eval_runtime": 26.2964, "eval_samples_per_second": 3.803, "eval_steps_per_second": 0.38, "step": 135 }, { "epoch": 0.7157894736842105, "grad_norm": 0.5653284192085266, "learning_rate": 1.912636949080745e-05, "loss": 1.3116, "step": 136 }, { "epoch": 0.7210526315789474, "grad_norm": 0.7396507859230042, "learning_rate": 1.9112656927067952e-05, "loss": 1.1869, "step": 137 }, { "epoch": 0.7263157894736842, "grad_norm": 0.9360893368721008, "learning_rate": 1.9098842577186315e-05, "loss": 1.2102, "step": 138 }, { "epoch": 0.7315789473684211, "grad_norm": 0.6051463484764099, "learning_rate": 1.908492659546543e-05, "loss": 1.0973, "step": 139 }, { "epoch": 0.7368421052631579, "grad_norm": 0.5625165700912476, "learning_rate": 1.907090913734341e-05, "loss": 1.0503, "step": 140 }, { "epoch": 0.7368421052631579, "eval_loss": 1.1730149984359741, "eval_runtime": 23.3522, "eval_samples_per_second": 4.282, "eval_steps_per_second": 0.428, "step": 140 }, { "epoch": 0.7421052631578947, "grad_norm": 0.7346214652061462, "learning_rate": 1.9056790359391813e-05, "loss": 1.2, "step": 141 }, { "epoch": 0.7473684210526316, "grad_norm": 0.5735582709312439, "learning_rate": 1.9042570419313927e-05, "loss": 1.2624, "step": 142 }, { "epoch": 0.7526315789473684, "grad_norm": 0.6319230794906616, "learning_rate": 1.902824947594299e-05, "loss": 1.219, "step": 143 }, { "epoch": 0.7578947368421053, "grad_norm": 0.5974968671798706, "learning_rate": 1.9013827689240434e-05, "loss": 1.3021, "step": 144 }, { "epoch": 0.7631578947368421, "grad_norm": 0.5415375232696533, "learning_rate": 1.899930522029408e-05, "loss": 1.1938, "step": 145 }, { "epoch": 0.7631578947368421, "eval_loss": 1.1729778051376343, "eval_runtime": 22.5298, "eval_samples_per_second": 4.439, "eval_steps_per_second": 0.444, "step": 145 }, { "epoch": 0.7684210526315789, "grad_norm": 0.5870479345321655, "learning_rate": 1.8984682231316335e-05, "loss": 1.1926, "step": 146 }, { "epoch": 0.7736842105263158, "grad_norm": 0.5426827669143677, "learning_rate": 1.8969958885642398e-05, "loss": 1.3281, "step": 147 }, { "epoch": 0.7789473684210526, "grad_norm": 0.7911564707756042, "learning_rate": 1.8955135347728434e-05, "loss": 1.4215, "step": 148 }, { "epoch": 0.7842105263157895, "grad_norm": 0.686764121055603, "learning_rate": 1.894021178314972e-05, "loss": 1.1371, "step": 149 }, { "epoch": 0.7894736842105263, "grad_norm": 0.7921651601791382, "learning_rate": 1.8925188358598815e-05, "loss": 1.4802, "step": 150 }, { "epoch": 0.7894736842105263, "eval_loss": 1.1709694862365723, "eval_runtime": 22.5036, "eval_samples_per_second": 4.444, "eval_steps_per_second": 0.444, "step": 150 }, { "epoch": 0.7947368421052632, "grad_norm": 0.7445337772369385, "learning_rate": 1.891006524188368e-05, "loss": 1.227, "step": 151 }, { "epoch": 0.8, "grad_norm": 0.6124373078346252, "learning_rate": 1.8894842601925823e-05, "loss": 1.2834, "step": 152 }, { "epoch": 0.8052631578947368, "grad_norm": 0.5827168822288513, "learning_rate": 1.8879520608758394e-05, "loss": 1.182, "step": 153 }, { "epoch": 0.8105263157894737, "grad_norm": 0.8173070549964905, "learning_rate": 1.8864099433524302e-05, "loss": 1.3835, "step": 154 }, { "epoch": 0.8157894736842105, "grad_norm": 0.6745061874389648, "learning_rate": 1.8848579248474286e-05, "loss": 1.1359, "step": 155 }, { "epoch": 0.8157894736842105, "eval_loss": 1.1687862873077393, "eval_runtime": 22.5242, "eval_samples_per_second": 4.44, "eval_steps_per_second": 0.444, "step": 155 }, { "epoch": 0.8210526315789474, "grad_norm": 0.6754941344261169, "learning_rate": 1.883296022696501e-05, "loss": 0.9308, "step": 156 }, { "epoch": 0.8263157894736842, "grad_norm": 0.723594605922699, "learning_rate": 1.881724254345711e-05, "loss": 1.3096, "step": 157 }, { "epoch": 0.8315789473684211, "grad_norm": 0.7102451324462891, "learning_rate": 1.880142637351325e-05, "loss": 1.0751, "step": 158 }, { "epoch": 0.8368421052631579, "grad_norm": 0.9131245613098145, "learning_rate": 1.8785511893796174e-05, "loss": 1.1138, "step": 159 }, { "epoch": 0.8421052631578947, "grad_norm": 0.7415843605995178, "learning_rate": 1.8769499282066716e-05, "loss": 1.3575, "step": 160 }, { "epoch": 0.8421052631578947, "eval_loss": 1.1709015369415283, "eval_runtime": 22.513, "eval_samples_per_second": 4.442, "eval_steps_per_second": 0.444, "step": 160 }, { "epoch": 0.8473684210526315, "grad_norm": 1.0621364116668701, "learning_rate": 1.875338871718182e-05, "loss": 1.1144, "step": 161 }, { "epoch": 0.8526315789473684, "grad_norm": 0.6710101366043091, "learning_rate": 1.8737180379092536e-05, "loss": 1.3185, "step": 162 }, { "epoch": 0.8578947368421053, "grad_norm": 0.7945305705070496, "learning_rate": 1.8720874448842035e-05, "loss": 1.0666, "step": 163 }, { "epoch": 0.8631578947368421, "grad_norm": 1.025848150253296, "learning_rate": 1.870447110856355e-05, "loss": 1.5478, "step": 164 }, { "epoch": 0.868421052631579, "grad_norm": 0.5534456372261047, "learning_rate": 1.8687970541478367e-05, "loss": 1.0188, "step": 165 }, { "epoch": 0.868421052631579, "eval_loss": 1.1685324907302856, "eval_runtime": 22.9215, "eval_samples_per_second": 4.363, "eval_steps_per_second": 0.436, "step": 165 }, { "epoch": 0.8736842105263158, "grad_norm": 0.814322829246521, "learning_rate": 1.8671372931893775e-05, "loss": 1.3719, "step": 166 }, { "epoch": 0.8789473684210526, "grad_norm": 0.6162807941436768, "learning_rate": 1.8654678465201e-05, "loss": 1.0076, "step": 167 }, { "epoch": 0.8842105263157894, "grad_norm": 0.6922213435173035, "learning_rate": 1.863788732787314e-05, "loss": 1.3598, "step": 168 }, { "epoch": 0.8894736842105263, "grad_norm": 0.6565741896629333, "learning_rate": 1.862099970746308e-05, "loss": 1.3049, "step": 169 }, { "epoch": 0.8947368421052632, "grad_norm": 1.109479546546936, "learning_rate": 1.8604015792601395e-05, "loss": 1.147, "step": 170 }, { "epoch": 0.8947368421052632, "eval_loss": 1.1683638095855713, "eval_runtime": 22.5639, "eval_samples_per_second": 4.432, "eval_steps_per_second": 0.443, "step": 170 }, { "epoch": 0.9, "grad_norm": 0.579093873500824, "learning_rate": 1.8586935772994247e-05, "loss": 0.9814, "step": 171 }, { "epoch": 0.9052631578947369, "grad_norm": 0.7083842158317566, "learning_rate": 1.8569759839421263e-05, "loss": 1.2772, "step": 172 }, { "epoch": 0.9105263157894737, "grad_norm": 0.7668213248252869, "learning_rate": 1.8552488183733413e-05, "loss": 1.2974, "step": 173 }, { "epoch": 0.9157894736842105, "grad_norm": 0.5876043438911438, "learning_rate": 1.853512099885085e-05, "loss": 1.1506, "step": 174 }, { "epoch": 0.9210526315789473, "grad_norm": 0.6190224289894104, "learning_rate": 1.851765847876076e-05, "loss": 0.9949, "step": 175 }, { "epoch": 0.9210526315789473, "eval_loss": 1.1667996644973755, "eval_runtime": 22.5526, "eval_samples_per_second": 4.434, "eval_steps_per_second": 0.443, "step": 175 }, { "epoch": 0.9263157894736842, "grad_norm": 0.8201778531074524, "learning_rate": 1.8500100818515224e-05, "loss": 1.1964, "step": 176 }, { "epoch": 0.9315789473684211, "grad_norm": 0.6571232080459595, "learning_rate": 1.848244821422899e-05, "loss": 1.127, "step": 177 }, { "epoch": 0.9368421052631579, "grad_norm": 0.74791419506073, "learning_rate": 1.8464700863077313e-05, "loss": 1.183, "step": 178 }, { "epoch": 0.9421052631578948, "grad_norm": 0.9551082253456116, "learning_rate": 1.8446858963293762e-05, "loss": 1.3346, "step": 179 }, { "epoch": 0.9473684210526315, "grad_norm": 0.6122288703918457, "learning_rate": 1.842892271416797e-05, "loss": 1.3082, "step": 180 }, { "epoch": 0.9473684210526315, "eval_loss": 1.1672919988632202, "eval_runtime": 22.5711, "eval_samples_per_second": 4.43, "eval_steps_per_second": 0.443, "step": 180 }, { "epoch": 0.9526315789473684, "grad_norm": 0.7107993960380554, "learning_rate": 1.841089231604345e-05, "loss": 1.1885, "step": 181 }, { "epoch": 0.9578947368421052, "grad_norm": 0.7170677185058594, "learning_rate": 1.8392767970315314e-05, "loss": 1.0256, "step": 182 }, { "epoch": 0.9631578947368421, "grad_norm": 0.6467927694320679, "learning_rate": 1.837454987942806e-05, "loss": 1.1658, "step": 183 }, { "epoch": 0.968421052631579, "grad_norm": 0.8207054138183594, "learning_rate": 1.8356238246873302e-05, "loss": 1.4187, "step": 184 }, { "epoch": 0.9736842105263158, "grad_norm": 0.6529420018196106, "learning_rate": 1.8337833277187472e-05, "loss": 1.1995, "step": 185 }, { "epoch": 0.9736842105263158, "eval_loss": 1.1654378175735474, "eval_runtime": 22.5235, "eval_samples_per_second": 4.44, "eval_steps_per_second": 0.444, "step": 185 }, { "epoch": 0.9789473684210527, "grad_norm": 0.7171172499656677, "learning_rate": 1.831933517594957e-05, "loss": 1.1462, "step": 186 }, { "epoch": 0.9842105263157894, "grad_norm": 0.7106444835662842, "learning_rate": 1.8300744149778853e-05, "loss": 0.9308, "step": 187 }, { "epoch": 0.9894736842105263, "grad_norm": 0.9578307867050171, "learning_rate": 1.8282060406332513e-05, "loss": 0.9275, "step": 188 }, { "epoch": 0.9947368421052631, "grad_norm": 0.7951686978340149, "learning_rate": 1.826328415430339e-05, "loss": 1.379, "step": 189 }, { "epoch": 1.0, "grad_norm": 0.5988521575927734, "learning_rate": 1.8244415603417603e-05, "loss": 1.2346, "step": 190 }, { "epoch": 1.0, "eval_loss": 1.1653565168380737, "eval_runtime": 22.5513, "eval_samples_per_second": 4.434, "eval_steps_per_second": 0.443, "step": 190 }, { "epoch": 1.0052631578947369, "grad_norm": 0.7035298943519592, "learning_rate": 1.8225454964432247e-05, "loss": 1.3235, "step": 191 }, { "epoch": 1.0105263157894737, "grad_norm": 0.6608556509017944, "learning_rate": 1.8206402449132997e-05, "loss": 1.4307, "step": 192 }, { "epoch": 1.0157894736842106, "grad_norm": 1.0675935745239258, "learning_rate": 1.818725827033178e-05, "loss": 1.1922, "step": 193 }, { "epoch": 1.0210526315789474, "grad_norm": 0.7653377652168274, "learning_rate": 1.816802264186438e-05, "loss": 1.1628, "step": 194 }, { "epoch": 1.0263157894736843, "grad_norm": 0.5536810755729675, "learning_rate": 1.8148695778588034e-05, "loss": 1.0948, "step": 195 }, { "epoch": 1.0263157894736843, "eval_loss": 1.1659743785858154, "eval_runtime": 22.5094, "eval_samples_per_second": 4.443, "eval_steps_per_second": 0.444, "step": 195 }, { "epoch": 1.0315789473684212, "grad_norm": 0.805934727191925, "learning_rate": 1.8129277896379077e-05, "loss": 1.1665, "step": 196 }, { "epoch": 1.0368421052631578, "grad_norm": 0.6778495907783508, "learning_rate": 1.8109769212130487e-05, "loss": 1.1583, "step": 197 }, { "epoch": 1.0421052631578946, "grad_norm": 0.5672218799591064, "learning_rate": 1.8090169943749477e-05, "loss": 1.1316, "step": 198 }, { "epoch": 1.0473684210526315, "grad_norm": 0.8666687607765198, "learning_rate": 1.8070480310155064e-05, "loss": 1.2402, "step": 199 }, { "epoch": 1.0526315789473684, "grad_norm": 1.4576120376586914, "learning_rate": 1.8050700531275632e-05, "loss": 1.3838, "step": 200 }, { "epoch": 1.0526315789473684, "eval_loss": 1.1642940044403076, "eval_runtime": 22.5116, "eval_samples_per_second": 4.442, "eval_steps_per_second": 0.444, "step": 200 }, { "epoch": 1.0578947368421052, "grad_norm": 0.8853726387023926, "learning_rate": 1.8030830828046452e-05, "loss": 1.2938, "step": 201 }, { "epoch": 1.063157894736842, "grad_norm": 0.6276920437812805, "learning_rate": 1.8010871422407238e-05, "loss": 1.1433, "step": 202 }, { "epoch": 1.068421052631579, "grad_norm": 0.8250148892402649, "learning_rate": 1.7990822537299647e-05, "loss": 1.2107, "step": 203 }, { "epoch": 1.0736842105263158, "grad_norm": 0.9768710732460022, "learning_rate": 1.7970684396664814e-05, "loss": 1.1804, "step": 204 }, { "epoch": 1.0789473684210527, "grad_norm": 0.8847206830978394, "learning_rate": 1.795045722544083e-05, "loss": 0.9594, "step": 205 }, { "epoch": 1.0789473684210527, "eval_loss": 1.1643882989883423, "eval_runtime": 26.2467, "eval_samples_per_second": 3.81, "eval_steps_per_second": 0.381, "step": 205 }, { "epoch": 1.0842105263157895, "grad_norm": 0.7735342383384705, "learning_rate": 1.7930141249560235e-05, "loss": 1.2791, "step": 206 }, { "epoch": 1.0894736842105264, "grad_norm": 0.8692260980606079, "learning_rate": 1.7909736695947485e-05, "loss": 1.2304, "step": 207 }, { "epoch": 1.0947368421052632, "grad_norm": 0.7190925478935242, "learning_rate": 1.7889243792516452e-05, "loss": 1.2876, "step": 208 }, { "epoch": 1.1, "grad_norm": 0.8737074732780457, "learning_rate": 1.7868662768167827e-05, "loss": 1.347, "step": 209 }, { "epoch": 1.1052631578947367, "grad_norm": 0.6932772994041443, "learning_rate": 1.7847993852786612e-05, "loss": 1.1423, "step": 210 }, { "epoch": 1.1052631578947367, "eval_loss": 1.1634669303894043, "eval_runtime": 22.9024, "eval_samples_per_second": 4.366, "eval_steps_per_second": 0.437, "step": 210 }, { "epoch": 1.1105263157894736, "grad_norm": 1.0242347717285156, "learning_rate": 1.7827237277239514e-05, "loss": 1.3044, "step": 211 }, { "epoch": 1.1157894736842104, "grad_norm": 0.6930939555168152, "learning_rate": 1.7806393273372396e-05, "loss": 1.2462, "step": 212 }, { "epoch": 1.1210526315789473, "grad_norm": 0.8810515403747559, "learning_rate": 1.778546207400766e-05, "loss": 1.1348, "step": 213 }, { "epoch": 1.1263157894736842, "grad_norm": 0.7547497749328613, "learning_rate": 1.7764443912941675e-05, "loss": 1.2151, "step": 214 }, { "epoch": 1.131578947368421, "grad_norm": 0.6814379096031189, "learning_rate": 1.7743339024942135e-05, "loss": 1.1774, "step": 215 }, { "epoch": 1.131578947368421, "eval_loss": 1.1645286083221436, "eval_runtime": 22.5112, "eval_samples_per_second": 4.442, "eval_steps_per_second": 0.444, "step": 215 }, { "epoch": 1.1368421052631579, "grad_norm": 0.9576784372329712, "learning_rate": 1.772214764574547e-05, "loss": 1.217, "step": 216 }, { "epoch": 1.1421052631578947, "grad_norm": 0.6980108618736267, "learning_rate": 1.770087001205418e-05, "loss": 1.0534, "step": 217 }, { "epoch": 1.1473684210526316, "grad_norm": 0.8377910256385803, "learning_rate": 1.7679506361534216e-05, "loss": 1.0931, "step": 218 }, { "epoch": 1.1526315789473685, "grad_norm": 0.6630823612213135, "learning_rate": 1.7658056932812312e-05, "loss": 1.2069, "step": 219 }, { "epoch": 1.1578947368421053, "grad_norm": 0.9491656422615051, "learning_rate": 1.7636521965473324e-05, "loss": 1.0085, "step": 220 }, { "epoch": 1.1578947368421053, "eval_loss": 1.1642261743545532, "eval_runtime": 22.5183, "eval_samples_per_second": 4.441, "eval_steps_per_second": 0.444, "step": 220 }, { "epoch": 1.1631578947368422, "grad_norm": 0.5266606211662292, "learning_rate": 1.761490170005755e-05, "loss": 1.1703, "step": 221 }, { "epoch": 1.168421052631579, "grad_norm": 0.8231062889099121, "learning_rate": 1.759319637805806e-05, "loss": 1.2785, "step": 222 }, { "epoch": 1.1736842105263159, "grad_norm": 0.8533615469932556, "learning_rate": 1.757140624191797e-05, "loss": 0.9474, "step": 223 }, { "epoch": 1.1789473684210527, "grad_norm": 0.7277817726135254, "learning_rate": 1.754953153502775e-05, "loss": 1.1459, "step": 224 }, { "epoch": 1.1842105263157894, "grad_norm": 0.6767086386680603, "learning_rate": 1.7527572501722516e-05, "loss": 1.0912, "step": 225 }, { "epoch": 1.1842105263157894, "eval_loss": 1.161059856414795, "eval_runtime": 22.5434, "eval_samples_per_second": 4.436, "eval_steps_per_second": 0.444, "step": 225 }, { "epoch": 1.1894736842105262, "grad_norm": 1.098981261253357, "learning_rate": 1.750552938727928e-05, "loss": 0.9843, "step": 226 }, { "epoch": 1.194736842105263, "grad_norm": 1.1435344219207764, "learning_rate": 1.748340243791422e-05, "loss": 1.2896, "step": 227 }, { "epoch": 1.2, "grad_norm": 0.8031597137451172, "learning_rate": 1.7461191900779936e-05, "loss": 1.0101, "step": 228 }, { "epoch": 1.2052631578947368, "grad_norm": 0.6429868340492249, "learning_rate": 1.743889802396268e-05, "loss": 1.045, "step": 229 }, { "epoch": 1.2105263157894737, "grad_norm": 0.5834282636642456, "learning_rate": 1.7416521056479577e-05, "loss": 1.193, "step": 230 }, { "epoch": 1.2105263157894737, "eval_loss": 1.1627156734466553, "eval_runtime": 22.5213, "eval_samples_per_second": 4.44, "eval_steps_per_second": 0.444, "step": 230 }, { "epoch": 1.2157894736842105, "grad_norm": 1.1269385814666748, "learning_rate": 1.7394061248275874e-05, "loss": 1.1835, "step": 231 }, { "epoch": 1.2210526315789474, "grad_norm": 0.6989235281944275, "learning_rate": 1.737151885022211e-05, "loss": 1.0071, "step": 232 }, { "epoch": 1.2263157894736842, "grad_norm": 1.0005873441696167, "learning_rate": 1.7348894114111346e-05, "loss": 0.9699, "step": 233 }, { "epoch": 1.231578947368421, "grad_norm": 0.8165798187255859, "learning_rate": 1.7326187292656332e-05, "loss": 1.0115, "step": 234 }, { "epoch": 1.236842105263158, "grad_norm": 0.7189733982086182, "learning_rate": 1.7303398639486696e-05, "loss": 1.2437, "step": 235 }, { "epoch": 1.236842105263158, "eval_loss": 1.1640002727508545, "eval_runtime": 22.5421, "eval_samples_per_second": 4.436, "eval_steps_per_second": 0.444, "step": 235 }, { "epoch": 1.2421052631578948, "grad_norm": 1.3955504894256592, "learning_rate": 1.7280528409146097e-05, "loss": 1.6323, "step": 236 }, { "epoch": 1.2473684210526317, "grad_norm": 0.9354751110076904, "learning_rate": 1.7257576857089397e-05, "loss": 1.2273, "step": 237 }, { "epoch": 1.2526315789473683, "grad_norm": 0.7192204594612122, "learning_rate": 1.7234544239679807e-05, "loss": 1.2498, "step": 238 }, { "epoch": 1.2578947368421054, "grad_norm": 0.706244170665741, "learning_rate": 1.721143081418601e-05, "loss": 1.0584, "step": 239 }, { "epoch": 1.263157894736842, "grad_norm": 0.7391364574432373, "learning_rate": 1.7188236838779297e-05, "loss": 1.1814, "step": 240 }, { "epoch": 1.263157894736842, "eval_loss": 1.162298321723938, "eval_runtime": 22.5363, "eval_samples_per_second": 4.437, "eval_steps_per_second": 0.444, "step": 240 } ], "logging_steps": 1, "max_steps": 950, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 5, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 3 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.77309237051392e+16, "train_batch_size": 10, "trial_name": null, "trial_params": null }