diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,16314 @@ +{ + "best_global_step": 1380, + "best_metric": 0.7464115023612976, + "best_model_checkpoint": "saves/qwen3-1.7B/Qwen3-1.7B-SFT-science-2e-5/checkpoint-1380", + "epoch": 3.0, + "eval_steps": 230, + "global_step": 2313, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0012977269501388974, + "grad_norm": 18.96442413330078, + "learning_rate": 0.0, + "loss": 1.341123104095459, + "step": 1 + }, + { + "epoch": 0.0025954539002777948, + "grad_norm": 17.5643310546875, + "learning_rate": 1.7241379310344828e-07, + "loss": 1.240975022315979, + "step": 2 + }, + { + "epoch": 0.003893180850416692, + "grad_norm": 18.22071075439453, + "learning_rate": 3.4482758620689656e-07, + "loss": 1.3369407653808594, + "step": 3 + }, + { + "epoch": 0.0051909078005555895, + "grad_norm": 19.40529441833496, + "learning_rate": 5.172413793103449e-07, + "loss": 1.4051162004470825, + "step": 4 + }, + { + "epoch": 0.006488634750694487, + "grad_norm": 17.282682418823242, + "learning_rate": 6.896551724137931e-07, + "loss": 1.318056344985962, + "step": 5 + }, + { + "epoch": 0.007786361700833384, + "grad_norm": 18.145490646362305, + "learning_rate": 8.620689655172415e-07, + "loss": 1.3011627197265625, + "step": 6 + }, + { + "epoch": 0.009084088650972282, + "grad_norm": 18.944950103759766, + "learning_rate": 1.0344827586206898e-06, + "loss": 1.2762426137924194, + "step": 7 + }, + { + "epoch": 0.010381815601111179, + "grad_norm": 16.987550735473633, + "learning_rate": 1.2068965517241381e-06, + "loss": 1.2320008277893066, + "step": 8 + }, + { + "epoch": 0.011679542551250076, + "grad_norm": 15.374279975891113, + "learning_rate": 1.3793103448275862e-06, + "loss": 1.1568862199783325, + "step": 9 + }, + { + "epoch": 0.012977269501388973, + "grad_norm": 15.470294952392578, + "learning_rate": 1.5517241379310346e-06, + "loss": 1.2633228302001953, + "step": 10 + }, + { + "epoch": 0.01427499645152787, + "grad_norm": 13.62917709350586, + "learning_rate": 1.724137931034483e-06, + "loss": 1.2120124101638794, + "step": 11 + }, + { + "epoch": 0.015572723401666768, + "grad_norm": 11.841530799865723, + "learning_rate": 1.896551724137931e-06, + "loss": 1.15806245803833, + "step": 12 + }, + { + "epoch": 0.016870450351805667, + "grad_norm": 11.673654556274414, + "learning_rate": 2.0689655172413796e-06, + "loss": 1.1886231899261475, + "step": 13 + }, + { + "epoch": 0.018168177301944564, + "grad_norm": 11.115256309509277, + "learning_rate": 2.241379310344828e-06, + "loss": 1.1659168004989624, + "step": 14 + }, + { + "epoch": 0.01946590425208346, + "grad_norm": 8.34097671508789, + "learning_rate": 2.4137931034482762e-06, + "loss": 1.1347044706344604, + "step": 15 + }, + { + "epoch": 0.020763631202222358, + "grad_norm": 6.3707804679870605, + "learning_rate": 2.5862068965517246e-06, + "loss": 1.097546935081482, + "step": 16 + }, + { + "epoch": 0.022061358152361255, + "grad_norm": 6.07731294631958, + "learning_rate": 2.7586206896551725e-06, + "loss": 1.1303181648254395, + "step": 17 + }, + { + "epoch": 0.023359085102500152, + "grad_norm": 5.143428802490234, + "learning_rate": 2.931034482758621e-06, + "loss": 1.087995171546936, + "step": 18 + }, + { + "epoch": 0.02465681205263905, + "grad_norm": 5.108595371246338, + "learning_rate": 3.103448275862069e-06, + "loss": 1.09377121925354, + "step": 19 + }, + { + "epoch": 0.025954539002777947, + "grad_norm": 4.329593658447266, + "learning_rate": 3.2758620689655175e-06, + "loss": 0.9835488200187683, + "step": 20 + }, + { + "epoch": 0.027252265952916844, + "grad_norm": 2.5329697132110596, + "learning_rate": 3.448275862068966e-06, + "loss": 1.1068130731582642, + "step": 21 + }, + { + "epoch": 0.02854999290305574, + "grad_norm": 2.4052135944366455, + "learning_rate": 3.620689655172414e-06, + "loss": 0.9785792827606201, + "step": 22 + }, + { + "epoch": 0.029847719853194638, + "grad_norm": 2.2059929370880127, + "learning_rate": 3.793103448275862e-06, + "loss": 1.0071507692337036, + "step": 23 + }, + { + "epoch": 0.031145446803333535, + "grad_norm": 1.962939977645874, + "learning_rate": 3.96551724137931e-06, + "loss": 0.9504339694976807, + "step": 24 + }, + { + "epoch": 0.032443173753472436, + "grad_norm": 1.8630015850067139, + "learning_rate": 4.137931034482759e-06, + "loss": 0.9488564133644104, + "step": 25 + }, + { + "epoch": 0.03374090070361133, + "grad_norm": 1.9074621200561523, + "learning_rate": 4.310344827586207e-06, + "loss": 0.9918304681777954, + "step": 26 + }, + { + "epoch": 0.03503862765375023, + "grad_norm": 1.7486937046051025, + "learning_rate": 4.482758620689656e-06, + "loss": 0.9598171710968018, + "step": 27 + }, + { + "epoch": 0.03633635460388913, + "grad_norm": 1.5654025077819824, + "learning_rate": 4.655172413793104e-06, + "loss": 0.9875293970108032, + "step": 28 + }, + { + "epoch": 0.037634081554028025, + "grad_norm": 1.5146547555923462, + "learning_rate": 4.8275862068965525e-06, + "loss": 0.9899477958679199, + "step": 29 + }, + { + "epoch": 0.03893180850416692, + "grad_norm": 1.4136415719985962, + "learning_rate": 5e-06, + "loss": 1.0122514963150024, + "step": 30 + }, + { + "epoch": 0.04022953545430582, + "grad_norm": 1.3606868982315063, + "learning_rate": 5.172413793103449e-06, + "loss": 0.9211847186088562, + "step": 31 + }, + { + "epoch": 0.041527262404444716, + "grad_norm": 1.1916248798370361, + "learning_rate": 5.344827586206896e-06, + "loss": 0.9429690837860107, + "step": 32 + }, + { + "epoch": 0.04282498935458361, + "grad_norm": 1.1089906692504883, + "learning_rate": 5.517241379310345e-06, + "loss": 0.9432889819145203, + "step": 33 + }, + { + "epoch": 0.04412271630472251, + "grad_norm": 1.0991381406784058, + "learning_rate": 5.689655172413794e-06, + "loss": 0.8937160968780518, + "step": 34 + }, + { + "epoch": 0.04542044325486141, + "grad_norm": 1.1420905590057373, + "learning_rate": 5.862068965517242e-06, + "loss": 0.9616763591766357, + "step": 35 + }, + { + "epoch": 0.046718170205000305, + "grad_norm": 1.22003972530365, + "learning_rate": 6.03448275862069e-06, + "loss": 0.991248369216919, + "step": 36 + }, + { + "epoch": 0.0480158971551392, + "grad_norm": 1.0027211904525757, + "learning_rate": 6.206896551724138e-06, + "loss": 0.8961243033409119, + "step": 37 + }, + { + "epoch": 0.0493136241052781, + "grad_norm": 0.948948085308075, + "learning_rate": 6.379310344827587e-06, + "loss": 0.8873807787895203, + "step": 38 + }, + { + "epoch": 0.050611351055416996, + "grad_norm": 0.906653106212616, + "learning_rate": 6.551724137931035e-06, + "loss": 0.9843493103981018, + "step": 39 + }, + { + "epoch": 0.05190907800555589, + "grad_norm": 0.9032185077667236, + "learning_rate": 6.724137931034484e-06, + "loss": 0.9521259069442749, + "step": 40 + }, + { + "epoch": 0.05320680495569479, + "grad_norm": 0.9004918336868286, + "learning_rate": 6.896551724137932e-06, + "loss": 0.9388642311096191, + "step": 41 + }, + { + "epoch": 0.05450453190583369, + "grad_norm": 0.9163469672203064, + "learning_rate": 7.0689655172413796e-06, + "loss": 0.8808169364929199, + "step": 42 + }, + { + "epoch": 0.055802258855972585, + "grad_norm": 0.8777008056640625, + "learning_rate": 7.241379310344828e-06, + "loss": 0.8969473242759705, + "step": 43 + }, + { + "epoch": 0.05709998580611148, + "grad_norm": 0.8831114768981934, + "learning_rate": 7.413793103448277e-06, + "loss": 0.8995171189308167, + "step": 44 + }, + { + "epoch": 0.05839771275625038, + "grad_norm": 0.8527185320854187, + "learning_rate": 7.586206896551724e-06, + "loss": 0.9566978216171265, + "step": 45 + }, + { + "epoch": 0.059695439706389276, + "grad_norm": 0.8445229530334473, + "learning_rate": 7.758620689655173e-06, + "loss": 0.8870581388473511, + "step": 46 + }, + { + "epoch": 0.060993166656528174, + "grad_norm": 0.7909572720527649, + "learning_rate": 7.93103448275862e-06, + "loss": 0.839882493019104, + "step": 47 + }, + { + "epoch": 0.06229089360666707, + "grad_norm": 0.9035473465919495, + "learning_rate": 8.103448275862069e-06, + "loss": 0.9470881223678589, + "step": 48 + }, + { + "epoch": 0.06358862055680597, + "grad_norm": 0.812706708908081, + "learning_rate": 8.275862068965518e-06, + "loss": 0.9084426760673523, + "step": 49 + }, + { + "epoch": 0.06488634750694487, + "grad_norm": 0.7788446545600891, + "learning_rate": 8.448275862068966e-06, + "loss": 0.9100271463394165, + "step": 50 + }, + { + "epoch": 0.06618407445708377, + "grad_norm": 0.7733594179153442, + "learning_rate": 8.620689655172414e-06, + "loss": 0.9046688675880432, + "step": 51 + }, + { + "epoch": 0.06748180140722267, + "grad_norm": 0.8074057698249817, + "learning_rate": 8.793103448275862e-06, + "loss": 0.9495884776115417, + "step": 52 + }, + { + "epoch": 0.06877952835736156, + "grad_norm": 0.7883110642433167, + "learning_rate": 8.965517241379312e-06, + "loss": 0.944835901260376, + "step": 53 + }, + { + "epoch": 0.07007725530750046, + "grad_norm": 0.7795141935348511, + "learning_rate": 9.13793103448276e-06, + "loss": 0.8827984929084778, + "step": 54 + }, + { + "epoch": 0.07137498225763936, + "grad_norm": 0.7496516704559326, + "learning_rate": 9.310344827586207e-06, + "loss": 0.8837717771530151, + "step": 55 + }, + { + "epoch": 0.07267270920777825, + "grad_norm": 0.7296638488769531, + "learning_rate": 9.482758620689655e-06, + "loss": 0.9134169220924377, + "step": 56 + }, + { + "epoch": 0.07397043615791715, + "grad_norm": 0.7594932913780212, + "learning_rate": 9.655172413793105e-06, + "loss": 0.8602768182754517, + "step": 57 + }, + { + "epoch": 0.07526816310805605, + "grad_norm": 0.7925019264221191, + "learning_rate": 9.827586206896553e-06, + "loss": 0.9638795852661133, + "step": 58 + }, + { + "epoch": 0.07656589005819495, + "grad_norm": 0.7823756337165833, + "learning_rate": 1e-05, + "loss": 0.9325800538063049, + "step": 59 + }, + { + "epoch": 0.07786361700833384, + "grad_norm": 0.7671526074409485, + "learning_rate": 1.0172413793103449e-05, + "loss": 0.8490806221961975, + "step": 60 + }, + { + "epoch": 0.07916134395847274, + "grad_norm": 0.7950026392936707, + "learning_rate": 1.0344827586206898e-05, + "loss": 0.8811596632003784, + "step": 61 + }, + { + "epoch": 0.08045907090861164, + "grad_norm": 0.7760382294654846, + "learning_rate": 1.0517241379310346e-05, + "loss": 0.9363852739334106, + "step": 62 + }, + { + "epoch": 0.08175679785875054, + "grad_norm": 0.7695664763450623, + "learning_rate": 1.0689655172413792e-05, + "loss": 0.9032339453697205, + "step": 63 + }, + { + "epoch": 0.08305452480888943, + "grad_norm": 0.7472826838493347, + "learning_rate": 1.0862068965517242e-05, + "loss": 0.9319165349006653, + "step": 64 + }, + { + "epoch": 0.08435225175902833, + "grad_norm": 0.7492451667785645, + "learning_rate": 1.103448275862069e-05, + "loss": 0.9181802272796631, + "step": 65 + }, + { + "epoch": 0.08564997870916723, + "grad_norm": 0.7906931042671204, + "learning_rate": 1.1206896551724138e-05, + "loss": 0.9204844236373901, + "step": 66 + }, + { + "epoch": 0.08694770565930612, + "grad_norm": 0.7987682223320007, + "learning_rate": 1.1379310344827587e-05, + "loss": 0.9132669568061829, + "step": 67 + }, + { + "epoch": 0.08824543260944502, + "grad_norm": 0.7293349504470825, + "learning_rate": 1.1551724137931035e-05, + "loss": 0.840244472026825, + "step": 68 + }, + { + "epoch": 0.08954315955958392, + "grad_norm": 0.7649659514427185, + "learning_rate": 1.1724137931034483e-05, + "loss": 0.9429194331169128, + "step": 69 + }, + { + "epoch": 0.09084088650972282, + "grad_norm": 0.7362731695175171, + "learning_rate": 1.1896551724137933e-05, + "loss": 0.910248339176178, + "step": 70 + }, + { + "epoch": 0.09213861345986171, + "grad_norm": 0.7714956402778625, + "learning_rate": 1.206896551724138e-05, + "loss": 0.9148205518722534, + "step": 71 + }, + { + "epoch": 0.09343634041000061, + "grad_norm": 0.8190087676048279, + "learning_rate": 1.2241379310344827e-05, + "loss": 1.0036617517471313, + "step": 72 + }, + { + "epoch": 0.0947340673601395, + "grad_norm": 0.7508696913719177, + "learning_rate": 1.2413793103448277e-05, + "loss": 0.8585586547851562, + "step": 73 + }, + { + "epoch": 0.0960317943102784, + "grad_norm": 0.7731637358665466, + "learning_rate": 1.2586206896551725e-05, + "loss": 0.8797649145126343, + "step": 74 + }, + { + "epoch": 0.0973295212604173, + "grad_norm": 0.7766374349594116, + "learning_rate": 1.2758620689655174e-05, + "loss": 0.8823714852333069, + "step": 75 + }, + { + "epoch": 0.0986272482105562, + "grad_norm": 0.7738403677940369, + "learning_rate": 1.2931034482758622e-05, + "loss": 0.9374374747276306, + "step": 76 + }, + { + "epoch": 0.0999249751606951, + "grad_norm": 0.7996422648429871, + "learning_rate": 1.310344827586207e-05, + "loss": 0.8985888957977295, + "step": 77 + }, + { + "epoch": 0.10122270211083399, + "grad_norm": 0.8077470064163208, + "learning_rate": 1.327586206896552e-05, + "loss": 0.8687019944190979, + "step": 78 + }, + { + "epoch": 0.10252042906097289, + "grad_norm": 0.7868083715438843, + "learning_rate": 1.3448275862068967e-05, + "loss": 0.9471523761749268, + "step": 79 + }, + { + "epoch": 0.10381815601111179, + "grad_norm": 0.7429269552230835, + "learning_rate": 1.3620689655172414e-05, + "loss": 0.8650257587432861, + "step": 80 + }, + { + "epoch": 0.10511588296125068, + "grad_norm": 0.736170768737793, + "learning_rate": 1.3793103448275863e-05, + "loss": 0.8755403757095337, + "step": 81 + }, + { + "epoch": 0.10641360991138958, + "grad_norm": 0.7359841465950012, + "learning_rate": 1.3965517241379311e-05, + "loss": 0.8383484482765198, + "step": 82 + }, + { + "epoch": 0.10771133686152848, + "grad_norm": 0.7211300730705261, + "learning_rate": 1.4137931034482759e-05, + "loss": 0.8565696477890015, + "step": 83 + }, + { + "epoch": 0.10900906381166738, + "grad_norm": 0.7671189308166504, + "learning_rate": 1.4310344827586209e-05, + "loss": 0.9218558073043823, + "step": 84 + }, + { + "epoch": 0.11030679076180627, + "grad_norm": 0.816425085067749, + "learning_rate": 1.4482758620689657e-05, + "loss": 0.870709240436554, + "step": 85 + }, + { + "epoch": 0.11160451771194517, + "grad_norm": 0.7335647940635681, + "learning_rate": 1.4655172413793105e-05, + "loss": 0.8868783116340637, + "step": 86 + }, + { + "epoch": 0.11290224466208407, + "grad_norm": 0.7765848636627197, + "learning_rate": 1.4827586206896554e-05, + "loss": 0.8968692421913147, + "step": 87 + }, + { + "epoch": 0.11419997161222296, + "grad_norm": 0.7707907557487488, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.8512423634529114, + "step": 88 + }, + { + "epoch": 0.11549769856236186, + "grad_norm": 0.7698812484741211, + "learning_rate": 1.5172413793103448e-05, + "loss": 0.9038546085357666, + "step": 89 + }, + { + "epoch": 0.11679542551250076, + "grad_norm": 0.7673100829124451, + "learning_rate": 1.5344827586206898e-05, + "loss": 0.9032548666000366, + "step": 90 + }, + { + "epoch": 0.11809315246263966, + "grad_norm": 0.7782520055770874, + "learning_rate": 1.5517241379310346e-05, + "loss": 0.8969484567642212, + "step": 91 + }, + { + "epoch": 0.11939087941277855, + "grad_norm": 0.7486196756362915, + "learning_rate": 1.5689655172413794e-05, + "loss": 0.9460266828536987, + "step": 92 + }, + { + "epoch": 0.12068860636291745, + "grad_norm": 0.7591387033462524, + "learning_rate": 1.586206896551724e-05, + "loss": 0.8913143277168274, + "step": 93 + }, + { + "epoch": 0.12198633331305635, + "grad_norm": 0.7186006903648376, + "learning_rate": 1.603448275862069e-05, + "loss": 0.817532479763031, + "step": 94 + }, + { + "epoch": 0.12328406026319524, + "grad_norm": 0.8398354053497314, + "learning_rate": 1.6206896551724137e-05, + "loss": 0.9849364161491394, + "step": 95 + }, + { + "epoch": 0.12458178721333414, + "grad_norm": 0.7659850120544434, + "learning_rate": 1.637931034482759e-05, + "loss": 0.8463207483291626, + "step": 96 + }, + { + "epoch": 0.12587951416347304, + "grad_norm": 0.7916679978370667, + "learning_rate": 1.6551724137931037e-05, + "loss": 0.87321537733078, + "step": 97 + }, + { + "epoch": 0.12717724111361195, + "grad_norm": 0.7151588201522827, + "learning_rate": 1.6724137931034485e-05, + "loss": 0.8810160160064697, + "step": 98 + }, + { + "epoch": 0.12847496806375083, + "grad_norm": 0.7750177383422852, + "learning_rate": 1.6896551724137932e-05, + "loss": 0.7909659147262573, + "step": 99 + }, + { + "epoch": 0.12977269501388974, + "grad_norm": 0.7832080125808716, + "learning_rate": 1.706896551724138e-05, + "loss": 0.9595565795898438, + "step": 100 + }, + { + "epoch": 0.13107042196402863, + "grad_norm": 0.764074444770813, + "learning_rate": 1.7241379310344828e-05, + "loss": 0.9244315028190613, + "step": 101 + }, + { + "epoch": 0.13236814891416754, + "grad_norm": 0.8302505016326904, + "learning_rate": 1.7413793103448276e-05, + "loss": 0.8567872643470764, + "step": 102 + }, + { + "epoch": 0.13366587586430642, + "grad_norm": 0.7476164102554321, + "learning_rate": 1.7586206896551724e-05, + "loss": 0.8335643410682678, + "step": 103 + }, + { + "epoch": 0.13496360281444533, + "grad_norm": 0.7683222889900208, + "learning_rate": 1.7758620689655175e-05, + "loss": 0.92899489402771, + "step": 104 + }, + { + "epoch": 0.13626132976458422, + "grad_norm": 0.8164420127868652, + "learning_rate": 1.7931034482758623e-05, + "loss": 0.9577179551124573, + "step": 105 + }, + { + "epoch": 0.13755905671472313, + "grad_norm": 0.7937741279602051, + "learning_rate": 1.810344827586207e-05, + "loss": 0.9404830932617188, + "step": 106 + }, + { + "epoch": 0.138856783664862, + "grad_norm": 0.7443995475769043, + "learning_rate": 1.827586206896552e-05, + "loss": 0.8533992171287537, + "step": 107 + }, + { + "epoch": 0.14015451061500092, + "grad_norm": 0.7239556312561035, + "learning_rate": 1.8448275862068967e-05, + "loss": 0.8692059516906738, + "step": 108 + }, + { + "epoch": 0.1414522375651398, + "grad_norm": 0.7722207903862, + "learning_rate": 1.8620689655172415e-05, + "loss": 0.9231195449829102, + "step": 109 + }, + { + "epoch": 0.14274996451527872, + "grad_norm": 0.8155950307846069, + "learning_rate": 1.8793103448275863e-05, + "loss": 0.9769394397735596, + "step": 110 + }, + { + "epoch": 0.1440476914654176, + "grad_norm": 0.8122441172599792, + "learning_rate": 1.896551724137931e-05, + "loss": 0.9506130218505859, + "step": 111 + }, + { + "epoch": 0.1453454184155565, + "grad_norm": 0.748271644115448, + "learning_rate": 1.913793103448276e-05, + "loss": 0.8314372897148132, + "step": 112 + }, + { + "epoch": 0.1466431453656954, + "grad_norm": 0.7835760712623596, + "learning_rate": 1.931034482758621e-05, + "loss": 0.9071435332298279, + "step": 113 + }, + { + "epoch": 0.1479408723158343, + "grad_norm": 0.7403405904769897, + "learning_rate": 1.9482758620689658e-05, + "loss": 0.8897596597671509, + "step": 114 + }, + { + "epoch": 0.1492385992659732, + "grad_norm": 0.8157104849815369, + "learning_rate": 1.9655172413793106e-05, + "loss": 0.8683630228042603, + "step": 115 + }, + { + "epoch": 0.1505363262161121, + "grad_norm": 0.8036532402038574, + "learning_rate": 1.9827586206896554e-05, + "loss": 0.8975539207458496, + "step": 116 + }, + { + "epoch": 0.15183405316625098, + "grad_norm": 0.7673157453536987, + "learning_rate": 2e-05, + "loss": 0.938015341758728, + "step": 117 + }, + { + "epoch": 0.1531317801163899, + "grad_norm": 0.8311364650726318, + "learning_rate": 1.999998977626552e-05, + "loss": 0.927339494228363, + "step": 118 + }, + { + "epoch": 0.15442950706652878, + "grad_norm": 0.8438189029693604, + "learning_rate": 1.999995910508299e-05, + "loss": 0.8367739319801331, + "step": 119 + }, + { + "epoch": 0.1557272340166677, + "grad_norm": 0.7619196176528931, + "learning_rate": 1.999990798651512e-05, + "loss": 0.8823627829551697, + "step": 120 + }, + { + "epoch": 0.15702496096680657, + "grad_norm": 0.8044223785400391, + "learning_rate": 1.9999836420666438e-05, + "loss": 0.9462600350379944, + "step": 121 + }, + { + "epoch": 0.15832268791694548, + "grad_norm": 0.7767183780670166, + "learning_rate": 1.999974440768327e-05, + "loss": 0.8584571480751038, + "step": 122 + }, + { + "epoch": 0.15962041486708436, + "grad_norm": 0.8261749148368835, + "learning_rate": 1.9999631947753776e-05, + "loss": 0.8864863514900208, + "step": 123 + }, + { + "epoch": 0.16091814181722328, + "grad_norm": 0.7884521484375, + "learning_rate": 1.999949904110789e-05, + "loss": 0.9228469133377075, + "step": 124 + }, + { + "epoch": 0.16221586876736216, + "grad_norm": 0.7482346296310425, + "learning_rate": 1.999934568801738e-05, + "loss": 0.8749440908432007, + "step": 125 + }, + { + "epoch": 0.16351359571750107, + "grad_norm": 0.7735321521759033, + "learning_rate": 1.999917188879582e-05, + "loss": 0.8487443327903748, + "step": 126 + }, + { + "epoch": 0.16481132266763995, + "grad_norm": 0.7950016856193542, + "learning_rate": 1.9998977643798572e-05, + "loss": 0.8879282474517822, + "step": 127 + }, + { + "epoch": 0.16610904961777886, + "grad_norm": 0.7628664374351501, + "learning_rate": 1.999876295342283e-05, + "loss": 0.8263102173805237, + "step": 128 + }, + { + "epoch": 0.16740677656791775, + "grad_norm": 0.7986794114112854, + "learning_rate": 1.9998527818107577e-05, + "loss": 0.8462676405906677, + "step": 129 + }, + { + "epoch": 0.16870450351805666, + "grad_norm": 0.7867287993431091, + "learning_rate": 1.9998272238333606e-05, + "loss": 0.8144584894180298, + "step": 130 + }, + { + "epoch": 0.17000223046819554, + "grad_norm": 0.7938011288642883, + "learning_rate": 1.9997996214623515e-05, + "loss": 0.9469823837280273, + "step": 131 + }, + { + "epoch": 0.17129995741833445, + "grad_norm": 0.7824422717094421, + "learning_rate": 1.9997699747541698e-05, + "loss": 0.8819964528083801, + "step": 132 + }, + { + "epoch": 0.17259768436847334, + "grad_norm": 0.7831183075904846, + "learning_rate": 1.9997382837694355e-05, + "loss": 0.8070334196090698, + "step": 133 + }, + { + "epoch": 0.17389541131861225, + "grad_norm": 0.7970272302627563, + "learning_rate": 1.999704548572949e-05, + "loss": 0.9148434996604919, + "step": 134 + }, + { + "epoch": 0.17519313826875113, + "grad_norm": 0.7763343453407288, + "learning_rate": 1.9996687692336896e-05, + "loss": 0.8732989430427551, + "step": 135 + }, + { + "epoch": 0.17649086521889004, + "grad_norm": 0.7826754450798035, + "learning_rate": 1.9996309458248184e-05, + "loss": 0.8220726847648621, + "step": 136 + }, + { + "epoch": 0.17778859216902893, + "grad_norm": 0.761687159538269, + "learning_rate": 1.999591078423673e-05, + "loss": 0.8763125538825989, + "step": 137 + }, + { + "epoch": 0.17908631911916784, + "grad_norm": 0.7728819251060486, + "learning_rate": 1.9995491671117734e-05, + "loss": 0.804518461227417, + "step": 138 + }, + { + "epoch": 0.18038404606930672, + "grad_norm": 0.7697947025299072, + "learning_rate": 1.999505211974817e-05, + "loss": 0.8979027271270752, + "step": 139 + }, + { + "epoch": 0.18168177301944563, + "grad_norm": 0.7905195951461792, + "learning_rate": 1.999459213102681e-05, + "loss": 0.8996750116348267, + "step": 140 + }, + { + "epoch": 0.1829794999695845, + "grad_norm": 0.7597678899765015, + "learning_rate": 1.9994111705894218e-05, + "loss": 0.9672253727912903, + "step": 141 + }, + { + "epoch": 0.18427722691972342, + "grad_norm": 0.7724127769470215, + "learning_rate": 1.9993610845332734e-05, + "loss": 0.9037659764289856, + "step": 142 + }, + { + "epoch": 0.1855749538698623, + "grad_norm": 0.8090096712112427, + "learning_rate": 1.99930895503665e-05, + "loss": 0.9177453517913818, + "step": 143 + }, + { + "epoch": 0.18687268082000122, + "grad_norm": 0.7363874316215515, + "learning_rate": 1.9992547822061427e-05, + "loss": 0.8449195027351379, + "step": 144 + }, + { + "epoch": 0.1881704077701401, + "grad_norm": 0.8058642745018005, + "learning_rate": 1.9991985661525217e-05, + "loss": 0.998737096786499, + "step": 145 + }, + { + "epoch": 0.189468134720279, + "grad_norm": 0.7756547927856445, + "learning_rate": 1.999140306990734e-05, + "loss": 0.8317436575889587, + "step": 146 + }, + { + "epoch": 0.1907658616704179, + "grad_norm": 0.7556934952735901, + "learning_rate": 1.999080004839905e-05, + "loss": 0.8867667317390442, + "step": 147 + }, + { + "epoch": 0.1920635886205568, + "grad_norm": 0.8031500577926636, + "learning_rate": 1.999017659823338e-05, + "loss": 0.9501492381095886, + "step": 148 + }, + { + "epoch": 0.1933613155706957, + "grad_norm": 0.7905899882316589, + "learning_rate": 1.9989532720685115e-05, + "loss": 0.9475319981575012, + "step": 149 + }, + { + "epoch": 0.1946590425208346, + "grad_norm": 0.7352354526519775, + "learning_rate": 1.998886841707083e-05, + "loss": 0.8857019543647766, + "step": 150 + }, + { + "epoch": 0.19595676947097349, + "grad_norm": 0.7715173363685608, + "learning_rate": 1.9988183688748862e-05, + "loss": 0.9451955556869507, + "step": 151 + }, + { + "epoch": 0.1972544964211124, + "grad_norm": 0.7771379351615906, + "learning_rate": 1.9987478537119297e-05, + "loss": 0.9485697150230408, + "step": 152 + }, + { + "epoch": 0.19855222337125128, + "grad_norm": 0.7867424488067627, + "learning_rate": 1.9986752963624002e-05, + "loss": 0.9234886169433594, + "step": 153 + }, + { + "epoch": 0.1998499503213902, + "grad_norm": 0.8710278272628784, + "learning_rate": 1.998600696974658e-05, + "loss": 0.9107885956764221, + "step": 154 + }, + { + "epoch": 0.20114767727152907, + "grad_norm": 0.7554876208305359, + "learning_rate": 1.9985240557012406e-05, + "loss": 0.9065303206443787, + "step": 155 + }, + { + "epoch": 0.20244540422166798, + "grad_norm": 0.7357529997825623, + "learning_rate": 1.99844537269886e-05, + "loss": 0.7701905965805054, + "step": 156 + }, + { + "epoch": 0.20374313117180687, + "grad_norm": 0.8202847242355347, + "learning_rate": 1.9983646481284028e-05, + "loss": 0.992992103099823, + "step": 157 + }, + { + "epoch": 0.20504085812194578, + "grad_norm": 0.7828136682510376, + "learning_rate": 1.9982818821549308e-05, + "loss": 0.9072571992874146, + "step": 158 + }, + { + "epoch": 0.2063385850720847, + "grad_norm": 0.7381945252418518, + "learning_rate": 1.9981970749476792e-05, + "loss": 0.8416173458099365, + "step": 159 + }, + { + "epoch": 0.20763631202222357, + "grad_norm": 0.7436814308166504, + "learning_rate": 1.998110226680057e-05, + "loss": 0.860198438167572, + "step": 160 + }, + { + "epoch": 0.20893403897236248, + "grad_norm": 0.7724810242652893, + "learning_rate": 1.9980213375296468e-05, + "loss": 0.8358607292175293, + "step": 161 + }, + { + "epoch": 0.21023176592250137, + "grad_norm": 0.7248872518539429, + "learning_rate": 1.997930407678205e-05, + "loss": 0.8103194236755371, + "step": 162 + }, + { + "epoch": 0.21152949287264028, + "grad_norm": 0.7623717784881592, + "learning_rate": 1.99783743731166e-05, + "loss": 0.8410395383834839, + "step": 163 + }, + { + "epoch": 0.21282721982277916, + "grad_norm": 0.7665237188339233, + "learning_rate": 1.9977424266201126e-05, + "loss": 0.9623262286186218, + "step": 164 + }, + { + "epoch": 0.21412494677291807, + "grad_norm": 0.7374143600463867, + "learning_rate": 1.9976453757978355e-05, + "loss": 0.8592593669891357, + "step": 165 + }, + { + "epoch": 0.21542267372305696, + "grad_norm": 0.7116683721542358, + "learning_rate": 1.997546285043273e-05, + "loss": 0.7682055234909058, + "step": 166 + }, + { + "epoch": 0.21672040067319587, + "grad_norm": 0.8028838038444519, + "learning_rate": 1.9974451545590407e-05, + "loss": 0.9229005575180054, + "step": 167 + }, + { + "epoch": 0.21801812762333475, + "grad_norm": 0.8015571236610413, + "learning_rate": 1.997341984551925e-05, + "loss": 0.8815708756446838, + "step": 168 + }, + { + "epoch": 0.21931585457347366, + "grad_norm": 0.7032439708709717, + "learning_rate": 1.9972367752328824e-05, + "loss": 0.7823411822319031, + "step": 169 + }, + { + "epoch": 0.22061358152361255, + "grad_norm": 0.7352714538574219, + "learning_rate": 1.9971295268170393e-05, + "loss": 0.8304542899131775, + "step": 170 + }, + { + "epoch": 0.22191130847375146, + "grad_norm": 0.7774588465690613, + "learning_rate": 1.9970202395236913e-05, + "loss": 0.8442955017089844, + "step": 171 + }, + { + "epoch": 0.22320903542389034, + "grad_norm": 0.8193069696426392, + "learning_rate": 1.996908913576304e-05, + "loss": 0.8395213484764099, + "step": 172 + }, + { + "epoch": 0.22450676237402925, + "grad_norm": 0.805517852306366, + "learning_rate": 1.9967955492025094e-05, + "loss": 0.8934487104415894, + "step": 173 + }, + { + "epoch": 0.22580448932416813, + "grad_norm": 0.7246384620666504, + "learning_rate": 1.9966801466341107e-05, + "loss": 0.8137494325637817, + "step": 174 + }, + { + "epoch": 0.22710221627430704, + "grad_norm": 0.7587799429893494, + "learning_rate": 1.9965627061070755e-05, + "loss": 0.8050680756568909, + "step": 175 + }, + { + "epoch": 0.22839994322444593, + "grad_norm": 0.744683027267456, + "learning_rate": 1.996443227861541e-05, + "loss": 0.9190195798873901, + "step": 176 + }, + { + "epoch": 0.22969767017458484, + "grad_norm": 0.7057942748069763, + "learning_rate": 1.996321712141809e-05, + "loss": 0.771306574344635, + "step": 177 + }, + { + "epoch": 0.23099539712472372, + "grad_norm": 0.758804440498352, + "learning_rate": 1.9961981591963494e-05, + "loss": 0.9052093029022217, + "step": 178 + }, + { + "epoch": 0.23229312407486263, + "grad_norm": 0.761832058429718, + "learning_rate": 1.9960725692777956e-05, + "loss": 0.8963150382041931, + "step": 179 + }, + { + "epoch": 0.23359085102500152, + "grad_norm": 0.7698036432266235, + "learning_rate": 1.995944942642948e-05, + "loss": 0.879082202911377, + "step": 180 + }, + { + "epoch": 0.23488857797514043, + "grad_norm": 0.7247833013534546, + "learning_rate": 1.9958152795527706e-05, + "loss": 0.8330357074737549, + "step": 181 + }, + { + "epoch": 0.2361863049252793, + "grad_norm": 0.8077431321144104, + "learning_rate": 1.9956835802723916e-05, + "loss": 0.94368577003479, + "step": 182 + }, + { + "epoch": 0.23748403187541822, + "grad_norm": 0.7545983195304871, + "learning_rate": 1.9955498450711026e-05, + "loss": 0.8294435739517212, + "step": 183 + }, + { + "epoch": 0.2387817588255571, + "grad_norm": 0.7249157428741455, + "learning_rate": 1.9954140742223586e-05, + "loss": 0.8432042598724365, + "step": 184 + }, + { + "epoch": 0.24007948577569602, + "grad_norm": 0.7442438006401062, + "learning_rate": 1.9952762680037758e-05, + "loss": 0.8805173635482788, + "step": 185 + }, + { + "epoch": 0.2413772127258349, + "grad_norm": 0.7329111695289612, + "learning_rate": 1.995136426697134e-05, + "loss": 0.863207221031189, + "step": 186 + }, + { + "epoch": 0.2426749396759738, + "grad_norm": 0.716304361820221, + "learning_rate": 1.9949945505883723e-05, + "loss": 0.8094059824943542, + "step": 187 + }, + { + "epoch": 0.2439726666261127, + "grad_norm": 0.7312113046646118, + "learning_rate": 1.994850639967592e-05, + "loss": 0.9180686473846436, + "step": 188 + }, + { + "epoch": 0.2452703935762516, + "grad_norm": 0.7700150609016418, + "learning_rate": 1.994704695129054e-05, + "loss": 0.8603487610816956, + "step": 189 + }, + { + "epoch": 0.2465681205263905, + "grad_norm": 0.7655259370803833, + "learning_rate": 1.9945567163711788e-05, + "loss": 0.8780601620674133, + "step": 190 + }, + { + "epoch": 0.2478658474765294, + "grad_norm": 0.7268514633178711, + "learning_rate": 1.9944067039965445e-05, + "loss": 0.8242926001548767, + "step": 191 + }, + { + "epoch": 0.24916357442666828, + "grad_norm": 0.7264497876167297, + "learning_rate": 1.9942546583118894e-05, + "loss": 0.894584596157074, + "step": 192 + }, + { + "epoch": 0.25046130137680717, + "grad_norm": 0.773765504360199, + "learning_rate": 1.994100579628108e-05, + "loss": 0.8504235744476318, + "step": 193 + }, + { + "epoch": 0.2517590283269461, + "grad_norm": 0.6867210865020752, + "learning_rate": 1.9939444682602522e-05, + "loss": 0.7794942259788513, + "step": 194 + }, + { + "epoch": 0.253056755277085, + "grad_norm": 0.7574644684791565, + "learning_rate": 1.9937863245275303e-05, + "loss": 0.8992743492126465, + "step": 195 + }, + { + "epoch": 0.2543544822272239, + "grad_norm": 0.7294052243232727, + "learning_rate": 1.9936261487533066e-05, + "loss": 0.8371526002883911, + "step": 196 + }, + { + "epoch": 0.25565220917736275, + "grad_norm": 0.7199873924255371, + "learning_rate": 1.993463941265099e-05, + "loss": 0.8135456442832947, + "step": 197 + }, + { + "epoch": 0.25694993612750167, + "grad_norm": 0.7726846933364868, + "learning_rate": 1.993299702394582e-05, + "loss": 0.8241779804229736, + "step": 198 + }, + { + "epoch": 0.2582476630776406, + "grad_norm": 0.7929345369338989, + "learning_rate": 1.9931334324775817e-05, + "loss": 0.9309947490692139, + "step": 199 + }, + { + "epoch": 0.2595453900277795, + "grad_norm": 0.7434781193733215, + "learning_rate": 1.9929651318540783e-05, + "loss": 0.8470789790153503, + "step": 200 + }, + { + "epoch": 0.26084311697791834, + "grad_norm": 0.8077720403671265, + "learning_rate": 1.9927948008682038e-05, + "loss": 0.8455624580383301, + "step": 201 + }, + { + "epoch": 0.26214084392805725, + "grad_norm": 0.7723199725151062, + "learning_rate": 1.9926224398682424e-05, + "loss": 0.8877855539321899, + "step": 202 + }, + { + "epoch": 0.26343857087819617, + "grad_norm": 0.723115861415863, + "learning_rate": 1.992448049206628e-05, + "loss": 0.7923484444618225, + "step": 203 + }, + { + "epoch": 0.2647362978283351, + "grad_norm": 0.7819997072219849, + "learning_rate": 1.9922716292399458e-05, + "loss": 0.8195080757141113, + "step": 204 + }, + { + "epoch": 0.26603402477847393, + "grad_norm": 0.7534734010696411, + "learning_rate": 1.9920931803289302e-05, + "loss": 0.8843890428543091, + "step": 205 + }, + { + "epoch": 0.26733175172861284, + "grad_norm": 0.6980569362640381, + "learning_rate": 1.9919127028384634e-05, + "loss": 0.841879665851593, + "step": 206 + }, + { + "epoch": 0.26862947867875175, + "grad_norm": 0.7415062189102173, + "learning_rate": 1.9917301971375767e-05, + "loss": 0.910488486289978, + "step": 207 + }, + { + "epoch": 0.26992720562889067, + "grad_norm": 0.7163265347480774, + "learning_rate": 1.991545663599448e-05, + "loss": 0.8969396948814392, + "step": 208 + }, + { + "epoch": 0.2712249325790295, + "grad_norm": 0.7287595868110657, + "learning_rate": 1.9913591026014016e-05, + "loss": 0.8557533621788025, + "step": 209 + }, + { + "epoch": 0.27252265952916843, + "grad_norm": 0.8144972324371338, + "learning_rate": 1.9911705145249076e-05, + "loss": 0.9075403809547424, + "step": 210 + }, + { + "epoch": 0.27382038647930734, + "grad_norm": 0.6856523156166077, + "learning_rate": 1.9909798997555806e-05, + "loss": 0.9015495777130127, + "step": 211 + }, + { + "epoch": 0.27511811342944625, + "grad_norm": 0.7224120497703552, + "learning_rate": 1.99078725868318e-05, + "loss": 0.8107393383979797, + "step": 212 + }, + { + "epoch": 0.2764158403795851, + "grad_norm": 0.783104419708252, + "learning_rate": 1.9905925917016077e-05, + "loss": 0.831728458404541, + "step": 213 + }, + { + "epoch": 0.277713567329724, + "grad_norm": 0.765583872795105, + "learning_rate": 1.9903958992089087e-05, + "loss": 0.872807502746582, + "step": 214 + }, + { + "epoch": 0.27901129427986293, + "grad_norm": 0.7342137098312378, + "learning_rate": 1.990197181607269e-05, + "loss": 0.8797867298126221, + "step": 215 + }, + { + "epoch": 0.28030902123000184, + "grad_norm": 0.7050272822380066, + "learning_rate": 1.989996439303016e-05, + "loss": 0.8417098522186279, + "step": 216 + }, + { + "epoch": 0.2816067481801407, + "grad_norm": 0.7334570288658142, + "learning_rate": 1.989793672706617e-05, + "loss": 0.8433218598365784, + "step": 217 + }, + { + "epoch": 0.2829044751302796, + "grad_norm": 0.7583123445510864, + "learning_rate": 1.9895888822326783e-05, + "loss": 0.8300482034683228, + "step": 218 + }, + { + "epoch": 0.2842022020804185, + "grad_norm": 0.7325905561447144, + "learning_rate": 1.9893820682999444e-05, + "loss": 0.8698530197143555, + "step": 219 + }, + { + "epoch": 0.28549992903055743, + "grad_norm": 0.7196786403656006, + "learning_rate": 1.9891732313312973e-05, + "loss": 0.8875235915184021, + "step": 220 + }, + { + "epoch": 0.2867976559806963, + "grad_norm": 0.7486999034881592, + "learning_rate": 1.9889623717537564e-05, + "loss": 0.8711264729499817, + "step": 221 + }, + { + "epoch": 0.2880953829308352, + "grad_norm": 0.7866005897521973, + "learning_rate": 1.9887494899984757e-05, + "loss": 0.9035714268684387, + "step": 222 + }, + { + "epoch": 0.2893931098809741, + "grad_norm": 0.698315441608429, + "learning_rate": 1.9885345865007444e-05, + "loss": 0.873035728931427, + "step": 223 + }, + { + "epoch": 0.290690836831113, + "grad_norm": 0.7287175059318542, + "learning_rate": 1.9883176616999863e-05, + "loss": 0.9040322303771973, + "step": 224 + }, + { + "epoch": 0.2919885637812519, + "grad_norm": 0.6973027586936951, + "learning_rate": 1.9880987160397573e-05, + "loss": 0.8214952349662781, + "step": 225 + }, + { + "epoch": 0.2932862907313908, + "grad_norm": 0.7529054880142212, + "learning_rate": 1.987877749967746e-05, + "loss": 0.8002289533615112, + "step": 226 + }, + { + "epoch": 0.2945840176815297, + "grad_norm": 0.7562571167945862, + "learning_rate": 1.987654763935772e-05, + "loss": 0.8632272481918335, + "step": 227 + }, + { + "epoch": 0.2958817446316686, + "grad_norm": 0.7309690713882446, + "learning_rate": 1.9874297583997852e-05, + "loss": 0.835785984992981, + "step": 228 + }, + { + "epoch": 0.29717947158180746, + "grad_norm": 0.7542479038238525, + "learning_rate": 1.9872027338198652e-05, + "loss": 0.8635554909706116, + "step": 229 + }, + { + "epoch": 0.2984771985319464, + "grad_norm": 0.743453860282898, + "learning_rate": 1.98697369066022e-05, + "loss": 0.918680727481842, + "step": 230 + }, + { + "epoch": 0.2984771985319464, + "eval_loss": 0.818739116191864, + "eval_runtime": 153.6061, + "eval_samples_per_second": 33.801, + "eval_steps_per_second": 8.45, + "step": 230 + }, + { + "epoch": 0.2997749254820853, + "grad_norm": 0.766386091709137, + "learning_rate": 1.986742629389184e-05, + "loss": 0.8685123324394226, + "step": 231 + }, + { + "epoch": 0.3010726524322242, + "grad_norm": 0.7218268513679504, + "learning_rate": 1.98650955047922e-05, + "loss": 0.8525049090385437, + "step": 232 + }, + { + "epoch": 0.30237037938236305, + "grad_norm": 0.7203767895698547, + "learning_rate": 1.9862744544069146e-05, + "loss": 0.867932915687561, + "step": 233 + }, + { + "epoch": 0.30366810633250196, + "grad_norm": 0.7556924819946289, + "learning_rate": 1.9860373416529804e-05, + "loss": 0.8170772790908813, + "step": 234 + }, + { + "epoch": 0.3049658332826409, + "grad_norm": 0.7739233374595642, + "learning_rate": 1.9857982127022527e-05, + "loss": 0.8461399674415588, + "step": 235 + }, + { + "epoch": 0.3062635602327798, + "grad_norm": 0.7455801367759705, + "learning_rate": 1.9855570680436896e-05, + "loss": 0.8253067135810852, + "step": 236 + }, + { + "epoch": 0.3075612871829187, + "grad_norm": 0.7704318761825562, + "learning_rate": 1.9853139081703712e-05, + "loss": 0.9142767786979675, + "step": 237 + }, + { + "epoch": 0.30885901413305755, + "grad_norm": 0.7740578651428223, + "learning_rate": 1.9850687335794974e-05, + "loss": 0.8383587002754211, + "step": 238 + }, + { + "epoch": 0.31015674108319646, + "grad_norm": 0.7392247319221497, + "learning_rate": 1.9848215447723888e-05, + "loss": 0.8735100030899048, + "step": 239 + }, + { + "epoch": 0.3114544680333354, + "grad_norm": 0.7605814337730408, + "learning_rate": 1.9845723422544834e-05, + "loss": 0.9212141633033752, + "step": 240 + }, + { + "epoch": 0.3127521949834743, + "grad_norm": 0.7394529581069946, + "learning_rate": 1.9843211265353376e-05, + "loss": 0.8197087049484253, + "step": 241 + }, + { + "epoch": 0.31404992193361314, + "grad_norm": 0.6981598138809204, + "learning_rate": 1.9840678981286237e-05, + "loss": 0.77371746301651, + "step": 242 + }, + { + "epoch": 0.31534764888375205, + "grad_norm": 0.6841283440589905, + "learning_rate": 1.98381265755213e-05, + "loss": 0.7815872430801392, + "step": 243 + }, + { + "epoch": 0.31664537583389096, + "grad_norm": 0.7323400974273682, + "learning_rate": 1.9835554053277587e-05, + "loss": 0.8495661616325378, + "step": 244 + }, + { + "epoch": 0.3179431027840299, + "grad_norm": 0.7340859174728394, + "learning_rate": 1.9832961419815253e-05, + "loss": 0.7806031107902527, + "step": 245 + }, + { + "epoch": 0.31924082973416873, + "grad_norm": 0.7229768633842468, + "learning_rate": 1.983034868043558e-05, + "loss": 0.8009724617004395, + "step": 246 + }, + { + "epoch": 0.32053855668430764, + "grad_norm": 0.7510941624641418, + "learning_rate": 1.9827715840480962e-05, + "loss": 0.9413229823112488, + "step": 247 + }, + { + "epoch": 0.32183628363444655, + "grad_norm": 0.6999549269676208, + "learning_rate": 1.9825062905334883e-05, + "loss": 0.7988513112068176, + "step": 248 + }, + { + "epoch": 0.32313401058458546, + "grad_norm": 0.7060723304748535, + "learning_rate": 1.9822389880421927e-05, + "loss": 0.8266105651855469, + "step": 249 + }, + { + "epoch": 0.3244317375347243, + "grad_norm": 0.7090180516242981, + "learning_rate": 1.9819696771207756e-05, + "loss": 0.8882022500038147, + "step": 250 + }, + { + "epoch": 0.32572946448486323, + "grad_norm": 0.7266640663146973, + "learning_rate": 1.981698358319909e-05, + "loss": 0.8313782215118408, + "step": 251 + }, + { + "epoch": 0.32702719143500214, + "grad_norm": 0.7484982013702393, + "learning_rate": 1.981425032194372e-05, + "loss": 0.9093562960624695, + "step": 252 + }, + { + "epoch": 0.32832491838514105, + "grad_norm": 0.7394732236862183, + "learning_rate": 1.981149699303047e-05, + "loss": 0.8808751106262207, + "step": 253 + }, + { + "epoch": 0.3296226453352799, + "grad_norm": 0.7643232345581055, + "learning_rate": 1.9808723602089198e-05, + "loss": 0.9079170823097229, + "step": 254 + }, + { + "epoch": 0.3309203722854188, + "grad_norm": 0.7218993902206421, + "learning_rate": 1.980593015479079e-05, + "loss": 0.8374384641647339, + "step": 255 + }, + { + "epoch": 0.33221809923555773, + "grad_norm": 0.7780535221099854, + "learning_rate": 1.9803116656847136e-05, + "loss": 0.9171014428138733, + "step": 256 + }, + { + "epoch": 0.33351582618569664, + "grad_norm": 0.7390936613082886, + "learning_rate": 1.9800283114011134e-05, + "loss": 0.8307523131370544, + "step": 257 + }, + { + "epoch": 0.3348135531358355, + "grad_norm": 0.7285546064376831, + "learning_rate": 1.9797429532076652e-05, + "loss": 0.8579209446907043, + "step": 258 + }, + { + "epoch": 0.3361112800859744, + "grad_norm": 0.7298453450202942, + "learning_rate": 1.9794555916878548e-05, + "loss": 0.9177393317222595, + "step": 259 + }, + { + "epoch": 0.3374090070361133, + "grad_norm": 0.7240604758262634, + "learning_rate": 1.9791662274292638e-05, + "loss": 0.8674473166465759, + "step": 260 + }, + { + "epoch": 0.33870673398625223, + "grad_norm": 0.6959360241889954, + "learning_rate": 1.978874861023569e-05, + "loss": 0.8340597152709961, + "step": 261 + }, + { + "epoch": 0.3400044609363911, + "grad_norm": 0.711373507976532, + "learning_rate": 1.9785814930665404e-05, + "loss": 0.8793005347251892, + "step": 262 + }, + { + "epoch": 0.34130218788653, + "grad_norm": 0.721527099609375, + "learning_rate": 1.9782861241580417e-05, + "loss": 0.7826907634735107, + "step": 263 + }, + { + "epoch": 0.3425999148366689, + "grad_norm": 0.7333364486694336, + "learning_rate": 1.9779887549020273e-05, + "loss": 0.8747556209564209, + "step": 264 + }, + { + "epoch": 0.3438976417868078, + "grad_norm": 0.6954993605613708, + "learning_rate": 1.9776893859065424e-05, + "loss": 0.825065553188324, + "step": 265 + }, + { + "epoch": 0.3451953687369467, + "grad_norm": 0.7496482729911804, + "learning_rate": 1.9773880177837202e-05, + "loss": 0.8960598111152649, + "step": 266 + }, + { + "epoch": 0.3464930956870856, + "grad_norm": 0.7554039359092712, + "learning_rate": 1.9770846511497833e-05, + "loss": 0.8298478722572327, + "step": 267 + }, + { + "epoch": 0.3477908226372245, + "grad_norm": 0.7233474850654602, + "learning_rate": 1.9767792866250386e-05, + "loss": 0.8535934090614319, + "step": 268 + }, + { + "epoch": 0.3490885495873634, + "grad_norm": 0.7677019238471985, + "learning_rate": 1.97647192483388e-05, + "loss": 0.8413315415382385, + "step": 269 + }, + { + "epoch": 0.35038627653750226, + "grad_norm": 0.7146613597869873, + "learning_rate": 1.976162566404784e-05, + "loss": 0.7900301814079285, + "step": 270 + }, + { + "epoch": 0.3516840034876412, + "grad_norm": 0.7061136364936829, + "learning_rate": 1.9758512119703106e-05, + "loss": 0.8699895739555359, + "step": 271 + }, + { + "epoch": 0.3529817304377801, + "grad_norm": 0.7685773968696594, + "learning_rate": 1.9755378621671006e-05, + "loss": 0.9059665203094482, + "step": 272 + }, + { + "epoch": 0.354279457387919, + "grad_norm": 0.7667369842529297, + "learning_rate": 1.9752225176358757e-05, + "loss": 0.8284919857978821, + "step": 273 + }, + { + "epoch": 0.35557718433805785, + "grad_norm": 0.7389227151870728, + "learning_rate": 1.974905179021435e-05, + "loss": 0.8445216417312622, + "step": 274 + }, + { + "epoch": 0.35687491128819676, + "grad_norm": 0.7373800873756409, + "learning_rate": 1.9745858469726555e-05, + "loss": 0.8499696254730225, + "step": 275 + }, + { + "epoch": 0.35817263823833567, + "grad_norm": 0.6966509222984314, + "learning_rate": 1.9742645221424905e-05, + "loss": 0.7845723032951355, + "step": 276 + }, + { + "epoch": 0.3594703651884746, + "grad_norm": 0.7133153080940247, + "learning_rate": 1.9739412051879686e-05, + "loss": 0.7712838053703308, + "step": 277 + }, + { + "epoch": 0.36076809213861344, + "grad_norm": 0.7376941442489624, + "learning_rate": 1.973615896770191e-05, + "loss": 0.8497350811958313, + "step": 278 + }, + { + "epoch": 0.36206581908875235, + "grad_norm": 0.7676963806152344, + "learning_rate": 1.97328859755433e-05, + "loss": 0.8830881714820862, + "step": 279 + }, + { + "epoch": 0.36336354603889126, + "grad_norm": 0.7721049785614014, + "learning_rate": 1.972959308209631e-05, + "loss": 0.9047907590866089, + "step": 280 + }, + { + "epoch": 0.36466127298903017, + "grad_norm": 0.7234658598899841, + "learning_rate": 1.9726280294094067e-05, + "loss": 0.8566961288452148, + "step": 281 + }, + { + "epoch": 0.365958999939169, + "grad_norm": 0.7352125644683838, + "learning_rate": 1.9722947618310384e-05, + "loss": 0.8019842505455017, + "step": 282 + }, + { + "epoch": 0.36725672688930794, + "grad_norm": 0.7341601848602295, + "learning_rate": 1.9719595061559742e-05, + "loss": 0.7666940093040466, + "step": 283 + }, + { + "epoch": 0.36855445383944685, + "grad_norm": 0.7719873785972595, + "learning_rate": 1.9716222630697266e-05, + "loss": 0.8902671933174133, + "step": 284 + }, + { + "epoch": 0.36985218078958576, + "grad_norm": 0.754192054271698, + "learning_rate": 1.971283033261873e-05, + "loss": 0.8718546628952026, + "step": 285 + }, + { + "epoch": 0.3711499077397246, + "grad_norm": 0.7254419922828674, + "learning_rate": 1.9709418174260523e-05, + "loss": 0.8636943101882935, + "step": 286 + }, + { + "epoch": 0.3724476346898635, + "grad_norm": 0.7372341156005859, + "learning_rate": 1.9705986162599642e-05, + "loss": 0.8579723238945007, + "step": 287 + }, + { + "epoch": 0.37374536164000244, + "grad_norm": 0.7488671541213989, + "learning_rate": 1.9702534304653685e-05, + "loss": 0.8281093835830688, + "step": 288 + }, + { + "epoch": 0.37504308859014135, + "grad_norm": 0.8016876578330994, + "learning_rate": 1.9699062607480827e-05, + "loss": 0.8639754056930542, + "step": 289 + }, + { + "epoch": 0.3763408155402802, + "grad_norm": 0.732269823551178, + "learning_rate": 1.969557107817981e-05, + "loss": 0.8395862579345703, + "step": 290 + }, + { + "epoch": 0.3776385424904191, + "grad_norm": 0.7406111359596252, + "learning_rate": 1.9692059723889927e-05, + "loss": 0.8540798425674438, + "step": 291 + }, + { + "epoch": 0.378936269440558, + "grad_norm": 0.7769038677215576, + "learning_rate": 1.968852855179101e-05, + "loss": 0.8707680106163025, + "step": 292 + }, + { + "epoch": 0.38023399639069694, + "grad_norm": 0.7666140198707581, + "learning_rate": 1.9684977569103415e-05, + "loss": 0.8578312993049622, + "step": 293 + }, + { + "epoch": 0.3815317233408358, + "grad_norm": 0.7852650284767151, + "learning_rate": 1.9681406783087998e-05, + "loss": 0.7673178911209106, + "step": 294 + }, + { + "epoch": 0.3828294502909747, + "grad_norm": 0.6789321899414062, + "learning_rate": 1.9677816201046113e-05, + "loss": 0.7785404324531555, + "step": 295 + }, + { + "epoch": 0.3841271772411136, + "grad_norm": 0.7129622101783752, + "learning_rate": 1.9674205830319594e-05, + "loss": 0.7908732295036316, + "step": 296 + }, + { + "epoch": 0.3854249041912525, + "grad_norm": 0.7952395081520081, + "learning_rate": 1.9670575678290732e-05, + "loss": 0.905153751373291, + "step": 297 + }, + { + "epoch": 0.3867226311413914, + "grad_norm": 0.7407474517822266, + "learning_rate": 1.9666925752382275e-05, + "loss": 0.8455154895782471, + "step": 298 + }, + { + "epoch": 0.3880203580915303, + "grad_norm": 0.7149595022201538, + "learning_rate": 1.9663256060057395e-05, + "loss": 0.7669047117233276, + "step": 299 + }, + { + "epoch": 0.3893180850416692, + "grad_norm": 0.724448025226593, + "learning_rate": 1.9659566608819677e-05, + "loss": 0.827459990978241, + "step": 300 + }, + { + "epoch": 0.3906158119918081, + "grad_norm": 0.7544072270393372, + "learning_rate": 1.9655857406213124e-05, + "loss": 0.8931189775466919, + "step": 301 + }, + { + "epoch": 0.39191353894194697, + "grad_norm": 0.7281385064125061, + "learning_rate": 1.9652128459822113e-05, + "loss": 0.8091886639595032, + "step": 302 + }, + { + "epoch": 0.3932112658920859, + "grad_norm": 0.7316269874572754, + "learning_rate": 1.9648379777271397e-05, + "loss": 0.7829949855804443, + "step": 303 + }, + { + "epoch": 0.3945089928422248, + "grad_norm": 0.7421220541000366, + "learning_rate": 1.964461136622608e-05, + "loss": 0.8580082058906555, + "step": 304 + }, + { + "epoch": 0.3958067197923637, + "grad_norm": 0.7127732038497925, + "learning_rate": 1.9640823234391614e-05, + "loss": 0.7645027041435242, + "step": 305 + }, + { + "epoch": 0.39710444674250256, + "grad_norm": 0.7605704665184021, + "learning_rate": 1.9637015389513765e-05, + "loss": 0.8976550698280334, + "step": 306 + }, + { + "epoch": 0.39840217369264147, + "grad_norm": 0.7157081365585327, + "learning_rate": 1.963318783937861e-05, + "loss": 0.7898974418640137, + "step": 307 + }, + { + "epoch": 0.3996999006427804, + "grad_norm": 0.694803774356842, + "learning_rate": 1.962934059181253e-05, + "loss": 0.8454594612121582, + "step": 308 + }, + { + "epoch": 0.4009976275929193, + "grad_norm": 0.7790278792381287, + "learning_rate": 1.962547365468216e-05, + "loss": 0.8850522041320801, + "step": 309 + }, + { + "epoch": 0.40229535454305815, + "grad_norm": 0.7630907893180847, + "learning_rate": 1.962158703589442e-05, + "loss": 0.7932512760162354, + "step": 310 + }, + { + "epoch": 0.40359308149319706, + "grad_norm": 0.7254197597503662, + "learning_rate": 1.9617680743396452e-05, + "loss": 0.8825772404670715, + "step": 311 + }, + { + "epoch": 0.40489080844333597, + "grad_norm": 0.6837211847305298, + "learning_rate": 1.961375478517564e-05, + "loss": 0.787892758846283, + "step": 312 + }, + { + "epoch": 0.4061885353934749, + "grad_norm": 0.8057960867881775, + "learning_rate": 1.9609809169259573e-05, + "loss": 0.8797285556793213, + "step": 313 + }, + { + "epoch": 0.40748626234361374, + "grad_norm": 0.7656168341636658, + "learning_rate": 1.960584390371604e-05, + "loss": 0.8403958678245544, + "step": 314 + }, + { + "epoch": 0.40878398929375265, + "grad_norm": 0.7079064249992371, + "learning_rate": 1.9601858996653004e-05, + "loss": 0.8279827237129211, + "step": 315 + }, + { + "epoch": 0.41008171624389156, + "grad_norm": 0.7371337413787842, + "learning_rate": 1.9597854456218588e-05, + "loss": 0.8244680166244507, + "step": 316 + }, + { + "epoch": 0.41137944319403047, + "grad_norm": 0.7662513256072998, + "learning_rate": 1.9593830290601067e-05, + "loss": 0.8895809650421143, + "step": 317 + }, + { + "epoch": 0.4126771701441694, + "grad_norm": 0.7431499361991882, + "learning_rate": 1.9589786508028842e-05, + "loss": 0.8213914632797241, + "step": 318 + }, + { + "epoch": 0.41397489709430824, + "grad_norm": 0.7631136178970337, + "learning_rate": 1.9585723116770425e-05, + "loss": 0.8473777770996094, + "step": 319 + }, + { + "epoch": 0.41527262404444715, + "grad_norm": 0.7579299807548523, + "learning_rate": 1.9581640125134415e-05, + "loss": 0.8756963014602661, + "step": 320 + }, + { + "epoch": 0.41657035099458606, + "grad_norm": 0.75262850522995, + "learning_rate": 1.9577537541469506e-05, + "loss": 0.8210287094116211, + "step": 321 + }, + { + "epoch": 0.41786807794472497, + "grad_norm": 0.7107104063034058, + "learning_rate": 1.957341537416444e-05, + "loss": 0.7835584878921509, + "step": 322 + }, + { + "epoch": 0.4191658048948638, + "grad_norm": 0.7898051738739014, + "learning_rate": 1.9569273631648005e-05, + "loss": 0.8497559428215027, + "step": 323 + }, + { + "epoch": 0.42046353184500274, + "grad_norm": 0.7612116932868958, + "learning_rate": 1.9565112322389017e-05, + "loss": 0.8350054621696472, + "step": 324 + }, + { + "epoch": 0.42176125879514165, + "grad_norm": 0.7677422761917114, + "learning_rate": 1.95609314548963e-05, + "loss": 0.8192890286445618, + "step": 325 + }, + { + "epoch": 0.42305898574528056, + "grad_norm": 0.7246314883232117, + "learning_rate": 1.955673103771867e-05, + "loss": 0.7340703010559082, + "step": 326 + }, + { + "epoch": 0.4243567126954194, + "grad_norm": 0.7684205770492554, + "learning_rate": 1.9552511079444914e-05, + "loss": 0.8853901028633118, + "step": 327 + }, + { + "epoch": 0.4256544396455583, + "grad_norm": 0.7860892415046692, + "learning_rate": 1.9548271588703783e-05, + "loss": 0.8821452856063843, + "step": 328 + }, + { + "epoch": 0.42695216659569724, + "grad_norm": 0.6936531662940979, + "learning_rate": 1.954401257416396e-05, + "loss": 0.7570967674255371, + "step": 329 + }, + { + "epoch": 0.42824989354583615, + "grad_norm": 0.7630011439323425, + "learning_rate": 1.9539734044534057e-05, + "loss": 0.8907523155212402, + "step": 330 + }, + { + "epoch": 0.429547620495975, + "grad_norm": 0.7460386753082275, + "learning_rate": 1.9535436008562576e-05, + "loss": 0.8264608383178711, + "step": 331 + }, + { + "epoch": 0.4308453474461139, + "grad_norm": 0.6788963675498962, + "learning_rate": 1.9531118475037916e-05, + "loss": 0.7674898505210876, + "step": 332 + }, + { + "epoch": 0.4321430743962528, + "grad_norm": 0.7098816633224487, + "learning_rate": 1.9526781452788342e-05, + "loss": 0.8403605818748474, + "step": 333 + }, + { + "epoch": 0.43344080134639174, + "grad_norm": 0.7769349813461304, + "learning_rate": 1.9522424950681964e-05, + "loss": 0.8386063575744629, + "step": 334 + }, + { + "epoch": 0.4347385282965306, + "grad_norm": 0.7037668824195862, + "learning_rate": 1.951804897762673e-05, + "loss": 0.7852950096130371, + "step": 335 + }, + { + "epoch": 0.4360362552466695, + "grad_norm": 0.6976593136787415, + "learning_rate": 1.951365354257039e-05, + "loss": 0.7828155159950256, + "step": 336 + }, + { + "epoch": 0.4373339821968084, + "grad_norm": 0.6809433698654175, + "learning_rate": 1.9509238654500505e-05, + "loss": 0.7821134924888611, + "step": 337 + }, + { + "epoch": 0.4386317091469473, + "grad_norm": 0.7023005485534668, + "learning_rate": 1.95048043224444e-05, + "loss": 0.8137397766113281, + "step": 338 + }, + { + "epoch": 0.4399294360970862, + "grad_norm": 0.709460973739624, + "learning_rate": 1.9500350555469164e-05, + "loss": 0.8287125825881958, + "step": 339 + }, + { + "epoch": 0.4412271630472251, + "grad_norm": 0.7066413760185242, + "learning_rate": 1.9495877362681613e-05, + "loss": 0.7227614521980286, + "step": 340 + }, + { + "epoch": 0.442524889997364, + "grad_norm": 0.7095454335212708, + "learning_rate": 1.9491384753228308e-05, + "loss": 0.8386364579200745, + "step": 341 + }, + { + "epoch": 0.4438226169475029, + "grad_norm": 0.704826831817627, + "learning_rate": 1.948687273629549e-05, + "loss": 0.7332904934883118, + "step": 342 + }, + { + "epoch": 0.44512034389764177, + "grad_norm": 0.7315965294837952, + "learning_rate": 1.9482341321109096e-05, + "loss": 0.8262498378753662, + "step": 343 + }, + { + "epoch": 0.4464180708477807, + "grad_norm": 0.7236066460609436, + "learning_rate": 1.947779051693472e-05, + "loss": 0.8105201721191406, + "step": 344 + }, + { + "epoch": 0.4477157977979196, + "grad_norm": 0.7457305192947388, + "learning_rate": 1.9473220333077604e-05, + "loss": 0.9067633748054504, + "step": 345 + }, + { + "epoch": 0.4490135247480585, + "grad_norm": 0.7768529653549194, + "learning_rate": 1.946863077888262e-05, + "loss": 0.9473153352737427, + "step": 346 + }, + { + "epoch": 0.45031125169819736, + "grad_norm": 0.7324157357215881, + "learning_rate": 1.946402186373424e-05, + "loss": 0.8552070260047913, + "step": 347 + }, + { + "epoch": 0.45160897864833627, + "grad_norm": 0.7343083024024963, + "learning_rate": 1.9459393597056536e-05, + "loss": 0.7906739115715027, + "step": 348 + }, + { + "epoch": 0.4529067055984752, + "grad_norm": 0.7099336385726929, + "learning_rate": 1.9454745988313135e-05, + "loss": 0.7985537052154541, + "step": 349 + }, + { + "epoch": 0.4542044325486141, + "grad_norm": 0.7202642560005188, + "learning_rate": 1.945007904700723e-05, + "loss": 0.8377722501754761, + "step": 350 + }, + { + "epoch": 0.45550215949875295, + "grad_norm": 0.7456194162368774, + "learning_rate": 1.9445392782681523e-05, + "loss": 0.7578713893890381, + "step": 351 + }, + { + "epoch": 0.45679988644889186, + "grad_norm": 0.6951096653938293, + "learning_rate": 1.9440687204918245e-05, + "loss": 0.8215861320495605, + "step": 352 + }, + { + "epoch": 0.45809761339903077, + "grad_norm": 0.6824142932891846, + "learning_rate": 1.943596232333911e-05, + "loss": 0.7992759346961975, + "step": 353 + }, + { + "epoch": 0.4593953403491697, + "grad_norm": 0.7076693773269653, + "learning_rate": 1.9431218147605307e-05, + "loss": 0.889447033405304, + "step": 354 + }, + { + "epoch": 0.46069306729930853, + "grad_norm": 0.7202051877975464, + "learning_rate": 1.9426454687417474e-05, + "loss": 0.7953578233718872, + "step": 355 + }, + { + "epoch": 0.46199079424944745, + "grad_norm": 0.6777750253677368, + "learning_rate": 1.942167195251568e-05, + "loss": 0.7135353088378906, + "step": 356 + }, + { + "epoch": 0.46328852119958636, + "grad_norm": 0.7169584035873413, + "learning_rate": 1.941686995267941e-05, + "loss": 0.8654831051826477, + "step": 357 + }, + { + "epoch": 0.46458624814972527, + "grad_norm": 0.7217689752578735, + "learning_rate": 1.941204869772753e-05, + "loss": 0.8449923992156982, + "step": 358 + }, + { + "epoch": 0.4658839750998641, + "grad_norm": 0.7016704678535461, + "learning_rate": 1.9407208197518296e-05, + "loss": 0.8285680413246155, + "step": 359 + }, + { + "epoch": 0.46718170205000303, + "grad_norm": 0.7271103262901306, + "learning_rate": 1.94023484619493e-05, + "loss": 0.788341760635376, + "step": 360 + }, + { + "epoch": 0.46847942900014194, + "grad_norm": 0.7725624442100525, + "learning_rate": 1.9397469500957478e-05, + "loss": 0.8492755889892578, + "step": 361 + }, + { + "epoch": 0.46977715595028086, + "grad_norm": 0.737015962600708, + "learning_rate": 1.939257132451906e-05, + "loss": 0.8843685388565063, + "step": 362 + }, + { + "epoch": 0.4710748829004197, + "grad_norm": 0.7315338850021362, + "learning_rate": 1.9387653942649586e-05, + "loss": 0.8183721899986267, + "step": 363 + }, + { + "epoch": 0.4723726098505586, + "grad_norm": 0.7253148555755615, + "learning_rate": 1.9382717365403854e-05, + "loss": 0.8446192145347595, + "step": 364 + }, + { + "epoch": 0.47367033680069753, + "grad_norm": 0.7184107303619385, + "learning_rate": 1.9377761602875913e-05, + "loss": 0.8196067214012146, + "step": 365 + }, + { + "epoch": 0.47496806375083644, + "grad_norm": 0.7668046355247498, + "learning_rate": 1.937278666519905e-05, + "loss": 0.8784077167510986, + "step": 366 + }, + { + "epoch": 0.4762657907009753, + "grad_norm": 0.7028603553771973, + "learning_rate": 1.9367792562545744e-05, + "loss": 0.8172916769981384, + "step": 367 + }, + { + "epoch": 0.4775635176511142, + "grad_norm": 0.7071288824081421, + "learning_rate": 1.9362779305127674e-05, + "loss": 0.7726463079452515, + "step": 368 + }, + { + "epoch": 0.4788612446012531, + "grad_norm": 0.744328498840332, + "learning_rate": 1.9357746903195686e-05, + "loss": 0.8223643898963928, + "step": 369 + }, + { + "epoch": 0.48015897155139203, + "grad_norm": 0.7051971554756165, + "learning_rate": 1.9352695367039764e-05, + "loss": 0.7989709973335266, + "step": 370 + }, + { + "epoch": 0.4814566985015309, + "grad_norm": 0.6921087503433228, + "learning_rate": 1.9347624706989026e-05, + "loss": 0.8276992440223694, + "step": 371 + }, + { + "epoch": 0.4827544254516698, + "grad_norm": 0.775720477104187, + "learning_rate": 1.9342534933411683e-05, + "loss": 0.8847764730453491, + "step": 372 + }, + { + "epoch": 0.4840521524018087, + "grad_norm": 0.7056650519371033, + "learning_rate": 1.9337426056715036e-05, + "loss": 0.8185163736343384, + "step": 373 + }, + { + "epoch": 0.4853498793519476, + "grad_norm": 0.746159017086029, + "learning_rate": 1.9332298087345447e-05, + "loss": 0.8038766980171204, + "step": 374 + }, + { + "epoch": 0.4866476063020865, + "grad_norm": 0.7275370359420776, + "learning_rate": 1.932715103578831e-05, + "loss": 0.8622571229934692, + "step": 375 + }, + { + "epoch": 0.4879453332522254, + "grad_norm": 0.6875770688056946, + "learning_rate": 1.9321984912568048e-05, + "loss": 0.7297530770301819, + "step": 376 + }, + { + "epoch": 0.4892430602023643, + "grad_norm": 0.7196366190910339, + "learning_rate": 1.9316799728248074e-05, + "loss": 0.8098776340484619, + "step": 377 + }, + { + "epoch": 0.4905407871525032, + "grad_norm": 0.8017922043800354, + "learning_rate": 1.9311595493430776e-05, + "loss": 0.8927175998687744, + "step": 378 + }, + { + "epoch": 0.49183851410264207, + "grad_norm": 0.752349317073822, + "learning_rate": 1.93063722187575e-05, + "loss": 0.8595757484436035, + "step": 379 + }, + { + "epoch": 0.493136241052781, + "grad_norm": 0.7166591882705688, + "learning_rate": 1.9301129914908516e-05, + "loss": 0.8619329333305359, + "step": 380 + }, + { + "epoch": 0.4944339680029199, + "grad_norm": 0.7622588872909546, + "learning_rate": 1.9295868592603012e-05, + "loss": 0.9877883195877075, + "step": 381 + }, + { + "epoch": 0.4957316949530588, + "grad_norm": 0.738442063331604, + "learning_rate": 1.929058826259906e-05, + "loss": 0.8450830578804016, + "step": 382 + }, + { + "epoch": 0.49702942190319765, + "grad_norm": 0.7250852584838867, + "learning_rate": 1.9285288935693597e-05, + "loss": 0.8014863133430481, + "step": 383 + }, + { + "epoch": 0.49832714885333657, + "grad_norm": 0.7121626138687134, + "learning_rate": 1.9279970622722403e-05, + "loss": 0.8381094932556152, + "step": 384 + }, + { + "epoch": 0.4996248758034755, + "grad_norm": 0.7626416087150574, + "learning_rate": 1.927463333456009e-05, + "loss": 0.8965335488319397, + "step": 385 + }, + { + "epoch": 0.5009226027536143, + "grad_norm": 0.7094375491142273, + "learning_rate": 1.9269277082120053e-05, + "loss": 0.8557580709457397, + "step": 386 + }, + { + "epoch": 0.5022203297037533, + "grad_norm": 0.7018871903419495, + "learning_rate": 1.926390187635448e-05, + "loss": 0.8587688207626343, + "step": 387 + }, + { + "epoch": 0.5035180566538922, + "grad_norm": 0.7267133593559265, + "learning_rate": 1.92585077282543e-05, + "loss": 0.8346423506736755, + "step": 388 + }, + { + "epoch": 0.504815783604031, + "grad_norm": 0.7274966835975647, + "learning_rate": 1.9253094648849183e-05, + "loss": 0.8169071078300476, + "step": 389 + }, + { + "epoch": 0.50611351055417, + "grad_norm": 0.7901791334152222, + "learning_rate": 1.924766264920751e-05, + "loss": 0.9163885116577148, + "step": 390 + }, + { + "epoch": 0.5074112375043088, + "grad_norm": 0.7128793001174927, + "learning_rate": 1.9242211740436335e-05, + "loss": 0.8264936804771423, + "step": 391 + }, + { + "epoch": 0.5087089644544478, + "grad_norm": 0.7791725993156433, + "learning_rate": 1.9236741933681396e-05, + "loss": 0.830746054649353, + "step": 392 + }, + { + "epoch": 0.5100066914045867, + "grad_norm": 0.7333115339279175, + "learning_rate": 1.9231253240127062e-05, + "loss": 0.7689610719680786, + "step": 393 + }, + { + "epoch": 0.5113044183547255, + "grad_norm": 0.722161591053009, + "learning_rate": 1.922574567099632e-05, + "loss": 0.8242402076721191, + "step": 394 + }, + { + "epoch": 0.5126021453048645, + "grad_norm": 0.7445337176322937, + "learning_rate": 1.9220219237550757e-05, + "loss": 0.8102379441261292, + "step": 395 + }, + { + "epoch": 0.5138998722550033, + "grad_norm": 0.6720981001853943, + "learning_rate": 1.921467395109053e-05, + "loss": 0.7922290563583374, + "step": 396 + }, + { + "epoch": 0.5151975992051423, + "grad_norm": 0.793062686920166, + "learning_rate": 1.9209109822954345e-05, + "loss": 0.8537084460258484, + "step": 397 + }, + { + "epoch": 0.5164953261552812, + "grad_norm": 0.7766822576522827, + "learning_rate": 1.9203526864519432e-05, + "loss": 0.8576462864875793, + "step": 398 + }, + { + "epoch": 0.51779305310542, + "grad_norm": 0.7053048610687256, + "learning_rate": 1.919792508720154e-05, + "loss": 0.7955272197723389, + "step": 399 + }, + { + "epoch": 0.519090780055559, + "grad_norm": 0.7525441646575928, + "learning_rate": 1.9192304502454876e-05, + "loss": 0.7955189347267151, + "step": 400 + }, + { + "epoch": 0.5203885070056978, + "grad_norm": 0.7097117304801941, + "learning_rate": 1.918666512177211e-05, + "loss": 0.8108992576599121, + "step": 401 + }, + { + "epoch": 0.5216862339558367, + "grad_norm": 0.7281200885772705, + "learning_rate": 1.918100695668436e-05, + "loss": 0.7774943113327026, + "step": 402 + }, + { + "epoch": 0.5229839609059757, + "grad_norm": 0.6979084610939026, + "learning_rate": 1.917533001876113e-05, + "loss": 0.8288201093673706, + "step": 403 + }, + { + "epoch": 0.5242816878561145, + "grad_norm": 0.7136226892471313, + "learning_rate": 1.916963431961033e-05, + "loss": 0.8710139393806458, + "step": 404 + }, + { + "epoch": 0.5255794148062535, + "grad_norm": 0.6950761079788208, + "learning_rate": 1.916391987087822e-05, + "loss": 0.82500821352005, + "step": 405 + }, + { + "epoch": 0.5268771417563923, + "grad_norm": 0.7169130444526672, + "learning_rate": 1.9158186684249397e-05, + "loss": 0.8732189536094666, + "step": 406 + }, + { + "epoch": 0.5281748687065312, + "grad_norm": 0.71788489818573, + "learning_rate": 1.9152434771446783e-05, + "loss": 0.7809304594993591, + "step": 407 + }, + { + "epoch": 0.5294725956566702, + "grad_norm": 0.7155045866966248, + "learning_rate": 1.914666414423158e-05, + "loss": 0.7732210159301758, + "step": 408 + }, + { + "epoch": 0.530770322606809, + "grad_norm": 0.6769919991493225, + "learning_rate": 1.914087481440326e-05, + "loss": 0.8261522650718689, + "step": 409 + }, + { + "epoch": 0.5320680495569479, + "grad_norm": 0.7309243679046631, + "learning_rate": 1.9135066793799538e-05, + "loss": 0.7936241626739502, + "step": 410 + }, + { + "epoch": 0.5333657765070868, + "grad_norm": 0.6851993203163147, + "learning_rate": 1.912924009429635e-05, + "loss": 0.8394724130630493, + "step": 411 + }, + { + "epoch": 0.5346635034572257, + "grad_norm": 0.7112469673156738, + "learning_rate": 1.9123394727807816e-05, + "loss": 0.8659080862998962, + "step": 412 + }, + { + "epoch": 0.5359612304073647, + "grad_norm": 0.8407036066055298, + "learning_rate": 1.9117530706286232e-05, + "loss": 0.8815537095069885, + "step": 413 + }, + { + "epoch": 0.5372589573575035, + "grad_norm": 0.7725140452384949, + "learning_rate": 1.9111648041722044e-05, + "loss": 0.8264433741569519, + "step": 414 + }, + { + "epoch": 0.5385566843076424, + "grad_norm": 0.7106306552886963, + "learning_rate": 1.91057467461438e-05, + "loss": 0.8120384812355042, + "step": 415 + }, + { + "epoch": 0.5398544112577813, + "grad_norm": 0.7314519882202148, + "learning_rate": 1.9099826831618168e-05, + "loss": 0.7814322113990784, + "step": 416 + }, + { + "epoch": 0.5411521382079202, + "grad_norm": 0.7492959499359131, + "learning_rate": 1.909388831024987e-05, + "loss": 0.8211044669151306, + "step": 417 + }, + { + "epoch": 0.542449865158059, + "grad_norm": 0.7524264454841614, + "learning_rate": 1.908793119418168e-05, + "loss": 0.831349790096283, + "step": 418 + }, + { + "epoch": 0.543747592108198, + "grad_norm": 0.768027663230896, + "learning_rate": 1.9081955495594388e-05, + "loss": 0.777296245098114, + "step": 419 + }, + { + "epoch": 0.5450453190583369, + "grad_norm": 0.6683104038238525, + "learning_rate": 1.9075961226706784e-05, + "loss": 0.8545945882797241, + "step": 420 + }, + { + "epoch": 0.5463430460084758, + "grad_norm": 0.7471824288368225, + "learning_rate": 1.906994839977564e-05, + "loss": 0.8631961941719055, + "step": 421 + }, + { + "epoch": 0.5476407729586147, + "grad_norm": 0.7404365539550781, + "learning_rate": 1.9063917027095664e-05, + "loss": 0.8402459025382996, + "step": 422 + }, + { + "epoch": 0.5489384999087535, + "grad_norm": 0.790240466594696, + "learning_rate": 1.905786712099948e-05, + "loss": 0.8911325335502625, + "step": 423 + }, + { + "epoch": 0.5502362268588925, + "grad_norm": 0.7139849662780762, + "learning_rate": 1.9051798693857617e-05, + "loss": 0.8359181880950928, + "step": 424 + }, + { + "epoch": 0.5515339538090314, + "grad_norm": 0.7506136894226074, + "learning_rate": 1.904571175807848e-05, + "loss": 0.8717991709709167, + "step": 425 + }, + { + "epoch": 0.5528316807591702, + "grad_norm": 0.7033493518829346, + "learning_rate": 1.9039606326108297e-05, + "loss": 0.808268666267395, + "step": 426 + }, + { + "epoch": 0.5541294077093092, + "grad_norm": 0.7442057132720947, + "learning_rate": 1.903348241043114e-05, + "loss": 0.8272799849510193, + "step": 427 + }, + { + "epoch": 0.555427134659448, + "grad_norm": 0.7257173657417297, + "learning_rate": 1.902734002356887e-05, + "loss": 0.8194448947906494, + "step": 428 + }, + { + "epoch": 0.556724861609587, + "grad_norm": 0.7403514385223389, + "learning_rate": 1.9021179178081107e-05, + "loss": 0.7172797322273254, + "step": 429 + }, + { + "epoch": 0.5580225885597259, + "grad_norm": 0.7432394623756409, + "learning_rate": 1.9014999886565226e-05, + "loss": 0.7437801361083984, + "step": 430 + }, + { + "epoch": 0.5593203155098647, + "grad_norm": 0.6978660225868225, + "learning_rate": 1.9008802161656308e-05, + "loss": 0.7967916131019592, + "step": 431 + }, + { + "epoch": 0.5606180424600037, + "grad_norm": 0.7165699005126953, + "learning_rate": 1.9002586016027136e-05, + "loss": 0.8070824146270752, + "step": 432 + }, + { + "epoch": 0.5619157694101425, + "grad_norm": 0.7089285254478455, + "learning_rate": 1.8996351462388153e-05, + "loss": 0.8515596389770508, + "step": 433 + }, + { + "epoch": 0.5632134963602814, + "grad_norm": 0.7979022860527039, + "learning_rate": 1.8990098513487447e-05, + "loss": 0.8934742212295532, + "step": 434 + }, + { + "epoch": 0.5645112233104204, + "grad_norm": 0.6929235458374023, + "learning_rate": 1.898382718211071e-05, + "loss": 0.7550987601280212, + "step": 435 + }, + { + "epoch": 0.5658089502605592, + "grad_norm": 0.7286235094070435, + "learning_rate": 1.897753748108123e-05, + "loss": 0.8770807981491089, + "step": 436 + }, + { + "epoch": 0.5671066772106982, + "grad_norm": 0.7233553528785706, + "learning_rate": 1.8971229423259855e-05, + "loss": 0.7454729080200195, + "step": 437 + }, + { + "epoch": 0.568404404160837, + "grad_norm": 0.7452800869941711, + "learning_rate": 1.8964903021544964e-05, + "loss": 0.8079807758331299, + "step": 438 + }, + { + "epoch": 0.5697021311109759, + "grad_norm": 0.696835994720459, + "learning_rate": 1.895855828887245e-05, + "loss": 0.8501238226890564, + "step": 439 + }, + { + "epoch": 0.5709998580611149, + "grad_norm": 0.6924627423286438, + "learning_rate": 1.895219523821568e-05, + "loss": 0.7888904213905334, + "step": 440 + }, + { + "epoch": 0.5722975850112537, + "grad_norm": 0.764805793762207, + "learning_rate": 1.894581388258549e-05, + "loss": 0.8138964772224426, + "step": 441 + }, + { + "epoch": 0.5735953119613926, + "grad_norm": 0.8151068091392517, + "learning_rate": 1.8939414235030137e-05, + "loss": 0.8286200165748596, + "step": 442 + }, + { + "epoch": 0.5748930389115315, + "grad_norm": 0.739456832408905, + "learning_rate": 1.893299630863527e-05, + "loss": 0.7820205092430115, + "step": 443 + }, + { + "epoch": 0.5761907658616704, + "grad_norm": 0.7076554298400879, + "learning_rate": 1.892656011652393e-05, + "loss": 0.8406723737716675, + "step": 444 + }, + { + "epoch": 0.5774884928118094, + "grad_norm": 0.6758636832237244, + "learning_rate": 1.8920105671856507e-05, + "loss": 0.793111264705658, + "step": 445 + }, + { + "epoch": 0.5787862197619482, + "grad_norm": 0.7238133549690247, + "learning_rate": 1.89136329878307e-05, + "loss": 0.7582585215568542, + "step": 446 + }, + { + "epoch": 0.5800839467120871, + "grad_norm": 0.7192074656486511, + "learning_rate": 1.890714207768151e-05, + "loss": 0.7284867763519287, + "step": 447 + }, + { + "epoch": 0.581381673662226, + "grad_norm": 0.7265046834945679, + "learning_rate": 1.8900632954681203e-05, + "loss": 0.836294412612915, + "step": 448 + }, + { + "epoch": 0.5826794006123649, + "grad_norm": 0.7325915098190308, + "learning_rate": 1.8894105632139296e-05, + "loss": 0.7910576462745667, + "step": 449 + }, + { + "epoch": 0.5839771275625038, + "grad_norm": 0.7702357172966003, + "learning_rate": 1.8887560123402505e-05, + "loss": 0.8775222301483154, + "step": 450 + }, + { + "epoch": 0.5852748545126427, + "grad_norm": 0.7335582971572876, + "learning_rate": 1.888099644185474e-05, + "loss": 0.8012707829475403, + "step": 451 + }, + { + "epoch": 0.5865725814627816, + "grad_norm": 0.733706533908844, + "learning_rate": 1.887441460091707e-05, + "loss": 0.7948039174079895, + "step": 452 + }, + { + "epoch": 0.5878703084129205, + "grad_norm": 0.7587592005729675, + "learning_rate": 1.886781461404769e-05, + "loss": 0.804535448551178, + "step": 453 + }, + { + "epoch": 0.5891680353630594, + "grad_norm": 0.7819000482559204, + "learning_rate": 1.886119649474191e-05, + "loss": 0.7766174077987671, + "step": 454 + }, + { + "epoch": 0.5904657623131983, + "grad_norm": 0.69929039478302, + "learning_rate": 1.8854560256532098e-05, + "loss": 0.7503871321678162, + "step": 455 + }, + { + "epoch": 0.5917634892633372, + "grad_norm": 0.742264449596405, + "learning_rate": 1.8847905912987693e-05, + "loss": 0.7669814229011536, + "step": 456 + }, + { + "epoch": 0.5930612162134761, + "grad_norm": 0.7957385182380676, + "learning_rate": 1.8841233477715136e-05, + "loss": 0.7808370590209961, + "step": 457 + }, + { + "epoch": 0.5943589431636149, + "grad_norm": 0.7357493042945862, + "learning_rate": 1.8834542964357875e-05, + "loss": 0.8638509511947632, + "step": 458 + }, + { + "epoch": 0.5956566701137539, + "grad_norm": 0.6800391674041748, + "learning_rate": 1.8827834386596306e-05, + "loss": 0.8268325924873352, + "step": 459 + }, + { + "epoch": 0.5969543970638927, + "grad_norm": 0.6685859560966492, + "learning_rate": 1.882110775814778e-05, + "loss": 0.7641065716743469, + "step": 460 + }, + { + "epoch": 0.5969543970638927, + "eval_loss": 0.788587212562561, + "eval_runtime": 140.6113, + "eval_samples_per_second": 36.924, + "eval_steps_per_second": 9.231, + "step": 460 + }, + { + "epoch": 0.5982521240140317, + "grad_norm": 0.7249795794487, + "learning_rate": 1.881436309276655e-05, + "loss": 0.8106693625450134, + "step": 461 + }, + { + "epoch": 0.5995498509641706, + "grad_norm": 0.7279155254364014, + "learning_rate": 1.8807600404243746e-05, + "loss": 0.7669492363929749, + "step": 462 + }, + { + "epoch": 0.6008475779143094, + "grad_norm": 0.6802601218223572, + "learning_rate": 1.8800819706407355e-05, + "loss": 0.7968916296958923, + "step": 463 + }, + { + "epoch": 0.6021453048644484, + "grad_norm": 0.6981019973754883, + "learning_rate": 1.879402101312219e-05, + "loss": 0.736625075340271, + "step": 464 + }, + { + "epoch": 0.6034430318145872, + "grad_norm": 0.7771289944648743, + "learning_rate": 1.8787204338289858e-05, + "loss": 0.8314676284790039, + "step": 465 + }, + { + "epoch": 0.6047407587647261, + "grad_norm": 0.7184056043624878, + "learning_rate": 1.8780369695848733e-05, + "loss": 0.7979223132133484, + "step": 466 + }, + { + "epoch": 0.6060384857148651, + "grad_norm": 0.7473218441009521, + "learning_rate": 1.8773517099773927e-05, + "loss": 0.858469545841217, + "step": 467 + }, + { + "epoch": 0.6073362126650039, + "grad_norm": 0.683022141456604, + "learning_rate": 1.8766646564077265e-05, + "loss": 0.8193258047103882, + "step": 468 + }, + { + "epoch": 0.6086339396151429, + "grad_norm": 0.7081974148750305, + "learning_rate": 1.8759758102807253e-05, + "loss": 0.7676112055778503, + "step": 469 + }, + { + "epoch": 0.6099316665652817, + "grad_norm": 0.7614895105361938, + "learning_rate": 1.8752851730049055e-05, + "loss": 0.8635563254356384, + "step": 470 + }, + { + "epoch": 0.6112293935154206, + "grad_norm": 0.7243057489395142, + "learning_rate": 1.8745927459924454e-05, + "loss": 0.9161559343338013, + "step": 471 + }, + { + "epoch": 0.6125271204655596, + "grad_norm": 0.6948226690292358, + "learning_rate": 1.8738985306591826e-05, + "loss": 0.7749679684638977, + "step": 472 + }, + { + "epoch": 0.6138248474156984, + "grad_norm": 0.7040874361991882, + "learning_rate": 1.8732025284246122e-05, + "loss": 0.79802006483078, + "step": 473 + }, + { + "epoch": 0.6151225743658374, + "grad_norm": 0.7108686566352844, + "learning_rate": 1.8725047407118823e-05, + "loss": 0.7963647246360779, + "step": 474 + }, + { + "epoch": 0.6164203013159762, + "grad_norm": 0.6806232333183289, + "learning_rate": 1.8718051689477923e-05, + "loss": 0.8362119197845459, + "step": 475 + }, + { + "epoch": 0.6177180282661151, + "grad_norm": 0.7135924696922302, + "learning_rate": 1.8711038145627893e-05, + "loss": 0.8811363577842712, + "step": 476 + }, + { + "epoch": 0.6190157552162541, + "grad_norm": 0.7035737633705139, + "learning_rate": 1.8704006789909654e-05, + "loss": 0.839409351348877, + "step": 477 + }, + { + "epoch": 0.6203134821663929, + "grad_norm": 0.6822429299354553, + "learning_rate": 1.8696957636700555e-05, + "loss": 0.8191482424736023, + "step": 478 + }, + { + "epoch": 0.6216112091165318, + "grad_norm": 0.731574296951294, + "learning_rate": 1.868989070041432e-05, + "loss": 0.853705108165741, + "step": 479 + }, + { + "epoch": 0.6229089360666707, + "grad_norm": 0.7717382907867432, + "learning_rate": 1.8682805995501052e-05, + "loss": 0.7867730259895325, + "step": 480 + }, + { + "epoch": 0.6242066630168096, + "grad_norm": 0.7173001170158386, + "learning_rate": 1.8675703536447178e-05, + "loss": 0.8229404091835022, + "step": 481 + }, + { + "epoch": 0.6255043899669486, + "grad_norm": 0.7436506748199463, + "learning_rate": 1.866858333777543e-05, + "loss": 0.8175429105758667, + "step": 482 + }, + { + "epoch": 0.6268021169170874, + "grad_norm": 0.6823157072067261, + "learning_rate": 1.8661445414044813e-05, + "loss": 0.8235064148902893, + "step": 483 + }, + { + "epoch": 0.6280998438672263, + "grad_norm": 0.6958295702934265, + "learning_rate": 1.865428977985057e-05, + "loss": 0.8292087316513062, + "step": 484 + }, + { + "epoch": 0.6293975708173652, + "grad_norm": 0.7212422490119934, + "learning_rate": 1.8647116449824165e-05, + "loss": 0.8680652379989624, + "step": 485 + }, + { + "epoch": 0.6306952977675041, + "grad_norm": 0.692675769329071, + "learning_rate": 1.8639925438633243e-05, + "loss": 0.8230209350585938, + "step": 486 + }, + { + "epoch": 0.631993024717643, + "grad_norm": 0.7433279752731323, + "learning_rate": 1.86327167609816e-05, + "loss": 0.7730977535247803, + "step": 487 + }, + { + "epoch": 0.6332907516677819, + "grad_norm": 0.7101516723632812, + "learning_rate": 1.8625490431609154e-05, + "loss": 0.9187572002410889, + "step": 488 + }, + { + "epoch": 0.6345884786179208, + "grad_norm": 0.7050445675849915, + "learning_rate": 1.8618246465291925e-05, + "loss": 0.8063424229621887, + "step": 489 + }, + { + "epoch": 0.6358862055680597, + "grad_norm": 0.7434412240982056, + "learning_rate": 1.861098487684199e-05, + "loss": 0.7892963290214539, + "step": 490 + }, + { + "epoch": 0.6371839325181986, + "grad_norm": 0.7191323041915894, + "learning_rate": 1.8603705681107456e-05, + "loss": 0.7660176157951355, + "step": 491 + }, + { + "epoch": 0.6384816594683375, + "grad_norm": 0.7202406525611877, + "learning_rate": 1.8596408892972442e-05, + "loss": 0.8213373422622681, + "step": 492 + }, + { + "epoch": 0.6397793864184764, + "grad_norm": 0.6945679783821106, + "learning_rate": 1.858909452735703e-05, + "loss": 0.7523878216743469, + "step": 493 + }, + { + "epoch": 0.6410771133686153, + "grad_norm": 0.8023699522018433, + "learning_rate": 1.858176259921724e-05, + "loss": 0.8551954030990601, + "step": 494 + }, + { + "epoch": 0.6423748403187541, + "grad_norm": 0.728702962398529, + "learning_rate": 1.857441312354502e-05, + "loss": 0.7901893854141235, + "step": 495 + }, + { + "epoch": 0.6436725672688931, + "grad_norm": 0.7125030755996704, + "learning_rate": 1.856704611536818e-05, + "loss": 0.8292658925056458, + "step": 496 + }, + { + "epoch": 0.644970294219032, + "grad_norm": 0.748110294342041, + "learning_rate": 1.8559661589750387e-05, + "loss": 0.8110982179641724, + "step": 497 + }, + { + "epoch": 0.6462680211691709, + "grad_norm": 0.7424649000167847, + "learning_rate": 1.8552259561791133e-05, + "loss": 0.7920522689819336, + "step": 498 + }, + { + "epoch": 0.6475657481193098, + "grad_norm": 0.7908960580825806, + "learning_rate": 1.8544840046625686e-05, + "loss": 0.9255160093307495, + "step": 499 + }, + { + "epoch": 0.6488634750694486, + "grad_norm": 0.7190539240837097, + "learning_rate": 1.8537403059425082e-05, + "loss": 0.8494732975959778, + "step": 500 + }, + { + "epoch": 0.6501612020195876, + "grad_norm": 0.7224424481391907, + "learning_rate": 1.852994861539607e-05, + "loss": 0.7837664484977722, + "step": 501 + }, + { + "epoch": 0.6514589289697265, + "grad_norm": 0.7687528729438782, + "learning_rate": 1.8522476729781106e-05, + "loss": 0.8091537952423096, + "step": 502 + }, + { + "epoch": 0.6527566559198653, + "grad_norm": 0.7272804379463196, + "learning_rate": 1.8514987417858306e-05, + "loss": 0.8691030740737915, + "step": 503 + }, + { + "epoch": 0.6540543828700043, + "grad_norm": 0.7369651794433594, + "learning_rate": 1.8507480694941416e-05, + "loss": 0.8802081346511841, + "step": 504 + }, + { + "epoch": 0.6553521098201431, + "grad_norm": 0.7450799942016602, + "learning_rate": 1.849995657637978e-05, + "loss": 0.8451288342475891, + "step": 505 + }, + { + "epoch": 0.6566498367702821, + "grad_norm": 0.723861813545227, + "learning_rate": 1.8492415077558325e-05, + "loss": 0.8779444694519043, + "step": 506 + }, + { + "epoch": 0.657947563720421, + "grad_norm": 0.6959301829338074, + "learning_rate": 1.8484856213897496e-05, + "loss": 0.8489083647727966, + "step": 507 + }, + { + "epoch": 0.6592452906705598, + "grad_norm": 0.7295985817909241, + "learning_rate": 1.847728000085327e-05, + "loss": 0.8433302044868469, + "step": 508 + }, + { + "epoch": 0.6605430176206988, + "grad_norm": 0.6785035133361816, + "learning_rate": 1.8469686453917074e-05, + "loss": 0.7844301462173462, + "step": 509 + }, + { + "epoch": 0.6618407445708376, + "grad_norm": 0.7163369059562683, + "learning_rate": 1.846207558861579e-05, + "loss": 0.8518480658531189, + "step": 510 + }, + { + "epoch": 0.6631384715209765, + "grad_norm": 0.6807128190994263, + "learning_rate": 1.845444742051172e-05, + "loss": 0.8048978447914124, + "step": 511 + }, + { + "epoch": 0.6644361984711155, + "grad_norm": 0.7018458247184753, + "learning_rate": 1.8446801965202524e-05, + "loss": 0.7482452392578125, + "step": 512 + }, + { + "epoch": 0.6657339254212543, + "grad_norm": 0.7418568134307861, + "learning_rate": 1.8439139238321235e-05, + "loss": 0.8263827562332153, + "step": 513 + }, + { + "epoch": 0.6670316523713933, + "grad_norm": 0.7616980075836182, + "learning_rate": 1.8431459255536185e-05, + "loss": 0.8845346570014954, + "step": 514 + }, + { + "epoch": 0.6683293793215321, + "grad_norm": 0.7437636852264404, + "learning_rate": 1.8423762032551e-05, + "loss": 0.7848752737045288, + "step": 515 + }, + { + "epoch": 0.669627106271671, + "grad_norm": 0.6855003833770752, + "learning_rate": 1.841604758510454e-05, + "loss": 0.7946106195449829, + "step": 516 + }, + { + "epoch": 0.67092483322181, + "grad_norm": 0.7443661689758301, + "learning_rate": 1.840831592897091e-05, + "loss": 0.8530216813087463, + "step": 517 + }, + { + "epoch": 0.6722225601719488, + "grad_norm": 0.7664664387702942, + "learning_rate": 1.8400567079959383e-05, + "loss": 0.836358368396759, + "step": 518 + }, + { + "epoch": 0.6735202871220877, + "grad_norm": 0.722017228603363, + "learning_rate": 1.8392801053914396e-05, + "loss": 0.8537322878837585, + "step": 519 + }, + { + "epoch": 0.6748180140722266, + "grad_norm": 0.7312494516372681, + "learning_rate": 1.8385017866715507e-05, + "loss": 0.8338693380355835, + "step": 520 + }, + { + "epoch": 0.6761157410223655, + "grad_norm": 0.7151913642883301, + "learning_rate": 1.8377217534277365e-05, + "loss": 0.879010021686554, + "step": 521 + }, + { + "epoch": 0.6774134679725045, + "grad_norm": 0.8348478674888611, + "learning_rate": 1.8369400072549674e-05, + "loss": 0.8499034643173218, + "step": 522 + }, + { + "epoch": 0.6787111949226433, + "grad_norm": 0.7662613987922668, + "learning_rate": 1.8361565497517166e-05, + "loss": 0.8573883175849915, + "step": 523 + }, + { + "epoch": 0.6800089218727822, + "grad_norm": 0.7006996870040894, + "learning_rate": 1.835371382519956e-05, + "loss": 0.8768547773361206, + "step": 524 + }, + { + "epoch": 0.6813066488229211, + "grad_norm": 0.6807017922401428, + "learning_rate": 1.8345845071651543e-05, + "loss": 0.7412630915641785, + "step": 525 + }, + { + "epoch": 0.68260437577306, + "grad_norm": 0.7801376581192017, + "learning_rate": 1.8337959252962728e-05, + "loss": 0.7919901609420776, + "step": 526 + }, + { + "epoch": 0.6839021027231988, + "grad_norm": 0.7031033635139465, + "learning_rate": 1.8330056385257607e-05, + "loss": 0.7936250567436218, + "step": 527 + }, + { + "epoch": 0.6851998296733378, + "grad_norm": 0.67047518491745, + "learning_rate": 1.8322136484695553e-05, + "loss": 0.7688592076301575, + "step": 528 + }, + { + "epoch": 0.6864975566234767, + "grad_norm": 0.7209057211875916, + "learning_rate": 1.8314199567470755e-05, + "loss": 0.7531197667121887, + "step": 529 + }, + { + "epoch": 0.6877952835736156, + "grad_norm": 0.7783409357070923, + "learning_rate": 1.83062456498122e-05, + "loss": 0.8060978055000305, + "step": 530 + }, + { + "epoch": 0.6890930105237545, + "grad_norm": 0.7646079659461975, + "learning_rate": 1.8298274747983638e-05, + "loss": 0.9013359546661377, + "step": 531 + }, + { + "epoch": 0.6903907374738933, + "grad_norm": 0.6973395943641663, + "learning_rate": 1.8290286878283542e-05, + "loss": 0.789779543876648, + "step": 532 + }, + { + "epoch": 0.6916884644240323, + "grad_norm": 0.7242528796195984, + "learning_rate": 1.8282282057045087e-05, + "loss": 0.8460395336151123, + "step": 533 + }, + { + "epoch": 0.6929861913741712, + "grad_norm": 0.7025911211967468, + "learning_rate": 1.827426030063611e-05, + "loss": 0.7623457312583923, + "step": 534 + }, + { + "epoch": 0.69428391832431, + "grad_norm": 0.6914080381393433, + "learning_rate": 1.8266221625459064e-05, + "loss": 0.8142719864845276, + "step": 535 + }, + { + "epoch": 0.695581645274449, + "grad_norm": 0.7013720870018005, + "learning_rate": 1.825816604795101e-05, + "loss": 0.7999016642570496, + "step": 536 + }, + { + "epoch": 0.6968793722245878, + "grad_norm": 0.7201952934265137, + "learning_rate": 1.8250093584583567e-05, + "loss": 0.8158777952194214, + "step": 537 + }, + { + "epoch": 0.6981770991747268, + "grad_norm": 0.6993263363838196, + "learning_rate": 1.8242004251862872e-05, + "loss": 0.7727892994880676, + "step": 538 + }, + { + "epoch": 0.6994748261248657, + "grad_norm": 0.7411354780197144, + "learning_rate": 1.823389806632957e-05, + "loss": 0.8402857184410095, + "step": 539 + }, + { + "epoch": 0.7007725530750045, + "grad_norm": 0.717903733253479, + "learning_rate": 1.8225775044558757e-05, + "loss": 0.8313778042793274, + "step": 540 + }, + { + "epoch": 0.7020702800251435, + "grad_norm": 0.7139982581138611, + "learning_rate": 1.8217635203159957e-05, + "loss": 0.8449199795722961, + "step": 541 + }, + { + "epoch": 0.7033680069752823, + "grad_norm": 0.7448502779006958, + "learning_rate": 1.8209478558777084e-05, + "loss": 0.8782564997673035, + "step": 542 + }, + { + "epoch": 0.7046657339254212, + "grad_norm": 0.7237476110458374, + "learning_rate": 1.8201305128088412e-05, + "loss": 0.8148598670959473, + "step": 543 + }, + { + "epoch": 0.7059634608755602, + "grad_norm": 0.7190750241279602, + "learning_rate": 1.819311492780654e-05, + "loss": 0.8512831926345825, + "step": 544 + }, + { + "epoch": 0.707261187825699, + "grad_norm": 0.6827414035797119, + "learning_rate": 1.8184907974678348e-05, + "loss": 0.7911166548728943, + "step": 545 + }, + { + "epoch": 0.708558914775838, + "grad_norm": 0.7072880864143372, + "learning_rate": 1.8176684285484985e-05, + "loss": 0.7934311032295227, + "step": 546 + }, + { + "epoch": 0.7098566417259768, + "grad_norm": 0.6981719136238098, + "learning_rate": 1.816844387704181e-05, + "loss": 0.7569193840026855, + "step": 547 + }, + { + "epoch": 0.7111543686761157, + "grad_norm": 0.6892895102500916, + "learning_rate": 1.8160186766198375e-05, + "loss": 0.8187867999076843, + "step": 548 + }, + { + "epoch": 0.7124520956262547, + "grad_norm": 0.6689103245735168, + "learning_rate": 1.815191296983838e-05, + "loss": 0.8214238882064819, + "step": 549 + }, + { + "epoch": 0.7137498225763935, + "grad_norm": 0.7005360722541809, + "learning_rate": 1.8143622504879647e-05, + "loss": 0.7808399796485901, + "step": 550 + }, + { + "epoch": 0.7150475495265324, + "grad_norm": 0.6692766547203064, + "learning_rate": 1.8135315388274075e-05, + "loss": 0.8118186593055725, + "step": 551 + }, + { + "epoch": 0.7163452764766713, + "grad_norm": 0.7556451559066772, + "learning_rate": 1.8126991637007618e-05, + "loss": 0.8829076290130615, + "step": 552 + }, + { + "epoch": 0.7176430034268102, + "grad_norm": 0.7057021856307983, + "learning_rate": 1.8118651268100235e-05, + "loss": 0.8323896527290344, + "step": 553 + }, + { + "epoch": 0.7189407303769492, + "grad_norm": 0.6931277513504028, + "learning_rate": 1.811029429860588e-05, + "loss": 0.8186264038085938, + "step": 554 + }, + { + "epoch": 0.720238457327088, + "grad_norm": 0.6943070292472839, + "learning_rate": 1.810192074561243e-05, + "loss": 0.7884860634803772, + "step": 555 + }, + { + "epoch": 0.7215361842772269, + "grad_norm": 0.7362954616546631, + "learning_rate": 1.8093530626241684e-05, + "loss": 0.8730647563934326, + "step": 556 + }, + { + "epoch": 0.7228339112273658, + "grad_norm": 0.7225231528282166, + "learning_rate": 1.8085123957649315e-05, + "loss": 0.8629934787750244, + "step": 557 + }, + { + "epoch": 0.7241316381775047, + "grad_norm": 0.6993386745452881, + "learning_rate": 1.8076700757024833e-05, + "loss": 0.8742365837097168, + "step": 558 + }, + { + "epoch": 0.7254293651276437, + "grad_norm": 0.7013887166976929, + "learning_rate": 1.8068261041591548e-05, + "loss": 0.8042615056037903, + "step": 559 + }, + { + "epoch": 0.7267270920777825, + "grad_norm": 0.7084468007087708, + "learning_rate": 1.8059804828606545e-05, + "loss": 0.8460750579833984, + "step": 560 + }, + { + "epoch": 0.7280248190279214, + "grad_norm": 0.6864623427391052, + "learning_rate": 1.8051332135360637e-05, + "loss": 0.7461860179901123, + "step": 561 + }, + { + "epoch": 0.7293225459780603, + "grad_norm": 0.7570308446884155, + "learning_rate": 1.8042842979178338e-05, + "loss": 0.8015311360359192, + "step": 562 + }, + { + "epoch": 0.7306202729281992, + "grad_norm": 0.6948541402816772, + "learning_rate": 1.8034337377417826e-05, + "loss": 0.7483975887298584, + "step": 563 + }, + { + "epoch": 0.731917999878338, + "grad_norm": 0.6935976147651672, + "learning_rate": 1.80258153474709e-05, + "loss": 0.8245661854743958, + "step": 564 + }, + { + "epoch": 0.733215726828477, + "grad_norm": 0.713844895362854, + "learning_rate": 1.8017276906762955e-05, + "loss": 0.7062139511108398, + "step": 565 + }, + { + "epoch": 0.7345134537786159, + "grad_norm": 0.7592107653617859, + "learning_rate": 1.8008722072752943e-05, + "loss": 0.9009630680084229, + "step": 566 + }, + { + "epoch": 0.7358111807287548, + "grad_norm": 0.7252402901649475, + "learning_rate": 1.8000150862933335e-05, + "loss": 0.8240823745727539, + "step": 567 + }, + { + "epoch": 0.7371089076788937, + "grad_norm": 0.6888589262962341, + "learning_rate": 1.7991563294830083e-05, + "loss": 0.7797961235046387, + "step": 568 + }, + { + "epoch": 0.7384066346290326, + "grad_norm": 0.6920890808105469, + "learning_rate": 1.7982959386002592e-05, + "loss": 0.8363062739372253, + "step": 569 + }, + { + "epoch": 0.7397043615791715, + "grad_norm": 0.7188555002212524, + "learning_rate": 1.7974339154043677e-05, + "loss": 0.8217660784721375, + "step": 570 + }, + { + "epoch": 0.7410020885293104, + "grad_norm": 0.6754209995269775, + "learning_rate": 1.796570261657953e-05, + "loss": 0.8851417899131775, + "step": 571 + }, + { + "epoch": 0.7422998154794492, + "grad_norm": 0.7101492881774902, + "learning_rate": 1.7957049791269684e-05, + "loss": 0.8277086615562439, + "step": 572 + }, + { + "epoch": 0.7435975424295882, + "grad_norm": 0.7085975408554077, + "learning_rate": 1.7948380695806983e-05, + "loss": 0.8054807186126709, + "step": 573 + }, + { + "epoch": 0.744895269379727, + "grad_norm": 0.6522380709648132, + "learning_rate": 1.793969534791752e-05, + "loss": 0.749293327331543, + "step": 574 + }, + { + "epoch": 0.746192996329866, + "grad_norm": 0.753157377243042, + "learning_rate": 1.7930993765360644e-05, + "loss": 0.86817467212677, + "step": 575 + }, + { + "epoch": 0.7474907232800049, + "grad_norm": 0.6874333024024963, + "learning_rate": 1.792227596592889e-05, + "loss": 0.7839986085891724, + "step": 576 + }, + { + "epoch": 0.7487884502301437, + "grad_norm": 0.690792977809906, + "learning_rate": 1.791354196744794e-05, + "loss": 0.8275938630104065, + "step": 577 + }, + { + "epoch": 0.7500861771802827, + "grad_norm": 0.7033665180206299, + "learning_rate": 1.790479178777662e-05, + "loss": 0.8231739401817322, + "step": 578 + }, + { + "epoch": 0.7513839041304216, + "grad_norm": 0.7290453314781189, + "learning_rate": 1.7896025444806834e-05, + "loss": 0.8637040257453918, + "step": 579 + }, + { + "epoch": 0.7526816310805604, + "grad_norm": 0.7544882893562317, + "learning_rate": 1.7887242956463528e-05, + "loss": 0.8368648886680603, + "step": 580 + }, + { + "epoch": 0.7539793580306994, + "grad_norm": 0.6997877955436707, + "learning_rate": 1.7878444340704666e-05, + "loss": 0.8118851184844971, + "step": 581 + }, + { + "epoch": 0.7552770849808382, + "grad_norm": 0.6926761269569397, + "learning_rate": 1.78696296155212e-05, + "loss": 0.7650015354156494, + "step": 582 + }, + { + "epoch": 0.7565748119309772, + "grad_norm": 0.7061843872070312, + "learning_rate": 1.7860798798937e-05, + "loss": 0.7908979654312134, + "step": 583 + }, + { + "epoch": 0.757872538881116, + "grad_norm": 0.687125563621521, + "learning_rate": 1.7851951909008864e-05, + "loss": 0.7617890238761902, + "step": 584 + }, + { + "epoch": 0.7591702658312549, + "grad_norm": 0.7391111254692078, + "learning_rate": 1.7843088963826437e-05, + "loss": 0.7612465023994446, + "step": 585 + }, + { + "epoch": 0.7604679927813939, + "grad_norm": 0.7583956122398376, + "learning_rate": 1.783420998151219e-05, + "loss": 0.8573638200759888, + "step": 586 + }, + { + "epoch": 0.7617657197315327, + "grad_norm": 0.721450686454773, + "learning_rate": 1.782531498022141e-05, + "loss": 0.7986845970153809, + "step": 587 + }, + { + "epoch": 0.7630634466816716, + "grad_norm": 0.7499017119407654, + "learning_rate": 1.781640397814211e-05, + "loss": 0.8502310514450073, + "step": 588 + }, + { + "epoch": 0.7643611736318106, + "grad_norm": 0.705142617225647, + "learning_rate": 1.7807476993495047e-05, + "loss": 0.8705092668533325, + "step": 589 + }, + { + "epoch": 0.7656589005819494, + "grad_norm": 0.689218282699585, + "learning_rate": 1.779853404453363e-05, + "loss": 0.8186284899711609, + "step": 590 + }, + { + "epoch": 0.7669566275320884, + "grad_norm": 0.6828286647796631, + "learning_rate": 1.7789575149543936e-05, + "loss": 0.7887763381004333, + "step": 591 + }, + { + "epoch": 0.7682543544822272, + "grad_norm": 0.7451944351196289, + "learning_rate": 1.7780600326844638e-05, + "loss": 0.8204880952835083, + "step": 592 + }, + { + "epoch": 0.7695520814323661, + "grad_norm": 0.7414618730545044, + "learning_rate": 1.7771609594786968e-05, + "loss": 0.8183786869049072, + "step": 593 + }, + { + "epoch": 0.770849808382505, + "grad_norm": 0.7165583968162537, + "learning_rate": 1.776260297175471e-05, + "loss": 0.860834002494812, + "step": 594 + }, + { + "epoch": 0.7721475353326439, + "grad_norm": 0.6954268217086792, + "learning_rate": 1.775358047616412e-05, + "loss": 0.7466313242912292, + "step": 595 + }, + { + "epoch": 0.7734452622827828, + "grad_norm": 0.7495166063308716, + "learning_rate": 1.774454212646392e-05, + "loss": 0.8352164626121521, + "step": 596 + }, + { + "epoch": 0.7747429892329217, + "grad_norm": 0.7836682796478271, + "learning_rate": 1.773548794113525e-05, + "loss": 0.7596052885055542, + "step": 597 + }, + { + "epoch": 0.7760407161830606, + "grad_norm": 0.7165281176567078, + "learning_rate": 1.772641793869162e-05, + "loss": 0.8770286440849304, + "step": 598 + }, + { + "epoch": 0.7773384431331996, + "grad_norm": 0.7152581810951233, + "learning_rate": 1.7717332137678895e-05, + "loss": 0.7514005899429321, + "step": 599 + }, + { + "epoch": 0.7786361700833384, + "grad_norm": 0.7103503942489624, + "learning_rate": 1.770823055667524e-05, + "loss": 0.8051580190658569, + "step": 600 + }, + { + "epoch": 0.7799338970334773, + "grad_norm": 0.6738602519035339, + "learning_rate": 1.7699113214291082e-05, + "loss": 0.7153568267822266, + "step": 601 + }, + { + "epoch": 0.7812316239836162, + "grad_norm": 0.7472966909408569, + "learning_rate": 1.768998012916908e-05, + "loss": 0.8714797496795654, + "step": 602 + }, + { + "epoch": 0.7825293509337551, + "grad_norm": 0.6653077602386475, + "learning_rate": 1.7680831319984077e-05, + "loss": 0.7944467663764954, + "step": 603 + }, + { + "epoch": 0.7838270778838939, + "grad_norm": 0.6959863305091858, + "learning_rate": 1.7671666805443076e-05, + "loss": 0.8018844127655029, + "step": 604 + }, + { + "epoch": 0.7851248048340329, + "grad_norm": 0.7507782578468323, + "learning_rate": 1.766248660428519e-05, + "loss": 0.8342332243919373, + "step": 605 + }, + { + "epoch": 0.7864225317841718, + "grad_norm": 0.685041069984436, + "learning_rate": 1.7653290735281605e-05, + "loss": 0.8430291414260864, + "step": 606 + }, + { + "epoch": 0.7877202587343107, + "grad_norm": 0.7120122313499451, + "learning_rate": 1.7644079217235547e-05, + "loss": 0.8382185697555542, + "step": 607 + }, + { + "epoch": 0.7890179856844496, + "grad_norm": 0.6778322458267212, + "learning_rate": 1.763485206898224e-05, + "loss": 0.7327848672866821, + "step": 608 + }, + { + "epoch": 0.7903157126345884, + "grad_norm": 0.6974225044250488, + "learning_rate": 1.762560930938886e-05, + "loss": 0.8788211941719055, + "step": 609 + }, + { + "epoch": 0.7916134395847274, + "grad_norm": 0.7211731672286987, + "learning_rate": 1.7616350957354523e-05, + "loss": 0.788176417350769, + "step": 610 + }, + { + "epoch": 0.7929111665348663, + "grad_norm": 0.7053602337837219, + "learning_rate": 1.7607077031810204e-05, + "loss": 0.7817824482917786, + "step": 611 + }, + { + "epoch": 0.7942088934850051, + "grad_norm": 0.7248443961143494, + "learning_rate": 1.759778755171874e-05, + "loss": 0.8502725958824158, + "step": 612 + }, + { + "epoch": 0.7955066204351441, + "grad_norm": 0.7390884160995483, + "learning_rate": 1.758848253607476e-05, + "loss": 0.8086085319519043, + "step": 613 + }, + { + "epoch": 0.7968043473852829, + "grad_norm": 0.7250061631202698, + "learning_rate": 1.7579162003904678e-05, + "loss": 0.8245308995246887, + "step": 614 + }, + { + "epoch": 0.7981020743354219, + "grad_norm": 0.7186241149902344, + "learning_rate": 1.756982597426661e-05, + "loss": 0.8296452760696411, + "step": 615 + }, + { + "epoch": 0.7993998012855608, + "grad_norm": 0.6929823160171509, + "learning_rate": 1.756047446625038e-05, + "loss": 0.804393470287323, + "step": 616 + }, + { + "epoch": 0.8006975282356996, + "grad_norm": 0.6644824743270874, + "learning_rate": 1.7551107498977458e-05, + "loss": 0.7272558808326721, + "step": 617 + }, + { + "epoch": 0.8019952551858386, + "grad_norm": 0.6946169137954712, + "learning_rate": 1.7541725091600918e-05, + "loss": 0.7725887894630432, + "step": 618 + }, + { + "epoch": 0.8032929821359774, + "grad_norm": 0.7124983668327332, + "learning_rate": 1.7532327263305405e-05, + "loss": 0.8026424646377563, + "step": 619 + }, + { + "epoch": 0.8045907090861163, + "grad_norm": 0.7041330337524414, + "learning_rate": 1.75229140333071e-05, + "loss": 0.8723938465118408, + "step": 620 + }, + { + "epoch": 0.8058884360362553, + "grad_norm": 0.7211349606513977, + "learning_rate": 1.7513485420853683e-05, + "loss": 0.7833378911018372, + "step": 621 + }, + { + "epoch": 0.8071861629863941, + "grad_norm": 0.7067847847938538, + "learning_rate": 1.750404144522427e-05, + "loss": 0.8030161261558533, + "step": 622 + }, + { + "epoch": 0.8084838899365331, + "grad_norm": 0.7632414102554321, + "learning_rate": 1.7494582125729408e-05, + "loss": 0.8390699625015259, + "step": 623 + }, + { + "epoch": 0.8097816168866719, + "grad_norm": 0.6906073689460754, + "learning_rate": 1.7485107481711014e-05, + "loss": 0.7584885954856873, + "step": 624 + }, + { + "epoch": 0.8110793438368108, + "grad_norm": 0.7074705362319946, + "learning_rate": 1.7475617532542325e-05, + "loss": 0.7802140116691589, + "step": 625 + }, + { + "epoch": 0.8123770707869498, + "grad_norm": 0.7454720735549927, + "learning_rate": 1.7466112297627894e-05, + "loss": 0.8060036897659302, + "step": 626 + }, + { + "epoch": 0.8136747977370886, + "grad_norm": 0.720340371131897, + "learning_rate": 1.7456591796403525e-05, + "loss": 0.8245412707328796, + "step": 627 + }, + { + "epoch": 0.8149725246872275, + "grad_norm": 0.6765140295028687, + "learning_rate": 1.744705604833622e-05, + "loss": 0.7529839277267456, + "step": 628 + }, + { + "epoch": 0.8162702516373664, + "grad_norm": 0.7487897872924805, + "learning_rate": 1.7437505072924177e-05, + "loss": 0.8539460897445679, + "step": 629 + }, + { + "epoch": 0.8175679785875053, + "grad_norm": 0.819340169429779, + "learning_rate": 1.742793888969673e-05, + "loss": 0.9023832082748413, + "step": 630 + }, + { + "epoch": 0.8188657055376443, + "grad_norm": 0.6978700757026672, + "learning_rate": 1.741835751821429e-05, + "loss": 0.8347563743591309, + "step": 631 + }, + { + "epoch": 0.8201634324877831, + "grad_norm": 0.6633133888244629, + "learning_rate": 1.7408760978068343e-05, + "loss": 0.7656944394111633, + "step": 632 + }, + { + "epoch": 0.821461159437922, + "grad_norm": 0.779058039188385, + "learning_rate": 1.739914928888139e-05, + "loss": 0.8407497406005859, + "step": 633 + }, + { + "epoch": 0.8227588863880609, + "grad_norm": 0.7178354263305664, + "learning_rate": 1.7389522470306892e-05, + "loss": 0.8489883542060852, + "step": 634 + }, + { + "epoch": 0.8240566133381998, + "grad_norm": 0.6867073774337769, + "learning_rate": 1.7379880542029263e-05, + "loss": 0.8083344101905823, + "step": 635 + }, + { + "epoch": 0.8253543402883388, + "grad_norm": 0.7312392592430115, + "learning_rate": 1.7370223523763804e-05, + "loss": 0.8478159308433533, + "step": 636 + }, + { + "epoch": 0.8266520672384776, + "grad_norm": 0.7201517224311829, + "learning_rate": 1.7360551435256673e-05, + "loss": 0.8310608863830566, + "step": 637 + }, + { + "epoch": 0.8279497941886165, + "grad_norm": 0.7189190983772278, + "learning_rate": 1.7350864296284846e-05, + "loss": 0.8333780765533447, + "step": 638 + }, + { + "epoch": 0.8292475211387554, + "grad_norm": 0.686372697353363, + "learning_rate": 1.7341162126656063e-05, + "loss": 0.774347722530365, + "step": 639 + }, + { + "epoch": 0.8305452480888943, + "grad_norm": 0.7090693712234497, + "learning_rate": 1.7331444946208815e-05, + "loss": 0.7772883772850037, + "step": 640 + }, + { + "epoch": 0.8318429750390332, + "grad_norm": 0.7179540991783142, + "learning_rate": 1.732171277481227e-05, + "loss": 0.8045225739479065, + "step": 641 + }, + { + "epoch": 0.8331407019891721, + "grad_norm": 0.7238140106201172, + "learning_rate": 1.7311965632366254e-05, + "loss": 0.816831648349762, + "step": 642 + }, + { + "epoch": 0.834438428939311, + "grad_norm": 0.7198631167411804, + "learning_rate": 1.7302203538801212e-05, + "loss": 0.8121675252914429, + "step": 643 + }, + { + "epoch": 0.8357361558894499, + "grad_norm": 0.743016242980957, + "learning_rate": 1.729242651407815e-05, + "loss": 0.8649178743362427, + "step": 644 + }, + { + "epoch": 0.8370338828395888, + "grad_norm": 0.7449317574501038, + "learning_rate": 1.7282634578188612e-05, + "loss": 0.823853611946106, + "step": 645 + }, + { + "epoch": 0.8383316097897276, + "grad_norm": 0.725826621055603, + "learning_rate": 1.7272827751154627e-05, + "loss": 0.8356031179428101, + "step": 646 + }, + { + "epoch": 0.8396293367398666, + "grad_norm": 0.7286955118179321, + "learning_rate": 1.7263006053028674e-05, + "loss": 0.7678595781326294, + "step": 647 + }, + { + "epoch": 0.8409270636900055, + "grad_norm": 0.7141085863113403, + "learning_rate": 1.7253169503893637e-05, + "loss": 0.819695770740509, + "step": 648 + }, + { + "epoch": 0.8422247906401443, + "grad_norm": 0.7320179343223572, + "learning_rate": 1.7243318123862777e-05, + "loss": 0.7937145233154297, + "step": 649 + }, + { + "epoch": 0.8435225175902833, + "grad_norm": 0.677760124206543, + "learning_rate": 1.7233451933079663e-05, + "loss": 0.7791966199874878, + "step": 650 + }, + { + "epoch": 0.8448202445404221, + "grad_norm": 0.7462013363838196, + "learning_rate": 1.7223570951718166e-05, + "loss": 0.7947529554367065, + "step": 651 + }, + { + "epoch": 0.8461179714905611, + "grad_norm": 0.7482285499572754, + "learning_rate": 1.7213675199982388e-05, + "loss": 0.8657369613647461, + "step": 652 + }, + { + "epoch": 0.8474156984407, + "grad_norm": 0.7175538539886475, + "learning_rate": 1.7203764698106636e-05, + "loss": 0.8233255743980408, + "step": 653 + }, + { + "epoch": 0.8487134253908388, + "grad_norm": 0.687630295753479, + "learning_rate": 1.7193839466355383e-05, + "loss": 0.730807363986969, + "step": 654 + }, + { + "epoch": 0.8500111523409778, + "grad_norm": 0.7357272505760193, + "learning_rate": 1.7183899525023212e-05, + "loss": 0.7798961997032166, + "step": 655 + }, + { + "epoch": 0.8513088792911166, + "grad_norm": 0.7003471851348877, + "learning_rate": 1.7173944894434783e-05, + "loss": 0.752636730670929, + "step": 656 + }, + { + "epoch": 0.8526066062412555, + "grad_norm": 0.72862708568573, + "learning_rate": 1.7163975594944807e-05, + "loss": 0.8516281247138977, + "step": 657 + }, + { + "epoch": 0.8539043331913945, + "grad_norm": 0.7155880928039551, + "learning_rate": 1.715399164693797e-05, + "loss": 0.8015654683113098, + "step": 658 + }, + { + "epoch": 0.8552020601415333, + "grad_norm": 0.6752951145172119, + "learning_rate": 1.7143993070828913e-05, + "loss": 0.7704746127128601, + "step": 659 + }, + { + "epoch": 0.8564997870916723, + "grad_norm": 0.7284151315689087, + "learning_rate": 1.713397988706221e-05, + "loss": 0.8053057789802551, + "step": 660 + }, + { + "epoch": 0.8577975140418111, + "grad_norm": 0.7367468476295471, + "learning_rate": 1.7123952116112275e-05, + "loss": 0.8107625246047974, + "step": 661 + }, + { + "epoch": 0.85909524099195, + "grad_norm": 0.7247380018234253, + "learning_rate": 1.7113909778483364e-05, + "loss": 0.8168917894363403, + "step": 662 + }, + { + "epoch": 0.860392967942089, + "grad_norm": 0.708310067653656, + "learning_rate": 1.7103852894709517e-05, + "loss": 0.765848696231842, + "step": 663 + }, + { + "epoch": 0.8616906948922278, + "grad_norm": 0.746276319026947, + "learning_rate": 1.7093781485354517e-05, + "loss": 0.7557209730148315, + "step": 664 + }, + { + "epoch": 0.8629884218423667, + "grad_norm": 0.7245295643806458, + "learning_rate": 1.7083695571011842e-05, + "loss": 0.8230986595153809, + "step": 665 + }, + { + "epoch": 0.8642861487925056, + "grad_norm": 0.6767184734344482, + "learning_rate": 1.707359517230464e-05, + "loss": 0.7791951894760132, + "step": 666 + }, + { + "epoch": 0.8655838757426445, + "grad_norm": 0.7177157402038574, + "learning_rate": 1.7063480309885668e-05, + "loss": 0.7597481608390808, + "step": 667 + }, + { + "epoch": 0.8668816026927835, + "grad_norm": 0.7185314297676086, + "learning_rate": 1.7053351004437258e-05, + "loss": 0.7932897210121155, + "step": 668 + }, + { + "epoch": 0.8681793296429223, + "grad_norm": 0.6925249695777893, + "learning_rate": 1.7043207276671276e-05, + "loss": 0.8076404333114624, + "step": 669 + }, + { + "epoch": 0.8694770565930612, + "grad_norm": 0.6706543564796448, + "learning_rate": 1.7033049147329077e-05, + "loss": 0.8299864530563354, + "step": 670 + }, + { + "epoch": 0.8707747835432001, + "grad_norm": 0.6854607462882996, + "learning_rate": 1.702287663718147e-05, + "loss": 0.7249770760536194, + "step": 671 + }, + { + "epoch": 0.872072510493339, + "grad_norm": 0.6870327591896057, + "learning_rate": 1.7012689767028656e-05, + "loss": 0.770750880241394, + "step": 672 + }, + { + "epoch": 0.8733702374434779, + "grad_norm": 0.7077570557594299, + "learning_rate": 1.700248855770021e-05, + "loss": 0.887006402015686, + "step": 673 + }, + { + "epoch": 0.8746679643936168, + "grad_norm": 0.7156735062599182, + "learning_rate": 1.6992273030055022e-05, + "loss": 0.793735921382904, + "step": 674 + }, + { + "epoch": 0.8759656913437557, + "grad_norm": 0.7201855182647705, + "learning_rate": 1.6982043204981264e-05, + "loss": 0.7955703139305115, + "step": 675 + }, + { + "epoch": 0.8772634182938946, + "grad_norm": 0.7118475437164307, + "learning_rate": 1.6971799103396332e-05, + "loss": 0.7845295667648315, + "step": 676 + }, + { + "epoch": 0.8785611452440335, + "grad_norm": 0.7221444845199585, + "learning_rate": 1.696154074624683e-05, + "loss": 0.824984610080719, + "step": 677 + }, + { + "epoch": 0.8798588721941724, + "grad_norm": 0.6542083621025085, + "learning_rate": 1.6951268154508497e-05, + "loss": 0.8094558119773865, + "step": 678 + }, + { + "epoch": 0.8811565991443113, + "grad_norm": 0.7080230116844177, + "learning_rate": 1.6940981349186182e-05, + "loss": 0.8446075916290283, + "step": 679 + }, + { + "epoch": 0.8824543260944502, + "grad_norm": 0.7394174933433533, + "learning_rate": 1.69306803513138e-05, + "loss": 0.8166599273681641, + "step": 680 + }, + { + "epoch": 0.883752053044589, + "grad_norm": 0.6939387321472168, + "learning_rate": 1.6920365181954284e-05, + "loss": 0.8320161700248718, + "step": 681 + }, + { + "epoch": 0.885049779994728, + "grad_norm": 0.7184001803398132, + "learning_rate": 1.6910035862199545e-05, + "loss": 0.7950330376625061, + "step": 682 + }, + { + "epoch": 0.8863475069448669, + "grad_norm": 0.6943792700767517, + "learning_rate": 1.6899692413170422e-05, + "loss": 0.8061293363571167, + "step": 683 + }, + { + "epoch": 0.8876452338950058, + "grad_norm": 0.7242916822433472, + "learning_rate": 1.688933485601666e-05, + "loss": 0.799871563911438, + "step": 684 + }, + { + "epoch": 0.8889429608451447, + "grad_norm": 0.6802281141281128, + "learning_rate": 1.6878963211916833e-05, + "loss": 0.8111347556114197, + "step": 685 + }, + { + "epoch": 0.8902406877952835, + "grad_norm": 0.7344982028007507, + "learning_rate": 1.6868577502078336e-05, + "loss": 0.818919837474823, + "step": 686 + }, + { + "epoch": 0.8915384147454225, + "grad_norm": 0.7255212664604187, + "learning_rate": 1.6858177747737312e-05, + "loss": 0.8595883846282959, + "step": 687 + }, + { + "epoch": 0.8928361416955614, + "grad_norm": 0.6713986992835999, + "learning_rate": 1.684776397015863e-05, + "loss": 0.7319802045822144, + "step": 688 + }, + { + "epoch": 0.8941338686457002, + "grad_norm": 0.6508772373199463, + "learning_rate": 1.6837336190635824e-05, + "loss": 0.7525233626365662, + "step": 689 + }, + { + "epoch": 0.8954315955958392, + "grad_norm": 0.6712636947631836, + "learning_rate": 1.682689443049107e-05, + "loss": 0.785638689994812, + "step": 690 + }, + { + "epoch": 0.8954315955958392, + "eval_loss": 0.7686871290206909, + "eval_runtime": 143.0434, + "eval_samples_per_second": 36.297, + "eval_steps_per_second": 9.074, + "step": 690 + }, + { + "epoch": 0.896729322545978, + "grad_norm": 0.7040373682975769, + "learning_rate": 1.6816438711075114e-05, + "loss": 0.8052287101745605, + "step": 691 + }, + { + "epoch": 0.898027049496117, + "grad_norm": 0.6796557903289795, + "learning_rate": 1.680596905376727e-05, + "loss": 0.8128867745399475, + "step": 692 + }, + { + "epoch": 0.8993247764462559, + "grad_norm": 0.689491868019104, + "learning_rate": 1.6795485479975327e-05, + "loss": 0.7731098532676697, + "step": 693 + }, + { + "epoch": 0.9006225033963947, + "grad_norm": 0.6846652030944824, + "learning_rate": 1.6784988011135546e-05, + "loss": 0.8001493811607361, + "step": 694 + }, + { + "epoch": 0.9019202303465337, + "grad_norm": 0.7072511911392212, + "learning_rate": 1.6774476668712587e-05, + "loss": 0.7856433391571045, + "step": 695 + }, + { + "epoch": 0.9032179572966725, + "grad_norm": 0.6913763880729675, + "learning_rate": 1.676395147419949e-05, + "loss": 0.8246166110038757, + "step": 696 + }, + { + "epoch": 0.9045156842468114, + "grad_norm": 0.7131518721580505, + "learning_rate": 1.6753412449117615e-05, + "loss": 0.8256362080574036, + "step": 697 + }, + { + "epoch": 0.9058134111969504, + "grad_norm": 0.6939201951026917, + "learning_rate": 1.67428596150166e-05, + "loss": 0.8615972399711609, + "step": 698 + }, + { + "epoch": 0.9071111381470892, + "grad_norm": 0.7194769382476807, + "learning_rate": 1.6732292993474316e-05, + "loss": 0.7931585907936096, + "step": 699 + }, + { + "epoch": 0.9084088650972282, + "grad_norm": 0.6878808736801147, + "learning_rate": 1.6721712606096833e-05, + "loss": 0.7722562551498413, + "step": 700 + }, + { + "epoch": 0.909706592047367, + "grad_norm": 0.6713901162147522, + "learning_rate": 1.6711118474518363e-05, + "loss": 0.7399365901947021, + "step": 701 + }, + { + "epoch": 0.9110043189975059, + "grad_norm": 0.6830242872238159, + "learning_rate": 1.6700510620401223e-05, + "loss": 0.7681128978729248, + "step": 702 + }, + { + "epoch": 0.9123020459476449, + "grad_norm": 0.7052934169769287, + "learning_rate": 1.6689889065435796e-05, + "loss": 0.8287486433982849, + "step": 703 + }, + { + "epoch": 0.9135997728977837, + "grad_norm": 0.7426304817199707, + "learning_rate": 1.667925383134047e-05, + "loss": 0.7236632108688354, + "step": 704 + }, + { + "epoch": 0.9148974998479226, + "grad_norm": 0.7284197807312012, + "learning_rate": 1.66686049398616e-05, + "loss": 0.8001005053520203, + "step": 705 + }, + { + "epoch": 0.9161952267980615, + "grad_norm": 0.7305144667625427, + "learning_rate": 1.6657942412773484e-05, + "loss": 0.816078245639801, + "step": 706 + }, + { + "epoch": 0.9174929537482004, + "grad_norm": 0.7396757006645203, + "learning_rate": 1.664726627187829e-05, + "loss": 0.8432518243789673, + "step": 707 + }, + { + "epoch": 0.9187906806983394, + "grad_norm": 0.7043930292129517, + "learning_rate": 1.6636576539006015e-05, + "loss": 0.8011447787284851, + "step": 708 + }, + { + "epoch": 0.9200884076484782, + "grad_norm": 0.6750434637069702, + "learning_rate": 1.6625873236014464e-05, + "loss": 0.8111026883125305, + "step": 709 + }, + { + "epoch": 0.9213861345986171, + "grad_norm": 0.6968750953674316, + "learning_rate": 1.6615156384789185e-05, + "loss": 0.7856196165084839, + "step": 710 + }, + { + "epoch": 0.922683861548756, + "grad_norm": 0.6756315231323242, + "learning_rate": 1.660442600724342e-05, + "loss": 0.7796693444252014, + "step": 711 + }, + { + "epoch": 0.9239815884988949, + "grad_norm": 0.7280746102333069, + "learning_rate": 1.659368212531808e-05, + "loss": 0.8190441131591797, + "step": 712 + }, + { + "epoch": 0.9252793154490339, + "grad_norm": 0.6771341562271118, + "learning_rate": 1.6582924760981683e-05, + "loss": 0.7919082641601562, + "step": 713 + }, + { + "epoch": 0.9265770423991727, + "grad_norm": 0.7019714713096619, + "learning_rate": 1.6572153936230316e-05, + "loss": 0.7387243509292603, + "step": 714 + }, + { + "epoch": 0.9278747693493116, + "grad_norm": 0.7424118518829346, + "learning_rate": 1.6561369673087588e-05, + "loss": 0.8694776892662048, + "step": 715 + }, + { + "epoch": 0.9291724962994505, + "grad_norm": 0.6909191012382507, + "learning_rate": 1.6550571993604587e-05, + "loss": 0.8239873647689819, + "step": 716 + }, + { + "epoch": 0.9304702232495894, + "grad_norm": 0.7481014728546143, + "learning_rate": 1.6539760919859838e-05, + "loss": 0.8004978895187378, + "step": 717 + }, + { + "epoch": 0.9317679501997282, + "grad_norm": 0.6954971551895142, + "learning_rate": 1.6528936473959253e-05, + "loss": 0.8122729659080505, + "step": 718 + }, + { + "epoch": 0.9330656771498672, + "grad_norm": 0.7150570154190063, + "learning_rate": 1.6518098678036073e-05, + "loss": 0.8382218480110168, + "step": 719 + }, + { + "epoch": 0.9343634041000061, + "grad_norm": 0.7469287514686584, + "learning_rate": 1.650724755425086e-05, + "loss": 0.8599920868873596, + "step": 720 + }, + { + "epoch": 0.935661131050145, + "grad_norm": 0.7064406275749207, + "learning_rate": 1.6496383124791406e-05, + "loss": 0.7755042314529419, + "step": 721 + }, + { + "epoch": 0.9369588580002839, + "grad_norm": 0.7173776626586914, + "learning_rate": 1.6485505411872725e-05, + "loss": 0.8066536784172058, + "step": 722 + }, + { + "epoch": 0.9382565849504227, + "grad_norm": 0.717430591583252, + "learning_rate": 1.6474614437736986e-05, + "loss": 0.8112089037895203, + "step": 723 + }, + { + "epoch": 0.9395543119005617, + "grad_norm": 0.696087658405304, + "learning_rate": 1.6463710224653477e-05, + "loss": 0.7918620705604553, + "step": 724 + }, + { + "epoch": 0.9408520388507006, + "grad_norm": 0.6923975944519043, + "learning_rate": 1.6452792794918545e-05, + "loss": 0.8037642240524292, + "step": 725 + }, + { + "epoch": 0.9421497658008394, + "grad_norm": 0.7063742280006409, + "learning_rate": 1.644186217085558e-05, + "loss": 0.7934796810150146, + "step": 726 + }, + { + "epoch": 0.9434474927509784, + "grad_norm": 0.6965203881263733, + "learning_rate": 1.6430918374814937e-05, + "loss": 0.8489659428596497, + "step": 727 + }, + { + "epoch": 0.9447452197011172, + "grad_norm": 0.7392389178276062, + "learning_rate": 1.641996142917391e-05, + "loss": 0.8604154586791992, + "step": 728 + }, + { + "epoch": 0.9460429466512562, + "grad_norm": 0.7131820321083069, + "learning_rate": 1.640899135633668e-05, + "loss": 0.8199344277381897, + "step": 729 + }, + { + "epoch": 0.9473406736013951, + "grad_norm": 0.7163403630256653, + "learning_rate": 1.6398008178734272e-05, + "loss": 0.8697142004966736, + "step": 730 + }, + { + "epoch": 0.9486384005515339, + "grad_norm": 0.6607118844985962, + "learning_rate": 1.6387011918824493e-05, + "loss": 0.7900056838989258, + "step": 731 + }, + { + "epoch": 0.9499361275016729, + "grad_norm": 0.669420599937439, + "learning_rate": 1.6376002599091925e-05, + "loss": 0.8032844066619873, + "step": 732 + }, + { + "epoch": 0.9512338544518117, + "grad_norm": 0.7059581279754639, + "learning_rate": 1.6364980242047835e-05, + "loss": 0.8048977851867676, + "step": 733 + }, + { + "epoch": 0.9525315814019506, + "grad_norm": 0.7329293489456177, + "learning_rate": 1.635394487023015e-05, + "loss": 0.8311731815338135, + "step": 734 + }, + { + "epoch": 0.9538293083520896, + "grad_norm": 0.7057397961616516, + "learning_rate": 1.634289650620342e-05, + "loss": 0.8411611318588257, + "step": 735 + }, + { + "epoch": 0.9551270353022284, + "grad_norm": 0.653426468372345, + "learning_rate": 1.633183517255875e-05, + "loss": 0.758813738822937, + "step": 736 + }, + { + "epoch": 0.9564247622523674, + "grad_norm": 0.7300577759742737, + "learning_rate": 1.632076089191376e-05, + "loss": 0.8028651475906372, + "step": 737 + }, + { + "epoch": 0.9577224892025062, + "grad_norm": 0.6757684350013733, + "learning_rate": 1.630967368691256e-05, + "loss": 0.8133585453033447, + "step": 738 + }, + { + "epoch": 0.9590202161526451, + "grad_norm": 0.6894041299819946, + "learning_rate": 1.6298573580225676e-05, + "loss": 0.766591489315033, + "step": 739 + }, + { + "epoch": 0.9603179431027841, + "grad_norm": 0.7034198641777039, + "learning_rate": 1.6287460594550017e-05, + "loss": 0.778566837310791, + "step": 740 + }, + { + "epoch": 0.9616156700529229, + "grad_norm": 0.6629794239997864, + "learning_rate": 1.6276334752608823e-05, + "loss": 0.7911474704742432, + "step": 741 + }, + { + "epoch": 0.9629133970030618, + "grad_norm": 0.6980583667755127, + "learning_rate": 1.6265196077151627e-05, + "loss": 0.7445369958877563, + "step": 742 + }, + { + "epoch": 0.9642111239532007, + "grad_norm": 0.7294824719429016, + "learning_rate": 1.62540445909542e-05, + "loss": 0.86620032787323, + "step": 743 + }, + { + "epoch": 0.9655088509033396, + "grad_norm": 0.7365493774414062, + "learning_rate": 1.624288031681851e-05, + "loss": 0.810501754283905, + "step": 744 + }, + { + "epoch": 0.9668065778534786, + "grad_norm": 0.737711489200592, + "learning_rate": 1.623170327757267e-05, + "loss": 0.8520309329032898, + "step": 745 + }, + { + "epoch": 0.9681043048036174, + "grad_norm": 0.683699905872345, + "learning_rate": 1.62205134960709e-05, + "loss": 0.7950071096420288, + "step": 746 + }, + { + "epoch": 0.9694020317537563, + "grad_norm": 0.7092661261558533, + "learning_rate": 1.620931099519347e-05, + "loss": 0.8340073823928833, + "step": 747 + }, + { + "epoch": 0.9706997587038952, + "grad_norm": 0.7204828262329102, + "learning_rate": 1.619809579784665e-05, + "loss": 0.7778469324111938, + "step": 748 + }, + { + "epoch": 0.9719974856540341, + "grad_norm": 0.6977567076683044, + "learning_rate": 1.6186867926962695e-05, + "loss": 0.797735869884491, + "step": 749 + }, + { + "epoch": 0.973295212604173, + "grad_norm": 0.6957900524139404, + "learning_rate": 1.6175627405499746e-05, + "loss": 0.7967561483383179, + "step": 750 + }, + { + "epoch": 0.9745929395543119, + "grad_norm": 0.6862889528274536, + "learning_rate": 1.6164374256441837e-05, + "loss": 0.8016502261161804, + "step": 751 + }, + { + "epoch": 0.9758906665044508, + "grad_norm": 0.6969533562660217, + "learning_rate": 1.6153108502798796e-05, + "loss": 0.8099682331085205, + "step": 752 + }, + { + "epoch": 0.9771883934545897, + "grad_norm": 0.6920532584190369, + "learning_rate": 1.614183016760625e-05, + "loss": 0.8007751107215881, + "step": 753 + }, + { + "epoch": 0.9784861204047286, + "grad_norm": 0.6825345158576965, + "learning_rate": 1.613053927392553e-05, + "loss": 0.8570786118507385, + "step": 754 + }, + { + "epoch": 0.9797838473548675, + "grad_norm": 0.7230255603790283, + "learning_rate": 1.6119235844843664e-05, + "loss": 0.7779375910758972, + "step": 755 + }, + { + "epoch": 0.9810815743050064, + "grad_norm": 0.68338543176651, + "learning_rate": 1.6107919903473294e-05, + "loss": 0.7894657850265503, + "step": 756 + }, + { + "epoch": 0.9823793012551453, + "grad_norm": 0.7132012248039246, + "learning_rate": 1.6096591472952664e-05, + "loss": 0.8401795625686646, + "step": 757 + }, + { + "epoch": 0.9836770282052841, + "grad_norm": 0.681077241897583, + "learning_rate": 1.6085250576445548e-05, + "loss": 0.7692939043045044, + "step": 758 + }, + { + "epoch": 0.9849747551554231, + "grad_norm": 0.6817126870155334, + "learning_rate": 1.6073897237141203e-05, + "loss": 0.7555439472198486, + "step": 759 + }, + { + "epoch": 0.986272482105562, + "grad_norm": 0.6702454090118408, + "learning_rate": 1.6062531478254333e-05, + "loss": 0.7115926742553711, + "step": 760 + }, + { + "epoch": 0.9875702090557009, + "grad_norm": 0.6700429320335388, + "learning_rate": 1.605115332302505e-05, + "loss": 0.7557807564735413, + "step": 761 + }, + { + "epoch": 0.9888679360058398, + "grad_norm": 0.6891334652900696, + "learning_rate": 1.603976279471879e-05, + "loss": 0.8077662587165833, + "step": 762 + }, + { + "epoch": 0.9901656629559786, + "grad_norm": 0.767073929309845, + "learning_rate": 1.6028359916626308e-05, + "loss": 0.7964708805084229, + "step": 763 + }, + { + "epoch": 0.9914633899061176, + "grad_norm": 0.660102367401123, + "learning_rate": 1.601694471206359e-05, + "loss": 0.7086456418037415, + "step": 764 + }, + { + "epoch": 0.9927611168562565, + "grad_norm": 0.6949501037597656, + "learning_rate": 1.600551720437186e-05, + "loss": 0.7723450660705566, + "step": 765 + }, + { + "epoch": 0.9940588438063953, + "grad_norm": 0.7149574756622314, + "learning_rate": 1.599407741691746e-05, + "loss": 0.8286278247833252, + "step": 766 + }, + { + "epoch": 0.9953565707565343, + "grad_norm": 0.6776000261306763, + "learning_rate": 1.5982625373091877e-05, + "loss": 0.7701430320739746, + "step": 767 + }, + { + "epoch": 0.9966542977066731, + "grad_norm": 0.7129999399185181, + "learning_rate": 1.5971161096311628e-05, + "loss": 0.8104744553565979, + "step": 768 + }, + { + "epoch": 0.9979520246568121, + "grad_norm": 0.6826761960983276, + "learning_rate": 1.5959684610018267e-05, + "loss": 0.7398239970207214, + "step": 769 + }, + { + "epoch": 0.999249751606951, + "grad_norm": 0.7236920595169067, + "learning_rate": 1.5948195937678297e-05, + "loss": 0.7627758383750916, + "step": 770 + }, + { + "epoch": 1.0, + "grad_norm": 0.9062820672988892, + "learning_rate": 1.5936695102783148e-05, + "loss": 0.7684851288795471, + "step": 771 + }, + { + "epoch": 1.0012977269501389, + "grad_norm": 1.0222225189208984, + "learning_rate": 1.5925182128849116e-05, + "loss": 0.7260036468505859, + "step": 772 + }, + { + "epoch": 1.0025954539002777, + "grad_norm": 0.8933354020118713, + "learning_rate": 1.591365703941732e-05, + "loss": 0.6952782869338989, + "step": 773 + }, + { + "epoch": 1.0038931808504168, + "grad_norm": 0.8150500059127808, + "learning_rate": 1.5902119858053652e-05, + "loss": 0.708466649055481, + "step": 774 + }, + { + "epoch": 1.0051909078005556, + "grad_norm": 0.677733838558197, + "learning_rate": 1.589057060834872e-05, + "loss": 0.714854896068573, + "step": 775 + }, + { + "epoch": 1.0064886347506945, + "grad_norm": 0.8115158677101135, + "learning_rate": 1.5879009313917826e-05, + "loss": 0.7126277089118958, + "step": 776 + }, + { + "epoch": 1.0077863617008334, + "grad_norm": 0.9660588502883911, + "learning_rate": 1.5867435998400885e-05, + "loss": 0.8123319149017334, + "step": 777 + }, + { + "epoch": 1.0090840886509722, + "grad_norm": 0.8912333846092224, + "learning_rate": 1.5855850685462404e-05, + "loss": 0.7480561137199402, + "step": 778 + }, + { + "epoch": 1.0103818156011113, + "grad_norm": 0.9120140075683594, + "learning_rate": 1.584425339879141e-05, + "loss": 0.7480191588401794, + "step": 779 + }, + { + "epoch": 1.0116795425512501, + "grad_norm": 0.8324950337409973, + "learning_rate": 1.5832644162101417e-05, + "loss": 0.7069035172462463, + "step": 780 + }, + { + "epoch": 1.012977269501389, + "grad_norm": 0.7601868510246277, + "learning_rate": 1.5821022999130385e-05, + "loss": 0.646752655506134, + "step": 781 + }, + { + "epoch": 1.0142749964515279, + "grad_norm": 0.7213713526725769, + "learning_rate": 1.580938993364064e-05, + "loss": 0.6728400588035583, + "step": 782 + }, + { + "epoch": 1.0155727234016667, + "grad_norm": 0.8234879374504089, + "learning_rate": 1.579774498941886e-05, + "loss": 0.6997194886207581, + "step": 783 + }, + { + "epoch": 1.0168704503518056, + "grad_norm": 0.794476330280304, + "learning_rate": 1.578608819027602e-05, + "loss": 0.6844808459281921, + "step": 784 + }, + { + "epoch": 1.0181681773019446, + "grad_norm": 0.8356218338012695, + "learning_rate": 1.5774419560047303e-05, + "loss": 0.7501406073570251, + "step": 785 + }, + { + "epoch": 1.0194659042520835, + "grad_norm": 0.7794895172119141, + "learning_rate": 1.5762739122592123e-05, + "loss": 0.7650024890899658, + "step": 786 + }, + { + "epoch": 1.0207636312022224, + "grad_norm": 0.7471200227737427, + "learning_rate": 1.5751046901794008e-05, + "loss": 0.7121275067329407, + "step": 787 + }, + { + "epoch": 1.0220613581523612, + "grad_norm": 0.7541830539703369, + "learning_rate": 1.5739342921560593e-05, + "loss": 0.7205899357795715, + "step": 788 + }, + { + "epoch": 1.0233590851025, + "grad_norm": 0.8261748552322388, + "learning_rate": 1.5727627205823554e-05, + "loss": 0.6890494227409363, + "step": 789 + }, + { + "epoch": 1.0246568120526391, + "grad_norm": 0.7363404035568237, + "learning_rate": 1.571589977853857e-05, + "loss": 0.7250495553016663, + "step": 790 + }, + { + "epoch": 1.025954539002778, + "grad_norm": 0.7666418552398682, + "learning_rate": 1.5704160663685254e-05, + "loss": 0.6565474271774292, + "step": 791 + }, + { + "epoch": 1.0272522659529169, + "grad_norm": 0.707535982131958, + "learning_rate": 1.5692409885267127e-05, + "loss": 0.8307659029960632, + "step": 792 + }, + { + "epoch": 1.0285499929030557, + "grad_norm": 0.7528367638587952, + "learning_rate": 1.568064746731156e-05, + "loss": 0.734372615814209, + "step": 793 + }, + { + "epoch": 1.0298477198531946, + "grad_norm": 0.7138853073120117, + "learning_rate": 1.5668873433869718e-05, + "loss": 0.6305298805236816, + "step": 794 + }, + { + "epoch": 1.0311454468033336, + "grad_norm": 0.7478009462356567, + "learning_rate": 1.5657087809016517e-05, + "loss": 0.6923752427101135, + "step": 795 + }, + { + "epoch": 1.0324431737534725, + "grad_norm": 0.7364891171455383, + "learning_rate": 1.564529061685058e-05, + "loss": 0.7163046598434448, + "step": 796 + }, + { + "epoch": 1.0337409007036114, + "grad_norm": 0.7004992365837097, + "learning_rate": 1.5633481881494178e-05, + "loss": 0.6700119972229004, + "step": 797 + }, + { + "epoch": 1.0350386276537502, + "grad_norm": 0.749292254447937, + "learning_rate": 1.562166162709319e-05, + "loss": 0.6811234951019287, + "step": 798 + }, + { + "epoch": 1.036336354603889, + "grad_norm": 0.7418084740638733, + "learning_rate": 1.560982987781704e-05, + "loss": 0.7332763075828552, + "step": 799 + }, + { + "epoch": 1.037634081554028, + "grad_norm": 0.6867294907569885, + "learning_rate": 1.5597986657858656e-05, + "loss": 0.7094939351081848, + "step": 800 + }, + { + "epoch": 1.038931808504167, + "grad_norm": 0.6801954507827759, + "learning_rate": 1.5586131991434434e-05, + "loss": 0.7229615449905396, + "step": 801 + }, + { + "epoch": 1.0402295354543059, + "grad_norm": 0.6919074654579163, + "learning_rate": 1.5574265902784163e-05, + "loss": 0.6745041012763977, + "step": 802 + }, + { + "epoch": 1.0415272624044447, + "grad_norm": 0.7064636945724487, + "learning_rate": 1.556238841617099e-05, + "loss": 0.7311556935310364, + "step": 803 + }, + { + "epoch": 1.0428249893545836, + "grad_norm": 0.7400867938995361, + "learning_rate": 1.555049955588137e-05, + "loss": 0.7360319495201111, + "step": 804 + }, + { + "epoch": 1.0441227163047224, + "grad_norm": 0.7186093330383301, + "learning_rate": 1.5538599346225013e-05, + "loss": 0.6791881918907166, + "step": 805 + }, + { + "epoch": 1.0454204432548615, + "grad_norm": 0.7080870866775513, + "learning_rate": 1.552668781153484e-05, + "loss": 0.6935555338859558, + "step": 806 + }, + { + "epoch": 1.0467181702050004, + "grad_norm": 0.7288933396339417, + "learning_rate": 1.5514764976166916e-05, + "loss": 0.7893433570861816, + "step": 807 + }, + { + "epoch": 1.0480158971551392, + "grad_norm": 0.7090301513671875, + "learning_rate": 1.5502830864500426e-05, + "loss": 0.7087657451629639, + "step": 808 + }, + { + "epoch": 1.049313624105278, + "grad_norm": 0.7548444271087646, + "learning_rate": 1.5490885500937606e-05, + "loss": 0.72869473695755, + "step": 809 + }, + { + "epoch": 1.050611351055417, + "grad_norm": 0.7161403894424438, + "learning_rate": 1.5478928909903705e-05, + "loss": 0.7281824946403503, + "step": 810 + }, + { + "epoch": 1.051909078005556, + "grad_norm": 0.6805386543273926, + "learning_rate": 1.5466961115846927e-05, + "loss": 0.6523677110671997, + "step": 811 + }, + { + "epoch": 1.0532068049556949, + "grad_norm": 0.7339995503425598, + "learning_rate": 1.545498214323837e-05, + "loss": 0.7160875797271729, + "step": 812 + }, + { + "epoch": 1.0545045319058337, + "grad_norm": 0.6826195120811462, + "learning_rate": 1.544299201657202e-05, + "loss": 0.7368515133857727, + "step": 813 + }, + { + "epoch": 1.0558022588559726, + "grad_norm": 0.7545201182365417, + "learning_rate": 1.543099076036463e-05, + "loss": 0.7098448276519775, + "step": 814 + }, + { + "epoch": 1.0570999858061114, + "grad_norm": 0.6874995827674866, + "learning_rate": 1.5418978399155748e-05, + "loss": 0.6643248200416565, + "step": 815 + }, + { + "epoch": 1.0583977127562503, + "grad_norm": 0.7067052125930786, + "learning_rate": 1.54069549575076e-05, + "loss": 0.7022271752357483, + "step": 816 + }, + { + "epoch": 1.0596954397063894, + "grad_norm": 0.7168053388595581, + "learning_rate": 1.539492046000509e-05, + "loss": 0.6977633237838745, + "step": 817 + }, + { + "epoch": 1.0609931666565282, + "grad_norm": 0.7110093235969543, + "learning_rate": 1.5382874931255717e-05, + "loss": 0.7410083413124084, + "step": 818 + }, + { + "epoch": 1.062290893606667, + "grad_norm": 0.6772004961967468, + "learning_rate": 1.5370818395889536e-05, + "loss": 0.6744326949119568, + "step": 819 + }, + { + "epoch": 1.063588620556806, + "grad_norm": 0.7344289422035217, + "learning_rate": 1.5358750878559113e-05, + "loss": 0.7128704190254211, + "step": 820 + }, + { + "epoch": 1.0648863475069448, + "grad_norm": 0.7206461429595947, + "learning_rate": 1.5346672403939465e-05, + "loss": 0.7533354759216309, + "step": 821 + }, + { + "epoch": 1.0661840744570839, + "grad_norm": 0.7541556358337402, + "learning_rate": 1.5334582996728017e-05, + "loss": 0.7774013876914978, + "step": 822 + }, + { + "epoch": 1.0674818014072227, + "grad_norm": 0.7579377293586731, + "learning_rate": 1.532248268164455e-05, + "loss": 0.7790758609771729, + "step": 823 + }, + { + "epoch": 1.0687795283573616, + "grad_norm": 0.7289340496063232, + "learning_rate": 1.5310371483431138e-05, + "loss": 0.7054307460784912, + "step": 824 + }, + { + "epoch": 1.0700772553075004, + "grad_norm": 0.7037842869758606, + "learning_rate": 1.529824942685212e-05, + "loss": 0.7457549571990967, + "step": 825 + }, + { + "epoch": 1.0713749822576393, + "grad_norm": 0.7253069877624512, + "learning_rate": 1.528611653669403e-05, + "loss": 0.7203331589698792, + "step": 826 + }, + { + "epoch": 1.0726727092077784, + "grad_norm": 0.7243335247039795, + "learning_rate": 1.5273972837765566e-05, + "loss": 0.7370164394378662, + "step": 827 + }, + { + "epoch": 1.0739704361579172, + "grad_norm": 0.6802127957344055, + "learning_rate": 1.526181835489751e-05, + "loss": 0.7022003531455994, + "step": 828 + }, + { + "epoch": 1.075268163108056, + "grad_norm": 0.7470188736915588, + "learning_rate": 1.5249653112942708e-05, + "loss": 0.7355238795280457, + "step": 829 + }, + { + "epoch": 1.076565890058195, + "grad_norm": 0.7139303684234619, + "learning_rate": 1.5237477136776e-05, + "loss": 0.6995757222175598, + "step": 830 + }, + { + "epoch": 1.0778636170083338, + "grad_norm": 0.6893638372421265, + "learning_rate": 1.5225290451294173e-05, + "loss": 0.6514896750450134, + "step": 831 + }, + { + "epoch": 1.0791613439584729, + "grad_norm": 0.7205830812454224, + "learning_rate": 1.521309308141592e-05, + "loss": 0.6881433725357056, + "step": 832 + }, + { + "epoch": 1.0804590709086117, + "grad_norm": 0.7569621205329895, + "learning_rate": 1.5200885052081767e-05, + "loss": 0.7357972264289856, + "step": 833 + }, + { + "epoch": 1.0817567978587506, + "grad_norm": 0.7436279654502869, + "learning_rate": 1.518866638825405e-05, + "loss": 0.758313775062561, + "step": 834 + }, + { + "epoch": 1.0830545248088894, + "grad_norm": 0.7273634076118469, + "learning_rate": 1.517643711491684e-05, + "loss": 0.6798244714736938, + "step": 835 + }, + { + "epoch": 1.0843522517590283, + "grad_norm": 0.6966442465782166, + "learning_rate": 1.516419725707591e-05, + "loss": 0.7077891826629639, + "step": 836 + }, + { + "epoch": 1.0856499787091671, + "grad_norm": 0.6794623732566833, + "learning_rate": 1.5151946839758673e-05, + "loss": 0.6736932992935181, + "step": 837 + }, + { + "epoch": 1.0869477056593062, + "grad_norm": 0.7189822196960449, + "learning_rate": 1.5139685888014123e-05, + "loss": 0.7594777345657349, + "step": 838 + }, + { + "epoch": 1.088245432609445, + "grad_norm": 0.7691319584846497, + "learning_rate": 1.512741442691281e-05, + "loss": 0.7986084818840027, + "step": 839 + }, + { + "epoch": 1.089543159559584, + "grad_norm": 0.7428483366966248, + "learning_rate": 1.5115132481546763e-05, + "loss": 0.7112255096435547, + "step": 840 + }, + { + "epoch": 1.0908408865097228, + "grad_norm": 0.7567489743232727, + "learning_rate": 1.5102840077029452e-05, + "loss": 0.647540807723999, + "step": 841 + }, + { + "epoch": 1.0921386134598616, + "grad_norm": 0.7548873424530029, + "learning_rate": 1.509053723849574e-05, + "loss": 0.776237428188324, + "step": 842 + }, + { + "epoch": 1.0934363404100007, + "grad_norm": 0.7588720917701721, + "learning_rate": 1.5078223991101805e-05, + "loss": 0.6855933666229248, + "step": 843 + }, + { + "epoch": 1.0947340673601396, + "grad_norm": 0.7549242973327637, + "learning_rate": 1.5065900360025128e-05, + "loss": 0.7288146615028381, + "step": 844 + }, + { + "epoch": 1.0960317943102784, + "grad_norm": 0.7281069755554199, + "learning_rate": 1.5053566370464416e-05, + "loss": 0.7359070777893066, + "step": 845 + }, + { + "epoch": 1.0973295212604173, + "grad_norm": 0.709331750869751, + "learning_rate": 1.5041222047639558e-05, + "loss": 0.718718945980072, + "step": 846 + }, + { + "epoch": 1.0986272482105561, + "grad_norm": 0.684161365032196, + "learning_rate": 1.5028867416791566e-05, + "loss": 0.6832801699638367, + "step": 847 + }, + { + "epoch": 1.099924975160695, + "grad_norm": 0.7570529580116272, + "learning_rate": 1.5016502503182533e-05, + "loss": 0.712772786617279, + "step": 848 + }, + { + "epoch": 1.101222702110834, + "grad_norm": 0.7224586606025696, + "learning_rate": 1.5004127332095579e-05, + "loss": 0.72933429479599, + "step": 849 + }, + { + "epoch": 1.102520429060973, + "grad_norm": 0.7530233263969421, + "learning_rate": 1.49917419288348e-05, + "loss": 0.7607170343399048, + "step": 850 + }, + { + "epoch": 1.1038181560111118, + "grad_norm": 0.7433916926383972, + "learning_rate": 1.4979346318725203e-05, + "loss": 0.7284337282180786, + "step": 851 + }, + { + "epoch": 1.1051158829612506, + "grad_norm": 0.7271002531051636, + "learning_rate": 1.4966940527112679e-05, + "loss": 0.7452124357223511, + "step": 852 + }, + { + "epoch": 1.1064136099113895, + "grad_norm": 0.7177510857582092, + "learning_rate": 1.4954524579363932e-05, + "loss": 0.7781730890274048, + "step": 853 + }, + { + "epoch": 1.1077113368615286, + "grad_norm": 0.7278553247451782, + "learning_rate": 1.4942098500866428e-05, + "loss": 0.760970413684845, + "step": 854 + }, + { + "epoch": 1.1090090638116674, + "grad_norm": 0.7369382977485657, + "learning_rate": 1.4929662317028359e-05, + "loss": 0.7270724177360535, + "step": 855 + }, + { + "epoch": 1.1103067907618063, + "grad_norm": 0.7529125213623047, + "learning_rate": 1.491721605327857e-05, + "loss": 0.6972394585609436, + "step": 856 + }, + { + "epoch": 1.1116045177119451, + "grad_norm": 0.8102325201034546, + "learning_rate": 1.490475973506652e-05, + "loss": 0.7593643069267273, + "step": 857 + }, + { + "epoch": 1.112902244662084, + "grad_norm": 0.7033381462097168, + "learning_rate": 1.4892293387862221e-05, + "loss": 0.750421404838562, + "step": 858 + }, + { + "epoch": 1.114199971612223, + "grad_norm": 0.7504622340202332, + "learning_rate": 1.487981703715621e-05, + "loss": 0.7422147989273071, + "step": 859 + }, + { + "epoch": 1.115497698562362, + "grad_norm": 0.7424933910369873, + "learning_rate": 1.4867330708459463e-05, + "loss": 0.7375016212463379, + "step": 860 + }, + { + "epoch": 1.1167954255125008, + "grad_norm": 0.73978191614151, + "learning_rate": 1.4854834427303353e-05, + "loss": 0.7315906286239624, + "step": 861 + }, + { + "epoch": 1.1180931524626396, + "grad_norm": 0.7480568289756775, + "learning_rate": 1.4842328219239618e-05, + "loss": 0.7146769762039185, + "step": 862 + }, + { + "epoch": 1.1193908794127785, + "grad_norm": 0.6838370561599731, + "learning_rate": 1.4829812109840291e-05, + "loss": 0.6863071918487549, + "step": 863 + }, + { + "epoch": 1.1206886063629176, + "grad_norm": 0.69765305519104, + "learning_rate": 1.4817286124697647e-05, + "loss": 0.6740079522132874, + "step": 864 + }, + { + "epoch": 1.1219863333130564, + "grad_norm": 0.7375463843345642, + "learning_rate": 1.480475028942415e-05, + "loss": 0.7721714973449707, + "step": 865 + }, + { + "epoch": 1.1232840602631953, + "grad_norm": 0.7765669226646423, + "learning_rate": 1.4792204629652414e-05, + "loss": 0.6988716125488281, + "step": 866 + }, + { + "epoch": 1.1245817872133341, + "grad_norm": 0.6921293139457703, + "learning_rate": 1.4779649171035138e-05, + "loss": 0.7338443398475647, + "step": 867 + }, + { + "epoch": 1.125879514163473, + "grad_norm": 0.7645788192749023, + "learning_rate": 1.4767083939245055e-05, + "loss": 0.7597560882568359, + "step": 868 + }, + { + "epoch": 1.1271772411136118, + "grad_norm": 0.7806273698806763, + "learning_rate": 1.475450895997489e-05, + "loss": 0.7360360026359558, + "step": 869 + }, + { + "epoch": 1.128474968063751, + "grad_norm": 0.7329487204551697, + "learning_rate": 1.4741924258937283e-05, + "loss": 0.694042980670929, + "step": 870 + }, + { + "epoch": 1.1297726950138898, + "grad_norm": 0.7490030527114868, + "learning_rate": 1.472932986186477e-05, + "loss": 0.771519660949707, + "step": 871 + }, + { + "epoch": 1.1310704219640286, + "grad_norm": 0.7821305990219116, + "learning_rate": 1.47167257945097e-05, + "loss": 0.7572095990180969, + "step": 872 + }, + { + "epoch": 1.1323681489141675, + "grad_norm": 0.745883584022522, + "learning_rate": 1.4704112082644207e-05, + "loss": 0.7173527479171753, + "step": 873 + }, + { + "epoch": 1.1336658758643063, + "grad_norm": 0.7457818984985352, + "learning_rate": 1.4691488752060132e-05, + "loss": 0.7411136031150818, + "step": 874 + }, + { + "epoch": 1.1349636028144454, + "grad_norm": 0.7116679549217224, + "learning_rate": 1.4678855828568996e-05, + "loss": 0.6630608439445496, + "step": 875 + }, + { + "epoch": 1.1362613297645843, + "grad_norm": 0.7429471611976624, + "learning_rate": 1.4666213338001929e-05, + "loss": 0.6890819668769836, + "step": 876 + }, + { + "epoch": 1.1375590567147231, + "grad_norm": 0.7173399925231934, + "learning_rate": 1.4653561306209625e-05, + "loss": 0.7061414122581482, + "step": 877 + }, + { + "epoch": 1.138856783664862, + "grad_norm": 0.7341779470443726, + "learning_rate": 1.4640899759062285e-05, + "loss": 0.7564276456832886, + "step": 878 + }, + { + "epoch": 1.1401545106150008, + "grad_norm": 0.73567795753479, + "learning_rate": 1.462822872244957e-05, + "loss": 0.7193140983581543, + "step": 879 + }, + { + "epoch": 1.1414522375651397, + "grad_norm": 0.7359784841537476, + "learning_rate": 1.461554822228054e-05, + "loss": 0.724113941192627, + "step": 880 + }, + { + "epoch": 1.1427499645152788, + "grad_norm": 0.6934400200843811, + "learning_rate": 1.460285828448361e-05, + "loss": 0.6648344397544861, + "step": 881 + }, + { + "epoch": 1.1440476914654176, + "grad_norm": 0.6720191836357117, + "learning_rate": 1.4590158935006494e-05, + "loss": 0.6355569362640381, + "step": 882 + }, + { + "epoch": 1.1453454184155565, + "grad_norm": 0.7342029809951782, + "learning_rate": 1.4577450199816142e-05, + "loss": 0.7470182180404663, + "step": 883 + }, + { + "epoch": 1.1466431453656953, + "grad_norm": 0.7566630244255066, + "learning_rate": 1.4564732104898702e-05, + "loss": 0.7848218679428101, + "step": 884 + }, + { + "epoch": 1.1479408723158344, + "grad_norm": 0.6953855752944946, + "learning_rate": 1.4552004676259462e-05, + "loss": 0.7087516784667969, + "step": 885 + }, + { + "epoch": 1.1492385992659733, + "grad_norm": 0.7306509613990784, + "learning_rate": 1.453926793992279e-05, + "loss": 0.7669079303741455, + "step": 886 + }, + { + "epoch": 1.1505363262161121, + "grad_norm": 0.7278076410293579, + "learning_rate": 1.4526521921932091e-05, + "loss": 0.7629184722900391, + "step": 887 + }, + { + "epoch": 1.151834053166251, + "grad_norm": 0.7405791878700256, + "learning_rate": 1.4513766648349742e-05, + "loss": 0.6739349961280823, + "step": 888 + }, + { + "epoch": 1.1531317801163898, + "grad_norm": 0.7238565683364868, + "learning_rate": 1.4501002145257048e-05, + "loss": 0.7271534204483032, + "step": 889 + }, + { + "epoch": 1.1544295070665287, + "grad_norm": 0.6887433528900146, + "learning_rate": 1.4488228438754191e-05, + "loss": 0.7166074514389038, + "step": 890 + }, + { + "epoch": 1.1557272340166678, + "grad_norm": 0.7274357676506042, + "learning_rate": 1.4475445554960166e-05, + "loss": 0.7644513845443726, + "step": 891 + }, + { + "epoch": 1.1570249609668066, + "grad_norm": 0.7332258224487305, + "learning_rate": 1.4462653520012736e-05, + "loss": 0.7806090116500854, + "step": 892 + }, + { + "epoch": 1.1583226879169455, + "grad_norm": 0.7651371359825134, + "learning_rate": 1.4449852360068372e-05, + "loss": 0.774925947189331, + "step": 893 + }, + { + "epoch": 1.1596204148670843, + "grad_norm": 0.718445897102356, + "learning_rate": 1.4437042101302212e-05, + "loss": 0.7388082146644592, + "step": 894 + }, + { + "epoch": 1.1609181418172232, + "grad_norm": 0.7201905250549316, + "learning_rate": 1.4424222769907985e-05, + "loss": 0.6872411966323853, + "step": 895 + }, + { + "epoch": 1.1622158687673623, + "grad_norm": 0.7322660088539124, + "learning_rate": 1.4411394392097985e-05, + "loss": 0.7020053267478943, + "step": 896 + }, + { + "epoch": 1.1635135957175011, + "grad_norm": 0.7322126626968384, + "learning_rate": 1.4398556994102996e-05, + "loss": 0.746367335319519, + "step": 897 + }, + { + "epoch": 1.16481132266764, + "grad_norm": 0.7316040992736816, + "learning_rate": 1.4385710602172245e-05, + "loss": 0.7530633807182312, + "step": 898 + }, + { + "epoch": 1.1661090496177788, + "grad_norm": 0.7623510360717773, + "learning_rate": 1.4372855242573356e-05, + "loss": 0.7122158408164978, + "step": 899 + }, + { + "epoch": 1.1674067765679177, + "grad_norm": 0.7587069869041443, + "learning_rate": 1.4359990941592283e-05, + "loss": 0.7452347278594971, + "step": 900 + }, + { + "epoch": 1.1687045035180565, + "grad_norm": 0.7146732807159424, + "learning_rate": 1.4347117725533269e-05, + "loss": 0.670911431312561, + "step": 901 + }, + { + "epoch": 1.1700022304681956, + "grad_norm": 0.6925002932548523, + "learning_rate": 1.4334235620718774e-05, + "loss": 0.6600379943847656, + "step": 902 + }, + { + "epoch": 1.1712999574183345, + "grad_norm": 0.7344015836715698, + "learning_rate": 1.4321344653489453e-05, + "loss": 0.7038690447807312, + "step": 903 + }, + { + "epoch": 1.1725976843684733, + "grad_norm": 0.7387973070144653, + "learning_rate": 1.4308444850204066e-05, + "loss": 0.7008363604545593, + "step": 904 + }, + { + "epoch": 1.1738954113186122, + "grad_norm": 0.7728487849235535, + "learning_rate": 1.4295536237239445e-05, + "loss": 0.7336927652359009, + "step": 905 + }, + { + "epoch": 1.175193138268751, + "grad_norm": 0.7491990923881531, + "learning_rate": 1.4282618840990438e-05, + "loss": 0.7324055433273315, + "step": 906 + }, + { + "epoch": 1.1764908652188901, + "grad_norm": 0.723862886428833, + "learning_rate": 1.4269692687869849e-05, + "loss": 0.7677553296089172, + "step": 907 + }, + { + "epoch": 1.177788592169029, + "grad_norm": 0.7578226923942566, + "learning_rate": 1.425675780430839e-05, + "loss": 0.7772313356399536, + "step": 908 + }, + { + "epoch": 1.1790863191191678, + "grad_norm": 0.7269909977912903, + "learning_rate": 1.4243814216754626e-05, + "loss": 0.7330427765846252, + "step": 909 + }, + { + "epoch": 1.1803840460693067, + "grad_norm": 0.7582956552505493, + "learning_rate": 1.4230861951674914e-05, + "loss": 0.7717634439468384, + "step": 910 + }, + { + "epoch": 1.1816817730194455, + "grad_norm": 0.7162467837333679, + "learning_rate": 1.421790103555336e-05, + "loss": 0.7092885375022888, + "step": 911 + }, + { + "epoch": 1.1829794999695844, + "grad_norm": 0.743224024772644, + "learning_rate": 1.4204931494891759e-05, + "loss": 0.7082977294921875, + "step": 912 + }, + { + "epoch": 1.1842772269197235, + "grad_norm": 0.7687066197395325, + "learning_rate": 1.4191953356209535e-05, + "loss": 0.7173585295677185, + "step": 913 + }, + { + "epoch": 1.1855749538698623, + "grad_norm": 0.7276656627655029, + "learning_rate": 1.4178966646043702e-05, + "loss": 0.6923103928565979, + "step": 914 + }, + { + "epoch": 1.1868726808200012, + "grad_norm": 0.7307775020599365, + "learning_rate": 1.4165971390948787e-05, + "loss": 0.7817268967628479, + "step": 915 + }, + { + "epoch": 1.18817040777014, + "grad_norm": 0.7706684470176697, + "learning_rate": 1.4152967617496805e-05, + "loss": 0.7029048800468445, + "step": 916 + }, + { + "epoch": 1.1894681347202791, + "grad_norm": 0.7382630705833435, + "learning_rate": 1.4139955352277176e-05, + "loss": 0.6833078265190125, + "step": 917 + }, + { + "epoch": 1.190765861670418, + "grad_norm": 0.6961492300033569, + "learning_rate": 1.4126934621896692e-05, + "loss": 0.6633516550064087, + "step": 918 + }, + { + "epoch": 1.1920635886205568, + "grad_norm": 0.7289763689041138, + "learning_rate": 1.4113905452979455e-05, + "loss": 0.7273116707801819, + "step": 919 + }, + { + "epoch": 1.1933613155706957, + "grad_norm": 0.6953696608543396, + "learning_rate": 1.410086787216681e-05, + "loss": 0.6880172491073608, + "step": 920 + }, + { + "epoch": 1.1933613155706957, + "eval_loss": 0.7621704339981079, + "eval_runtime": 143.9146, + "eval_samples_per_second": 36.077, + "eval_steps_per_second": 9.019, + "step": 920 + }, + { + "epoch": 1.1946590425208345, + "grad_norm": 0.6652716398239136, + "learning_rate": 1.4087821906117314e-05, + "loss": 0.6670587658882141, + "step": 921 + }, + { + "epoch": 1.1959567694709734, + "grad_norm": 0.7497081756591797, + "learning_rate": 1.4074767581506666e-05, + "loss": 0.7381057739257812, + "step": 922 + }, + { + "epoch": 1.1972544964211125, + "grad_norm": 0.710457444190979, + "learning_rate": 1.4061704925027653e-05, + "loss": 0.6957287192344666, + "step": 923 + }, + { + "epoch": 1.1985522233712513, + "grad_norm": 0.7493513226509094, + "learning_rate": 1.4048633963390105e-05, + "loss": 0.6821112036705017, + "step": 924 + }, + { + "epoch": 1.1998499503213902, + "grad_norm": 0.7443753480911255, + "learning_rate": 1.4035554723320828e-05, + "loss": 0.7110794186592102, + "step": 925 + }, + { + "epoch": 1.201147677271529, + "grad_norm": 0.6964433789253235, + "learning_rate": 1.4022467231563554e-05, + "loss": 0.6899577379226685, + "step": 926 + }, + { + "epoch": 1.202445404221668, + "grad_norm": 0.718528687953949, + "learning_rate": 1.4009371514878898e-05, + "loss": 0.7851035594940186, + "step": 927 + }, + { + "epoch": 1.203743131171807, + "grad_norm": 0.7249849438667297, + "learning_rate": 1.399626760004428e-05, + "loss": 0.7298780679702759, + "step": 928 + }, + { + "epoch": 1.2050408581219458, + "grad_norm": 0.6934380531311035, + "learning_rate": 1.3983155513853897e-05, + "loss": 0.7791250944137573, + "step": 929 + }, + { + "epoch": 1.2063385850720847, + "grad_norm": 0.704552173614502, + "learning_rate": 1.3970035283118639e-05, + "loss": 0.7045942544937134, + "step": 930 + }, + { + "epoch": 1.2076363120222235, + "grad_norm": 0.748252809047699, + "learning_rate": 1.3956906934666056e-05, + "loss": 0.7210633158683777, + "step": 931 + }, + { + "epoch": 1.2089340389723624, + "grad_norm": 0.7162604331970215, + "learning_rate": 1.3943770495340307e-05, + "loss": 0.7707422375679016, + "step": 932 + }, + { + "epoch": 1.2102317659225013, + "grad_norm": 0.6919230222702026, + "learning_rate": 1.3930625992002076e-05, + "loss": 0.7039645910263062, + "step": 933 + }, + { + "epoch": 1.2115294928726403, + "grad_norm": 0.7416049242019653, + "learning_rate": 1.391747345152855e-05, + "loss": 0.7351235747337341, + "step": 934 + }, + { + "epoch": 1.2128272198227792, + "grad_norm": 0.7046512961387634, + "learning_rate": 1.3904312900813345e-05, + "loss": 0.659813642501831, + "step": 935 + }, + { + "epoch": 1.214124946772918, + "grad_norm": 0.6865445971488953, + "learning_rate": 1.3891144366766457e-05, + "loss": 0.6879123449325562, + "step": 936 + }, + { + "epoch": 1.215422673723057, + "grad_norm": 0.7112798094749451, + "learning_rate": 1.3877967876314205e-05, + "loss": 0.745692789554596, + "step": 937 + }, + { + "epoch": 1.216720400673196, + "grad_norm": 0.7131559252738953, + "learning_rate": 1.3864783456399174e-05, + "loss": 0.7047199010848999, + "step": 938 + }, + { + "epoch": 1.2180181276233348, + "grad_norm": 0.7183334231376648, + "learning_rate": 1.3851591133980167e-05, + "loss": 0.7335140109062195, + "step": 939 + }, + { + "epoch": 1.2193158545734737, + "grad_norm": 0.7161308526992798, + "learning_rate": 1.3838390936032146e-05, + "loss": 0.6805643439292908, + "step": 940 + }, + { + "epoch": 1.2206135815236125, + "grad_norm": 0.6899462938308716, + "learning_rate": 1.3825182889546173e-05, + "loss": 0.6711665391921997, + "step": 941 + }, + { + "epoch": 1.2219113084737514, + "grad_norm": 0.7179728150367737, + "learning_rate": 1.3811967021529362e-05, + "loss": 0.730987012386322, + "step": 942 + }, + { + "epoch": 1.2232090354238903, + "grad_norm": 0.7028578519821167, + "learning_rate": 1.3798743359004816e-05, + "loss": 0.7164129614830017, + "step": 943 + }, + { + "epoch": 1.2245067623740293, + "grad_norm": 0.7241238355636597, + "learning_rate": 1.378551192901158e-05, + "loss": 0.6604956984519958, + "step": 944 + }, + { + "epoch": 1.2258044893241682, + "grad_norm": 0.6871349215507507, + "learning_rate": 1.3772272758604576e-05, + "loss": 0.705906093120575, + "step": 945 + }, + { + "epoch": 1.227102216274307, + "grad_norm": 0.7182629108428955, + "learning_rate": 1.375902587485456e-05, + "loss": 0.6978931427001953, + "step": 946 + }, + { + "epoch": 1.228399943224446, + "grad_norm": 0.7523950934410095, + "learning_rate": 1.3745771304848056e-05, + "loss": 0.669691264629364, + "step": 947 + }, + { + "epoch": 1.2296976701745848, + "grad_norm": 0.736535906791687, + "learning_rate": 1.3732509075687302e-05, + "loss": 0.6971163749694824, + "step": 948 + }, + { + "epoch": 1.2309953971247238, + "grad_norm": 0.773280143737793, + "learning_rate": 1.3719239214490203e-05, + "loss": 0.7307339906692505, + "step": 949 + }, + { + "epoch": 1.2322931240748627, + "grad_norm": 0.7597857713699341, + "learning_rate": 1.3705961748390264e-05, + "loss": 0.6916163563728333, + "step": 950 + }, + { + "epoch": 1.2335908510250015, + "grad_norm": 0.7426233291625977, + "learning_rate": 1.3692676704536547e-05, + "loss": 0.7779046297073364, + "step": 951 + }, + { + "epoch": 1.2348885779751404, + "grad_norm": 0.7428677082061768, + "learning_rate": 1.3679384110093601e-05, + "loss": 0.7056743502616882, + "step": 952 + }, + { + "epoch": 1.2361863049252793, + "grad_norm": 0.7308823466300964, + "learning_rate": 1.3666083992241414e-05, + "loss": 0.7445065379142761, + "step": 953 + }, + { + "epoch": 1.2374840318754181, + "grad_norm": 0.7000466585159302, + "learning_rate": 1.3652776378175366e-05, + "loss": 0.7621708512306213, + "step": 954 + }, + { + "epoch": 1.2387817588255572, + "grad_norm": 0.7069138288497925, + "learning_rate": 1.3639461295106157e-05, + "loss": 0.6963789463043213, + "step": 955 + }, + { + "epoch": 1.240079485775696, + "grad_norm": 0.7114101052284241, + "learning_rate": 1.3626138770259765e-05, + "loss": 0.6562871932983398, + "step": 956 + }, + { + "epoch": 1.241377212725835, + "grad_norm": 0.7246086597442627, + "learning_rate": 1.3612808830877377e-05, + "loss": 0.6914277672767639, + "step": 957 + }, + { + "epoch": 1.2426749396759738, + "grad_norm": 0.7212405800819397, + "learning_rate": 1.3599471504215347e-05, + "loss": 0.7332183122634888, + "step": 958 + }, + { + "epoch": 1.2439726666261126, + "grad_norm": 0.725243866443634, + "learning_rate": 1.358612681754513e-05, + "loss": 0.7095848321914673, + "step": 959 + }, + { + "epoch": 1.2452703935762517, + "grad_norm": 0.7690359354019165, + "learning_rate": 1.357277479815324e-05, + "loss": 0.7376914024353027, + "step": 960 + }, + { + "epoch": 1.2465681205263905, + "grad_norm": 0.7036330699920654, + "learning_rate": 1.355941547334117e-05, + "loss": 0.6845636367797852, + "step": 961 + }, + { + "epoch": 1.2478658474765294, + "grad_norm": 0.7338976860046387, + "learning_rate": 1.3546048870425356e-05, + "loss": 0.6979953050613403, + "step": 962 + }, + { + "epoch": 1.2491635744266683, + "grad_norm": 0.7343106865882874, + "learning_rate": 1.3532675016737127e-05, + "loss": 0.7461492419242859, + "step": 963 + }, + { + "epoch": 1.250461301376807, + "grad_norm": 0.7208863496780396, + "learning_rate": 1.3519293939622622e-05, + "loss": 0.8038127422332764, + "step": 964 + }, + { + "epoch": 1.251759028326946, + "grad_norm": 0.7410427331924438, + "learning_rate": 1.3505905666442757e-05, + "loss": 0.7741251587867737, + "step": 965 + }, + { + "epoch": 1.253056755277085, + "grad_norm": 0.711874783039093, + "learning_rate": 1.3492510224573165e-05, + "loss": 0.6908672451972961, + "step": 966 + }, + { + "epoch": 1.254354482227224, + "grad_norm": 0.6897700428962708, + "learning_rate": 1.3479107641404134e-05, + "loss": 0.6856587529182434, + "step": 967 + }, + { + "epoch": 1.2556522091773628, + "grad_norm": 0.6764082908630371, + "learning_rate": 1.3465697944340552e-05, + "loss": 0.6477972865104675, + "step": 968 + }, + { + "epoch": 1.2569499361275016, + "grad_norm": 0.7004117965698242, + "learning_rate": 1.3452281160801856e-05, + "loss": 0.7135658264160156, + "step": 969 + }, + { + "epoch": 1.2582476630776407, + "grad_norm": 0.7178849577903748, + "learning_rate": 1.3438857318221974e-05, + "loss": 0.7354244589805603, + "step": 970 + }, + { + "epoch": 1.2595453900277795, + "grad_norm": 0.7121056318283081, + "learning_rate": 1.3425426444049265e-05, + "loss": 0.7121109962463379, + "step": 971 + }, + { + "epoch": 1.2608431169779184, + "grad_norm": 0.8285553455352783, + "learning_rate": 1.3411988565746467e-05, + "loss": 0.7759053111076355, + "step": 972 + }, + { + "epoch": 1.2621408439280573, + "grad_norm": 0.6977941989898682, + "learning_rate": 1.3398543710790642e-05, + "loss": 0.7189201712608337, + "step": 973 + }, + { + "epoch": 1.263438570878196, + "grad_norm": 0.7547982931137085, + "learning_rate": 1.3385091906673115e-05, + "loss": 0.7352871298789978, + "step": 974 + }, + { + "epoch": 1.264736297828335, + "grad_norm": 0.7178804278373718, + "learning_rate": 1.3371633180899417e-05, + "loss": 0.7920108437538147, + "step": 975 + }, + { + "epoch": 1.2660340247784738, + "grad_norm": 0.7035505771636963, + "learning_rate": 1.335816756098924e-05, + "loss": 0.7362672090530396, + "step": 976 + }, + { + "epoch": 1.267331751728613, + "grad_norm": 0.7581067681312561, + "learning_rate": 1.3344695074476365e-05, + "loss": 0.7702075839042664, + "step": 977 + }, + { + "epoch": 1.2686294786787518, + "grad_norm": 0.7533540725708008, + "learning_rate": 1.3331215748908622e-05, + "loss": 0.7555018067359924, + "step": 978 + }, + { + "epoch": 1.2699272056288906, + "grad_norm": 0.7056939601898193, + "learning_rate": 1.3317729611847818e-05, + "loss": 0.7297285795211792, + "step": 979 + }, + { + "epoch": 1.2712249325790295, + "grad_norm": 0.7933931946754456, + "learning_rate": 1.3304236690869688e-05, + "loss": 0.7637395262718201, + "step": 980 + }, + { + "epoch": 1.2725226595291685, + "grad_norm": 0.7511240243911743, + "learning_rate": 1.329073701356384e-05, + "loss": 0.7278518676757812, + "step": 981 + }, + { + "epoch": 1.2738203864793074, + "grad_norm": 0.6915922164916992, + "learning_rate": 1.3277230607533698e-05, + "loss": 0.6694924831390381, + "step": 982 + }, + { + "epoch": 1.2751181134294463, + "grad_norm": 0.7327374219894409, + "learning_rate": 1.3263717500396446e-05, + "loss": 0.714762806892395, + "step": 983 + }, + { + "epoch": 1.276415840379585, + "grad_norm": 0.7382856607437134, + "learning_rate": 1.3250197719782966e-05, + "loss": 0.7134686708450317, + "step": 984 + }, + { + "epoch": 1.277713567329724, + "grad_norm": 0.7472854256629944, + "learning_rate": 1.3236671293337788e-05, + "loss": 0.7220948934555054, + "step": 985 + }, + { + "epoch": 1.2790112942798628, + "grad_norm": 0.7201051712036133, + "learning_rate": 1.3223138248719032e-05, + "loss": 0.7394418120384216, + "step": 986 + }, + { + "epoch": 1.280309021230002, + "grad_norm": 0.7629786133766174, + "learning_rate": 1.3209598613598344e-05, + "loss": 0.7015069127082825, + "step": 987 + }, + { + "epoch": 1.2816067481801408, + "grad_norm": 0.7126546502113342, + "learning_rate": 1.3196052415660856e-05, + "loss": 0.7289220690727234, + "step": 988 + }, + { + "epoch": 1.2829044751302796, + "grad_norm": 0.7296859622001648, + "learning_rate": 1.318249968260511e-05, + "loss": 0.7893659472465515, + "step": 989 + }, + { + "epoch": 1.2842022020804185, + "grad_norm": 0.7498401403427124, + "learning_rate": 1.316894044214302e-05, + "loss": 0.7200069427490234, + "step": 990 + }, + { + "epoch": 1.2854999290305575, + "grad_norm": 0.7126410603523254, + "learning_rate": 1.3155374721999797e-05, + "loss": 0.7033067345619202, + "step": 991 + }, + { + "epoch": 1.2867976559806964, + "grad_norm": 0.7097041606903076, + "learning_rate": 1.3141802549913907e-05, + "loss": 0.7358456254005432, + "step": 992 + }, + { + "epoch": 1.2880953829308353, + "grad_norm": 0.6961123943328857, + "learning_rate": 1.3128223953637003e-05, + "loss": 0.6741704940795898, + "step": 993 + }, + { + "epoch": 1.289393109880974, + "grad_norm": 0.7323908805847168, + "learning_rate": 1.3114638960933883e-05, + "loss": 0.8081434965133667, + "step": 994 + }, + { + "epoch": 1.290690836831113, + "grad_norm": 0.713190495967865, + "learning_rate": 1.3101047599582415e-05, + "loss": 0.7475412487983704, + "step": 995 + }, + { + "epoch": 1.2919885637812518, + "grad_norm": 0.7204756140708923, + "learning_rate": 1.3087449897373494e-05, + "loss": 0.7166237831115723, + "step": 996 + }, + { + "epoch": 1.2932862907313907, + "grad_norm": 0.7209048271179199, + "learning_rate": 1.307384588211098e-05, + "loss": 0.7091537117958069, + "step": 997 + }, + { + "epoch": 1.2945840176815298, + "grad_norm": 0.7139458656311035, + "learning_rate": 1.306023558161164e-05, + "loss": 0.7146654725074768, + "step": 998 + }, + { + "epoch": 1.2958817446316686, + "grad_norm": 0.7128956317901611, + "learning_rate": 1.3046619023705095e-05, + "loss": 0.821353018283844, + "step": 999 + }, + { + "epoch": 1.2971794715818075, + "grad_norm": 0.7287904620170593, + "learning_rate": 1.3032996236233756e-05, + "loss": 0.7813044786453247, + "step": 1000 + }, + { + "epoch": 1.2984771985319463, + "grad_norm": 0.7277258038520813, + "learning_rate": 1.3019367247052781e-05, + "loss": 0.7448681592941284, + "step": 1001 + }, + { + "epoch": 1.2997749254820854, + "grad_norm": 0.7179688811302185, + "learning_rate": 1.300573208403e-05, + "loss": 0.6965285539627075, + "step": 1002 + }, + { + "epoch": 1.3010726524322243, + "grad_norm": 0.7211664319038391, + "learning_rate": 1.2992090775045868e-05, + "loss": 0.7049282789230347, + "step": 1003 + }, + { + "epoch": 1.302370379382363, + "grad_norm": 0.6898071765899658, + "learning_rate": 1.2978443347993415e-05, + "loss": 0.6415733695030212, + "step": 1004 + }, + { + "epoch": 1.303668106332502, + "grad_norm": 0.7255175709724426, + "learning_rate": 1.296478983077817e-05, + "loss": 0.708603024482727, + "step": 1005 + }, + { + "epoch": 1.3049658332826408, + "grad_norm": 0.7339725494384766, + "learning_rate": 1.2951130251318125e-05, + "loss": 0.73588627576828, + "step": 1006 + }, + { + "epoch": 1.3062635602327797, + "grad_norm": 0.6914424300193787, + "learning_rate": 1.2937464637543655e-05, + "loss": 0.7236727476119995, + "step": 1007 + }, + { + "epoch": 1.3075612871829188, + "grad_norm": 0.6850101351737976, + "learning_rate": 1.2923793017397488e-05, + "loss": 0.6565558910369873, + "step": 1008 + }, + { + "epoch": 1.3088590141330576, + "grad_norm": 0.6893193125724792, + "learning_rate": 1.2910115418834624e-05, + "loss": 0.6460487246513367, + "step": 1009 + }, + { + "epoch": 1.3101567410831965, + "grad_norm": 0.7375558018684387, + "learning_rate": 1.289643186982229e-05, + "loss": 0.8016327619552612, + "step": 1010 + }, + { + "epoch": 1.3114544680333353, + "grad_norm": 0.7113102078437805, + "learning_rate": 1.2882742398339884e-05, + "loss": 0.6883566975593567, + "step": 1011 + }, + { + "epoch": 1.3127521949834744, + "grad_norm": 0.7452290058135986, + "learning_rate": 1.2869047032378905e-05, + "loss": 0.7325704097747803, + "step": 1012 + }, + { + "epoch": 1.3140499219336133, + "grad_norm": 0.6935728192329407, + "learning_rate": 1.2855345799942915e-05, + "loss": 0.689193606376648, + "step": 1013 + }, + { + "epoch": 1.315347648883752, + "grad_norm": 0.7144383192062378, + "learning_rate": 1.2841638729047463e-05, + "loss": 0.6948485374450684, + "step": 1014 + }, + { + "epoch": 1.316645375833891, + "grad_norm": 0.6706473231315613, + "learning_rate": 1.2827925847720041e-05, + "loss": 0.7062092423439026, + "step": 1015 + }, + { + "epoch": 1.3179431027840298, + "grad_norm": 0.7125740051269531, + "learning_rate": 1.2814207184000018e-05, + "loss": 0.6752945780754089, + "step": 1016 + }, + { + "epoch": 1.3192408297341687, + "grad_norm": 0.7221876978874207, + "learning_rate": 1.2800482765938594e-05, + "loss": 0.7700286507606506, + "step": 1017 + }, + { + "epoch": 1.3205385566843075, + "grad_norm": 0.6877630949020386, + "learning_rate": 1.2786752621598726e-05, + "loss": 0.7289664149284363, + "step": 1018 + }, + { + "epoch": 1.3218362836344466, + "grad_norm": 0.7257193922996521, + "learning_rate": 1.2773016779055089e-05, + "loss": 0.6938936710357666, + "step": 1019 + }, + { + "epoch": 1.3231340105845855, + "grad_norm": 0.6880965828895569, + "learning_rate": 1.2759275266393998e-05, + "loss": 0.6982592344284058, + "step": 1020 + }, + { + "epoch": 1.3244317375347243, + "grad_norm": 0.683870792388916, + "learning_rate": 1.2745528111713373e-05, + "loss": 0.6983235478401184, + "step": 1021 + }, + { + "epoch": 1.3257294644848632, + "grad_norm": 0.7127654552459717, + "learning_rate": 1.2731775343122663e-05, + "loss": 0.7544030547142029, + "step": 1022 + }, + { + "epoch": 1.3270271914350023, + "grad_norm": 0.7284364104270935, + "learning_rate": 1.2718016988742799e-05, + "loss": 0.7375183701515198, + "step": 1023 + }, + { + "epoch": 1.328324918385141, + "grad_norm": 0.6857113838195801, + "learning_rate": 1.270425307670614e-05, + "loss": 0.6983596682548523, + "step": 1024 + }, + { + "epoch": 1.32962264533528, + "grad_norm": 0.7102038860321045, + "learning_rate": 1.2690483635156392e-05, + "loss": 0.7385768294334412, + "step": 1025 + }, + { + "epoch": 1.3309203722854188, + "grad_norm": 0.7345147728919983, + "learning_rate": 1.2676708692248583e-05, + "loss": 0.6854493618011475, + "step": 1026 + }, + { + "epoch": 1.3322180992355577, + "grad_norm": 0.7039386630058289, + "learning_rate": 1.2662928276148985e-05, + "loss": 0.7170513868331909, + "step": 1027 + }, + { + "epoch": 1.3335158261856965, + "grad_norm": 0.6941388845443726, + "learning_rate": 1.264914241503506e-05, + "loss": 0.7566976547241211, + "step": 1028 + }, + { + "epoch": 1.3348135531358354, + "grad_norm": 0.6874922513961792, + "learning_rate": 1.2635351137095408e-05, + "loss": 0.6834582686424255, + "step": 1029 + }, + { + "epoch": 1.3361112800859745, + "grad_norm": 0.7201216220855713, + "learning_rate": 1.2621554470529698e-05, + "loss": 0.734821617603302, + "step": 1030 + }, + { + "epoch": 1.3374090070361133, + "grad_norm": 0.7032731175422668, + "learning_rate": 1.2607752443548622e-05, + "loss": 0.7255396842956543, + "step": 1031 + }, + { + "epoch": 1.3387067339862522, + "grad_norm": 0.7893847823143005, + "learning_rate": 1.259394508437383e-05, + "loss": 0.7393696308135986, + "step": 1032 + }, + { + "epoch": 1.340004460936391, + "grad_norm": 0.7231351137161255, + "learning_rate": 1.2580132421237883e-05, + "loss": 0.7424145340919495, + "step": 1033 + }, + { + "epoch": 1.34130218788653, + "grad_norm": 0.7326940298080444, + "learning_rate": 1.2566314482384174e-05, + "loss": 0.7439311742782593, + "step": 1034 + }, + { + "epoch": 1.342599914836669, + "grad_norm": 0.775790810585022, + "learning_rate": 1.2552491296066895e-05, + "loss": 0.7325758934020996, + "step": 1035 + }, + { + "epoch": 1.3438976417868078, + "grad_norm": 0.7467171549797058, + "learning_rate": 1.2538662890550959e-05, + "loss": 0.7975653409957886, + "step": 1036 + }, + { + "epoch": 1.3451953687369467, + "grad_norm": 0.762482225894928, + "learning_rate": 1.252482929411196e-05, + "loss": 0.7613498568534851, + "step": 1037 + }, + { + "epoch": 1.3464930956870855, + "grad_norm": 0.6938416957855225, + "learning_rate": 1.25109905350361e-05, + "loss": 0.691423773765564, + "step": 1038 + }, + { + "epoch": 1.3477908226372244, + "grad_norm": 0.7459502816200256, + "learning_rate": 1.249714664162014e-05, + "loss": 0.7226969003677368, + "step": 1039 + }, + { + "epoch": 1.3490885495873635, + "grad_norm": 0.7236127853393555, + "learning_rate": 1.2483297642171332e-05, + "loss": 0.7204033732414246, + "step": 1040 + }, + { + "epoch": 1.3503862765375023, + "grad_norm": 0.7287815809249878, + "learning_rate": 1.246944356500738e-05, + "loss": 0.7803208231925964, + "step": 1041 + }, + { + "epoch": 1.3516840034876412, + "grad_norm": 0.7607238292694092, + "learning_rate": 1.2455584438456366e-05, + "loss": 0.7617399096488953, + "step": 1042 + }, + { + "epoch": 1.35298173043778, + "grad_norm": 0.707085907459259, + "learning_rate": 1.2441720290856694e-05, + "loss": 0.7277243733406067, + "step": 1043 + }, + { + "epoch": 1.354279457387919, + "grad_norm": 0.7148833274841309, + "learning_rate": 1.2427851150557036e-05, + "loss": 0.7467551231384277, + "step": 1044 + }, + { + "epoch": 1.355577184338058, + "grad_norm": 0.7209689617156982, + "learning_rate": 1.241397704591627e-05, + "loss": 0.6694290637969971, + "step": 1045 + }, + { + "epoch": 1.3568749112881968, + "grad_norm": 0.7720620036125183, + "learning_rate": 1.2400098005303436e-05, + "loss": 0.7658464312553406, + "step": 1046 + }, + { + "epoch": 1.3581726382383357, + "grad_norm": 0.68074631690979, + "learning_rate": 1.238621405709766e-05, + "loss": 0.6357854008674622, + "step": 1047 + }, + { + "epoch": 1.3594703651884745, + "grad_norm": 0.7629329562187195, + "learning_rate": 1.2372325229688093e-05, + "loss": 0.7309067249298096, + "step": 1048 + }, + { + "epoch": 1.3607680921386134, + "grad_norm": 0.7004507184028625, + "learning_rate": 1.235843155147388e-05, + "loss": 0.6715525388717651, + "step": 1049 + }, + { + "epoch": 1.3620658190887522, + "grad_norm": 0.6997591853141785, + "learning_rate": 1.2344533050864071e-05, + "loss": 0.6700186729431152, + "step": 1050 + }, + { + "epoch": 1.3633635460388913, + "grad_norm": 0.7181966304779053, + "learning_rate": 1.2330629756277588e-05, + "loss": 0.6444705724716187, + "step": 1051 + }, + { + "epoch": 1.3646612729890302, + "grad_norm": 0.780085563659668, + "learning_rate": 1.2316721696143141e-05, + "loss": 0.7659810185432434, + "step": 1052 + }, + { + "epoch": 1.365958999939169, + "grad_norm": 0.690724790096283, + "learning_rate": 1.23028088988992e-05, + "loss": 0.6315090656280518, + "step": 1053 + }, + { + "epoch": 1.3672567268893079, + "grad_norm": 0.7686077356338501, + "learning_rate": 1.228889139299391e-05, + "loss": 0.8060528039932251, + "step": 1054 + }, + { + "epoch": 1.368554453839447, + "grad_norm": 0.7056965827941895, + "learning_rate": 1.2274969206885048e-05, + "loss": 0.6794640421867371, + "step": 1055 + }, + { + "epoch": 1.3698521807895858, + "grad_norm": 0.7886383533477783, + "learning_rate": 1.2261042369039966e-05, + "loss": 0.7453962564468384, + "step": 1056 + }, + { + "epoch": 1.3711499077397247, + "grad_norm": 0.6753075122833252, + "learning_rate": 1.2247110907935518e-05, + "loss": 0.6878754496574402, + "step": 1057 + }, + { + "epoch": 1.3724476346898635, + "grad_norm": 0.670427143573761, + "learning_rate": 1.2233174852058015e-05, + "loss": 0.6822103261947632, + "step": 1058 + }, + { + "epoch": 1.3737453616400024, + "grad_norm": 0.725235641002655, + "learning_rate": 1.2219234229903163e-05, + "loss": 0.7130811810493469, + "step": 1059 + }, + { + "epoch": 1.3750430885901412, + "grad_norm": 0.7341755032539368, + "learning_rate": 1.2205289069976012e-05, + "loss": 0.6956161856651306, + "step": 1060 + }, + { + "epoch": 1.37634081554028, + "grad_norm": 0.7005776166915894, + "learning_rate": 1.2191339400790881e-05, + "loss": 0.6915519833564758, + "step": 1061 + }, + { + "epoch": 1.3776385424904192, + "grad_norm": 0.7250275015830994, + "learning_rate": 1.2177385250871312e-05, + "loss": 0.7210217118263245, + "step": 1062 + }, + { + "epoch": 1.378936269440558, + "grad_norm": 0.7169617414474487, + "learning_rate": 1.2163426648750009e-05, + "loss": 0.7050390839576721, + "step": 1063 + }, + { + "epoch": 1.3802339963906969, + "grad_norm": 0.7458826303482056, + "learning_rate": 1.2149463622968782e-05, + "loss": 0.7116800546646118, + "step": 1064 + }, + { + "epoch": 1.3815317233408357, + "grad_norm": 0.7212430834770203, + "learning_rate": 1.2135496202078487e-05, + "loss": 0.658031165599823, + "step": 1065 + }, + { + "epoch": 1.3828294502909748, + "grad_norm": 0.7072278261184692, + "learning_rate": 1.2121524414638958e-05, + "loss": 0.7117524147033691, + "step": 1066 + }, + { + "epoch": 1.3841271772411137, + "grad_norm": 0.7267945408821106, + "learning_rate": 1.2107548289218968e-05, + "loss": 0.690047025680542, + "step": 1067 + }, + { + "epoch": 1.3854249041912525, + "grad_norm": 0.7326766848564148, + "learning_rate": 1.2093567854396158e-05, + "loss": 0.7240371704101562, + "step": 1068 + }, + { + "epoch": 1.3867226311413914, + "grad_norm": 0.6955649256706238, + "learning_rate": 1.2079583138756976e-05, + "loss": 0.7229723334312439, + "step": 1069 + }, + { + "epoch": 1.3880203580915302, + "grad_norm": 0.6991240978240967, + "learning_rate": 1.206559417089663e-05, + "loss": 0.7131638526916504, + "step": 1070 + }, + { + "epoch": 1.389318085041669, + "grad_norm": 0.7009238600730896, + "learning_rate": 1.205160097941901e-05, + "loss": 0.7577610611915588, + "step": 1071 + }, + { + "epoch": 1.3906158119918082, + "grad_norm": 0.7368999719619751, + "learning_rate": 1.2037603592936656e-05, + "loss": 0.7876178026199341, + "step": 1072 + }, + { + "epoch": 1.391913538941947, + "grad_norm": 0.7627021670341492, + "learning_rate": 1.2023602040070679e-05, + "loss": 0.8456990718841553, + "step": 1073 + }, + { + "epoch": 1.3932112658920859, + "grad_norm": 0.7341564893722534, + "learning_rate": 1.2009596349450717e-05, + "loss": 0.7692890167236328, + "step": 1074 + }, + { + "epoch": 1.3945089928422247, + "grad_norm": 0.706305205821991, + "learning_rate": 1.1995586549714855e-05, + "loss": 0.7290987372398376, + "step": 1075 + }, + { + "epoch": 1.3958067197923638, + "grad_norm": 0.7150030136108398, + "learning_rate": 1.198157266950959e-05, + "loss": 0.7904977202415466, + "step": 1076 + }, + { + "epoch": 1.3971044467425027, + "grad_norm": 0.6936087608337402, + "learning_rate": 1.1967554737489762e-05, + "loss": 0.7233096361160278, + "step": 1077 + }, + { + "epoch": 1.3984021736926415, + "grad_norm": 0.705502450466156, + "learning_rate": 1.1953532782318491e-05, + "loss": 0.6974169015884399, + "step": 1078 + }, + { + "epoch": 1.3996999006427804, + "grad_norm": 0.7046432495117188, + "learning_rate": 1.1939506832667129e-05, + "loss": 0.7049128413200378, + "step": 1079 + }, + { + "epoch": 1.4009976275929192, + "grad_norm": 0.7448377013206482, + "learning_rate": 1.1925476917215191e-05, + "loss": 0.7288391590118408, + "step": 1080 + }, + { + "epoch": 1.402295354543058, + "grad_norm": 0.7215666174888611, + "learning_rate": 1.1911443064650301e-05, + "loss": 0.7517431974411011, + "step": 1081 + }, + { + "epoch": 1.403593081493197, + "grad_norm": 0.7152860164642334, + "learning_rate": 1.189740530366814e-05, + "loss": 0.7353943586349487, + "step": 1082 + }, + { + "epoch": 1.404890808443336, + "grad_norm": 0.7322341203689575, + "learning_rate": 1.1883363662972375e-05, + "loss": 0.7282765507698059, + "step": 1083 + }, + { + "epoch": 1.4061885353934749, + "grad_norm": 0.7007766962051392, + "learning_rate": 1.1869318171274606e-05, + "loss": 0.6773781776428223, + "step": 1084 + }, + { + "epoch": 1.4074862623436137, + "grad_norm": 0.6969038248062134, + "learning_rate": 1.1855268857294308e-05, + "loss": 0.7106554508209229, + "step": 1085 + }, + { + "epoch": 1.4087839892937526, + "grad_norm": 0.7315483093261719, + "learning_rate": 1.1841215749758774e-05, + "loss": 0.7127244472503662, + "step": 1086 + }, + { + "epoch": 1.4100817162438917, + "grad_norm": 0.7427330613136292, + "learning_rate": 1.182715887740305e-05, + "loss": 0.7914733290672302, + "step": 1087 + }, + { + "epoch": 1.4113794431940305, + "grad_norm": 0.7135612964630127, + "learning_rate": 1.1813098268969886e-05, + "loss": 0.7351382374763489, + "step": 1088 + }, + { + "epoch": 1.4126771701441694, + "grad_norm": 0.6763968467712402, + "learning_rate": 1.1799033953209664e-05, + "loss": 0.7243238687515259, + "step": 1089 + }, + { + "epoch": 1.4139748970943082, + "grad_norm": 0.6963580250740051, + "learning_rate": 1.178496595888035e-05, + "loss": 0.718358039855957, + "step": 1090 + }, + { + "epoch": 1.415272624044447, + "grad_norm": 0.7186612486839294, + "learning_rate": 1.1770894314747433e-05, + "loss": 0.7567769885063171, + "step": 1091 + }, + { + "epoch": 1.416570350994586, + "grad_norm": 0.7769639492034912, + "learning_rate": 1.1756819049583861e-05, + "loss": 0.6931068301200867, + "step": 1092 + }, + { + "epoch": 1.417868077944725, + "grad_norm": 0.6902489066123962, + "learning_rate": 1.1742740192169995e-05, + "loss": 0.7427462339401245, + "step": 1093 + }, + { + "epoch": 1.4191658048948639, + "grad_norm": 0.7374582886695862, + "learning_rate": 1.1728657771293529e-05, + "loss": 0.7023187279701233, + "step": 1094 + }, + { + "epoch": 1.4204635318450027, + "grad_norm": 0.7119615077972412, + "learning_rate": 1.171457181574945e-05, + "loss": 0.7274259328842163, + "step": 1095 + }, + { + "epoch": 1.4217612587951416, + "grad_norm": 0.7346155047416687, + "learning_rate": 1.1700482354339972e-05, + "loss": 0.7683991193771362, + "step": 1096 + }, + { + "epoch": 1.4230589857452807, + "grad_norm": 0.7501071095466614, + "learning_rate": 1.168638941587448e-05, + "loss": 0.7191241979598999, + "step": 1097 + }, + { + "epoch": 1.4243567126954195, + "grad_norm": 0.7470526695251465, + "learning_rate": 1.1672293029169466e-05, + "loss": 0.6885469555854797, + "step": 1098 + }, + { + "epoch": 1.4256544396455584, + "grad_norm": 0.7323938608169556, + "learning_rate": 1.165819322304847e-05, + "loss": 0.7280178666114807, + "step": 1099 + }, + { + "epoch": 1.4269521665956972, + "grad_norm": 0.735260546207428, + "learning_rate": 1.164409002634203e-05, + "loss": 0.7417027354240417, + "step": 1100 + }, + { + "epoch": 1.428249893545836, + "grad_norm": 0.6863338351249695, + "learning_rate": 1.162998346788761e-05, + "loss": 0.7153418660163879, + "step": 1101 + }, + { + "epoch": 1.429547620495975, + "grad_norm": 0.6918323636054993, + "learning_rate": 1.1615873576529556e-05, + "loss": 0.7203163504600525, + "step": 1102 + }, + { + "epoch": 1.4308453474461138, + "grad_norm": 0.6796247363090515, + "learning_rate": 1.1601760381119022e-05, + "loss": 0.6820694208145142, + "step": 1103 + }, + { + "epoch": 1.4321430743962529, + "grad_norm": 0.7495130896568298, + "learning_rate": 1.158764391051392e-05, + "loss": 0.8182595372200012, + "step": 1104 + }, + { + "epoch": 1.4334408013463917, + "grad_norm": 0.702680766582489, + "learning_rate": 1.1573524193578863e-05, + "loss": 0.6952674984931946, + "step": 1105 + }, + { + "epoch": 1.4347385282965306, + "grad_norm": 0.7394551634788513, + "learning_rate": 1.1559401259185095e-05, + "loss": 0.7986393570899963, + "step": 1106 + }, + { + "epoch": 1.4360362552466694, + "grad_norm": 0.7024036049842834, + "learning_rate": 1.1545275136210441e-05, + "loss": 0.7037473917007446, + "step": 1107 + }, + { + "epoch": 1.4373339821968085, + "grad_norm": 0.7654225826263428, + "learning_rate": 1.153114585353925e-05, + "loss": 0.788162350654602, + "step": 1108 + }, + { + "epoch": 1.4386317091469474, + "grad_norm": 0.7220718264579773, + "learning_rate": 1.1517013440062326e-05, + "loss": 0.677041232585907, + "step": 1109 + }, + { + "epoch": 1.4399294360970862, + "grad_norm": 0.636647641658783, + "learning_rate": 1.1502877924676881e-05, + "loss": 0.6478151679039001, + "step": 1110 + }, + { + "epoch": 1.441227163047225, + "grad_norm": 0.7449962496757507, + "learning_rate": 1.1488739336286467e-05, + "loss": 0.7527351975440979, + "step": 1111 + }, + { + "epoch": 1.442524889997364, + "grad_norm": 0.6970670819282532, + "learning_rate": 1.1474597703800915e-05, + "loss": 0.7169626951217651, + "step": 1112 + }, + { + "epoch": 1.4438226169475028, + "grad_norm": 0.7441650032997131, + "learning_rate": 1.1460453056136285e-05, + "loss": 0.750106930732727, + "step": 1113 + }, + { + "epoch": 1.4451203438976417, + "grad_norm": 0.7144120335578918, + "learning_rate": 1.14463054222148e-05, + "loss": 0.7835033535957336, + "step": 1114 + }, + { + "epoch": 1.4464180708477807, + "grad_norm": 0.7178052663803101, + "learning_rate": 1.1432154830964796e-05, + "loss": 0.755246639251709, + "step": 1115 + }, + { + "epoch": 1.4477157977979196, + "grad_norm": 0.7312644720077515, + "learning_rate": 1.1418001311320649e-05, + "loss": 0.7156558632850647, + "step": 1116 + }, + { + "epoch": 1.4490135247480584, + "grad_norm": 0.6545835137367249, + "learning_rate": 1.1403844892222717e-05, + "loss": 0.6448360085487366, + "step": 1117 + }, + { + "epoch": 1.4503112516981973, + "grad_norm": 0.7543350458145142, + "learning_rate": 1.1389685602617302e-05, + "loss": 0.7119275331497192, + "step": 1118 + }, + { + "epoch": 1.4516089786483364, + "grad_norm": 0.6919403672218323, + "learning_rate": 1.1375523471456564e-05, + "loss": 0.6998506188392639, + "step": 1119 + }, + { + "epoch": 1.4529067055984752, + "grad_norm": 0.7320676445960999, + "learning_rate": 1.1361358527698481e-05, + "loss": 0.7184922099113464, + "step": 1120 + }, + { + "epoch": 1.454204432548614, + "grad_norm": 0.672732949256897, + "learning_rate": 1.134719080030677e-05, + "loss": 0.6867491006851196, + "step": 1121 + }, + { + "epoch": 1.455502159498753, + "grad_norm": 0.6875948309898376, + "learning_rate": 1.1333020318250854e-05, + "loss": 0.7337048053741455, + "step": 1122 + }, + { + "epoch": 1.4567998864488918, + "grad_norm": 0.6922927498817444, + "learning_rate": 1.131884711050578e-05, + "loss": 0.6915356516838074, + "step": 1123 + }, + { + "epoch": 1.4580976133990307, + "grad_norm": 0.6755322217941284, + "learning_rate": 1.1304671206052168e-05, + "loss": 0.6491101980209351, + "step": 1124 + }, + { + "epoch": 1.4593953403491697, + "grad_norm": 0.698635995388031, + "learning_rate": 1.1290492633876164e-05, + "loss": 0.7431061267852783, + "step": 1125 + }, + { + "epoch": 1.4606930672993086, + "grad_norm": 0.6657348871231079, + "learning_rate": 1.1276311422969349e-05, + "loss": 0.7039294838905334, + "step": 1126 + }, + { + "epoch": 1.4619907942494474, + "grad_norm": 0.7172051072120667, + "learning_rate": 1.1262127602328712e-05, + "loss": 0.7308294773101807, + "step": 1127 + }, + { + "epoch": 1.4632885211995863, + "grad_norm": 0.6960781812667847, + "learning_rate": 1.124794120095658e-05, + "loss": 0.693443238735199, + "step": 1128 + }, + { + "epoch": 1.4645862481497254, + "grad_norm": 0.759774386882782, + "learning_rate": 1.1233752247860549e-05, + "loss": 0.7438464760780334, + "step": 1129 + }, + { + "epoch": 1.4658839750998642, + "grad_norm": 0.7278202772140503, + "learning_rate": 1.1219560772053442e-05, + "loss": 0.7231059074401855, + "step": 1130 + }, + { + "epoch": 1.467181702050003, + "grad_norm": 0.7277034521102905, + "learning_rate": 1.1205366802553231e-05, + "loss": 0.6796480417251587, + "step": 1131 + }, + { + "epoch": 1.468479429000142, + "grad_norm": 0.7773372530937195, + "learning_rate": 1.1191170368382992e-05, + "loss": 0.7957556247711182, + "step": 1132 + }, + { + "epoch": 1.4697771559502808, + "grad_norm": 0.7063891887664795, + "learning_rate": 1.117697149857084e-05, + "loss": 0.7295725345611572, + "step": 1133 + }, + { + "epoch": 1.4710748829004197, + "grad_norm": 0.7076992988586426, + "learning_rate": 1.1162770222149873e-05, + "loss": 0.7353643178939819, + "step": 1134 + }, + { + "epoch": 1.4723726098505585, + "grad_norm": 0.7097960710525513, + "learning_rate": 1.1148566568158099e-05, + "loss": 0.6855234503746033, + "step": 1135 + }, + { + "epoch": 1.4736703368006976, + "grad_norm": 0.7133991122245789, + "learning_rate": 1.1134360565638402e-05, + "loss": 0.7381144762039185, + "step": 1136 + }, + { + "epoch": 1.4749680637508364, + "grad_norm": 0.6666829586029053, + "learning_rate": 1.1120152243638457e-05, + "loss": 0.7571398019790649, + "step": 1137 + }, + { + "epoch": 1.4762657907009753, + "grad_norm": 0.7138345837593079, + "learning_rate": 1.1105941631210694e-05, + "loss": 0.7363887429237366, + "step": 1138 + }, + { + "epoch": 1.4775635176511142, + "grad_norm": 0.6881229877471924, + "learning_rate": 1.1091728757412212e-05, + "loss": 0.6838353276252747, + "step": 1139 + }, + { + "epoch": 1.4788612446012532, + "grad_norm": 0.6954206824302673, + "learning_rate": 1.107751365130474e-05, + "loss": 0.6892279386520386, + "step": 1140 + }, + { + "epoch": 1.480158971551392, + "grad_norm": 0.7325204014778137, + "learning_rate": 1.1063296341954577e-05, + "loss": 0.7068898677825928, + "step": 1141 + }, + { + "epoch": 1.481456698501531, + "grad_norm": 0.7389767169952393, + "learning_rate": 1.1049076858432517e-05, + "loss": 0.7737511396408081, + "step": 1142 + }, + { + "epoch": 1.4827544254516698, + "grad_norm": 0.7286487817764282, + "learning_rate": 1.1034855229813812e-05, + "loss": 0.7521780729293823, + "step": 1143 + }, + { + "epoch": 1.4840521524018087, + "grad_norm": 0.7211914658546448, + "learning_rate": 1.1020631485178084e-05, + "loss": 0.7648857831954956, + "step": 1144 + }, + { + "epoch": 1.4853498793519475, + "grad_norm": 0.6989269852638245, + "learning_rate": 1.1006405653609295e-05, + "loss": 0.7818325161933899, + "step": 1145 + }, + { + "epoch": 1.4866476063020864, + "grad_norm": 0.7269567251205444, + "learning_rate": 1.0992177764195671e-05, + "loss": 0.7369544506072998, + "step": 1146 + }, + { + "epoch": 1.4879453332522254, + "grad_norm": 0.7193188071250916, + "learning_rate": 1.0977947846029642e-05, + "loss": 0.7326228022575378, + "step": 1147 + }, + { + "epoch": 1.4892430602023643, + "grad_norm": 0.6688587665557861, + "learning_rate": 1.0963715928207795e-05, + "loss": 0.6900015473365784, + "step": 1148 + }, + { + "epoch": 1.4905407871525032, + "grad_norm": 0.7130873203277588, + "learning_rate": 1.094948203983079e-05, + "loss": 0.7647519707679749, + "step": 1149 + }, + { + "epoch": 1.491838514102642, + "grad_norm": 0.7038359642028809, + "learning_rate": 1.0935246210003334e-05, + "loss": 0.7078969478607178, + "step": 1150 + }, + { + "epoch": 1.491838514102642, + "eval_loss": 0.7540779113769531, + "eval_runtime": 144.3473, + "eval_samples_per_second": 35.969, + "eval_steps_per_second": 8.992, + "step": 1150 + }, + { + "epoch": 1.493136241052781, + "grad_norm": 0.7373347878456116, + "learning_rate": 1.0921008467834094e-05, + "loss": 0.7495899200439453, + "step": 1151 + }, + { + "epoch": 1.49443396800292, + "grad_norm": 0.7285864949226379, + "learning_rate": 1.0906768842435647e-05, + "loss": 0.7451608777046204, + "step": 1152 + }, + { + "epoch": 1.4957316949530588, + "grad_norm": 0.7112108469009399, + "learning_rate": 1.0892527362924426e-05, + "loss": 0.6732929944992065, + "step": 1153 + }, + { + "epoch": 1.4970294219031977, + "grad_norm": 0.7155210971832275, + "learning_rate": 1.0878284058420647e-05, + "loss": 0.7473354339599609, + "step": 1154 + }, + { + "epoch": 1.4983271488533365, + "grad_norm": 0.7318425178527832, + "learning_rate": 1.0864038958048267e-05, + "loss": 0.6648399829864502, + "step": 1155 + }, + { + "epoch": 1.4996248758034754, + "grad_norm": 0.6885069012641907, + "learning_rate": 1.084979209093491e-05, + "loss": 0.7034338712692261, + "step": 1156 + }, + { + "epoch": 1.5009226027536142, + "grad_norm": 0.7019109129905701, + "learning_rate": 1.0835543486211815e-05, + "loss": 0.7674492001533508, + "step": 1157 + }, + { + "epoch": 1.5022203297037533, + "grad_norm": 0.7262328267097473, + "learning_rate": 1.0821293173013769e-05, + "loss": 0.7348574995994568, + "step": 1158 + }, + { + "epoch": 1.5035180566538922, + "grad_norm": 0.6678932905197144, + "learning_rate": 1.0807041180479054e-05, + "loss": 0.6102491617202759, + "step": 1159 + }, + { + "epoch": 1.504815783604031, + "grad_norm": 0.7315651178359985, + "learning_rate": 1.0792787537749392e-05, + "loss": 0.7893344163894653, + "step": 1160 + }, + { + "epoch": 1.50611351055417, + "grad_norm": 0.7274885773658752, + "learning_rate": 1.0778532273969877e-05, + "loss": 0.6995629072189331, + "step": 1161 + }, + { + "epoch": 1.507411237504309, + "grad_norm": 0.6988937258720398, + "learning_rate": 1.0764275418288908e-05, + "loss": 0.753483772277832, + "step": 1162 + }, + { + "epoch": 1.5087089644544478, + "grad_norm": 0.714952290058136, + "learning_rate": 1.0750016999858151e-05, + "loss": 0.7254124283790588, + "step": 1163 + }, + { + "epoch": 1.5100066914045867, + "grad_norm": 0.7081964015960693, + "learning_rate": 1.0735757047832461e-05, + "loss": 0.7344964146614075, + "step": 1164 + }, + { + "epoch": 1.5113044183547255, + "grad_norm": 0.6843774914741516, + "learning_rate": 1.0721495591369832e-05, + "loss": 0.6407060623168945, + "step": 1165 + }, + { + "epoch": 1.5126021453048644, + "grad_norm": 0.7179701924324036, + "learning_rate": 1.0707232659631333e-05, + "loss": 0.7781057357788086, + "step": 1166 + }, + { + "epoch": 1.5138998722550032, + "grad_norm": 0.7363991141319275, + "learning_rate": 1.0692968281781046e-05, + "loss": 0.6866899132728577, + "step": 1167 + }, + { + "epoch": 1.5151975992051423, + "grad_norm": 0.6679601669311523, + "learning_rate": 1.0678702486986016e-05, + "loss": 0.6717002391815186, + "step": 1168 + }, + { + "epoch": 1.5164953261552812, + "grad_norm": 0.6931522488594055, + "learning_rate": 1.0664435304416185e-05, + "loss": 0.6953310966491699, + "step": 1169 + }, + { + "epoch": 1.51779305310542, + "grad_norm": 0.738691508769989, + "learning_rate": 1.065016676324433e-05, + "loss": 0.7797529101371765, + "step": 1170 + }, + { + "epoch": 1.519090780055559, + "grad_norm": 0.6795670390129089, + "learning_rate": 1.0635896892645998e-05, + "loss": 0.652160108089447, + "step": 1171 + }, + { + "epoch": 1.520388507005698, + "grad_norm": 0.7034809589385986, + "learning_rate": 1.0621625721799473e-05, + "loss": 0.7155415415763855, + "step": 1172 + }, + { + "epoch": 1.5216862339558368, + "grad_norm": 0.7075764536857605, + "learning_rate": 1.0607353279885682e-05, + "loss": 0.6893566846847534, + "step": 1173 + }, + { + "epoch": 1.5229839609059757, + "grad_norm": 0.696140468120575, + "learning_rate": 1.0593079596088155e-05, + "loss": 0.6836467981338501, + "step": 1174 + }, + { + "epoch": 1.5242816878561145, + "grad_norm": 0.7141397595405579, + "learning_rate": 1.0578804699592968e-05, + "loss": 0.7246308326721191, + "step": 1175 + }, + { + "epoch": 1.5255794148062534, + "grad_norm": 0.6880807280540466, + "learning_rate": 1.0564528619588668e-05, + "loss": 0.6564866304397583, + "step": 1176 + }, + { + "epoch": 1.5268771417563922, + "grad_norm": 0.6661361455917358, + "learning_rate": 1.0550251385266223e-05, + "loss": 0.6993754506111145, + "step": 1177 + }, + { + "epoch": 1.528174868706531, + "grad_norm": 0.7442536950111389, + "learning_rate": 1.0535973025818969e-05, + "loss": 0.7055092453956604, + "step": 1178 + }, + { + "epoch": 1.5294725956566702, + "grad_norm": 0.7330362200737, + "learning_rate": 1.0521693570442533e-05, + "loss": 0.7582162022590637, + "step": 1179 + }, + { + "epoch": 1.530770322606809, + "grad_norm": 0.722161591053009, + "learning_rate": 1.050741304833479e-05, + "loss": 0.7415435314178467, + "step": 1180 + }, + { + "epoch": 1.5320680495569479, + "grad_norm": 0.69851154088974, + "learning_rate": 1.0493131488695789e-05, + "loss": 0.6807332038879395, + "step": 1181 + }, + { + "epoch": 1.533365776507087, + "grad_norm": 0.7059313654899597, + "learning_rate": 1.0478848920727707e-05, + "loss": 0.7028640508651733, + "step": 1182 + }, + { + "epoch": 1.5346635034572258, + "grad_norm": 0.6546805500984192, + "learning_rate": 1.0464565373634784e-05, + "loss": 0.6459164619445801, + "step": 1183 + }, + { + "epoch": 1.5359612304073647, + "grad_norm": 0.6890950202941895, + "learning_rate": 1.0450280876623253e-05, + "loss": 0.7195508480072021, + "step": 1184 + }, + { + "epoch": 1.5372589573575035, + "grad_norm": 0.6886339783668518, + "learning_rate": 1.0435995458901298e-05, + "loss": 0.7041788697242737, + "step": 1185 + }, + { + "epoch": 1.5385566843076424, + "grad_norm": 0.7007988095283508, + "learning_rate": 1.042170914967898e-05, + "loss": 0.6726493835449219, + "step": 1186 + }, + { + "epoch": 1.5398544112577812, + "grad_norm": 0.7152829766273499, + "learning_rate": 1.0407421978168186e-05, + "loss": 0.7684251666069031, + "step": 1187 + }, + { + "epoch": 1.54115213820792, + "grad_norm": 0.7171955108642578, + "learning_rate": 1.0393133973582572e-05, + "loss": 0.7586410045623779, + "step": 1188 + }, + { + "epoch": 1.542449865158059, + "grad_norm": 0.7011827230453491, + "learning_rate": 1.0378845165137483e-05, + "loss": 0.6837091445922852, + "step": 1189 + }, + { + "epoch": 1.543747592108198, + "grad_norm": 0.7295593619346619, + "learning_rate": 1.0364555582049917e-05, + "loss": 0.7210373282432556, + "step": 1190 + }, + { + "epoch": 1.5450453190583369, + "grad_norm": 0.7250920534133911, + "learning_rate": 1.0350265253538458e-05, + "loss": 0.7209242582321167, + "step": 1191 + }, + { + "epoch": 1.546343046008476, + "grad_norm": 0.7172147631645203, + "learning_rate": 1.033597420882321e-05, + "loss": 0.771355390548706, + "step": 1192 + }, + { + "epoch": 1.5476407729586148, + "grad_norm": 0.7234722375869751, + "learning_rate": 1.0321682477125743e-05, + "loss": 0.7173848152160645, + "step": 1193 + }, + { + "epoch": 1.5489384999087537, + "grad_norm": 0.7182676792144775, + "learning_rate": 1.0307390087669026e-05, + "loss": 0.6971171498298645, + "step": 1194 + }, + { + "epoch": 1.5502362268588925, + "grad_norm": 0.711088240146637, + "learning_rate": 1.0293097069677382e-05, + "loss": 0.7250340580940247, + "step": 1195 + }, + { + "epoch": 1.5515339538090314, + "grad_norm": 0.7057585120201111, + "learning_rate": 1.0278803452376416e-05, + "loss": 0.6538138389587402, + "step": 1196 + }, + { + "epoch": 1.5528316807591702, + "grad_norm": 0.7198209166526794, + "learning_rate": 1.0264509264992954e-05, + "loss": 0.7397878170013428, + "step": 1197 + }, + { + "epoch": 1.554129407709309, + "grad_norm": 0.7141586542129517, + "learning_rate": 1.0250214536754996e-05, + "loss": 0.7416911125183105, + "step": 1198 + }, + { + "epoch": 1.555427134659448, + "grad_norm": 0.6700720191001892, + "learning_rate": 1.0235919296891641e-05, + "loss": 0.6646735072135925, + "step": 1199 + }, + { + "epoch": 1.556724861609587, + "grad_norm": 0.7620872855186462, + "learning_rate": 1.0221623574633035e-05, + "loss": 0.7746062874794006, + "step": 1200 + }, + { + "epoch": 1.5580225885597259, + "grad_norm": 0.7291470766067505, + "learning_rate": 1.0207327399210311e-05, + "loss": 0.7022420167922974, + "step": 1201 + }, + { + "epoch": 1.5593203155098647, + "grad_norm": 0.7325419783592224, + "learning_rate": 1.0193030799855534e-05, + "loss": 0.6780503988265991, + "step": 1202 + }, + { + "epoch": 1.5606180424600038, + "grad_norm": 0.7144452929496765, + "learning_rate": 1.0178733805801626e-05, + "loss": 0.7393384575843811, + "step": 1203 + }, + { + "epoch": 1.5619157694101427, + "grad_norm": 0.7362129092216492, + "learning_rate": 1.0164436446282324e-05, + "loss": 0.7512763142585754, + "step": 1204 + }, + { + "epoch": 1.5632134963602815, + "grad_norm": 0.6746947765350342, + "learning_rate": 1.015013875053211e-05, + "loss": 0.6646847128868103, + "step": 1205 + }, + { + "epoch": 1.5645112233104204, + "grad_norm": 0.6834600567817688, + "learning_rate": 1.013584074778615e-05, + "loss": 0.6130549311637878, + "step": 1206 + }, + { + "epoch": 1.5658089502605592, + "grad_norm": 0.7684876918792725, + "learning_rate": 1.0121542467280245e-05, + "loss": 0.7241174578666687, + "step": 1207 + }, + { + "epoch": 1.567106677210698, + "grad_norm": 0.7327429056167603, + "learning_rate": 1.0107243938250755e-05, + "loss": 0.6390076875686646, + "step": 1208 + }, + { + "epoch": 1.568404404160837, + "grad_norm": 0.6959134340286255, + "learning_rate": 1.0092945189934558e-05, + "loss": 0.7467840909957886, + "step": 1209 + }, + { + "epoch": 1.5697021311109758, + "grad_norm": 0.7259625792503357, + "learning_rate": 1.007864625156897e-05, + "loss": 0.7787569165229797, + "step": 1210 + }, + { + "epoch": 1.5709998580611149, + "grad_norm": 0.7313428521156311, + "learning_rate": 1.0064347152391703e-05, + "loss": 0.7091028690338135, + "step": 1211 + }, + { + "epoch": 1.5722975850112537, + "grad_norm": 0.7232116460800171, + "learning_rate": 1.0050047921640797e-05, + "loss": 0.6815755367279053, + "step": 1212 + }, + { + "epoch": 1.5735953119613926, + "grad_norm": 0.7286602854728699, + "learning_rate": 1.003574858855456e-05, + "loss": 0.72878098487854, + "step": 1213 + }, + { + "epoch": 1.5748930389115317, + "grad_norm": 0.6926529407501221, + "learning_rate": 1.0021449182371504e-05, + "loss": 0.6855754852294922, + "step": 1214 + }, + { + "epoch": 1.5761907658616705, + "grad_norm": 0.7037234306335449, + "learning_rate": 1.0007149732330299e-05, + "loss": 0.6827071309089661, + "step": 1215 + }, + { + "epoch": 1.5774884928118094, + "grad_norm": 0.7107639908790588, + "learning_rate": 9.992850267669703e-06, + "loss": 0.7649542093276978, + "step": 1216 + }, + { + "epoch": 1.5787862197619482, + "grad_norm": 0.737821102142334, + "learning_rate": 9.978550817628501e-06, + "loss": 0.6636335849761963, + "step": 1217 + }, + { + "epoch": 1.580083946712087, + "grad_norm": 0.7441766858100891, + "learning_rate": 9.964251411445444e-06, + "loss": 0.7413192391395569, + "step": 1218 + }, + { + "epoch": 1.581381673662226, + "grad_norm": 0.750579833984375, + "learning_rate": 9.949952078359208e-06, + "loss": 0.7131896018981934, + "step": 1219 + }, + { + "epoch": 1.5826794006123648, + "grad_norm": 0.7051860690116882, + "learning_rate": 9.935652847608302e-06, + "loss": 0.7157960534095764, + "step": 1220 + }, + { + "epoch": 1.5839771275625036, + "grad_norm": 0.6900631785392761, + "learning_rate": 9.921353748431036e-06, + "loss": 0.6898972392082214, + "step": 1221 + }, + { + "epoch": 1.5852748545126427, + "grad_norm": 0.7243295907974243, + "learning_rate": 9.907054810065446e-06, + "loss": 0.6597715616226196, + "step": 1222 + }, + { + "epoch": 1.5865725814627816, + "grad_norm": 0.6974424123764038, + "learning_rate": 9.89275606174925e-06, + "loss": 0.6871618032455444, + "step": 1223 + }, + { + "epoch": 1.5878703084129207, + "grad_norm": 0.6947103142738342, + "learning_rate": 9.878457532719757e-06, + "loss": 0.680080235004425, + "step": 1224 + }, + { + "epoch": 1.5891680353630595, + "grad_norm": 0.7873682975769043, + "learning_rate": 9.864159252213852e-06, + "loss": 0.7676745057106018, + "step": 1225 + }, + { + "epoch": 1.5904657623131984, + "grad_norm": 0.7117084860801697, + "learning_rate": 9.849861249467893e-06, + "loss": 0.7582260370254517, + "step": 1226 + }, + { + "epoch": 1.5917634892633372, + "grad_norm": 0.7120140194892883, + "learning_rate": 9.83556355371768e-06, + "loss": 0.7325617074966431, + "step": 1227 + }, + { + "epoch": 1.593061216213476, + "grad_norm": 0.8112825155258179, + "learning_rate": 9.821266194198375e-06, + "loss": 0.704188883304596, + "step": 1228 + }, + { + "epoch": 1.594358943163615, + "grad_norm": 0.6812202334403992, + "learning_rate": 9.806969200144471e-06, + "loss": 0.6495468616485596, + "step": 1229 + }, + { + "epoch": 1.5956566701137538, + "grad_norm": 0.672173261642456, + "learning_rate": 9.79267260078969e-06, + "loss": 0.7104700207710266, + "step": 1230 + }, + { + "epoch": 1.5969543970638926, + "grad_norm": 0.7402030229568481, + "learning_rate": 9.778376425366967e-06, + "loss": 0.7161640524864197, + "step": 1231 + }, + { + "epoch": 1.5982521240140317, + "grad_norm": 0.7105618119239807, + "learning_rate": 9.764080703108362e-06, + "loss": 0.7429479956626892, + "step": 1232 + }, + { + "epoch": 1.5995498509641706, + "grad_norm": 0.7068690657615662, + "learning_rate": 9.749785463245006e-06, + "loss": 0.7453438639640808, + "step": 1233 + }, + { + "epoch": 1.6008475779143094, + "grad_norm": 0.7170218825340271, + "learning_rate": 9.735490735007047e-06, + "loss": 0.7229534387588501, + "step": 1234 + }, + { + "epoch": 1.6021453048644485, + "grad_norm": 0.6783753633499146, + "learning_rate": 9.721196547623585e-06, + "loss": 0.7175101041793823, + "step": 1235 + }, + { + "epoch": 1.6034430318145874, + "grad_norm": 0.7113945484161377, + "learning_rate": 9.706902930322621e-06, + "loss": 0.7054000496864319, + "step": 1236 + }, + { + "epoch": 1.6047407587647262, + "grad_norm": 0.7143622636795044, + "learning_rate": 9.692609912330975e-06, + "loss": 0.7337828278541565, + "step": 1237 + }, + { + "epoch": 1.606038485714865, + "grad_norm": 0.7191219329833984, + "learning_rate": 9.67831752287426e-06, + "loss": 0.7462741136550903, + "step": 1238 + }, + { + "epoch": 1.607336212665004, + "grad_norm": 0.6787925362586975, + "learning_rate": 9.66402579117679e-06, + "loss": 0.6983505487442017, + "step": 1239 + }, + { + "epoch": 1.6086339396151428, + "grad_norm": 0.7183864712715149, + "learning_rate": 9.649734746461544e-06, + "loss": 0.7454296350479126, + "step": 1240 + }, + { + "epoch": 1.6099316665652816, + "grad_norm": 0.7119743227958679, + "learning_rate": 9.635444417950083e-06, + "loss": 0.6732832193374634, + "step": 1241 + }, + { + "epoch": 1.6112293935154205, + "grad_norm": 0.7184067368507385, + "learning_rate": 9.62115483486252e-06, + "loss": 0.6472535729408264, + "step": 1242 + }, + { + "epoch": 1.6125271204655596, + "grad_norm": 0.693452000617981, + "learning_rate": 9.606866026417431e-06, + "loss": 0.7115393877029419, + "step": 1243 + }, + { + "epoch": 1.6138248474156984, + "grad_norm": 0.749234139919281, + "learning_rate": 9.592578021831817e-06, + "loss": 0.775533139705658, + "step": 1244 + }, + { + "epoch": 1.6151225743658375, + "grad_norm": 0.7310823798179626, + "learning_rate": 9.578290850321023e-06, + "loss": 0.7301318645477295, + "step": 1245 + }, + { + "epoch": 1.6164203013159764, + "grad_norm": 0.7240172028541565, + "learning_rate": 9.564004541098709e-06, + "loss": 0.6760499477386475, + "step": 1246 + }, + { + "epoch": 1.6177180282661152, + "grad_norm": 0.7192076444625854, + "learning_rate": 9.549719123376749e-06, + "loss": 0.8106221556663513, + "step": 1247 + }, + { + "epoch": 1.619015755216254, + "grad_norm": 0.763373851776123, + "learning_rate": 9.535434626365221e-06, + "loss": 0.7758911848068237, + "step": 1248 + }, + { + "epoch": 1.620313482166393, + "grad_norm": 0.766298234462738, + "learning_rate": 9.521151079272295e-06, + "loss": 0.8113157749176025, + "step": 1249 + }, + { + "epoch": 1.6216112091165318, + "grad_norm": 0.7206328511238098, + "learning_rate": 9.506868511304216e-06, + "loss": 0.7105867266654968, + "step": 1250 + }, + { + "epoch": 1.6229089360666706, + "grad_norm": 0.7417821288108826, + "learning_rate": 9.492586951665214e-06, + "loss": 0.7875233888626099, + "step": 1251 + }, + { + "epoch": 1.6242066630168095, + "grad_norm": 0.6913713812828064, + "learning_rate": 9.47830642955747e-06, + "loss": 0.6810760498046875, + "step": 1252 + }, + { + "epoch": 1.6255043899669486, + "grad_norm": 0.7151052355766296, + "learning_rate": 9.464026974181035e-06, + "loss": 0.7549710869789124, + "step": 1253 + }, + { + "epoch": 1.6268021169170874, + "grad_norm": 0.6772926449775696, + "learning_rate": 9.44974861473378e-06, + "loss": 0.6992902159690857, + "step": 1254 + }, + { + "epoch": 1.6280998438672263, + "grad_norm": 0.734398365020752, + "learning_rate": 9.435471380411335e-06, + "loss": 0.7508738040924072, + "step": 1255 + }, + { + "epoch": 1.6293975708173654, + "grad_norm": 0.6922202706336975, + "learning_rate": 9.421195300407035e-06, + "loss": 0.6657233834266663, + "step": 1256 + }, + { + "epoch": 1.6306952977675042, + "grad_norm": 0.6931065917015076, + "learning_rate": 9.406920403911848e-06, + "loss": 0.7156346440315247, + "step": 1257 + }, + { + "epoch": 1.631993024717643, + "grad_norm": 0.6905820369720459, + "learning_rate": 9.392646720114325e-06, + "loss": 0.7550724744796753, + "step": 1258 + }, + { + "epoch": 1.633290751667782, + "grad_norm": 0.6891010403633118, + "learning_rate": 9.37837427820053e-06, + "loss": 0.7689525485038757, + "step": 1259 + }, + { + "epoch": 1.6345884786179208, + "grad_norm": 0.6997367739677429, + "learning_rate": 9.364103107354002e-06, + "loss": 0.6940702795982361, + "step": 1260 + }, + { + "epoch": 1.6358862055680596, + "grad_norm": 0.7232581973075867, + "learning_rate": 9.349833236755675e-06, + "loss": 0.708733856678009, + "step": 1261 + }, + { + "epoch": 1.6371839325181985, + "grad_norm": 0.7156563997268677, + "learning_rate": 9.335564695583816e-06, + "loss": 0.7080838680267334, + "step": 1262 + }, + { + "epoch": 1.6384816594683373, + "grad_norm": 0.7129452228546143, + "learning_rate": 9.321297513013987e-06, + "loss": 0.7160661816596985, + "step": 1263 + }, + { + "epoch": 1.6397793864184764, + "grad_norm": 0.7260149717330933, + "learning_rate": 9.307031718218956e-06, + "loss": 0.7261675000190735, + "step": 1264 + }, + { + "epoch": 1.6410771133686153, + "grad_norm": 0.7252016067504883, + "learning_rate": 9.292767340368672e-06, + "loss": 0.7626814842224121, + "step": 1265 + }, + { + "epoch": 1.6423748403187541, + "grad_norm": 0.7192304134368896, + "learning_rate": 9.278504408630171e-06, + "loss": 0.7479438781738281, + "step": 1266 + }, + { + "epoch": 1.6436725672688932, + "grad_norm": 0.7067307829856873, + "learning_rate": 9.264242952167544e-06, + "loss": 0.7229454517364502, + "step": 1267 + }, + { + "epoch": 1.644970294219032, + "grad_norm": 0.69132000207901, + "learning_rate": 9.24998300014185e-06, + "loss": 0.7404082417488098, + "step": 1268 + }, + { + "epoch": 1.646268021169171, + "grad_norm": 0.7199667096138, + "learning_rate": 9.235724581711096e-06, + "loss": 0.6846930384635925, + "step": 1269 + }, + { + "epoch": 1.6475657481193098, + "grad_norm": 2.456246852874756, + "learning_rate": 9.221467726030126e-06, + "loss": 0.7993893623352051, + "step": 1270 + }, + { + "epoch": 1.6488634750694486, + "grad_norm": 0.6726557016372681, + "learning_rate": 9.207212462250611e-06, + "loss": 0.6635693311691284, + "step": 1271 + }, + { + "epoch": 1.6501612020195875, + "grad_norm": 0.6767668128013611, + "learning_rate": 9.192958819520948e-06, + "loss": 0.6265630722045898, + "step": 1272 + }, + { + "epoch": 1.6514589289697263, + "grad_norm": 0.660176157951355, + "learning_rate": 9.178706826986236e-06, + "loss": 0.7039428353309631, + "step": 1273 + }, + { + "epoch": 1.6527566559198652, + "grad_norm": 0.710209846496582, + "learning_rate": 9.164456513788186e-06, + "loss": 0.712166965007782, + "step": 1274 + }, + { + "epoch": 1.6540543828700043, + "grad_norm": 0.7239776849746704, + "learning_rate": 9.150207909065093e-06, + "loss": 0.7487761378288269, + "step": 1275 + }, + { + "epoch": 1.6553521098201431, + "grad_norm": 0.6918028593063354, + "learning_rate": 9.135961041951735e-06, + "loss": 0.6682979464530945, + "step": 1276 + }, + { + "epoch": 1.6566498367702822, + "grad_norm": 0.7262064218521118, + "learning_rate": 9.121715941579358e-06, + "loss": 0.6650745868682861, + "step": 1277 + }, + { + "epoch": 1.657947563720421, + "grad_norm": 0.6805858612060547, + "learning_rate": 9.107472637075578e-06, + "loss": 0.7332329750061035, + "step": 1278 + }, + { + "epoch": 1.65924529067056, + "grad_norm": 0.7414560914039612, + "learning_rate": 9.093231157564357e-06, + "loss": 0.7112785577774048, + "step": 1279 + }, + { + "epoch": 1.6605430176206988, + "grad_norm": 0.6898860335350037, + "learning_rate": 9.078991532165911e-06, + "loss": 0.6940746307373047, + "step": 1280 + }, + { + "epoch": 1.6618407445708376, + "grad_norm": 0.734137773513794, + "learning_rate": 9.06475378999667e-06, + "loss": 0.7100757956504822, + "step": 1281 + }, + { + "epoch": 1.6631384715209765, + "grad_norm": 0.7733497023582458, + "learning_rate": 9.050517960169211e-06, + "loss": 0.7518686056137085, + "step": 1282 + }, + { + "epoch": 1.6644361984711153, + "grad_norm": 0.709705650806427, + "learning_rate": 9.036284071792212e-06, + "loss": 0.7964266538619995, + "step": 1283 + }, + { + "epoch": 1.6657339254212542, + "grad_norm": 0.711685836315155, + "learning_rate": 9.022052153970361e-06, + "loss": 0.7170289158821106, + "step": 1284 + }, + { + "epoch": 1.6670316523713933, + "grad_norm": 0.7108113169670105, + "learning_rate": 9.007822235804334e-06, + "loss": 0.7257951498031616, + "step": 1285 + }, + { + "epoch": 1.6683293793215321, + "grad_norm": 0.727200984954834, + "learning_rate": 8.993594346390709e-06, + "loss": 0.7011697888374329, + "step": 1286 + }, + { + "epoch": 1.669627106271671, + "grad_norm": 0.682969868183136, + "learning_rate": 8.979368514821917e-06, + "loss": 0.6846626996994019, + "step": 1287 + }, + { + "epoch": 1.67092483322181, + "grad_norm": 0.7197726964950562, + "learning_rate": 8.965144770186192e-06, + "loss": 0.7460110783576965, + "step": 1288 + }, + { + "epoch": 1.672222560171949, + "grad_norm": 0.7024762630462646, + "learning_rate": 8.950923141567482e-06, + "loss": 0.6903531551361084, + "step": 1289 + }, + { + "epoch": 1.6735202871220878, + "grad_norm": 0.7416940927505493, + "learning_rate": 8.936703658045426e-06, + "loss": 0.8462705612182617, + "step": 1290 + }, + { + "epoch": 1.6748180140722266, + "grad_norm": 0.749668538570404, + "learning_rate": 8.92248634869526e-06, + "loss": 0.7686569690704346, + "step": 1291 + }, + { + "epoch": 1.6761157410223655, + "grad_norm": 0.6500091552734375, + "learning_rate": 8.90827124258779e-06, + "loss": 0.7148120403289795, + "step": 1292 + }, + { + "epoch": 1.6774134679725043, + "grad_norm": 0.6878598928451538, + "learning_rate": 8.894058368789308e-06, + "loss": 0.5954074263572693, + "step": 1293 + }, + { + "epoch": 1.6787111949226432, + "grad_norm": 0.687202513217926, + "learning_rate": 8.879847756361544e-06, + "loss": 0.6912335753440857, + "step": 1294 + }, + { + "epoch": 1.680008921872782, + "grad_norm": 0.7027560472488403, + "learning_rate": 8.8656394343616e-06, + "loss": 0.6989542245864868, + "step": 1295 + }, + { + "epoch": 1.6813066488229211, + "grad_norm": 0.6999865770339966, + "learning_rate": 8.851433431841904e-06, + "loss": 0.7319304347038269, + "step": 1296 + }, + { + "epoch": 1.68260437577306, + "grad_norm": 0.7822436690330505, + "learning_rate": 8.837229777850129e-06, + "loss": 0.7571746110916138, + "step": 1297 + }, + { + "epoch": 1.6839021027231988, + "grad_norm": 0.6928126215934753, + "learning_rate": 8.823028501429161e-06, + "loss": 0.7471798062324524, + "step": 1298 + }, + { + "epoch": 1.685199829673338, + "grad_norm": 0.6795255541801453, + "learning_rate": 8.808829631617009e-06, + "loss": 0.6901456117630005, + "step": 1299 + }, + { + "epoch": 1.6864975566234768, + "grad_norm": 0.7609167695045471, + "learning_rate": 8.79463319744677e-06, + "loss": 0.782101035118103, + "step": 1300 + }, + { + "epoch": 1.6877952835736156, + "grad_norm": 0.7111324667930603, + "learning_rate": 8.78043922794656e-06, + "loss": 0.7500295042991638, + "step": 1301 + }, + { + "epoch": 1.6890930105237545, + "grad_norm": 0.7332251667976379, + "learning_rate": 8.766247752139453e-06, + "loss": 0.7808182835578918, + "step": 1302 + }, + { + "epoch": 1.6903907374738933, + "grad_norm": 0.7156122922897339, + "learning_rate": 8.752058799043422e-06, + "loss": 0.748470664024353, + "step": 1303 + }, + { + "epoch": 1.6916884644240322, + "grad_norm": 0.7189647555351257, + "learning_rate": 8.737872397671293e-06, + "loss": 0.7072033882141113, + "step": 1304 + }, + { + "epoch": 1.692986191374171, + "grad_norm": 0.719592809677124, + "learning_rate": 8.723688577030655e-06, + "loss": 0.7256566286087036, + "step": 1305 + }, + { + "epoch": 1.69428391832431, + "grad_norm": 0.7151191234588623, + "learning_rate": 8.709507366123841e-06, + "loss": 0.7216327786445618, + "step": 1306 + }, + { + "epoch": 1.695581645274449, + "grad_norm": 0.6929178833961487, + "learning_rate": 8.695328793947833e-06, + "loss": 0.6505569815635681, + "step": 1307 + }, + { + "epoch": 1.6968793722245878, + "grad_norm": 0.7117684483528137, + "learning_rate": 8.681152889494227e-06, + "loss": 0.750861644744873, + "step": 1308 + }, + { + "epoch": 1.698177099174727, + "grad_norm": 0.8048399090766907, + "learning_rate": 8.66697968174915e-06, + "loss": 0.7125011682510376, + "step": 1309 + }, + { + "epoch": 1.6994748261248658, + "grad_norm": 0.7594026923179626, + "learning_rate": 8.652809199693236e-06, + "loss": 0.6821706295013428, + "step": 1310 + }, + { + "epoch": 1.7007725530750046, + "grad_norm": 0.696814775466919, + "learning_rate": 8.638641472301524e-06, + "loss": 0.7341318726539612, + "step": 1311 + }, + { + "epoch": 1.7020702800251435, + "grad_norm": 0.6953744292259216, + "learning_rate": 8.624476528543439e-06, + "loss": 0.7471984028816223, + "step": 1312 + }, + { + "epoch": 1.7033680069752823, + "grad_norm": 0.7624510526657104, + "learning_rate": 8.610314397382701e-06, + "loss": 0.7660402655601501, + "step": 1313 + }, + { + "epoch": 1.7046657339254212, + "grad_norm": 0.7193018198013306, + "learning_rate": 8.596155107777288e-06, + "loss": 0.7213659882545471, + "step": 1314 + }, + { + "epoch": 1.70596346087556, + "grad_norm": 0.703834593296051, + "learning_rate": 8.581998688679356e-06, + "loss": 0.7187014818191528, + "step": 1315 + }, + { + "epoch": 1.707261187825699, + "grad_norm": 0.7352998852729797, + "learning_rate": 8.567845169035205e-06, + "loss": 0.7381072044372559, + "step": 1316 + }, + { + "epoch": 1.708558914775838, + "grad_norm": 0.7008899450302124, + "learning_rate": 8.553694577785201e-06, + "loss": 0.6953420639038086, + "step": 1317 + }, + { + "epoch": 1.7098566417259768, + "grad_norm": 0.6997075080871582, + "learning_rate": 8.539546943863717e-06, + "loss": 0.721794605255127, + "step": 1318 + }, + { + "epoch": 1.7111543686761157, + "grad_norm": 0.7531685829162598, + "learning_rate": 8.525402296199089e-06, + "loss": 0.763767421245575, + "step": 1319 + }, + { + "epoch": 1.7124520956262548, + "grad_norm": 0.686306357383728, + "learning_rate": 8.511260663713537e-06, + "loss": 0.6505174040794373, + "step": 1320 + }, + { + "epoch": 1.7137498225763936, + "grad_norm": 0.6891371607780457, + "learning_rate": 8.497122075323122e-06, + "loss": 0.6535521745681763, + "step": 1321 + }, + { + "epoch": 1.7150475495265325, + "grad_norm": 0.6797356009483337, + "learning_rate": 8.482986559937676e-06, + "loss": 0.711966872215271, + "step": 1322 + }, + { + "epoch": 1.7163452764766713, + "grad_norm": 0.6834943890571594, + "learning_rate": 8.468854146460754e-06, + "loss": 0.6898146271705627, + "step": 1323 + }, + { + "epoch": 1.7176430034268102, + "grad_norm": 0.6787711381912231, + "learning_rate": 8.45472486378956e-06, + "loss": 0.7132437825202942, + "step": 1324 + }, + { + "epoch": 1.718940730376949, + "grad_norm": 0.731886088848114, + "learning_rate": 8.440598740814909e-06, + "loss": 0.767355740070343, + "step": 1325 + }, + { + "epoch": 1.720238457327088, + "grad_norm": 0.6801634430885315, + "learning_rate": 8.426475806421139e-06, + "loss": 0.728312611579895, + "step": 1326 + }, + { + "epoch": 1.7215361842772268, + "grad_norm": 0.6922846436500549, + "learning_rate": 8.412356089486082e-06, + "loss": 0.6810072064399719, + "step": 1327 + }, + { + "epoch": 1.7228339112273658, + "grad_norm": 0.7422820329666138, + "learning_rate": 8.39823961888098e-06, + "loss": 0.7293540835380554, + "step": 1328 + }, + { + "epoch": 1.7241316381775047, + "grad_norm": 0.9656670689582825, + "learning_rate": 8.384126423470447e-06, + "loss": 0.7158606648445129, + "step": 1329 + }, + { + "epoch": 1.7254293651276438, + "grad_norm": 0.704413652420044, + "learning_rate": 8.37001653211239e-06, + "loss": 0.6522120833396912, + "step": 1330 + }, + { + "epoch": 1.7267270920777826, + "grad_norm": 0.7198591828346252, + "learning_rate": 8.355909973657975e-06, + "loss": 0.7289344072341919, + "step": 1331 + }, + { + "epoch": 1.7280248190279215, + "grad_norm": 0.7069032192230225, + "learning_rate": 8.341806776951532e-06, + "loss": 0.7365983724594116, + "step": 1332 + }, + { + "epoch": 1.7293225459780603, + "grad_norm": 0.7014702558517456, + "learning_rate": 8.327706970830537e-06, + "loss": 0.7173565030097961, + "step": 1333 + }, + { + "epoch": 1.7306202729281992, + "grad_norm": 0.7151576280593872, + "learning_rate": 8.313610584125523e-06, + "loss": 0.7827293872833252, + "step": 1334 + }, + { + "epoch": 1.731917999878338, + "grad_norm": 0.7050095796585083, + "learning_rate": 8.299517645660033e-06, + "loss": 0.681469202041626, + "step": 1335 + }, + { + "epoch": 1.733215726828477, + "grad_norm": 0.6885892152786255, + "learning_rate": 8.285428184250554e-06, + "loss": 0.6469728946685791, + "step": 1336 + }, + { + "epoch": 1.7345134537786158, + "grad_norm": 0.7026622891426086, + "learning_rate": 8.271342228706478e-06, + "loss": 0.76534104347229, + "step": 1337 + }, + { + "epoch": 1.7358111807287548, + "grad_norm": 0.6556008458137512, + "learning_rate": 8.257259807830009e-06, + "loss": 0.6358019113540649, + "step": 1338 + }, + { + "epoch": 1.7371089076788937, + "grad_norm": 0.6949118971824646, + "learning_rate": 8.243180950416142e-06, + "loss": 0.7216454148292542, + "step": 1339 + }, + { + "epoch": 1.7384066346290326, + "grad_norm": 0.6842135190963745, + "learning_rate": 8.22910568525257e-06, + "loss": 0.7009142637252808, + "step": 1340 + }, + { + "epoch": 1.7397043615791716, + "grad_norm": 0.7473326921463013, + "learning_rate": 8.215034041119655e-06, + "loss": 0.7074841856956482, + "step": 1341 + }, + { + "epoch": 1.7410020885293105, + "grad_norm": 0.6532716751098633, + "learning_rate": 8.200966046790339e-06, + "loss": 0.7174238562583923, + "step": 1342 + }, + { + "epoch": 1.7422998154794493, + "grad_norm": 0.672916829586029, + "learning_rate": 8.186901731030117e-06, + "loss": 0.71747887134552, + "step": 1343 + }, + { + "epoch": 1.7435975424295882, + "grad_norm": 0.7592087388038635, + "learning_rate": 8.172841122596951e-06, + "loss": 0.8052394390106201, + "step": 1344 + }, + { + "epoch": 1.744895269379727, + "grad_norm": 0.6938197016716003, + "learning_rate": 8.158784250241226e-06, + "loss": 0.7313718795776367, + "step": 1345 + }, + { + "epoch": 1.746192996329866, + "grad_norm": 0.6459118723869324, + "learning_rate": 8.144731142705693e-06, + "loss": 0.632814884185791, + "step": 1346 + }, + { + "epoch": 1.7474907232800048, + "grad_norm": 0.6704484820365906, + "learning_rate": 8.130681828725394e-06, + "loss": 0.6906111836433411, + "step": 1347 + }, + { + "epoch": 1.7487884502301436, + "grad_norm": 0.6933112144470215, + "learning_rate": 8.116636337027626e-06, + "loss": 0.6973313093185425, + "step": 1348 + }, + { + "epoch": 1.7500861771802827, + "grad_norm": 0.6778403520584106, + "learning_rate": 8.10259469633186e-06, + "loss": 0.7237393260002136, + "step": 1349 + }, + { + "epoch": 1.7513839041304216, + "grad_norm": 0.7188864946365356, + "learning_rate": 8.0885569353497e-06, + "loss": 0.694682776927948, + "step": 1350 + }, + { + "epoch": 1.7526816310805604, + "grad_norm": 0.7305310368537903, + "learning_rate": 8.07452308278481e-06, + "loss": 0.7369967103004456, + "step": 1351 + }, + { + "epoch": 1.7539793580306995, + "grad_norm": 0.6883519291877747, + "learning_rate": 8.060493167332874e-06, + "loss": 0.6693746447563171, + "step": 1352 + }, + { + "epoch": 1.7552770849808383, + "grad_norm": 0.6861468553543091, + "learning_rate": 8.04646721768151e-06, + "loss": 0.7269149422645569, + "step": 1353 + }, + { + "epoch": 1.7565748119309772, + "grad_norm": 0.6963792443275452, + "learning_rate": 8.032445262510241e-06, + "loss": 0.7375723123550415, + "step": 1354 + }, + { + "epoch": 1.757872538881116, + "grad_norm": 0.70611572265625, + "learning_rate": 8.018427330490411e-06, + "loss": 0.6536609530448914, + "step": 1355 + }, + { + "epoch": 1.759170265831255, + "grad_norm": 0.6833199262619019, + "learning_rate": 8.004413450285147e-06, + "loss": 0.7803836464881897, + "step": 1356 + }, + { + "epoch": 1.7604679927813938, + "grad_norm": 0.7367565631866455, + "learning_rate": 7.990403650549285e-06, + "loss": 0.7431750893592834, + "step": 1357 + }, + { + "epoch": 1.7617657197315326, + "grad_norm": 0.7168142795562744, + "learning_rate": 7.976397959929324e-06, + "loss": 0.708920955657959, + "step": 1358 + }, + { + "epoch": 1.7630634466816715, + "grad_norm": 0.7081824541091919, + "learning_rate": 7.962396407063346e-06, + "loss": 0.7360220551490784, + "step": 1359 + }, + { + "epoch": 1.7643611736318106, + "grad_norm": 0.7008010149002075, + "learning_rate": 7.948399020580995e-06, + "loss": 0.6721465587615967, + "step": 1360 + }, + { + "epoch": 1.7656589005819494, + "grad_norm": 0.7550066709518433, + "learning_rate": 7.934405829103376e-06, + "loss": 0.7266613245010376, + "step": 1361 + }, + { + "epoch": 1.7669566275320885, + "grad_norm": 0.713932454586029, + "learning_rate": 7.920416861243028e-06, + "loss": 0.7003293037414551, + "step": 1362 + }, + { + "epoch": 1.7682543544822273, + "grad_norm": 0.6848137378692627, + "learning_rate": 7.906432145603844e-06, + "loss": 0.7255281805992126, + "step": 1363 + }, + { + "epoch": 1.7695520814323662, + "grad_norm": 0.7302910685539246, + "learning_rate": 7.892451710781035e-06, + "loss": 0.7285719513893127, + "step": 1364 + }, + { + "epoch": 1.770849808382505, + "grad_norm": 0.7387238144874573, + "learning_rate": 7.878475585361045e-06, + "loss": 0.7333699464797974, + "step": 1365 + }, + { + "epoch": 1.772147535332644, + "grad_norm": 0.7755225300788879, + "learning_rate": 7.864503797921518e-06, + "loss": 0.7592843770980835, + "step": 1366 + }, + { + "epoch": 1.7734452622827828, + "grad_norm": 0.6892391443252563, + "learning_rate": 7.850536377031221e-06, + "loss": 0.7412334084510803, + "step": 1367 + }, + { + "epoch": 1.7747429892329216, + "grad_norm": 0.7299293279647827, + "learning_rate": 7.836573351249996e-06, + "loss": 0.7442951798439026, + "step": 1368 + }, + { + "epoch": 1.7760407161830605, + "grad_norm": 0.6848152875900269, + "learning_rate": 7.822614749128692e-06, + "loss": 0.6193121671676636, + "step": 1369 + }, + { + "epoch": 1.7773384431331996, + "grad_norm": 0.6931573748588562, + "learning_rate": 7.808660599209124e-06, + "loss": 0.7440711259841919, + "step": 1370 + }, + { + "epoch": 1.7786361700833384, + "grad_norm": 0.7260693907737732, + "learning_rate": 7.794710930023993e-06, + "loss": 0.7359597682952881, + "step": 1371 + }, + { + "epoch": 1.7799338970334773, + "grad_norm": 0.705436646938324, + "learning_rate": 7.78076577009684e-06, + "loss": 0.6207844614982605, + "step": 1372 + }, + { + "epoch": 1.7812316239836163, + "grad_norm": 0.6740301847457886, + "learning_rate": 7.76682514794199e-06, + "loss": 0.6975910663604736, + "step": 1373 + }, + { + "epoch": 1.7825293509337552, + "grad_norm": 0.6805901527404785, + "learning_rate": 7.752889092064484e-06, + "loss": 0.671751081943512, + "step": 1374 + }, + { + "epoch": 1.783827077883894, + "grad_norm": 0.7223953604698181, + "learning_rate": 7.738957630960037e-06, + "loss": 0.6885688900947571, + "step": 1375 + }, + { + "epoch": 1.785124804834033, + "grad_norm": 0.6852001547813416, + "learning_rate": 7.725030793114952e-06, + "loss": 0.7190781831741333, + "step": 1376 + }, + { + "epoch": 1.7864225317841718, + "grad_norm": 0.7344854474067688, + "learning_rate": 7.711108607006094e-06, + "loss": 0.7325436472892761, + "step": 1377 + }, + { + "epoch": 1.7877202587343106, + "grad_norm": 0.7047913670539856, + "learning_rate": 7.697191101100802e-06, + "loss": 0.7324240803718567, + "step": 1378 + }, + { + "epoch": 1.7890179856844495, + "grad_norm": 0.7197734713554382, + "learning_rate": 7.683278303856862e-06, + "loss": 0.7601778507232666, + "step": 1379 + }, + { + "epoch": 1.7903157126345883, + "grad_norm": 0.6842553615570068, + "learning_rate": 7.669370243722415e-06, + "loss": 0.7301578521728516, + "step": 1380 + }, + { + "epoch": 1.7903157126345883, + "eval_loss": 0.7464115023612976, + "eval_runtime": 143.4981, + "eval_samples_per_second": 36.182, + "eval_steps_per_second": 9.045, + "step": 1380 + }, + { + "epoch": 1.7916134395847274, + "grad_norm": 0.6962341666221619, + "learning_rate": 7.655466949135932e-06, + "loss": 0.7249746918678284, + "step": 1381 + }, + { + "epoch": 1.7929111665348663, + "grad_norm": 0.6840744018554688, + "learning_rate": 7.641568448526122e-06, + "loss": 0.6648120880126953, + "step": 1382 + }, + { + "epoch": 1.7942088934850051, + "grad_norm": 0.7047871947288513, + "learning_rate": 7.627674770311909e-06, + "loss": 0.6969434022903442, + "step": 1383 + }, + { + "epoch": 1.7955066204351442, + "grad_norm": 0.716124951839447, + "learning_rate": 7.613785942902343e-06, + "loss": 0.7197269201278687, + "step": 1384 + }, + { + "epoch": 1.796804347385283, + "grad_norm": 0.6727207899093628, + "learning_rate": 7.599901994696566e-06, + "loss": 0.6794359683990479, + "step": 1385 + }, + { + "epoch": 1.798102074335422, + "grad_norm": 0.6976568698883057, + "learning_rate": 7.586022954083731e-06, + "loss": 0.6372778415679932, + "step": 1386 + }, + { + "epoch": 1.7993998012855608, + "grad_norm": 0.683164656162262, + "learning_rate": 7.572148849442971e-06, + "loss": 0.6731259226799011, + "step": 1387 + }, + { + "epoch": 1.8006975282356996, + "grad_norm": 0.6801917552947998, + "learning_rate": 7.5582797091433105e-06, + "loss": 0.6921297907829285, + "step": 1388 + }, + { + "epoch": 1.8019952551858385, + "grad_norm": 0.7587413191795349, + "learning_rate": 7.544415561543639e-06, + "loss": 0.7684265971183777, + "step": 1389 + }, + { + "epoch": 1.8032929821359773, + "grad_norm": 0.7493230700492859, + "learning_rate": 7.5305564349926215e-06, + "loss": 0.6984431147575378, + "step": 1390 + }, + { + "epoch": 1.8045907090861162, + "grad_norm": 0.6897554993629456, + "learning_rate": 7.516702357828672e-06, + "loss": 0.739819347858429, + "step": 1391 + }, + { + "epoch": 1.8058884360362553, + "grad_norm": 0.6832559704780579, + "learning_rate": 7.502853358379865e-06, + "loss": 0.6518275141716003, + "step": 1392 + }, + { + "epoch": 1.8071861629863941, + "grad_norm": 0.7185218334197998, + "learning_rate": 7.489009464963903e-06, + "loss": 0.7867194414138794, + "step": 1393 + }, + { + "epoch": 1.8084838899365332, + "grad_norm": 0.6737310886383057, + "learning_rate": 7.475170705888042e-06, + "loss": 0.6979063749313354, + "step": 1394 + }, + { + "epoch": 1.809781616886672, + "grad_norm": 0.713076651096344, + "learning_rate": 7.461337109449045e-06, + "loss": 0.7293301224708557, + "step": 1395 + }, + { + "epoch": 1.811079343836811, + "grad_norm": 0.700568675994873, + "learning_rate": 7.447508703933109e-06, + "loss": 0.6935805678367615, + "step": 1396 + }, + { + "epoch": 1.8123770707869498, + "grad_norm": 0.7034053802490234, + "learning_rate": 7.433685517615831e-06, + "loss": 0.7284054160118103, + "step": 1397 + }, + { + "epoch": 1.8136747977370886, + "grad_norm": 0.6562127470970154, + "learning_rate": 7.4198675787621185e-06, + "loss": 0.721833348274231, + "step": 1398 + }, + { + "epoch": 1.8149725246872275, + "grad_norm": 0.6957826614379883, + "learning_rate": 7.406054915626172e-06, + "loss": 0.6763690114021301, + "step": 1399 + }, + { + "epoch": 1.8162702516373663, + "grad_norm": 0.758056104183197, + "learning_rate": 7.392247556451382e-06, + "loss": 0.7644186615943909, + "step": 1400 + }, + { + "epoch": 1.8175679785875052, + "grad_norm": 0.6855806708335876, + "learning_rate": 7.378445529470303e-06, + "loss": 0.7499503493309021, + "step": 1401 + }, + { + "epoch": 1.8188657055376443, + "grad_norm": 0.7280805706977844, + "learning_rate": 7.364648862904593e-06, + "loss": 0.7766327261924744, + "step": 1402 + }, + { + "epoch": 1.8201634324877831, + "grad_norm": 0.7023898959159851, + "learning_rate": 7.35085758496494e-06, + "loss": 0.6799028515815735, + "step": 1403 + }, + { + "epoch": 1.821461159437922, + "grad_norm": 0.696554958820343, + "learning_rate": 7.337071723851018e-06, + "loss": 0.6930332183837891, + "step": 1404 + }, + { + "epoch": 1.822758886388061, + "grad_norm": 0.7462826371192932, + "learning_rate": 7.323291307751418e-06, + "loss": 0.7603926658630371, + "step": 1405 + }, + { + "epoch": 1.8240566133382, + "grad_norm": 0.6899564266204834, + "learning_rate": 7.3095163648436115e-06, + "loss": 0.6602949500083923, + "step": 1406 + }, + { + "epoch": 1.8253543402883388, + "grad_norm": 0.7230206727981567, + "learning_rate": 7.295746923293865e-06, + "loss": 0.7429470419883728, + "step": 1407 + }, + { + "epoch": 1.8266520672384776, + "grad_norm": 0.6691879034042358, + "learning_rate": 7.2819830112572035e-06, + "loss": 0.7018039226531982, + "step": 1408 + }, + { + "epoch": 1.8279497941886165, + "grad_norm": 0.7611459493637085, + "learning_rate": 7.268224656877339e-06, + "loss": 0.7324895262718201, + "step": 1409 + }, + { + "epoch": 1.8292475211387553, + "grad_norm": 0.7313300967216492, + "learning_rate": 7.25447188828663e-06, + "loss": 0.7643807530403137, + "step": 1410 + }, + { + "epoch": 1.8305452480888942, + "grad_norm": 0.7345109581947327, + "learning_rate": 7.240724733606002e-06, + "loss": 0.7648757696151733, + "step": 1411 + }, + { + "epoch": 1.831842975039033, + "grad_norm": 0.6995144486427307, + "learning_rate": 7.2269832209449145e-06, + "loss": 0.6826534271240234, + "step": 1412 + }, + { + "epoch": 1.8331407019891721, + "grad_norm": 0.6842563152313232, + "learning_rate": 7.213247378401274e-06, + "loss": 0.7718407511711121, + "step": 1413 + }, + { + "epoch": 1.834438428939311, + "grad_norm": 0.6925626397132874, + "learning_rate": 7.199517234061408e-06, + "loss": 0.7063374519348145, + "step": 1414 + }, + { + "epoch": 1.83573615588945, + "grad_norm": 0.7153764963150024, + "learning_rate": 7.1857928159999814e-06, + "loss": 0.7116506695747375, + "step": 1415 + }, + { + "epoch": 1.837033882839589, + "grad_norm": 0.7008180022239685, + "learning_rate": 7.172074152279963e-06, + "loss": 0.6926634311676025, + "step": 1416 + }, + { + "epoch": 1.8383316097897278, + "grad_norm": 0.695785641670227, + "learning_rate": 7.1583612709525405e-06, + "loss": 0.7824428081512451, + "step": 1417 + }, + { + "epoch": 1.8396293367398666, + "grad_norm": 0.7137957215309143, + "learning_rate": 7.14465420005709e-06, + "loss": 0.7480607032775879, + "step": 1418 + }, + { + "epoch": 1.8409270636900055, + "grad_norm": 0.6970608234405518, + "learning_rate": 7.130952967621096e-06, + "loss": 0.6973427534103394, + "step": 1419 + }, + { + "epoch": 1.8422247906401443, + "grad_norm": 0.7116836309432983, + "learning_rate": 7.11725760166012e-06, + "loss": 0.7084696292877197, + "step": 1420 + }, + { + "epoch": 1.8435225175902832, + "grad_norm": 0.7125561833381653, + "learning_rate": 7.103568130177713e-06, + "loss": 0.6803657412528992, + "step": 1421 + }, + { + "epoch": 1.844820244540422, + "grad_norm": 0.66914963722229, + "learning_rate": 7.089884581165382e-06, + "loss": 0.6364957690238953, + "step": 1422 + }, + { + "epoch": 1.8461179714905611, + "grad_norm": 0.7396631240844727, + "learning_rate": 7.076206982602516e-06, + "loss": 0.7236162424087524, + "step": 1423 + }, + { + "epoch": 1.8474156984407, + "grad_norm": 0.7191373109817505, + "learning_rate": 7.06253536245635e-06, + "loss": 0.7462475895881653, + "step": 1424 + }, + { + "epoch": 1.8487134253908388, + "grad_norm": 0.7262799143791199, + "learning_rate": 7.048869748681879e-06, + "loss": 0.7678788900375366, + "step": 1425 + }, + { + "epoch": 1.850011152340978, + "grad_norm": 0.7085245847702026, + "learning_rate": 7.035210169221834e-06, + "loss": 0.7576820850372314, + "step": 1426 + }, + { + "epoch": 1.8513088792911168, + "grad_norm": 0.7027114629745483, + "learning_rate": 7.021556652006588e-06, + "loss": 0.755644679069519, + "step": 1427 + }, + { + "epoch": 1.8526066062412556, + "grad_norm": 0.6858870387077332, + "learning_rate": 7.007909224954135e-06, + "loss": 0.7338079810142517, + "step": 1428 + }, + { + "epoch": 1.8539043331913945, + "grad_norm": 0.7013359069824219, + "learning_rate": 6.994267915970003e-06, + "loss": 0.7038964033126831, + "step": 1429 + }, + { + "epoch": 1.8552020601415333, + "grad_norm": 0.7172896265983582, + "learning_rate": 6.980632752947221e-06, + "loss": 0.7479324340820312, + "step": 1430 + }, + { + "epoch": 1.8564997870916722, + "grad_norm": 0.7214548587799072, + "learning_rate": 6.967003763766247e-06, + "loss": 0.7139613032341003, + "step": 1431 + }, + { + "epoch": 1.857797514041811, + "grad_norm": 0.730970025062561, + "learning_rate": 6.953380976294907e-06, + "loss": 0.765926718711853, + "step": 1432 + }, + { + "epoch": 1.85909524099195, + "grad_norm": 0.6703609824180603, + "learning_rate": 6.9397644183883616e-06, + "loss": 0.7193933129310608, + "step": 1433 + }, + { + "epoch": 1.860392967942089, + "grad_norm": 0.6499923467636108, + "learning_rate": 6.926154117889022e-06, + "loss": 0.6723966002464294, + "step": 1434 + }, + { + "epoch": 1.8616906948922278, + "grad_norm": 0.7143534421920776, + "learning_rate": 6.91255010262651e-06, + "loss": 0.7171000838279724, + "step": 1435 + }, + { + "epoch": 1.8629884218423667, + "grad_norm": 0.6932517290115356, + "learning_rate": 6.898952400417587e-06, + "loss": 0.6997263431549072, + "step": 1436 + }, + { + "epoch": 1.8642861487925058, + "grad_norm": 0.7429547905921936, + "learning_rate": 6.885361039066121e-06, + "loss": 0.780619204044342, + "step": 1437 + }, + { + "epoch": 1.8655838757426446, + "grad_norm": 0.7190982699394226, + "learning_rate": 6.8717760463629965e-06, + "loss": 0.7348355054855347, + "step": 1438 + }, + { + "epoch": 1.8668816026927835, + "grad_norm": 0.7007834315299988, + "learning_rate": 6.858197450086097e-06, + "loss": 0.7280945181846619, + "step": 1439 + }, + { + "epoch": 1.8681793296429223, + "grad_norm": 0.7208773493766785, + "learning_rate": 6.844625278000205e-06, + "loss": 0.775151252746582, + "step": 1440 + }, + { + "epoch": 1.8694770565930612, + "grad_norm": 0.6837726831436157, + "learning_rate": 6.831059557856984e-06, + "loss": 0.7308005094528198, + "step": 1441 + }, + { + "epoch": 1.8707747835432, + "grad_norm": 0.6819126009941101, + "learning_rate": 6.81750031739489e-06, + "loss": 0.6529159545898438, + "step": 1442 + }, + { + "epoch": 1.872072510493339, + "grad_norm": 0.6784840226173401, + "learning_rate": 6.803947584339148e-06, + "loss": 0.6919572949409485, + "step": 1443 + }, + { + "epoch": 1.8733702374434777, + "grad_norm": 0.6869913935661316, + "learning_rate": 6.79040138640166e-06, + "loss": 0.6871669888496399, + "step": 1444 + }, + { + "epoch": 1.8746679643936168, + "grad_norm": 0.7124300599098206, + "learning_rate": 6.7768617512809745e-06, + "loss": 0.7206623554229736, + "step": 1445 + }, + { + "epoch": 1.8759656913437557, + "grad_norm": 0.71539306640625, + "learning_rate": 6.763328706662214e-06, + "loss": 0.7108519673347473, + "step": 1446 + }, + { + "epoch": 1.8772634182938948, + "grad_norm": 0.7159188985824585, + "learning_rate": 6.749802280217037e-06, + "loss": 0.7131993770599365, + "step": 1447 + }, + { + "epoch": 1.8785611452440336, + "grad_norm": 0.722147524356842, + "learning_rate": 6.7362824996035545e-06, + "loss": 0.6998387575149536, + "step": 1448 + }, + { + "epoch": 1.8798588721941725, + "grad_norm": 0.7286826968193054, + "learning_rate": 6.722769392466304e-06, + "loss": 0.7367603778839111, + "step": 1449 + }, + { + "epoch": 1.8811565991443113, + "grad_norm": 0.7212167382240295, + "learning_rate": 6.709262986436162e-06, + "loss": 0.7357022762298584, + "step": 1450 + }, + { + "epoch": 1.8824543260944502, + "grad_norm": 0.7026610374450684, + "learning_rate": 6.695763309130318e-06, + "loss": 0.7126086354255676, + "step": 1451 + }, + { + "epoch": 1.883752053044589, + "grad_norm": 0.7145894169807434, + "learning_rate": 6.682270388152185e-06, + "loss": 0.6773615479469299, + "step": 1452 + }, + { + "epoch": 1.885049779994728, + "grad_norm": 0.7049593925476074, + "learning_rate": 6.668784251091381e-06, + "loss": 0.6776928305625916, + "step": 1453 + }, + { + "epoch": 1.8863475069448667, + "grad_norm": 0.699505627155304, + "learning_rate": 6.655304925523635e-06, + "loss": 0.6610416173934937, + "step": 1454 + }, + { + "epoch": 1.8876452338950058, + "grad_norm": 0.7056293487548828, + "learning_rate": 6.641832439010765e-06, + "loss": 0.6919702291488647, + "step": 1455 + }, + { + "epoch": 1.8889429608451447, + "grad_norm": 0.702669084072113, + "learning_rate": 6.628366819100586e-06, + "loss": 0.682940661907196, + "step": 1456 + }, + { + "epoch": 1.8902406877952835, + "grad_norm": 0.6931704878807068, + "learning_rate": 6.614908093326891e-06, + "loss": 0.7477650046348572, + "step": 1457 + }, + { + "epoch": 1.8915384147454226, + "grad_norm": 0.7257412075996399, + "learning_rate": 6.601456289209362e-06, + "loss": 0.774404764175415, + "step": 1458 + }, + { + "epoch": 1.8928361416955615, + "grad_norm": 0.6645631194114685, + "learning_rate": 6.588011434253534e-06, + "loss": 0.647753119468689, + "step": 1459 + }, + { + "epoch": 1.8941338686457003, + "grad_norm": 0.70735102891922, + "learning_rate": 6.574573555950738e-06, + "loss": 0.6710544228553772, + "step": 1460 + }, + { + "epoch": 1.8954315955958392, + "grad_norm": 0.7064939141273499, + "learning_rate": 6.561142681778027e-06, + "loss": 0.6929414868354797, + "step": 1461 + }, + { + "epoch": 1.896729322545978, + "grad_norm": 0.6896395683288574, + "learning_rate": 6.547718839198145e-06, + "loss": 0.6804373264312744, + "step": 1462 + }, + { + "epoch": 1.898027049496117, + "grad_norm": 0.726024329662323, + "learning_rate": 6.53430205565945e-06, + "loss": 0.7252693772315979, + "step": 1463 + }, + { + "epoch": 1.8993247764462557, + "grad_norm": 0.7104306817054749, + "learning_rate": 6.520892358595869e-06, + "loss": 0.7321268916130066, + "step": 1464 + }, + { + "epoch": 1.9006225033963946, + "grad_norm": 0.680915892124176, + "learning_rate": 6.507489775426834e-06, + "loss": 0.7166538238525391, + "step": 1465 + }, + { + "epoch": 1.9019202303465337, + "grad_norm": 0.7132366895675659, + "learning_rate": 6.494094333557243e-06, + "loss": 0.708162784576416, + "step": 1466 + }, + { + "epoch": 1.9032179572966725, + "grad_norm": 0.6515333652496338, + "learning_rate": 6.4807060603773795e-06, + "loss": 0.7163029313087463, + "step": 1467 + }, + { + "epoch": 1.9045156842468114, + "grad_norm": 0.7042413353919983, + "learning_rate": 6.467324983262877e-06, + "loss": 0.6881014704704285, + "step": 1468 + }, + { + "epoch": 1.9058134111969505, + "grad_norm": 0.6660881042480469, + "learning_rate": 6.453951129574644e-06, + "loss": 0.678939938545227, + "step": 1469 + }, + { + "epoch": 1.9071111381470893, + "grad_norm": 0.7373862266540527, + "learning_rate": 6.4405845266588356e-06, + "loss": 0.7181136608123779, + "step": 1470 + }, + { + "epoch": 1.9084088650972282, + "grad_norm": 0.7122411727905273, + "learning_rate": 6.427225201846763e-06, + "loss": 0.6904677748680115, + "step": 1471 + }, + { + "epoch": 1.909706592047367, + "grad_norm": 0.7414330244064331, + "learning_rate": 6.413873182454873e-06, + "loss": 0.7363246083259583, + "step": 1472 + }, + { + "epoch": 1.911004318997506, + "grad_norm": 0.6871086359024048, + "learning_rate": 6.4005284957846546e-06, + "loss": 0.6799793243408203, + "step": 1473 + }, + { + "epoch": 1.9123020459476447, + "grad_norm": 0.7056854963302612, + "learning_rate": 6.3871911691226276e-06, + "loss": 0.7036612033843994, + "step": 1474 + }, + { + "epoch": 1.9135997728977836, + "grad_norm": 0.7454568147659302, + "learning_rate": 6.373861229740237e-06, + "loss": 0.7416712045669556, + "step": 1475 + }, + { + "epoch": 1.9148974998479225, + "grad_norm": 0.6941256523132324, + "learning_rate": 6.360538704893845e-06, + "loss": 0.6659767031669617, + "step": 1476 + }, + { + "epoch": 1.9161952267980615, + "grad_norm": 0.7420505881309509, + "learning_rate": 6.3472236218246366e-06, + "loss": 0.7747020721435547, + "step": 1477 + }, + { + "epoch": 1.9174929537482004, + "grad_norm": 0.7113460302352905, + "learning_rate": 6.333916007758591e-06, + "loss": 0.7053021788597107, + "step": 1478 + }, + { + "epoch": 1.9187906806983395, + "grad_norm": 0.7145473957061768, + "learning_rate": 6.320615889906403e-06, + "loss": 0.7014235258102417, + "step": 1479 + }, + { + "epoch": 1.9200884076484783, + "grad_norm": 0.7099266052246094, + "learning_rate": 6.307323295463457e-06, + "loss": 0.7599897980690002, + "step": 1480 + }, + { + "epoch": 1.9213861345986172, + "grad_norm": 0.7005822062492371, + "learning_rate": 6.294038251609738e-06, + "loss": 0.6990090608596802, + "step": 1481 + }, + { + "epoch": 1.922683861548756, + "grad_norm": 0.6796419620513916, + "learning_rate": 6.280760785509802e-06, + "loss": 0.6529797911643982, + "step": 1482 + }, + { + "epoch": 1.923981588498895, + "grad_norm": 0.7199534773826599, + "learning_rate": 6.2674909243127e-06, + "loss": 0.714480459690094, + "step": 1483 + }, + { + "epoch": 1.9252793154490337, + "grad_norm": 0.7127954959869385, + "learning_rate": 6.254228695151949e-06, + "loss": 0.7583557367324829, + "step": 1484 + }, + { + "epoch": 1.9265770423991726, + "grad_norm": 0.7017828226089478, + "learning_rate": 6.240974125145443e-06, + "loss": 0.6976377367973328, + "step": 1485 + }, + { + "epoch": 1.9278747693493115, + "grad_norm": 0.6956459283828735, + "learning_rate": 6.227727241395429e-06, + "loss": 0.7237988114356995, + "step": 1486 + }, + { + "epoch": 1.9291724962994505, + "grad_norm": 0.7250760197639465, + "learning_rate": 6.214488070988424e-06, + "loss": 0.705412745475769, + "step": 1487 + }, + { + "epoch": 1.9304702232495894, + "grad_norm": 0.72161465883255, + "learning_rate": 6.201256640995184e-06, + "loss": 0.6755847930908203, + "step": 1488 + }, + { + "epoch": 1.9317679501997282, + "grad_norm": 0.6741456389427185, + "learning_rate": 6.188032978470639e-06, + "loss": 0.7194631099700928, + "step": 1489 + }, + { + "epoch": 1.9330656771498673, + "grad_norm": 0.6884588003158569, + "learning_rate": 6.174817110453828e-06, + "loss": 0.6863330006599426, + "step": 1490 + }, + { + "epoch": 1.9343634041000062, + "grad_norm": 0.7027184963226318, + "learning_rate": 6.161609063967857e-06, + "loss": 0.7379326224327087, + "step": 1491 + }, + { + "epoch": 1.935661131050145, + "grad_norm": 0.7299201488494873, + "learning_rate": 6.1484088660198325e-06, + "loss": 0.7956094145774841, + "step": 1492 + }, + { + "epoch": 1.936958858000284, + "grad_norm": 0.7009000182151794, + "learning_rate": 6.135216543600828e-06, + "loss": 0.7050310373306274, + "step": 1493 + }, + { + "epoch": 1.9382565849504227, + "grad_norm": 0.7212353944778442, + "learning_rate": 6.1220321236857974e-06, + "loss": 0.7898357510566711, + "step": 1494 + }, + { + "epoch": 1.9395543119005616, + "grad_norm": 0.7044717669487, + "learning_rate": 6.108855633233546e-06, + "loss": 0.7022029757499695, + "step": 1495 + }, + { + "epoch": 1.9408520388507005, + "grad_norm": 0.6811977624893188, + "learning_rate": 6.0956870991866545e-06, + "loss": 0.6920107007026672, + "step": 1496 + }, + { + "epoch": 1.9421497658008393, + "grad_norm": 0.6873610019683838, + "learning_rate": 6.0825265484714526e-06, + "loss": 0.6889206767082214, + "step": 1497 + }, + { + "epoch": 1.9434474927509784, + "grad_norm": 0.7255538702011108, + "learning_rate": 6.0693740079979235e-06, + "loss": 0.763762891292572, + "step": 1498 + }, + { + "epoch": 1.9447452197011172, + "grad_norm": 0.6617857217788696, + "learning_rate": 6.056229504659696e-06, + "loss": 0.65453040599823, + "step": 1499 + }, + { + "epoch": 1.9460429466512563, + "grad_norm": 0.7204879522323608, + "learning_rate": 6.043093065333945e-06, + "loss": 0.6839476823806763, + "step": 1500 + }, + { + "epoch": 1.9473406736013952, + "grad_norm": 0.695447564125061, + "learning_rate": 6.029964716881367e-06, + "loss": 0.6658032536506653, + "step": 1501 + }, + { + "epoch": 1.948638400551534, + "grad_norm": 0.6816181540489197, + "learning_rate": 6.016844486146106e-06, + "loss": 0.7248274087905884, + "step": 1502 + }, + { + "epoch": 1.9499361275016729, + "grad_norm": 0.7379606366157532, + "learning_rate": 6.003732399955722e-06, + "loss": 0.6768795251846313, + "step": 1503 + }, + { + "epoch": 1.9512338544518117, + "grad_norm": 0.6998269557952881, + "learning_rate": 5.990628485121106e-06, + "loss": 0.6504592895507812, + "step": 1504 + }, + { + "epoch": 1.9525315814019506, + "grad_norm": 0.7351219654083252, + "learning_rate": 5.97753276843645e-06, + "loss": 0.7741858959197998, + "step": 1505 + }, + { + "epoch": 1.9538293083520895, + "grad_norm": 0.6803948283195496, + "learning_rate": 5.964445276679176e-06, + "loss": 0.6615405678749084, + "step": 1506 + }, + { + "epoch": 1.9551270353022283, + "grad_norm": 0.7318346500396729, + "learning_rate": 5.9513660366099005e-06, + "loss": 0.7087497115135193, + "step": 1507 + }, + { + "epoch": 1.9564247622523674, + "grad_norm": 0.7118584513664246, + "learning_rate": 5.93829507497235e-06, + "loss": 0.647581934928894, + "step": 1508 + }, + { + "epoch": 1.9577224892025062, + "grad_norm": 0.7135505080223083, + "learning_rate": 5.925232418493338e-06, + "loss": 0.7108398079872131, + "step": 1509 + }, + { + "epoch": 1.959020216152645, + "grad_norm": 0.6982471346855164, + "learning_rate": 5.912178093882688e-06, + "loss": 0.7022315859794617, + "step": 1510 + }, + { + "epoch": 1.9603179431027842, + "grad_norm": 0.7076136469841003, + "learning_rate": 5.8991321278331934e-06, + "loss": 0.6406600475311279, + "step": 1511 + }, + { + "epoch": 1.961615670052923, + "grad_norm": 0.7392069101333618, + "learning_rate": 5.8860945470205466e-06, + "loss": 0.7887027859687805, + "step": 1512 + }, + { + "epoch": 1.9629133970030619, + "grad_norm": 0.7483602166175842, + "learning_rate": 5.8730653781033085e-06, + "loss": 0.7219119668006897, + "step": 1513 + }, + { + "epoch": 1.9642111239532007, + "grad_norm": 0.7024926543235779, + "learning_rate": 5.860044647722827e-06, + "loss": 0.7041683793067932, + "step": 1514 + }, + { + "epoch": 1.9655088509033396, + "grad_norm": 0.6939775347709656, + "learning_rate": 5.847032382503202e-06, + "loss": 0.6798254251480103, + "step": 1515 + }, + { + "epoch": 1.9668065778534785, + "grad_norm": 0.700524628162384, + "learning_rate": 5.834028609051218e-06, + "loss": 0.731053352355957, + "step": 1516 + }, + { + "epoch": 1.9681043048036173, + "grad_norm": 0.7189422845840454, + "learning_rate": 5.8210333539563e-06, + "loss": 0.6871148347854614, + "step": 1517 + }, + { + "epoch": 1.9694020317537562, + "grad_norm": 0.7418919205665588, + "learning_rate": 5.808046643790468e-06, + "loss": 0.7469598054885864, + "step": 1518 + }, + { + "epoch": 1.9706997587038952, + "grad_norm": 0.6783238649368286, + "learning_rate": 5.795068505108243e-06, + "loss": 0.6897709369659424, + "step": 1519 + }, + { + "epoch": 1.971997485654034, + "grad_norm": 0.7016989588737488, + "learning_rate": 5.782098964446641e-06, + "loss": 0.6978930830955505, + "step": 1520 + }, + { + "epoch": 1.973295212604173, + "grad_norm": 0.6924634575843811, + "learning_rate": 5.769138048325087e-06, + "loss": 0.6557913422584534, + "step": 1521 + }, + { + "epoch": 1.974592939554312, + "grad_norm": 0.6980036497116089, + "learning_rate": 5.756185783245376e-06, + "loss": 0.6883926391601562, + "step": 1522 + }, + { + "epoch": 1.9758906665044509, + "grad_norm": 0.6666119694709778, + "learning_rate": 5.743242195691612e-06, + "loss": 0.696445107460022, + "step": 1523 + }, + { + "epoch": 1.9771883934545897, + "grad_norm": 0.7082392573356628, + "learning_rate": 5.730307312130152e-06, + "loss": 0.7830109596252441, + "step": 1524 + }, + { + "epoch": 1.9784861204047286, + "grad_norm": 0.7415315508842468, + "learning_rate": 5.717381159009563e-06, + "loss": 0.6982215642929077, + "step": 1525 + }, + { + "epoch": 1.9797838473548675, + "grad_norm": 0.7484350800514221, + "learning_rate": 5.704463762760559e-06, + "loss": 0.727252721786499, + "step": 1526 + }, + { + "epoch": 1.9810815743050063, + "grad_norm": 0.6809999346733093, + "learning_rate": 5.691555149795933e-06, + "loss": 0.794657826423645, + "step": 1527 + }, + { + "epoch": 1.9823793012551452, + "grad_norm": 0.7138223648071289, + "learning_rate": 5.678655346510549e-06, + "loss": 0.7287296056747437, + "step": 1528 + }, + { + "epoch": 1.983677028205284, + "grad_norm": 0.6818944215774536, + "learning_rate": 5.6657643792812265e-06, + "loss": 0.6768350601196289, + "step": 1529 + }, + { + "epoch": 1.984974755155423, + "grad_norm": 0.7276642918586731, + "learning_rate": 5.652882274466736e-06, + "loss": 0.7598171830177307, + "step": 1530 + }, + { + "epoch": 1.986272482105562, + "grad_norm": 0.6802821159362793, + "learning_rate": 5.640009058407719e-06, + "loss": 0.682623028755188, + "step": 1531 + }, + { + "epoch": 1.987570209055701, + "grad_norm": 0.7515146732330322, + "learning_rate": 5.627144757426647e-06, + "loss": 0.7861851453781128, + "step": 1532 + }, + { + "epoch": 1.9888679360058399, + "grad_norm": 0.7353605628013611, + "learning_rate": 5.614289397827757e-06, + "loss": 0.7634737491607666, + "step": 1533 + }, + { + "epoch": 1.9901656629559787, + "grad_norm": 0.7560073137283325, + "learning_rate": 5.601443005897012e-06, + "loss": 0.7616620659828186, + "step": 1534 + }, + { + "epoch": 1.9914633899061176, + "grad_norm": 0.7289350628852844, + "learning_rate": 5.588605607902017e-06, + "loss": 0.7190179824829102, + "step": 1535 + }, + { + "epoch": 1.9927611168562565, + "grad_norm": 0.7019691467285156, + "learning_rate": 5.57577723009202e-06, + "loss": 0.671945333480835, + "step": 1536 + }, + { + "epoch": 1.9940588438063953, + "grad_norm": 0.6952185034751892, + "learning_rate": 5.5629578986977894e-06, + "loss": 0.7416089177131653, + "step": 1537 + }, + { + "epoch": 1.9953565707565342, + "grad_norm": 0.7558557987213135, + "learning_rate": 5.550147639931631e-06, + "loss": 0.7460814714431763, + "step": 1538 + }, + { + "epoch": 1.996654297706673, + "grad_norm": 0.6997542381286621, + "learning_rate": 5.537346479987269e-06, + "loss": 0.7162995338439941, + "step": 1539 + }, + { + "epoch": 1.997952024656812, + "grad_norm": 0.7319507002830505, + "learning_rate": 5.524554445039838e-06, + "loss": 0.7580918669700623, + "step": 1540 + }, + { + "epoch": 1.999249751606951, + "grad_norm": 0.7187158465385437, + "learning_rate": 5.511771561245813e-06, + "loss": 0.6829614043235779, + "step": 1541 + }, + { + "epoch": 2.0, + "grad_norm": 0.8756005167961121, + "learning_rate": 5.498997854742956e-06, + "loss": 0.654055118560791, + "step": 1542 + }, + { + "epoch": 2.001297726950139, + "grad_norm": 0.884756326675415, + "learning_rate": 5.4862333516502634e-06, + "loss": 0.6550735831260681, + "step": 1543 + }, + { + "epoch": 2.0025954539002777, + "grad_norm": 0.8835470080375671, + "learning_rate": 5.473478078067913e-06, + "loss": 0.7326578497886658, + "step": 1544 + }, + { + "epoch": 2.0038931808504166, + "grad_norm": 0.8778272867202759, + "learning_rate": 5.460732060077212e-06, + "loss": 0.6050289273262024, + "step": 1545 + }, + { + "epoch": 2.0051909078005554, + "grad_norm": 0.7473064661026001, + "learning_rate": 5.44799532374054e-06, + "loss": 0.6881033182144165, + "step": 1546 + }, + { + "epoch": 2.0064886347506943, + "grad_norm": 0.7708891034126282, + "learning_rate": 5.435267895101303e-06, + "loss": 0.6227023005485535, + "step": 1547 + }, + { + "epoch": 2.0077863617008336, + "grad_norm": 0.7482177019119263, + "learning_rate": 5.422549800183861e-06, + "loss": 0.6618348360061646, + "step": 1548 + }, + { + "epoch": 2.0090840886509724, + "grad_norm": 0.7345021963119507, + "learning_rate": 5.409841064993512e-06, + "loss": 0.6520942449569702, + "step": 1549 + }, + { + "epoch": 2.0103818156011113, + "grad_norm": 0.7631828188896179, + "learning_rate": 5.39714171551639e-06, + "loss": 0.6233668923377991, + "step": 1550 + }, + { + "epoch": 2.01167954255125, + "grad_norm": 0.813840925693512, + "learning_rate": 5.384451777719464e-06, + "loss": 0.7311254739761353, + "step": 1551 + }, + { + "epoch": 2.012977269501389, + "grad_norm": 0.8413859009742737, + "learning_rate": 5.371771277550432e-06, + "loss": 0.7018522024154663, + "step": 1552 + }, + { + "epoch": 2.014274996451528, + "grad_norm": 0.7750846147537231, + "learning_rate": 5.359100240937717e-06, + "loss": 0.6850703954696655, + "step": 1553 + }, + { + "epoch": 2.0155727234016667, + "grad_norm": 0.7778939604759216, + "learning_rate": 5.3464386937903764e-06, + "loss": 0.6811778545379639, + "step": 1554 + }, + { + "epoch": 2.0168704503518056, + "grad_norm": 0.7875815033912659, + "learning_rate": 5.33378666199807e-06, + "loss": 0.6062582731246948, + "step": 1555 + }, + { + "epoch": 2.0181681773019444, + "grad_norm": 0.8213943839073181, + "learning_rate": 5.321144171431003e-06, + "loss": 0.6217991709709167, + "step": 1556 + }, + { + "epoch": 2.0194659042520833, + "grad_norm": 0.8762441873550415, + "learning_rate": 5.308511247939872e-06, + "loss": 0.6675798296928406, + "step": 1557 + }, + { + "epoch": 2.0207636312022226, + "grad_norm": 0.7665208578109741, + "learning_rate": 5.295887917355794e-06, + "loss": 0.6503481268882751, + "step": 1558 + }, + { + "epoch": 2.0220613581523614, + "grad_norm": 0.7740142941474915, + "learning_rate": 5.283274205490303e-06, + "loss": 0.6113878488540649, + "step": 1559 + }, + { + "epoch": 2.0233590851025003, + "grad_norm": 0.7948552966117859, + "learning_rate": 5.270670138135234e-06, + "loss": 0.7041577100753784, + "step": 1560 + }, + { + "epoch": 2.024656812052639, + "grad_norm": 0.732266366481781, + "learning_rate": 5.25807574106272e-06, + "loss": 0.683874785900116, + "step": 1561 + }, + { + "epoch": 2.025954539002778, + "grad_norm": 0.7335087060928345, + "learning_rate": 5.245491040025115e-06, + "loss": 0.6318987011909485, + "step": 1562 + }, + { + "epoch": 2.027252265952917, + "grad_norm": 0.7172908186912537, + "learning_rate": 5.232916060754947e-06, + "loss": 0.6631210446357727, + "step": 1563 + }, + { + "epoch": 2.0285499929030557, + "grad_norm": 0.7232309579849243, + "learning_rate": 5.220350828964865e-06, + "loss": 0.6236647367477417, + "step": 1564 + }, + { + "epoch": 2.0298477198531946, + "grad_norm": 0.727989137172699, + "learning_rate": 5.207795370347588e-06, + "loss": 0.6853646039962769, + "step": 1565 + }, + { + "epoch": 2.0311454468033334, + "grad_norm": 0.7468066215515137, + "learning_rate": 5.195249710575853e-06, + "loss": 0.6544186472892761, + "step": 1566 + }, + { + "epoch": 2.0324431737534723, + "grad_norm": 0.7399063110351562, + "learning_rate": 5.182713875302361e-06, + "loss": 0.6106476783752441, + "step": 1567 + }, + { + "epoch": 2.033740900703611, + "grad_norm": 0.7420501708984375, + "learning_rate": 5.1701878901597106e-06, + "loss": 0.715307891368866, + "step": 1568 + }, + { + "epoch": 2.0350386276537504, + "grad_norm": 0.7202077507972717, + "learning_rate": 5.157671780760385e-06, + "loss": 0.6406188607215881, + "step": 1569 + }, + { + "epoch": 2.0363363546038893, + "grad_norm": 0.7133172154426575, + "learning_rate": 5.145165572696652e-06, + "loss": 0.6294587850570679, + "step": 1570 + }, + { + "epoch": 2.037634081554028, + "grad_norm": 0.7211350798606873, + "learning_rate": 5.132669291540544e-06, + "loss": 0.6074943542480469, + "step": 1571 + }, + { + "epoch": 2.038931808504167, + "grad_norm": 0.7271124124526978, + "learning_rate": 5.1201829628437926e-06, + "loss": 0.6158304214477539, + "step": 1572 + }, + { + "epoch": 2.040229535454306, + "grad_norm": 0.7051241397857666, + "learning_rate": 5.107706612137776e-06, + "loss": 0.6632368564605713, + "step": 1573 + }, + { + "epoch": 2.0415272624044447, + "grad_norm": 0.7206335067749023, + "learning_rate": 5.095240264933486e-06, + "loss": 0.6133254766464233, + "step": 1574 + }, + { + "epoch": 2.0428249893545836, + "grad_norm": 0.7106805443763733, + "learning_rate": 5.082783946721434e-06, + "loss": 0.629423201084137, + "step": 1575 + }, + { + "epoch": 2.0441227163047224, + "grad_norm": 0.7104700207710266, + "learning_rate": 5.070337682971642e-06, + "loss": 0.6985434293746948, + "step": 1576 + }, + { + "epoch": 2.0454204432548613, + "grad_norm": 0.6845932006835938, + "learning_rate": 5.057901499133573e-06, + "loss": 0.6254795789718628, + "step": 1577 + }, + { + "epoch": 2.046718170205, + "grad_norm": 0.7214529514312744, + "learning_rate": 5.0454754206360705e-06, + "loss": 0.6072602868080139, + "step": 1578 + }, + { + "epoch": 2.048015897155139, + "grad_norm": 0.71996009349823, + "learning_rate": 5.033059472887322e-06, + "loss": 0.6534575819969177, + "step": 1579 + }, + { + "epoch": 2.0493136241052783, + "grad_norm": 0.7217608690261841, + "learning_rate": 5.0206536812748004e-06, + "loss": 0.6317112445831299, + "step": 1580 + }, + { + "epoch": 2.050611351055417, + "grad_norm": 0.7069404125213623, + "learning_rate": 5.008258071165202e-06, + "loss": 0.6474272608757019, + "step": 1581 + }, + { + "epoch": 2.051909078005556, + "grad_norm": 0.7298946976661682, + "learning_rate": 4.995872667904424e-06, + "loss": 0.6893925666809082, + "step": 1582 + }, + { + "epoch": 2.053206804955695, + "grad_norm": 0.750266432762146, + "learning_rate": 4.98349749681747e-06, + "loss": 0.6087015271186829, + "step": 1583 + }, + { + "epoch": 2.0545045319058337, + "grad_norm": 0.7133123278617859, + "learning_rate": 4.971132583208438e-06, + "loss": 0.624868631362915, + "step": 1584 + }, + { + "epoch": 2.0558022588559726, + "grad_norm": 0.7388240694999695, + "learning_rate": 4.958777952360445e-06, + "loss": 0.6425670981407166, + "step": 1585 + }, + { + "epoch": 2.0570999858061114, + "grad_norm": 0.7531347870826721, + "learning_rate": 4.946433629535585e-06, + "loss": 0.6272885799407959, + "step": 1586 + }, + { + "epoch": 2.0583977127562503, + "grad_norm": 0.7500084042549133, + "learning_rate": 4.934099639974874e-06, + "loss": 0.6620087027549744, + "step": 1587 + }, + { + "epoch": 2.059695439706389, + "grad_norm": 0.708791196346283, + "learning_rate": 4.921776008898198e-06, + "loss": 0.5606707334518433, + "step": 1588 + }, + { + "epoch": 2.060993166656528, + "grad_norm": 0.7260934114456177, + "learning_rate": 4.909462761504264e-06, + "loss": 0.67381352186203, + "step": 1589 + }, + { + "epoch": 2.0622908936066673, + "grad_norm": 0.6928997039794922, + "learning_rate": 4.897159922970551e-06, + "loss": 0.6307032704353333, + "step": 1590 + }, + { + "epoch": 2.063588620556806, + "grad_norm": 0.7362192869186401, + "learning_rate": 4.884867518453238e-06, + "loss": 0.6901969313621521, + "step": 1591 + }, + { + "epoch": 2.064886347506945, + "grad_norm": 0.722802460193634, + "learning_rate": 4.872585573087195e-06, + "loss": 0.7266512513160706, + "step": 1592 + }, + { + "epoch": 2.066184074457084, + "grad_norm": 0.7570728659629822, + "learning_rate": 4.860314111985881e-06, + "loss": 0.7014977335929871, + "step": 1593 + }, + { + "epoch": 2.0674818014072227, + "grad_norm": 0.7065424919128418, + "learning_rate": 4.848053160241333e-06, + "loss": 0.623349130153656, + "step": 1594 + }, + { + "epoch": 2.0687795283573616, + "grad_norm": 0.7208600044250488, + "learning_rate": 4.835802742924091e-06, + "loss": 0.6265473961830139, + "step": 1595 + }, + { + "epoch": 2.0700772553075004, + "grad_norm": 0.7267877459526062, + "learning_rate": 4.823562885083161e-06, + "loss": 0.6631119251251221, + "step": 1596 + }, + { + "epoch": 2.0713749822576393, + "grad_norm": 0.7265859842300415, + "learning_rate": 4.811333611745953e-06, + "loss": 0.655154824256897, + "step": 1597 + }, + { + "epoch": 2.072672709207778, + "grad_norm": 0.7422747015953064, + "learning_rate": 4.799114947918238e-06, + "loss": 0.6400114297866821, + "step": 1598 + }, + { + "epoch": 2.073970436157917, + "grad_norm": 0.7720977663993835, + "learning_rate": 4.786906918584083e-06, + "loss": 0.6592541337013245, + "step": 1599 + }, + { + "epoch": 2.075268163108056, + "grad_norm": 0.741809606552124, + "learning_rate": 4.774709548705831e-06, + "loss": 0.6636130213737488, + "step": 1600 + }, + { + "epoch": 2.076565890058195, + "grad_norm": 0.7521026730537415, + "learning_rate": 4.762522863224001e-06, + "loss": 0.6645440459251404, + "step": 1601 + }, + { + "epoch": 2.077863617008334, + "grad_norm": 0.735471248626709, + "learning_rate": 4.750346887057292e-06, + "loss": 0.6191429495811462, + "step": 1602 + }, + { + "epoch": 2.079161343958473, + "grad_norm": 0.7346929907798767, + "learning_rate": 4.738181645102493e-06, + "loss": 0.616767406463623, + "step": 1603 + }, + { + "epoch": 2.0804590709086117, + "grad_norm": 0.7322461605072021, + "learning_rate": 4.726027162234434e-06, + "loss": 0.6997534036636353, + "step": 1604 + }, + { + "epoch": 2.0817567978587506, + "grad_norm": 0.7436448335647583, + "learning_rate": 4.713883463305972e-06, + "loss": 0.6780825853347778, + "step": 1605 + }, + { + "epoch": 2.0830545248088894, + "grad_norm": 0.7452847361564636, + "learning_rate": 4.701750573147885e-06, + "loss": 0.6652136445045471, + "step": 1606 + }, + { + "epoch": 2.0843522517590283, + "grad_norm": 0.7359369993209839, + "learning_rate": 4.689628516568866e-06, + "loss": 0.676584780216217, + "step": 1607 + }, + { + "epoch": 2.085649978709167, + "grad_norm": 0.7257094979286194, + "learning_rate": 4.677517318355455e-06, + "loss": 0.6461347937583923, + "step": 1608 + }, + { + "epoch": 2.086947705659306, + "grad_norm": 0.7261176705360413, + "learning_rate": 4.6654170032719825e-06, + "loss": 0.6190035343170166, + "step": 1609 + }, + { + "epoch": 2.088245432609445, + "grad_norm": 0.7273695468902588, + "learning_rate": 4.6533275960605355e-06, + "loss": 0.6539610028266907, + "step": 1610 + }, + { + "epoch": 2.088245432609445, + "eval_loss": 0.7521457076072693, + "eval_runtime": 140.3222, + "eval_samples_per_second": 37.001, + "eval_steps_per_second": 9.25, + "step": 1610 + }, + { + "epoch": 2.089543159559584, + "grad_norm": 0.7120246291160583, + "learning_rate": 4.641249121440892e-06, + "loss": 0.6520042419433594, + "step": 1611 + }, + { + "epoch": 2.090840886509723, + "grad_norm": 0.7543119788169861, + "learning_rate": 4.629181604110464e-06, + "loss": 0.6681778430938721, + "step": 1612 + }, + { + "epoch": 2.092138613459862, + "grad_norm": 0.7003790736198425, + "learning_rate": 4.617125068744288e-06, + "loss": 0.5710310935974121, + "step": 1613 + }, + { + "epoch": 2.0934363404100007, + "grad_norm": 0.7836804986000061, + "learning_rate": 4.605079539994911e-06, + "loss": 0.686365008354187, + "step": 1614 + }, + { + "epoch": 2.0947340673601396, + "grad_norm": 0.7372239828109741, + "learning_rate": 4.593045042492404e-06, + "loss": 0.684090256690979, + "step": 1615 + }, + { + "epoch": 2.0960317943102784, + "grad_norm": 0.7506935000419617, + "learning_rate": 4.581021600844258e-06, + "loss": 0.6425600647926331, + "step": 1616 + }, + { + "epoch": 2.0973295212604173, + "grad_norm": 0.7384741306304932, + "learning_rate": 4.569009239635374e-06, + "loss": 0.675249457359314, + "step": 1617 + }, + { + "epoch": 2.098627248210556, + "grad_norm": 0.7220048308372498, + "learning_rate": 4.557007983427987e-06, + "loss": 0.6857472658157349, + "step": 1618 + }, + { + "epoch": 2.099924975160695, + "grad_norm": 0.7698497772216797, + "learning_rate": 4.54501785676163e-06, + "loss": 0.6067232489585876, + "step": 1619 + }, + { + "epoch": 2.101222702110834, + "grad_norm": 0.7213151454925537, + "learning_rate": 4.533038884153077e-06, + "loss": 0.7489792704582214, + "step": 1620 + }, + { + "epoch": 2.1025204290609727, + "grad_norm": 0.7353917956352234, + "learning_rate": 4.521071090096298e-06, + "loss": 0.6004921793937683, + "step": 1621 + }, + { + "epoch": 2.103818156011112, + "grad_norm": 0.712821364402771, + "learning_rate": 4.509114499062393e-06, + "loss": 0.632519006729126, + "step": 1622 + }, + { + "epoch": 2.105115882961251, + "grad_norm": 0.7335408926010132, + "learning_rate": 4.4971691354995795e-06, + "loss": 0.6487690210342407, + "step": 1623 + }, + { + "epoch": 2.1064136099113897, + "grad_norm": 0.7657801508903503, + "learning_rate": 4.485235023833087e-06, + "loss": 0.7272740602493286, + "step": 1624 + }, + { + "epoch": 2.1077113368615286, + "grad_norm": 0.7787186503410339, + "learning_rate": 4.4733121884651665e-06, + "loss": 0.6530774235725403, + "step": 1625 + }, + { + "epoch": 2.1090090638116674, + "grad_norm": 0.7693159580230713, + "learning_rate": 4.46140065377499e-06, + "loss": 0.6131106019020081, + "step": 1626 + }, + { + "epoch": 2.1103067907618063, + "grad_norm": 0.7225230932235718, + "learning_rate": 4.449500444118633e-06, + "loss": 0.6403114199638367, + "step": 1627 + }, + { + "epoch": 2.111604517711945, + "grad_norm": 0.7100993990898132, + "learning_rate": 4.437611583829014e-06, + "loss": 0.6448891162872314, + "step": 1628 + }, + { + "epoch": 2.112902244662084, + "grad_norm": 0.6913020610809326, + "learning_rate": 4.42573409721584e-06, + "loss": 0.6105331778526306, + "step": 1629 + }, + { + "epoch": 2.114199971612223, + "grad_norm": 0.7184289693832397, + "learning_rate": 4.413868008565569e-06, + "loss": 0.6300491690635681, + "step": 1630 + }, + { + "epoch": 2.1154976985623617, + "grad_norm": 0.7327896356582642, + "learning_rate": 4.402013342141347e-06, + "loss": 0.5891982316970825, + "step": 1631 + }, + { + "epoch": 2.1167954255125006, + "grad_norm": 0.7524354457855225, + "learning_rate": 4.390170122182965e-06, + "loss": 0.6236910820007324, + "step": 1632 + }, + { + "epoch": 2.11809315246264, + "grad_norm": 0.69328373670578, + "learning_rate": 4.378338372906813e-06, + "loss": 0.6320694088935852, + "step": 1633 + }, + { + "epoch": 2.1193908794127787, + "grad_norm": 0.7765412926673889, + "learning_rate": 4.3665181185058255e-06, + "loss": 0.6867218613624573, + "step": 1634 + }, + { + "epoch": 2.1206886063629176, + "grad_norm": 0.7132006883621216, + "learning_rate": 4.354709383149421e-06, + "loss": 0.6264625787734985, + "step": 1635 + }, + { + "epoch": 2.1219863333130564, + "grad_norm": 0.7659435272216797, + "learning_rate": 4.342912190983487e-06, + "loss": 0.7046580910682678, + "step": 1636 + }, + { + "epoch": 2.1232840602631953, + "grad_norm": 0.7297986149787903, + "learning_rate": 4.331126566130284e-06, + "loss": 0.7077990174293518, + "step": 1637 + }, + { + "epoch": 2.124581787213334, + "grad_norm": 0.7537614107131958, + "learning_rate": 4.319352532688444e-06, + "loss": 0.652155876159668, + "step": 1638 + }, + { + "epoch": 2.125879514163473, + "grad_norm": 0.7315341234207153, + "learning_rate": 4.3075901147328745e-06, + "loss": 0.6733738780021667, + "step": 1639 + }, + { + "epoch": 2.127177241113612, + "grad_norm": 0.7361832857131958, + "learning_rate": 4.295839336314749e-06, + "loss": 0.635147750377655, + "step": 1640 + }, + { + "epoch": 2.1284749680637507, + "grad_norm": 0.7507902383804321, + "learning_rate": 4.284100221461432e-06, + "loss": 0.6047714948654175, + "step": 1641 + }, + { + "epoch": 2.1297726950138895, + "grad_norm": 0.7528434991836548, + "learning_rate": 4.272372794176446e-06, + "loss": 0.7513724565505981, + "step": 1642 + }, + { + "epoch": 2.1310704219640284, + "grad_norm": 0.7637490034103394, + "learning_rate": 4.260657078439409e-06, + "loss": 0.67987060546875, + "step": 1643 + }, + { + "epoch": 2.1323681489141677, + "grad_norm": 0.7283375859260559, + "learning_rate": 4.248953098205997e-06, + "loss": 0.6341656446456909, + "step": 1644 + }, + { + "epoch": 2.1336658758643066, + "grad_norm": 0.7419525980949402, + "learning_rate": 4.237260877407878e-06, + "loss": 0.6832218766212463, + "step": 1645 + }, + { + "epoch": 2.1349636028144454, + "grad_norm": 0.7223761081695557, + "learning_rate": 4.225580439952699e-06, + "loss": 0.6866045594215393, + "step": 1646 + }, + { + "epoch": 2.1362613297645843, + "grad_norm": 0.7388637065887451, + "learning_rate": 4.213911809723987e-06, + "loss": 0.6384668350219727, + "step": 1647 + }, + { + "epoch": 2.137559056714723, + "grad_norm": 0.755170464515686, + "learning_rate": 4.20225501058114e-06, + "loss": 0.6708781123161316, + "step": 1648 + }, + { + "epoch": 2.138856783664862, + "grad_norm": 0.7287908792495728, + "learning_rate": 4.190610066359364e-06, + "loss": 0.6631587743759155, + "step": 1649 + }, + { + "epoch": 2.140154510615001, + "grad_norm": 0.7358418107032776, + "learning_rate": 4.1789770008696205e-06, + "loss": 0.6789165735244751, + "step": 1650 + }, + { + "epoch": 2.1414522375651397, + "grad_norm": 0.7651984691619873, + "learning_rate": 4.167355837898585e-06, + "loss": 0.7314514517784119, + "step": 1651 + }, + { + "epoch": 2.1427499645152785, + "grad_norm": 0.7463676333427429, + "learning_rate": 4.155746601208594e-06, + "loss": 0.6692876219749451, + "step": 1652 + }, + { + "epoch": 2.1440476914654174, + "grad_norm": 0.7222311496734619, + "learning_rate": 4.144149314537599e-06, + "loss": 0.6298620104789734, + "step": 1653 + }, + { + "epoch": 2.1453454184155567, + "grad_norm": 0.6989638805389404, + "learning_rate": 4.1325640015991185e-06, + "loss": 0.6444326043128967, + "step": 1654 + }, + { + "epoch": 2.1466431453656956, + "grad_norm": 0.7494760155677795, + "learning_rate": 4.120990686082174e-06, + "loss": 0.6625097990036011, + "step": 1655 + }, + { + "epoch": 2.1479408723158344, + "grad_norm": 0.7078225016593933, + "learning_rate": 4.109429391651283e-06, + "loss": 0.5881250500679016, + "step": 1656 + }, + { + "epoch": 2.1492385992659733, + "grad_norm": 0.767970621585846, + "learning_rate": 4.097880141946354e-06, + "loss": 0.6296786665916443, + "step": 1657 + }, + { + "epoch": 2.150536326216112, + "grad_norm": 0.7743704319000244, + "learning_rate": 4.08634296058268e-06, + "loss": 0.6085373759269714, + "step": 1658 + }, + { + "epoch": 2.151834053166251, + "grad_norm": 0.7132009267807007, + "learning_rate": 4.074817871150887e-06, + "loss": 0.6695290803909302, + "step": 1659 + }, + { + "epoch": 2.15313178011639, + "grad_norm": 0.7174614667892456, + "learning_rate": 4.063304897216856e-06, + "loss": 0.6345046758651733, + "step": 1660 + }, + { + "epoch": 2.1544295070665287, + "grad_norm": 0.756147027015686, + "learning_rate": 4.051804062321706e-06, + "loss": 0.6537505388259888, + "step": 1661 + }, + { + "epoch": 2.1557272340166675, + "grad_norm": 0.7213236093521118, + "learning_rate": 4.040315389981736e-06, + "loss": 0.702519953250885, + "step": 1662 + }, + { + "epoch": 2.1570249609668064, + "grad_norm": 0.7155364751815796, + "learning_rate": 4.028838903688372e-06, + "loss": 0.681422770023346, + "step": 1663 + }, + { + "epoch": 2.1583226879169457, + "grad_norm": 0.7463889122009277, + "learning_rate": 4.017374626908125e-06, + "loss": 0.6635671854019165, + "step": 1664 + }, + { + "epoch": 2.1596204148670846, + "grad_norm": 0.7302799820899963, + "learning_rate": 4.005922583082538e-06, + "loss": 0.6605507731437683, + "step": 1665 + }, + { + "epoch": 2.1609181418172234, + "grad_norm": 0.7709221243858337, + "learning_rate": 3.994482795628142e-06, + "loss": 0.6744245290756226, + "step": 1666 + }, + { + "epoch": 2.1622158687673623, + "grad_norm": 0.7545700669288635, + "learning_rate": 3.983055287936411e-06, + "loss": 0.7104499340057373, + "step": 1667 + }, + { + "epoch": 2.163513595717501, + "grad_norm": 0.7296931743621826, + "learning_rate": 3.971640083373696e-06, + "loss": 0.6586728096008301, + "step": 1668 + }, + { + "epoch": 2.16481132266764, + "grad_norm": 0.7653056383132935, + "learning_rate": 3.960237205281213e-06, + "loss": 0.6596845388412476, + "step": 1669 + }, + { + "epoch": 2.166109049617779, + "grad_norm": 0.740091860294342, + "learning_rate": 3.948846676974953e-06, + "loss": 0.6983301043510437, + "step": 1670 + }, + { + "epoch": 2.1674067765679177, + "grad_norm": 0.7317189574241638, + "learning_rate": 3.937468521745666e-06, + "loss": 0.6039131879806519, + "step": 1671 + }, + { + "epoch": 2.1687045035180565, + "grad_norm": 0.7543178200721741, + "learning_rate": 3.9261027628588e-06, + "loss": 0.7082279324531555, + "step": 1672 + }, + { + "epoch": 2.1700022304681954, + "grad_norm": 0.7396308779716492, + "learning_rate": 3.9147494235544544e-06, + "loss": 0.6432596445083618, + "step": 1673 + }, + { + "epoch": 2.1712999574183343, + "grad_norm": 0.7311068177223206, + "learning_rate": 3.903408527047336e-06, + "loss": 0.6383781433105469, + "step": 1674 + }, + { + "epoch": 2.1725976843684736, + "grad_norm": 0.7544176578521729, + "learning_rate": 3.892080096526707e-06, + "loss": 0.6584154367446899, + "step": 1675 + }, + { + "epoch": 2.1738954113186124, + "grad_norm": 0.7279508113861084, + "learning_rate": 3.880764155156339e-06, + "loss": 0.6078423261642456, + "step": 1676 + }, + { + "epoch": 2.1751931382687513, + "grad_norm": 0.7655706405639648, + "learning_rate": 3.8694607260744745e-06, + "loss": 0.716061532497406, + "step": 1677 + }, + { + "epoch": 2.17649086521889, + "grad_norm": 0.7374406456947327, + "learning_rate": 3.858169832393752e-06, + "loss": 0.6383547782897949, + "step": 1678 + }, + { + "epoch": 2.177788592169029, + "grad_norm": 0.7599214911460876, + "learning_rate": 3.846891497201206e-06, + "loss": 0.734661340713501, + "step": 1679 + }, + { + "epoch": 2.179086319119168, + "grad_norm": 0.7564613819122314, + "learning_rate": 3.835625743558168e-06, + "loss": 0.6974920630455017, + "step": 1680 + }, + { + "epoch": 2.1803840460693067, + "grad_norm": 0.7368860244750977, + "learning_rate": 3.824372594500256e-06, + "loss": 0.7153822183609009, + "step": 1681 + }, + { + "epoch": 2.1816817730194455, + "grad_norm": 0.7436947226524353, + "learning_rate": 3.813132073037309e-06, + "loss": 0.6690018773078918, + "step": 1682 + }, + { + "epoch": 2.1829794999695844, + "grad_norm": 0.7441128492355347, + "learning_rate": 3.8019042021533513e-06, + "loss": 0.6398620009422302, + "step": 1683 + }, + { + "epoch": 2.1842772269197233, + "grad_norm": 0.7101579308509827, + "learning_rate": 3.7906890048065358e-06, + "loss": 0.6713053584098816, + "step": 1684 + }, + { + "epoch": 2.185574953869862, + "grad_norm": 0.7423803210258484, + "learning_rate": 3.779486503929106e-06, + "loss": 0.6554515957832336, + "step": 1685 + }, + { + "epoch": 2.1868726808200014, + "grad_norm": 0.7913647890090942, + "learning_rate": 3.7682967224273317e-06, + "loss": 0.6829732656478882, + "step": 1686 + }, + { + "epoch": 2.1881704077701403, + "grad_norm": 0.7406657338142395, + "learning_rate": 3.757119683181493e-06, + "loss": 0.6207722425460815, + "step": 1687 + }, + { + "epoch": 2.189468134720279, + "grad_norm": 0.755535900592804, + "learning_rate": 3.7459554090458018e-06, + "loss": 0.5663500428199768, + "step": 1688 + }, + { + "epoch": 2.190765861670418, + "grad_norm": 0.736067533493042, + "learning_rate": 3.7348039228483758e-06, + "loss": 0.6010056734085083, + "step": 1689 + }, + { + "epoch": 2.192063588620557, + "grad_norm": 0.7262256741523743, + "learning_rate": 3.7236652473911817e-06, + "loss": 0.6251591444015503, + "step": 1690 + }, + { + "epoch": 2.1933613155706957, + "grad_norm": 0.7204144597053528, + "learning_rate": 3.7125394054499843e-06, + "loss": 0.6580095887184143, + "step": 1691 + }, + { + "epoch": 2.1946590425208345, + "grad_norm": 0.7472013235092163, + "learning_rate": 3.7014264197743267e-06, + "loss": 0.6532347798347473, + "step": 1692 + }, + { + "epoch": 2.1959567694709734, + "grad_norm": 0.7987051010131836, + "learning_rate": 3.6903263130874423e-06, + "loss": 0.7221670746803284, + "step": 1693 + }, + { + "epoch": 2.1972544964211123, + "grad_norm": 0.6925249695777893, + "learning_rate": 3.679239108086241e-06, + "loss": 0.6809045672416687, + "step": 1694 + }, + { + "epoch": 2.198552223371251, + "grad_norm": 0.7587743997573853, + "learning_rate": 3.668164827441254e-06, + "loss": 0.6878798007965088, + "step": 1695 + }, + { + "epoch": 2.19984995032139, + "grad_norm": 0.7842516899108887, + "learning_rate": 3.657103493796581e-06, + "loss": 0.6502532958984375, + "step": 1696 + }, + { + "epoch": 2.2011476772715293, + "grad_norm": 0.7169952392578125, + "learning_rate": 3.6460551297698486e-06, + "loss": 0.6481271386146545, + "step": 1697 + }, + { + "epoch": 2.202445404221668, + "grad_norm": 0.7124336957931519, + "learning_rate": 3.6350197579521696e-06, + "loss": 0.6550193428993225, + "step": 1698 + }, + { + "epoch": 2.203743131171807, + "grad_norm": 0.7990091443061829, + "learning_rate": 3.6239974009080746e-06, + "loss": 0.6425266265869141, + "step": 1699 + }, + { + "epoch": 2.205040858121946, + "grad_norm": 0.7323048114776611, + "learning_rate": 3.6129880811755093e-06, + "loss": 0.6682150959968567, + "step": 1700 + }, + { + "epoch": 2.2063385850720847, + "grad_norm": 0.7515720129013062, + "learning_rate": 3.601991821265731e-06, + "loss": 0.6324195265769958, + "step": 1701 + }, + { + "epoch": 2.2076363120222235, + "grad_norm": 0.7524798512458801, + "learning_rate": 3.591008643663323e-06, + "loss": 0.6398360729217529, + "step": 1702 + }, + { + "epoch": 2.2089340389723624, + "grad_norm": 0.7255743741989136, + "learning_rate": 3.580038570826093e-06, + "loss": 0.6324408650398254, + "step": 1703 + }, + { + "epoch": 2.2102317659225013, + "grad_norm": 0.7248579263687134, + "learning_rate": 3.5690816251850657e-06, + "loss": 0.6215530037879944, + "step": 1704 + }, + { + "epoch": 2.21152949287264, + "grad_norm": 0.7395302057266235, + "learning_rate": 3.5581378291444223e-06, + "loss": 0.6551209092140198, + "step": 1705 + }, + { + "epoch": 2.212827219822779, + "grad_norm": 0.7041357755661011, + "learning_rate": 3.5472072050814565e-06, + "loss": 0.5609908103942871, + "step": 1706 + }, + { + "epoch": 2.2141249467729183, + "grad_norm": 0.7290234565734863, + "learning_rate": 3.5362897753465265e-06, + "loss": 0.6203784346580505, + "step": 1707 + }, + { + "epoch": 2.215422673723057, + "grad_norm": 0.7435030341148376, + "learning_rate": 3.5253855622630174e-06, + "loss": 0.6926784515380859, + "step": 1708 + }, + { + "epoch": 2.216720400673196, + "grad_norm": 0.8078302145004272, + "learning_rate": 3.514494588127275e-06, + "loss": 0.7228481769561768, + "step": 1709 + }, + { + "epoch": 2.218018127623335, + "grad_norm": 0.7225632667541504, + "learning_rate": 3.5036168752085977e-06, + "loss": 0.6265015006065369, + "step": 1710 + }, + { + "epoch": 2.2193158545734737, + "grad_norm": 0.7306722402572632, + "learning_rate": 3.4927524457491456e-06, + "loss": 0.6289119720458984, + "step": 1711 + }, + { + "epoch": 2.2206135815236125, + "grad_norm": 0.7898452281951904, + "learning_rate": 3.4819013219639295e-06, + "loss": 0.597404420375824, + "step": 1712 + }, + { + "epoch": 2.2219113084737514, + "grad_norm": 0.6890703439712524, + "learning_rate": 3.471063526040752e-06, + "loss": 0.6129499673843384, + "step": 1713 + }, + { + "epoch": 2.2232090354238903, + "grad_norm": 0.710536777973175, + "learning_rate": 3.460239080140163e-06, + "loss": 0.5661106109619141, + "step": 1714 + }, + { + "epoch": 2.224506762374029, + "grad_norm": 0.7644726634025574, + "learning_rate": 3.4494280063954146e-06, + "loss": 0.6964048147201538, + "step": 1715 + }, + { + "epoch": 2.225804489324168, + "grad_norm": 0.7347561120986938, + "learning_rate": 3.4386303269124142e-06, + "loss": 0.6240056157112122, + "step": 1716 + }, + { + "epoch": 2.2271022162743073, + "grad_norm": 0.7397733330726624, + "learning_rate": 3.4278460637696865e-06, + "loss": 0.6740396022796631, + "step": 1717 + }, + { + "epoch": 2.228399943224446, + "grad_norm": 0.7311684489250183, + "learning_rate": 3.4170752390183183e-06, + "loss": 0.666801929473877, + "step": 1718 + }, + { + "epoch": 2.229697670174585, + "grad_norm": 0.7383760213851929, + "learning_rate": 3.4063178746819193e-06, + "loss": 0.6334900259971619, + "step": 1719 + }, + { + "epoch": 2.230995397124724, + "grad_norm": 0.7332467436790466, + "learning_rate": 3.395573992756579e-06, + "loss": 0.6466909646987915, + "step": 1720 + }, + { + "epoch": 2.2322931240748627, + "grad_norm": 0.7475365996360779, + "learning_rate": 3.384843615210819e-06, + "loss": 0.6753822565078735, + "step": 1721 + }, + { + "epoch": 2.2335908510250015, + "grad_norm": 0.7616447806358337, + "learning_rate": 3.3741267639855345e-06, + "loss": 0.7791091203689575, + "step": 1722 + }, + { + "epoch": 2.2348885779751404, + "grad_norm": 0.7229276299476624, + "learning_rate": 3.3634234609939888e-06, + "loss": 0.6403383016586304, + "step": 1723 + }, + { + "epoch": 2.2361863049252793, + "grad_norm": 0.7077613472938538, + "learning_rate": 3.352733728121712e-06, + "loss": 0.6446459889411926, + "step": 1724 + }, + { + "epoch": 2.237484031875418, + "grad_norm": 0.6968312859535217, + "learning_rate": 3.3420575872265184e-06, + "loss": 0.5743072032928467, + "step": 1725 + }, + { + "epoch": 2.238781758825557, + "grad_norm": 0.7185531854629517, + "learning_rate": 3.3313950601384016e-06, + "loss": 0.6074244379997253, + "step": 1726 + }, + { + "epoch": 2.240079485775696, + "grad_norm": 0.7392717599868774, + "learning_rate": 3.320746168659534e-06, + "loss": 0.7010684609413147, + "step": 1727 + }, + { + "epoch": 2.241377212725835, + "grad_norm": 0.7549191117286682, + "learning_rate": 3.3101109345642056e-06, + "loss": 0.6260566115379333, + "step": 1728 + }, + { + "epoch": 2.242674939675974, + "grad_norm": 0.7569594383239746, + "learning_rate": 3.299489379598777e-06, + "loss": 0.6684094667434692, + "step": 1729 + }, + { + "epoch": 2.243972666626113, + "grad_norm": 0.7654653787612915, + "learning_rate": 3.288881525481639e-06, + "loss": 0.6516446471214294, + "step": 1730 + }, + { + "epoch": 2.2452703935762517, + "grad_norm": 0.7150068879127502, + "learning_rate": 3.278287393903172e-06, + "loss": 0.6244807839393616, + "step": 1731 + }, + { + "epoch": 2.2465681205263905, + "grad_norm": 0.7367082238197327, + "learning_rate": 3.2677070065256855e-06, + "loss": 0.6541182398796082, + "step": 1732 + }, + { + "epoch": 2.2478658474765294, + "grad_norm": 0.7309427857398987, + "learning_rate": 3.257140384983405e-06, + "loss": 0.6608707308769226, + "step": 1733 + }, + { + "epoch": 2.2491635744266683, + "grad_norm": 0.7438578009605408, + "learning_rate": 3.2465875508823876e-06, + "loss": 0.6337431073188782, + "step": 1734 + }, + { + "epoch": 2.250461301376807, + "grad_norm": 0.7018159627914429, + "learning_rate": 3.2360485258005115e-06, + "loss": 0.614033043384552, + "step": 1735 + }, + { + "epoch": 2.251759028326946, + "grad_norm": 0.7361255884170532, + "learning_rate": 3.2255233312874155e-06, + "loss": 0.6730838418006897, + "step": 1736 + }, + { + "epoch": 2.253056755277085, + "grad_norm": 0.7623570561408997, + "learning_rate": 3.2150119888644594e-06, + "loss": 0.659545361995697, + "step": 1737 + }, + { + "epoch": 2.2543544822272237, + "grad_norm": 0.6926621198654175, + "learning_rate": 3.2045145200246763e-06, + "loss": 0.5896809697151184, + "step": 1738 + }, + { + "epoch": 2.255652209177363, + "grad_norm": 0.7644792795181274, + "learning_rate": 3.1940309462327334e-06, + "loss": 0.688497006893158, + "step": 1739 + }, + { + "epoch": 2.256949936127502, + "grad_norm": 0.7479227185249329, + "learning_rate": 3.1835612889248868e-06, + "loss": 0.6612273454666138, + "step": 1740 + }, + { + "epoch": 2.2582476630776407, + "grad_norm": 0.7315995693206787, + "learning_rate": 3.1731055695089384e-06, + "loss": 0.5924808382987976, + "step": 1741 + }, + { + "epoch": 2.2595453900277795, + "grad_norm": 0.7356354594230652, + "learning_rate": 3.162663809364178e-06, + "loss": 0.6635130047798157, + "step": 1742 + }, + { + "epoch": 2.2608431169779184, + "grad_norm": 0.7253445982933044, + "learning_rate": 3.152236029841376e-06, + "loss": 0.6303724646568298, + "step": 1743 + }, + { + "epoch": 2.2621408439280573, + "grad_norm": 0.7351011037826538, + "learning_rate": 3.1418222522626907e-06, + "loss": 0.720777153968811, + "step": 1744 + }, + { + "epoch": 2.263438570878196, + "grad_norm": 0.7059449553489685, + "learning_rate": 3.1314224979216633e-06, + "loss": 0.598090648651123, + "step": 1745 + }, + { + "epoch": 2.264736297828335, + "grad_norm": 0.7039961218833923, + "learning_rate": 3.1210367880831684e-06, + "loss": 0.5808880925178528, + "step": 1746 + }, + { + "epoch": 2.266034024778474, + "grad_norm": 0.7747211456298828, + "learning_rate": 3.1106651439833434e-06, + "loss": 0.6428390741348267, + "step": 1747 + }, + { + "epoch": 2.2673317517286127, + "grad_norm": 0.7529793381690979, + "learning_rate": 3.1003075868295794e-06, + "loss": 0.6959705352783203, + "step": 1748 + }, + { + "epoch": 2.2686294786787515, + "grad_norm": 0.7145947813987732, + "learning_rate": 3.0899641378004596e-06, + "loss": 0.6403526663780212, + "step": 1749 + }, + { + "epoch": 2.269927205628891, + "grad_norm": 0.7092662453651428, + "learning_rate": 3.079634818045719e-06, + "loss": 0.5681431889533997, + "step": 1750 + }, + { + "epoch": 2.2712249325790297, + "grad_norm": 0.7515605688095093, + "learning_rate": 3.069319648686202e-06, + "loss": 0.633612334728241, + "step": 1751 + }, + { + "epoch": 2.2725226595291685, + "grad_norm": 0.7028906941413879, + "learning_rate": 3.0590186508138186e-06, + "loss": 0.6241360902786255, + "step": 1752 + }, + { + "epoch": 2.2738203864793074, + "grad_norm": 0.7183363437652588, + "learning_rate": 3.048731845491504e-06, + "loss": 0.5909807085990906, + "step": 1753 + }, + { + "epoch": 2.2751181134294463, + "grad_norm": 0.7331669926643372, + "learning_rate": 3.038459253753172e-06, + "loss": 0.6321236491203308, + "step": 1754 + }, + { + "epoch": 2.276415840379585, + "grad_norm": 0.6997974514961243, + "learning_rate": 3.0282008966036647e-06, + "loss": 0.6245713829994202, + "step": 1755 + }, + { + "epoch": 2.277713567329724, + "grad_norm": 0.7051255702972412, + "learning_rate": 3.0179567950187396e-06, + "loss": 0.6196664571762085, + "step": 1756 + }, + { + "epoch": 2.279011294279863, + "grad_norm": 0.7281318306922913, + "learning_rate": 3.0077269699449795e-06, + "loss": 0.6078094840049744, + "step": 1757 + }, + { + "epoch": 2.2803090212300017, + "grad_norm": 0.7404606938362122, + "learning_rate": 2.9975114422997932e-06, + "loss": 0.6296783685684204, + "step": 1758 + }, + { + "epoch": 2.2816067481801405, + "grad_norm": 0.7832150459289551, + "learning_rate": 2.9873102329713478e-06, + "loss": 0.6518726348876953, + "step": 1759 + }, + { + "epoch": 2.2829044751302794, + "grad_norm": 0.715710461139679, + "learning_rate": 2.9771233628185346e-06, + "loss": 0.5865130424499512, + "step": 1760 + }, + { + "epoch": 2.2842022020804187, + "grad_norm": 0.7315993309020996, + "learning_rate": 2.9669508526709256e-06, + "loss": 0.7027003765106201, + "step": 1761 + }, + { + "epoch": 2.2854999290305575, + "grad_norm": 0.7398679852485657, + "learning_rate": 2.9567927233287307e-06, + "loss": 0.6710663437843323, + "step": 1762 + }, + { + "epoch": 2.2867976559806964, + "grad_norm": 0.7295849323272705, + "learning_rate": 2.9466489955627452e-06, + "loss": 0.7136781811714172, + "step": 1763 + }, + { + "epoch": 2.2880953829308353, + "grad_norm": 0.7286946773529053, + "learning_rate": 2.936519690114338e-06, + "loss": 0.6223260760307312, + "step": 1764 + }, + { + "epoch": 2.289393109880974, + "grad_norm": 0.7104554772377014, + "learning_rate": 2.9264048276953606e-06, + "loss": 0.6340541839599609, + "step": 1765 + }, + { + "epoch": 2.290690836831113, + "grad_norm": 0.7115781903266907, + "learning_rate": 2.9163044289881604e-06, + "loss": 0.6645469069480896, + "step": 1766 + }, + { + "epoch": 2.291988563781252, + "grad_norm": 0.733094334602356, + "learning_rate": 2.906218514645487e-06, + "loss": 0.6235517859458923, + "step": 1767 + }, + { + "epoch": 2.2932862907313907, + "grad_norm": 0.7436304688453674, + "learning_rate": 2.8961471052904855e-06, + "loss": 0.66838139295578, + "step": 1768 + }, + { + "epoch": 2.2945840176815295, + "grad_norm": 0.7022131681442261, + "learning_rate": 2.8860902215166374e-06, + "loss": 0.6098725199699402, + "step": 1769 + }, + { + "epoch": 2.295881744631669, + "grad_norm": 0.725817859172821, + "learning_rate": 2.876047883887727e-06, + "loss": 0.7111449837684631, + "step": 1770 + }, + { + "epoch": 2.2971794715818072, + "grad_norm": 0.7336429357528687, + "learning_rate": 2.866020112937792e-06, + "loss": 0.6535848379135132, + "step": 1771 + }, + { + "epoch": 2.2984771985319465, + "grad_norm": 0.743033230304718, + "learning_rate": 2.8560069291710857e-06, + "loss": 0.6946330070495605, + "step": 1772 + }, + { + "epoch": 2.2997749254820854, + "grad_norm": 0.7527621388435364, + "learning_rate": 2.8460083530620342e-06, + "loss": 0.67728191614151, + "step": 1773 + }, + { + "epoch": 2.3010726524322243, + "grad_norm": 0.7036607265472412, + "learning_rate": 2.8360244050551943e-06, + "loss": 0.5508571267127991, + "step": 1774 + }, + { + "epoch": 2.302370379382363, + "grad_norm": 0.698133647441864, + "learning_rate": 2.8260551055652154e-06, + "loss": 0.680967390537262, + "step": 1775 + }, + { + "epoch": 2.303668106332502, + "grad_norm": 0.7584355473518372, + "learning_rate": 2.8161004749767893e-06, + "loss": 0.6776391863822937, + "step": 1776 + }, + { + "epoch": 2.304965833282641, + "grad_norm": 0.7389799356460571, + "learning_rate": 2.8061605336446194e-06, + "loss": 0.6526666879653931, + "step": 1777 + }, + { + "epoch": 2.3062635602327797, + "grad_norm": 0.7454041242599487, + "learning_rate": 2.796235301893362e-06, + "loss": 0.6357724666595459, + "step": 1778 + }, + { + "epoch": 2.3075612871829185, + "grad_norm": 0.745415210723877, + "learning_rate": 2.7863248000176146e-06, + "loss": 0.6145803928375244, + "step": 1779 + }, + { + "epoch": 2.3088590141330574, + "grad_norm": 0.7515760660171509, + "learning_rate": 2.776429048281837e-06, + "loss": 0.6784413456916809, + "step": 1780 + }, + { + "epoch": 2.3101567410831967, + "grad_norm": 0.7618042230606079, + "learning_rate": 2.7665480669203383e-06, + "loss": 0.6697713136672974, + "step": 1781 + }, + { + "epoch": 2.3114544680333355, + "grad_norm": 0.69931560754776, + "learning_rate": 2.756681876137227e-06, + "loss": 0.5977004766464233, + "step": 1782 + }, + { + "epoch": 2.3127521949834744, + "grad_norm": 0.7272830605506897, + "learning_rate": 2.7468304961063642e-06, + "loss": 0.6867664456367493, + "step": 1783 + }, + { + "epoch": 2.3140499219336133, + "grad_norm": 0.7531746029853821, + "learning_rate": 2.736993946971329e-06, + "loss": 0.6313377022743225, + "step": 1784 + }, + { + "epoch": 2.315347648883752, + "grad_norm": 0.7396632432937622, + "learning_rate": 2.727172248845378e-06, + "loss": 0.6548261642456055, + "step": 1785 + }, + { + "epoch": 2.316645375833891, + "grad_norm": 0.7558153867721558, + "learning_rate": 2.717365421811389e-06, + "loss": 0.6362917423248291, + "step": 1786 + }, + { + "epoch": 2.31794310278403, + "grad_norm": 0.7348777055740356, + "learning_rate": 2.7075734859218526e-06, + "loss": 0.617246150970459, + "step": 1787 + }, + { + "epoch": 2.3192408297341687, + "grad_norm": 0.7107247710227966, + "learning_rate": 2.6977964611987885e-06, + "loss": 0.6115847229957581, + "step": 1788 + }, + { + "epoch": 2.3205385566843075, + "grad_norm": 0.7372192740440369, + "learning_rate": 2.6880343676337485e-06, + "loss": 0.653107762336731, + "step": 1789 + }, + { + "epoch": 2.3218362836344464, + "grad_norm": 0.7087644338607788, + "learning_rate": 2.6782872251877347e-06, + "loss": 0.6624957919120789, + "step": 1790 + }, + { + "epoch": 2.3231340105845852, + "grad_norm": 0.7231054902076721, + "learning_rate": 2.6685550537911886e-06, + "loss": 0.6585568189620972, + "step": 1791 + }, + { + "epoch": 2.3244317375347245, + "grad_norm": 0.7619837522506714, + "learning_rate": 2.658837873343938e-06, + "loss": 0.6406753063201904, + "step": 1792 + }, + { + "epoch": 2.3257294644848634, + "grad_norm": 0.7381089329719543, + "learning_rate": 2.6491357037151565e-06, + "loss": 0.6516512036323547, + "step": 1793 + }, + { + "epoch": 2.3270271914350023, + "grad_norm": 0.7420887351036072, + "learning_rate": 2.639448564743328e-06, + "loss": 0.6555370688438416, + "step": 1794 + }, + { + "epoch": 2.328324918385141, + "grad_norm": 0.7358477115631104, + "learning_rate": 2.6297764762362e-06, + "loss": 0.6229339838027954, + "step": 1795 + }, + { + "epoch": 2.32962264533528, + "grad_norm": 0.7449919581413269, + "learning_rate": 2.6201194579707377e-06, + "loss": 0.6487348675727844, + "step": 1796 + }, + { + "epoch": 2.330920372285419, + "grad_norm": 0.755095362663269, + "learning_rate": 2.6104775296931118e-06, + "loss": 0.709601640701294, + "step": 1797 + }, + { + "epoch": 2.3322180992355577, + "grad_norm": 0.7726845145225525, + "learning_rate": 2.6008507111186142e-06, + "loss": 0.6235072016716003, + "step": 1798 + }, + { + "epoch": 2.3335158261856965, + "grad_norm": 0.7045385241508484, + "learning_rate": 2.5912390219316573e-06, + "loss": 0.5908339619636536, + "step": 1799 + }, + { + "epoch": 2.3348135531358354, + "grad_norm": 0.7490655779838562, + "learning_rate": 2.5816424817857122e-06, + "loss": 0.7369755506515503, + "step": 1800 + }, + { + "epoch": 2.3361112800859742, + "grad_norm": 0.7135450839996338, + "learning_rate": 2.572061110303271e-06, + "loss": 0.6987670063972473, + "step": 1801 + }, + { + "epoch": 2.337409007036113, + "grad_norm": 0.7187747359275818, + "learning_rate": 2.562494927075824e-06, + "loss": 0.5778123140335083, + "step": 1802 + }, + { + "epoch": 2.3387067339862524, + "grad_norm": 0.7786324620246887, + "learning_rate": 2.552943951663782e-06, + "loss": 0.6605340838432312, + "step": 1803 + }, + { + "epoch": 2.3400044609363913, + "grad_norm": 0.785906195640564, + "learning_rate": 2.543408203596479e-06, + "loss": 0.6925969123840332, + "step": 1804 + }, + { + "epoch": 2.34130218788653, + "grad_norm": 0.7589930891990662, + "learning_rate": 2.5338877023721055e-06, + "loss": 0.6296513676643372, + "step": 1805 + }, + { + "epoch": 2.342599914836669, + "grad_norm": 0.6791945695877075, + "learning_rate": 2.5243824674576743e-06, + "loss": 0.6128097176551819, + "step": 1806 + }, + { + "epoch": 2.343897641786808, + "grad_norm": 0.737198531627655, + "learning_rate": 2.514892518288988e-06, + "loss": 0.60391765832901, + "step": 1807 + }, + { + "epoch": 2.3451953687369467, + "grad_norm": 0.7078155279159546, + "learning_rate": 2.5054178742705936e-06, + "loss": 0.6364641189575195, + "step": 1808 + }, + { + "epoch": 2.3464930956870855, + "grad_norm": 0.7275543808937073, + "learning_rate": 2.4959585547757294e-06, + "loss": 0.6722849011421204, + "step": 1809 + }, + { + "epoch": 2.3477908226372244, + "grad_norm": 0.8179038166999817, + "learning_rate": 2.486514579146322e-06, + "loss": 0.6581687927246094, + "step": 1810 + }, + { + "epoch": 2.3490885495873632, + "grad_norm": 0.766876757144928, + "learning_rate": 2.4770859666929027e-06, + "loss": 0.6003885865211487, + "step": 1811 + }, + { + "epoch": 2.350386276537502, + "grad_norm": 0.7353731989860535, + "learning_rate": 2.4676727366945995e-06, + "loss": 0.6582502722740173, + "step": 1812 + }, + { + "epoch": 2.351684003487641, + "grad_norm": 0.7552323341369629, + "learning_rate": 2.4582749083990875e-06, + "loss": 0.6586010456085205, + "step": 1813 + }, + { + "epoch": 2.3529817304377803, + "grad_norm": 0.7750751376152039, + "learning_rate": 2.448892501022544e-06, + "loss": 0.6576810479164124, + "step": 1814 + }, + { + "epoch": 2.354279457387919, + "grad_norm": 0.755615770816803, + "learning_rate": 2.4395255337496202e-06, + "loss": 0.6574745178222656, + "step": 1815 + }, + { + "epoch": 2.355577184338058, + "grad_norm": 0.7417405843734741, + "learning_rate": 2.4301740257333918e-06, + "loss": 0.6290728449821472, + "step": 1816 + }, + { + "epoch": 2.356874911288197, + "grad_norm": 0.7301021814346313, + "learning_rate": 2.4208379960953255e-06, + "loss": 0.6600069403648376, + "step": 1817 + }, + { + "epoch": 2.3581726382383357, + "grad_norm": 0.7170204520225525, + "learning_rate": 2.4115174639252425e-06, + "loss": 0.5834653973579407, + "step": 1818 + }, + { + "epoch": 2.3594703651884745, + "grad_norm": 0.7591288089752197, + "learning_rate": 2.4022124482812627e-06, + "loss": 0.6460838913917542, + "step": 1819 + }, + { + "epoch": 2.3607680921386134, + "grad_norm": 0.7465713024139404, + "learning_rate": 2.3929229681898005e-06, + "loss": 0.670021116733551, + "step": 1820 + }, + { + "epoch": 2.3620658190887522, + "grad_norm": 0.7204452753067017, + "learning_rate": 2.3836490426454816e-06, + "loss": 0.6367021799087524, + "step": 1821 + }, + { + "epoch": 2.363363546038891, + "grad_norm": 0.7174842357635498, + "learning_rate": 2.3743906906111415e-06, + "loss": 0.6825685501098633, + "step": 1822 + }, + { + "epoch": 2.3646612729890304, + "grad_norm": 0.6899293065071106, + "learning_rate": 2.365147931017764e-06, + "loss": 0.642341673374176, + "step": 1823 + }, + { + "epoch": 2.365958999939169, + "grad_norm": 0.7295400500297546, + "learning_rate": 2.355920782764455e-06, + "loss": 0.6189469695091248, + "step": 1824 + }, + { + "epoch": 2.367256726889308, + "grad_norm": 0.7334946393966675, + "learning_rate": 2.3467092647183962e-06, + "loss": 0.642494261264801, + "step": 1825 + }, + { + "epoch": 2.368554453839447, + "grad_norm": 0.727120041847229, + "learning_rate": 2.337513395714812e-06, + "loss": 0.6564252972602844, + "step": 1826 + }, + { + "epoch": 2.369852180789586, + "grad_norm": 0.7781887650489807, + "learning_rate": 2.3283331945569256e-06, + "loss": 0.7230110764503479, + "step": 1827 + }, + { + "epoch": 2.3711499077397247, + "grad_norm": 0.7318363189697266, + "learning_rate": 2.3191686800159272e-06, + "loss": 0.6312495470046997, + "step": 1828 + }, + { + "epoch": 2.3724476346898635, + "grad_norm": 0.7348397374153137, + "learning_rate": 2.310019870830923e-06, + "loss": 0.6707776784896851, + "step": 1829 + }, + { + "epoch": 2.3737453616400024, + "grad_norm": 0.7550859451293945, + "learning_rate": 2.300886785708919e-06, + "loss": 0.6729933023452759, + "step": 1830 + }, + { + "epoch": 2.3750430885901412, + "grad_norm": 0.724520206451416, + "learning_rate": 2.2917694433247626e-06, + "loss": 0.6436410546302795, + "step": 1831 + }, + { + "epoch": 2.37634081554028, + "grad_norm": 0.7761313319206238, + "learning_rate": 2.282667862321104e-06, + "loss": 0.6961484551429749, + "step": 1832 + }, + { + "epoch": 2.377638542490419, + "grad_norm": 0.7718027234077454, + "learning_rate": 2.2735820613083837e-06, + "loss": 0.731279194355011, + "step": 1833 + }, + { + "epoch": 2.3789362694405582, + "grad_norm": 0.7511587738990784, + "learning_rate": 2.264512058864755e-06, + "loss": 0.6527747511863708, + "step": 1834 + }, + { + "epoch": 2.380233996390697, + "grad_norm": 0.7314983010292053, + "learning_rate": 2.2554578735360823e-06, + "loss": 0.6660367846488953, + "step": 1835 + }, + { + "epoch": 2.381531723340836, + "grad_norm": 0.7481415867805481, + "learning_rate": 2.246419523835882e-06, + "loss": 0.6034996509552002, + "step": 1836 + }, + { + "epoch": 2.382829450290975, + "grad_norm": 0.7201923131942749, + "learning_rate": 2.2373970282452916e-06, + "loss": 0.618115246295929, + "step": 1837 + }, + { + "epoch": 2.3841271772411137, + "grad_norm": 0.7333959341049194, + "learning_rate": 2.2283904052130313e-06, + "loss": 0.679516077041626, + "step": 1838 + }, + { + "epoch": 2.3854249041912525, + "grad_norm": 0.7144783735275269, + "learning_rate": 2.2193996731553656e-06, + "loss": 0.6412646174430847, + "step": 1839 + }, + { + "epoch": 2.3867226311413914, + "grad_norm": 0.7374799251556396, + "learning_rate": 2.2104248504560643e-06, + "loss": 0.6004337072372437, + "step": 1840 + }, + { + "epoch": 2.3867226311413914, + "eval_loss": 0.7504242062568665, + "eval_runtime": 140.6905, + "eval_samples_per_second": 36.904, + "eval_steps_per_second": 9.226, + "step": 1840 + }, + { + "epoch": 2.3880203580915302, + "grad_norm": 0.7108725905418396, + "learning_rate": 2.2014659554663732e-06, + "loss": 0.6515002250671387, + "step": 1841 + }, + { + "epoch": 2.389318085041669, + "grad_norm": 0.744311511516571, + "learning_rate": 2.192523006504956e-06, + "loss": 0.5911805033683777, + "step": 1842 + }, + { + "epoch": 2.390615811991808, + "grad_norm": 0.7513126730918884, + "learning_rate": 2.183596021857891e-06, + "loss": 0.5855857133865356, + "step": 1843 + }, + { + "epoch": 2.391913538941947, + "grad_norm": 0.7308302521705627, + "learning_rate": 2.1746850197785928e-06, + "loss": 0.6079833507537842, + "step": 1844 + }, + { + "epoch": 2.393211265892086, + "grad_norm": 0.7567104697227478, + "learning_rate": 2.16579001848781e-06, + "loss": 0.6419387459754944, + "step": 1845 + }, + { + "epoch": 2.394508992842225, + "grad_norm": 0.7667451500892639, + "learning_rate": 2.156911036173568e-06, + "loss": 0.6022201776504517, + "step": 1846 + }, + { + "epoch": 2.395806719792364, + "grad_norm": 0.700553297996521, + "learning_rate": 2.1480480909911384e-06, + "loss": 0.6151991486549377, + "step": 1847 + }, + { + "epoch": 2.3971044467425027, + "grad_norm": 0.7488269209861755, + "learning_rate": 2.139201201062999e-06, + "loss": 0.6688805222511292, + "step": 1848 + }, + { + "epoch": 2.3984021736926415, + "grad_norm": 0.7348271608352661, + "learning_rate": 2.130370384478807e-06, + "loss": 0.6284016370773315, + "step": 1849 + }, + { + "epoch": 2.3996999006427804, + "grad_norm": 0.7548435926437378, + "learning_rate": 2.1215556592953357e-06, + "loss": 0.6753513216972351, + "step": 1850 + }, + { + "epoch": 2.4009976275929192, + "grad_norm": 0.7015430927276611, + "learning_rate": 2.11275704353648e-06, + "loss": 0.5835912823677063, + "step": 1851 + }, + { + "epoch": 2.402295354543058, + "grad_norm": 0.732021689414978, + "learning_rate": 2.10397455519317e-06, + "loss": 0.645444929599762, + "step": 1852 + }, + { + "epoch": 2.403593081493197, + "grad_norm": 0.7345272302627563, + "learning_rate": 2.095208212223383e-06, + "loss": 0.666027843952179, + "step": 1853 + }, + { + "epoch": 2.404890808443336, + "grad_norm": 0.694179356098175, + "learning_rate": 2.0864580325520623e-06, + "loss": 0.6171280145645142, + "step": 1854 + }, + { + "epoch": 2.4061885353934747, + "grad_norm": 0.7522391080856323, + "learning_rate": 2.077724034071116e-06, + "loss": 0.6551393270492554, + "step": 1855 + }, + { + "epoch": 2.407486262343614, + "grad_norm": 0.731461226940155, + "learning_rate": 2.069006234639357e-06, + "loss": 0.5965202450752258, + "step": 1856 + }, + { + "epoch": 2.408783989293753, + "grad_norm": 0.7376645803451538, + "learning_rate": 2.060304652082481e-06, + "loss": 0.6684772372245789, + "step": 1857 + }, + { + "epoch": 2.4100817162438917, + "grad_norm": 0.8123404383659363, + "learning_rate": 2.051619304193022e-06, + "loss": 0.726719856262207, + "step": 1858 + }, + { + "epoch": 2.4113794431940305, + "grad_norm": 0.723229169845581, + "learning_rate": 2.0429502087303164e-06, + "loss": 0.6310455799102783, + "step": 1859 + }, + { + "epoch": 2.4126771701441694, + "grad_norm": 0.7440442442893982, + "learning_rate": 2.0342973834204715e-06, + "loss": 0.6147751808166504, + "step": 1860 + }, + { + "epoch": 2.4139748970943082, + "grad_norm": 0.7190000414848328, + "learning_rate": 2.0256608459563244e-06, + "loss": 0.6343541741371155, + "step": 1861 + }, + { + "epoch": 2.415272624044447, + "grad_norm": 0.7396417260169983, + "learning_rate": 2.017040613997412e-06, + "loss": 0.6213467121124268, + "step": 1862 + }, + { + "epoch": 2.416570350994586, + "grad_norm": 0.7148772478103638, + "learning_rate": 2.008436705169917e-06, + "loss": 0.5708230137825012, + "step": 1863 + }, + { + "epoch": 2.417868077944725, + "grad_norm": 0.7284368872642517, + "learning_rate": 1.9998491370666684e-06, + "loss": 0.5845701098442078, + "step": 1864 + }, + { + "epoch": 2.4191658048948637, + "grad_norm": 0.7286568284034729, + "learning_rate": 1.991277927247056e-06, + "loss": 0.636822521686554, + "step": 1865 + }, + { + "epoch": 2.4204635318450025, + "grad_norm": 0.741385817527771, + "learning_rate": 1.9827230932370467e-06, + "loss": 0.6635302305221558, + "step": 1866 + }, + { + "epoch": 2.421761258795142, + "grad_norm": 0.7097977995872498, + "learning_rate": 1.9741846525291033e-06, + "loss": 0.5913397669792175, + "step": 1867 + }, + { + "epoch": 2.4230589857452807, + "grad_norm": 0.748805582523346, + "learning_rate": 1.9656626225821774e-06, + "loss": 0.6394146680831909, + "step": 1868 + }, + { + "epoch": 2.4243567126954195, + "grad_norm": 0.7540968656539917, + "learning_rate": 1.957157020821664e-06, + "loss": 0.6580138802528381, + "step": 1869 + }, + { + "epoch": 2.4256544396455584, + "grad_norm": 0.7199598550796509, + "learning_rate": 1.9486678646393654e-06, + "loss": 0.6445693969726562, + "step": 1870 + }, + { + "epoch": 2.4269521665956972, + "grad_norm": 0.722776472568512, + "learning_rate": 1.9401951713934574e-06, + "loss": 0.6294406056404114, + "step": 1871 + }, + { + "epoch": 2.428249893545836, + "grad_norm": 0.776488184928894, + "learning_rate": 1.931738958408457e-06, + "loss": 0.6513455510139465, + "step": 1872 + }, + { + "epoch": 2.429547620495975, + "grad_norm": 0.751055121421814, + "learning_rate": 1.9232992429751694e-06, + "loss": 0.6255248785018921, + "step": 1873 + }, + { + "epoch": 2.430845347446114, + "grad_norm": 0.7133703827857971, + "learning_rate": 1.9148760423506884e-06, + "loss": 0.5895485281944275, + "step": 1874 + }, + { + "epoch": 2.4321430743962527, + "grad_norm": 0.7120479941368103, + "learning_rate": 1.9064693737583173e-06, + "loss": 0.6799072027206421, + "step": 1875 + }, + { + "epoch": 2.433440801346392, + "grad_norm": 0.7090493440628052, + "learning_rate": 1.8980792543875758e-06, + "loss": 0.6845042705535889, + "step": 1876 + }, + { + "epoch": 2.4347385282965304, + "grad_norm": 0.7474452257156372, + "learning_rate": 1.8897057013941256e-06, + "loss": 0.6170677542686462, + "step": 1877 + }, + { + "epoch": 2.4360362552466697, + "grad_norm": 0.7024904489517212, + "learning_rate": 1.8813487318997658e-06, + "loss": 0.6431372165679932, + "step": 1878 + }, + { + "epoch": 2.4373339821968085, + "grad_norm": 0.7497063875198364, + "learning_rate": 1.8730083629923857e-06, + "loss": 0.6090019345283508, + "step": 1879 + }, + { + "epoch": 2.4386317091469474, + "grad_norm": 0.7273635268211365, + "learning_rate": 1.8646846117259277e-06, + "loss": 0.6302788257598877, + "step": 1880 + }, + { + "epoch": 2.4399294360970862, + "grad_norm": 0.745716392993927, + "learning_rate": 1.856377495120355e-06, + "loss": 0.6740216612815857, + "step": 1881 + }, + { + "epoch": 2.441227163047225, + "grad_norm": 0.6912100911140442, + "learning_rate": 1.8480870301616227e-06, + "loss": 0.6371436715126038, + "step": 1882 + }, + { + "epoch": 2.442524889997364, + "grad_norm": 0.73276287317276, + "learning_rate": 1.839813233801626e-06, + "loss": 0.6914728283882141, + "step": 1883 + }, + { + "epoch": 2.443822616947503, + "grad_norm": 0.6954025626182556, + "learning_rate": 1.8315561229581925e-06, + "loss": 0.6365620493888855, + "step": 1884 + }, + { + "epoch": 2.4451203438976417, + "grad_norm": 0.7226231098175049, + "learning_rate": 1.8233157145150183e-06, + "loss": 0.6907994151115417, + "step": 1885 + }, + { + "epoch": 2.4464180708477805, + "grad_norm": 0.7429067492485046, + "learning_rate": 1.8150920253216542e-06, + "loss": 0.6867068409919739, + "step": 1886 + }, + { + "epoch": 2.44771579779792, + "grad_norm": 0.7071108818054199, + "learning_rate": 1.8068850721934639e-06, + "loss": 0.6865320205688477, + "step": 1887 + }, + { + "epoch": 2.4490135247480587, + "grad_norm": 0.7338579893112183, + "learning_rate": 1.7986948719115872e-06, + "loss": 0.6243481636047363, + "step": 1888 + }, + { + "epoch": 2.4503112516981975, + "grad_norm": 0.727736234664917, + "learning_rate": 1.7905214412229177e-06, + "loss": 0.6568608283996582, + "step": 1889 + }, + { + "epoch": 2.4516089786483364, + "grad_norm": 0.7110669612884521, + "learning_rate": 1.7823647968400437e-06, + "loss": 0.6400637626647949, + "step": 1890 + }, + { + "epoch": 2.4529067055984752, + "grad_norm": 0.7366207242012024, + "learning_rate": 1.7742249554412426e-06, + "loss": 0.6992728114128113, + "step": 1891 + }, + { + "epoch": 2.454204432548614, + "grad_norm": 0.7760360836982727, + "learning_rate": 1.76610193367043e-06, + "loss": 0.660463809967041, + "step": 1892 + }, + { + "epoch": 2.455502159498753, + "grad_norm": 0.7349168658256531, + "learning_rate": 1.757995748137129e-06, + "loss": 0.6087374091148376, + "step": 1893 + }, + { + "epoch": 2.456799886448892, + "grad_norm": 0.7244678139686584, + "learning_rate": 1.7499064154164358e-06, + "loss": 0.6310493350028992, + "step": 1894 + }, + { + "epoch": 2.4580976133990307, + "grad_norm": 0.735069215297699, + "learning_rate": 1.7418339520489936e-06, + "loss": 0.6924616098403931, + "step": 1895 + }, + { + "epoch": 2.4593953403491695, + "grad_norm": 0.7370489239692688, + "learning_rate": 1.7337783745409363e-06, + "loss": 0.6034020781517029, + "step": 1896 + }, + { + "epoch": 2.4606930672993084, + "grad_norm": 0.7326070666313171, + "learning_rate": 1.7257396993638942e-06, + "loss": 0.6212228536605835, + "step": 1897 + }, + { + "epoch": 2.4619907942494477, + "grad_norm": 0.6936232447624207, + "learning_rate": 1.717717942954914e-06, + "loss": 0.705615758895874, + "step": 1898 + }, + { + "epoch": 2.4632885211995865, + "grad_norm": 0.7247579097747803, + "learning_rate": 1.7097131217164598e-06, + "loss": 0.6505810618400574, + "step": 1899 + }, + { + "epoch": 2.4645862481497254, + "grad_norm": 0.7129016518592834, + "learning_rate": 1.7017252520163652e-06, + "loss": 0.637854814529419, + "step": 1900 + }, + { + "epoch": 2.4658839750998642, + "grad_norm": 0.7215719819068909, + "learning_rate": 1.6937543501878018e-06, + "loss": 0.6486891508102417, + "step": 1901 + }, + { + "epoch": 2.467181702050003, + "grad_norm": 0.7112030386924744, + "learning_rate": 1.6858004325292466e-06, + "loss": 0.6466121673583984, + "step": 1902 + }, + { + "epoch": 2.468479429000142, + "grad_norm": 0.7482553124427795, + "learning_rate": 1.6778635153044486e-06, + "loss": 0.6906379461288452, + "step": 1903 + }, + { + "epoch": 2.469777155950281, + "grad_norm": 0.7411786317825317, + "learning_rate": 1.6699436147423942e-06, + "loss": 0.613003134727478, + "step": 1904 + }, + { + "epoch": 2.4710748829004197, + "grad_norm": 0.7285057902336121, + "learning_rate": 1.662040747037277e-06, + "loss": 0.7423882484436035, + "step": 1905 + }, + { + "epoch": 2.4723726098505585, + "grad_norm": 0.7251142859458923, + "learning_rate": 1.654154928348455e-06, + "loss": 0.6890588402748108, + "step": 1906 + }, + { + "epoch": 2.4736703368006974, + "grad_norm": 0.7212609052658081, + "learning_rate": 1.646286174800441e-06, + "loss": 0.6591873168945312, + "step": 1907 + }, + { + "epoch": 2.4749680637508362, + "grad_norm": 0.7344200611114502, + "learning_rate": 1.6384345024828374e-06, + "loss": 0.6354522705078125, + "step": 1908 + }, + { + "epoch": 2.4762657907009755, + "grad_norm": 0.7125760316848755, + "learning_rate": 1.6305999274503282e-06, + "loss": 0.6043302416801453, + "step": 1909 + }, + { + "epoch": 2.4775635176511144, + "grad_norm": 0.7003780603408813, + "learning_rate": 1.6227824657226366e-06, + "loss": 0.5772091150283813, + "step": 1910 + }, + { + "epoch": 2.4788612446012532, + "grad_norm": 0.7161146998405457, + "learning_rate": 1.614982133284495e-06, + "loss": 0.6129906177520752, + "step": 1911 + }, + { + "epoch": 2.480158971551392, + "grad_norm": 0.7459210157394409, + "learning_rate": 1.6071989460856063e-06, + "loss": 0.6741005182266235, + "step": 1912 + }, + { + "epoch": 2.481456698501531, + "grad_norm": 0.7306010723114014, + "learning_rate": 1.5994329200406223e-06, + "loss": 0.6048024296760559, + "step": 1913 + }, + { + "epoch": 2.48275442545167, + "grad_norm": 0.7296182513237, + "learning_rate": 1.5916840710290937e-06, + "loss": 0.6497235298156738, + "step": 1914 + }, + { + "epoch": 2.4840521524018087, + "grad_norm": 0.7177472114562988, + "learning_rate": 1.5839524148954622e-06, + "loss": 0.5927858352661133, + "step": 1915 + }, + { + "epoch": 2.4853498793519475, + "grad_norm": 0.7376892566680908, + "learning_rate": 1.5762379674490048e-06, + "loss": 0.591650128364563, + "step": 1916 + }, + { + "epoch": 2.4866476063020864, + "grad_norm": 0.7759072780609131, + "learning_rate": 1.5685407444638146e-06, + "loss": 0.686072051525116, + "step": 1917 + }, + { + "epoch": 2.487945333252225, + "grad_norm": 0.7239146828651428, + "learning_rate": 1.5608607616787663e-06, + "loss": 0.6082277297973633, + "step": 1918 + }, + { + "epoch": 2.489243060202364, + "grad_norm": 0.7537539005279541, + "learning_rate": 1.553198034797474e-06, + "loss": 0.7451168298721313, + "step": 1919 + }, + { + "epoch": 2.4905407871525034, + "grad_norm": 0.7346340417861938, + "learning_rate": 1.5455525794882841e-06, + "loss": 0.611229658126831, + "step": 1920 + }, + { + "epoch": 2.4918385141026422, + "grad_norm": 0.731436550617218, + "learning_rate": 1.5379244113842106e-06, + "loss": 0.659216582775116, + "step": 1921 + }, + { + "epoch": 2.493136241052781, + "grad_norm": 0.7185493111610413, + "learning_rate": 1.53031354608293e-06, + "loss": 0.7043588161468506, + "step": 1922 + }, + { + "epoch": 2.49443396800292, + "grad_norm": 0.7525856494903564, + "learning_rate": 1.5227199991467335e-06, + "loss": 0.6584152579307556, + "step": 1923 + }, + { + "epoch": 2.495731694953059, + "grad_norm": 0.7316333055496216, + "learning_rate": 1.5151437861025032e-06, + "loss": 0.5660229921340942, + "step": 1924 + }, + { + "epoch": 2.4970294219031977, + "grad_norm": 0.7230735421180725, + "learning_rate": 1.5075849224416783e-06, + "loss": 0.6512929201126099, + "step": 1925 + }, + { + "epoch": 2.4983271488533365, + "grad_norm": 0.7257496118545532, + "learning_rate": 1.5000434236202211e-06, + "loss": 0.665654718875885, + "step": 1926 + }, + { + "epoch": 2.4996248758034754, + "grad_norm": 0.7206733226776123, + "learning_rate": 1.4925193050585873e-06, + "loss": 0.656543493270874, + "step": 1927 + }, + { + "epoch": 2.500922602753614, + "grad_norm": 0.7368682026863098, + "learning_rate": 1.4850125821416983e-06, + "loss": 0.6262930035591125, + "step": 1928 + }, + { + "epoch": 2.5022203297037535, + "grad_norm": 0.7327122092247009, + "learning_rate": 1.4775232702188947e-06, + "loss": 0.6124476790428162, + "step": 1929 + }, + { + "epoch": 2.503518056653892, + "grad_norm": 0.7396702170372009, + "learning_rate": 1.4700513846039332e-06, + "loss": 0.5858893990516663, + "step": 1930 + }, + { + "epoch": 2.5048157836040312, + "grad_norm": 0.7264795899391174, + "learning_rate": 1.4625969405749218e-06, + "loss": 0.6673074960708618, + "step": 1931 + }, + { + "epoch": 2.50611351055417, + "grad_norm": 0.7444024085998535, + "learning_rate": 1.4551599533743155e-06, + "loss": 0.6632063388824463, + "step": 1932 + }, + { + "epoch": 2.507411237504309, + "grad_norm": 0.7873533964157104, + "learning_rate": 1.4477404382088689e-06, + "loss": 0.6932485103607178, + "step": 1933 + }, + { + "epoch": 2.508708964454448, + "grad_norm": 0.7218677997589111, + "learning_rate": 1.4403384102496132e-06, + "loss": 0.6060501933097839, + "step": 1934 + }, + { + "epoch": 2.5100066914045867, + "grad_norm": 0.7189037203788757, + "learning_rate": 1.4329538846318225e-06, + "loss": 0.6672825217247009, + "step": 1935 + }, + { + "epoch": 2.5113044183547255, + "grad_norm": 0.7413656115531921, + "learning_rate": 1.4255868764549852e-06, + "loss": 0.6226930022239685, + "step": 1936 + }, + { + "epoch": 2.5126021453048644, + "grad_norm": 0.7134820222854614, + "learning_rate": 1.4182374007827605e-06, + "loss": 0.6670020818710327, + "step": 1937 + }, + { + "epoch": 2.513899872255003, + "grad_norm": 0.7409310340881348, + "learning_rate": 1.410905472642975e-06, + "loss": 0.6528188586235046, + "step": 1938 + }, + { + "epoch": 2.515197599205142, + "grad_norm": 0.7328957319259644, + "learning_rate": 1.4035911070275576e-06, + "loss": 0.6440276503562927, + "step": 1939 + }, + { + "epoch": 2.5164953261552814, + "grad_norm": 0.7795917391777039, + "learning_rate": 1.3962943188925438e-06, + "loss": 0.6895844340324402, + "step": 1940 + }, + { + "epoch": 2.51779305310542, + "grad_norm": 0.7205235958099365, + "learning_rate": 1.3890151231580117e-06, + "loss": 0.6578382253646851, + "step": 1941 + }, + { + "epoch": 2.519090780055559, + "grad_norm": 0.7230272889137268, + "learning_rate": 1.3817535347080768e-06, + "loss": 0.6839146614074707, + "step": 1942 + }, + { + "epoch": 2.520388507005698, + "grad_norm": 0.7740436792373657, + "learning_rate": 1.3745095683908482e-06, + "loss": 0.6639747619628906, + "step": 1943 + }, + { + "epoch": 2.521686233955837, + "grad_norm": 0.7473544478416443, + "learning_rate": 1.3672832390184042e-06, + "loss": 0.6539671421051025, + "step": 1944 + }, + { + "epoch": 2.5229839609059757, + "grad_norm": 0.7322369813919067, + "learning_rate": 1.3600745613667598e-06, + "loss": 0.6508328318595886, + "step": 1945 + }, + { + "epoch": 2.5242816878561145, + "grad_norm": 0.7107250094413757, + "learning_rate": 1.3528835501758365e-06, + "loss": 0.6462997198104858, + "step": 1946 + }, + { + "epoch": 2.5255794148062534, + "grad_norm": 0.7492804527282715, + "learning_rate": 1.345710220149431e-06, + "loss": 0.6402596235275269, + "step": 1947 + }, + { + "epoch": 2.526877141756392, + "grad_norm": 0.7333636283874512, + "learning_rate": 1.3385545859551886e-06, + "loss": 0.6897069811820984, + "step": 1948 + }, + { + "epoch": 2.528174868706531, + "grad_norm": 0.7276363372802734, + "learning_rate": 1.3314166622245717e-06, + "loss": 0.6612985134124756, + "step": 1949 + }, + { + "epoch": 2.52947259565667, + "grad_norm": 0.7273007035255432, + "learning_rate": 1.324296463552821e-06, + "loss": 0.6120861172676086, + "step": 1950 + }, + { + "epoch": 2.5307703226068092, + "grad_norm": 0.7370741963386536, + "learning_rate": 1.3171940044989495e-06, + "loss": 0.7364912033081055, + "step": 1951 + }, + { + "epoch": 2.5320680495569476, + "grad_norm": 0.7171733379364014, + "learning_rate": 1.3101092995856802e-06, + "loss": 0.6327986121177673, + "step": 1952 + }, + { + "epoch": 2.533365776507087, + "grad_norm": 0.7327584028244019, + "learning_rate": 1.3030423632994493e-06, + "loss": 0.6383181810379028, + "step": 1953 + }, + { + "epoch": 2.534663503457226, + "grad_norm": 0.7271527051925659, + "learning_rate": 1.2959932100903472e-06, + "loss": 0.6336721777915955, + "step": 1954 + }, + { + "epoch": 2.5359612304073647, + "grad_norm": 0.7524319887161255, + "learning_rate": 1.2889618543721094e-06, + "loss": 0.662846028804779, + "step": 1955 + }, + { + "epoch": 2.5372589573575035, + "grad_norm": 0.7470775842666626, + "learning_rate": 1.2819483105220798e-06, + "loss": 0.6556363105773926, + "step": 1956 + }, + { + "epoch": 2.5385566843076424, + "grad_norm": 0.7219761610031128, + "learning_rate": 1.274952592881179e-06, + "loss": 0.6259469389915466, + "step": 1957 + }, + { + "epoch": 2.539854411257781, + "grad_norm": 0.7156399488449097, + "learning_rate": 1.2679747157538801e-06, + "loss": 0.6495680212974548, + "step": 1958 + }, + { + "epoch": 2.54115213820792, + "grad_norm": 0.7380321621894836, + "learning_rate": 1.2610146934081768e-06, + "loss": 0.6329517960548401, + "step": 1959 + }, + { + "epoch": 2.542449865158059, + "grad_norm": 0.7332315444946289, + "learning_rate": 1.2540725400755472e-06, + "loss": 0.7250087261199951, + "step": 1960 + }, + { + "epoch": 2.543747592108198, + "grad_norm": 0.6943919658660889, + "learning_rate": 1.2471482699509463e-06, + "loss": 0.6895512938499451, + "step": 1961 + }, + { + "epoch": 2.545045319058337, + "grad_norm": 0.7061095237731934, + "learning_rate": 1.2402418971927487e-06, + "loss": 0.6665888428688049, + "step": 1962 + }, + { + "epoch": 2.546343046008476, + "grad_norm": 0.7387134432792664, + "learning_rate": 1.2333534359227383e-06, + "loss": 0.6526239514350891, + "step": 1963 + }, + { + "epoch": 2.547640772958615, + "grad_norm": 0.7360694408416748, + "learning_rate": 1.226482900226077e-06, + "loss": 0.6126471161842346, + "step": 1964 + }, + { + "epoch": 2.5489384999087537, + "grad_norm": 0.7157735824584961, + "learning_rate": 1.2196303041512714e-06, + "loss": 0.6631340384483337, + "step": 1965 + }, + { + "epoch": 2.5502362268588925, + "grad_norm": 0.7504985332489014, + "learning_rate": 1.2127956617101445e-06, + "loss": 0.6746035218238831, + "step": 1966 + }, + { + "epoch": 2.5515339538090314, + "grad_norm": 0.7058922648429871, + "learning_rate": 1.2059789868778116e-06, + "loss": 0.641784131526947, + "step": 1967 + }, + { + "epoch": 2.55283168075917, + "grad_norm": 0.7049847841262817, + "learning_rate": 1.1991802935926455e-06, + "loss": 0.5715856552124023, + "step": 1968 + }, + { + "epoch": 2.554129407709309, + "grad_norm": 0.7680399417877197, + "learning_rate": 1.1923995957562585e-06, + "loss": 0.6144214272499084, + "step": 1969 + }, + { + "epoch": 2.555427134659448, + "grad_norm": 0.7535842657089233, + "learning_rate": 1.1856369072334517e-06, + "loss": 0.6755169630050659, + "step": 1970 + }, + { + "epoch": 2.5567248616095872, + "grad_norm": 0.7342673540115356, + "learning_rate": 1.178892241852222e-06, + "loss": 0.6000391244888306, + "step": 1971 + }, + { + "epoch": 2.5580225885597256, + "grad_norm": 0.7472249865531921, + "learning_rate": 1.1721656134036962e-06, + "loss": 0.6413825750350952, + "step": 1972 + }, + { + "epoch": 2.559320315509865, + "grad_norm": 0.7509233355522156, + "learning_rate": 1.165457035642128e-06, + "loss": 0.662197470664978, + "step": 1973 + }, + { + "epoch": 2.560618042460004, + "grad_norm": 0.7827663421630859, + "learning_rate": 1.1587665222848643e-06, + "loss": 0.6412524580955505, + "step": 1974 + }, + { + "epoch": 2.5619157694101427, + "grad_norm": 0.7427447438240051, + "learning_rate": 1.1520940870123065e-06, + "loss": 0.6249580979347229, + "step": 1975 + }, + { + "epoch": 2.5632134963602815, + "grad_norm": 0.7329998016357422, + "learning_rate": 1.1454397434679022e-06, + "loss": 0.67298424243927, + "step": 1976 + }, + { + "epoch": 2.5645112233104204, + "grad_norm": 0.7379522919654846, + "learning_rate": 1.1388035052580936e-06, + "loss": 0.6553415060043335, + "step": 1977 + }, + { + "epoch": 2.565808950260559, + "grad_norm": 0.7228721380233765, + "learning_rate": 1.1321853859523113e-06, + "loss": 0.6369103193283081, + "step": 1978 + }, + { + "epoch": 2.567106677210698, + "grad_norm": 0.7016708850860596, + "learning_rate": 1.1255853990829323e-06, + "loss": 0.5797883868217468, + "step": 1979 + }, + { + "epoch": 2.568404404160837, + "grad_norm": 0.7308626174926758, + "learning_rate": 1.119003558145262e-06, + "loss": 0.6397665143013, + "step": 1980 + }, + { + "epoch": 2.569702131110976, + "grad_norm": 0.7535097599029541, + "learning_rate": 1.1124398765974976e-06, + "loss": 0.6552141308784485, + "step": 1981 + }, + { + "epoch": 2.570999858061115, + "grad_norm": 0.7034752368927002, + "learning_rate": 1.1058943678607082e-06, + "loss": 0.5966861844062805, + "step": 1982 + }, + { + "epoch": 2.5722975850112535, + "grad_norm": 0.7308294177055359, + "learning_rate": 1.0993670453187965e-06, + "loss": 0.678621768951416, + "step": 1983 + }, + { + "epoch": 2.573595311961393, + "grad_norm": 0.7100163698196411, + "learning_rate": 1.0928579223184943e-06, + "loss": 0.629210889339447, + "step": 1984 + }, + { + "epoch": 2.5748930389115317, + "grad_norm": 0.715771496295929, + "learning_rate": 1.0863670121693037e-06, + "loss": 0.6395845413208008, + "step": 1985 + }, + { + "epoch": 2.5761907658616705, + "grad_norm": 0.7279219627380371, + "learning_rate": 1.0798943281434958e-06, + "loss": 0.6864475607872009, + "step": 1986 + }, + { + "epoch": 2.5774884928118094, + "grad_norm": 0.7253682613372803, + "learning_rate": 1.0734398834760695e-06, + "loss": 0.613013505935669, + "step": 1987 + }, + { + "epoch": 2.578786219761948, + "grad_norm": 0.7802004814147949, + "learning_rate": 1.067003691364733e-06, + "loss": 0.686352014541626, + "step": 1988 + }, + { + "epoch": 2.580083946712087, + "grad_norm": 0.7534424066543579, + "learning_rate": 1.060585764969867e-06, + "loss": 0.7019538283348083, + "step": 1989 + }, + { + "epoch": 2.581381673662226, + "grad_norm": 0.7177249789237976, + "learning_rate": 1.0541861174145097e-06, + "loss": 0.6038709282875061, + "step": 1990 + }, + { + "epoch": 2.582679400612365, + "grad_norm": 0.7184469103813171, + "learning_rate": 1.047804761784319e-06, + "loss": 0.6142391562461853, + "step": 1991 + }, + { + "epoch": 2.5839771275625036, + "grad_norm": 0.7472144961357117, + "learning_rate": 1.0414417111275533e-06, + "loss": 0.6911140084266663, + "step": 1992 + }, + { + "epoch": 2.585274854512643, + "grad_norm": 0.7293811440467834, + "learning_rate": 1.0350969784550368e-06, + "loss": 0.6472504138946533, + "step": 1993 + }, + { + "epoch": 2.5865725814627814, + "grad_norm": 0.7172240018844604, + "learning_rate": 1.028770576740148e-06, + "loss": 0.674932599067688, + "step": 1994 + }, + { + "epoch": 2.5878703084129207, + "grad_norm": 0.70241379737854, + "learning_rate": 1.022462518918772e-06, + "loss": 0.5798804759979248, + "step": 1995 + }, + { + "epoch": 2.5891680353630595, + "grad_norm": 0.7364243865013123, + "learning_rate": 1.0161728178892928e-06, + "loss": 0.5872079133987427, + "step": 1996 + }, + { + "epoch": 2.5904657623131984, + "grad_norm": 0.7111935615539551, + "learning_rate": 1.0099014865125557e-06, + "loss": 0.609887421131134, + "step": 1997 + }, + { + "epoch": 2.591763489263337, + "grad_norm": 0.7527702450752258, + "learning_rate": 1.0036485376118477e-06, + "loss": 0.7164459824562073, + "step": 1998 + }, + { + "epoch": 2.593061216213476, + "grad_norm": 0.7354010939598083, + "learning_rate": 9.974139839728658e-07, + "loss": 0.7024336457252502, + "step": 1999 + }, + { + "epoch": 2.594358943163615, + "grad_norm": 0.7463487982749939, + "learning_rate": 9.91197838343696e-07, + "loss": 0.6939477324485779, + "step": 2000 + }, + { + "epoch": 2.595656670113754, + "grad_norm": 0.736788809299469, + "learning_rate": 9.850001134347765e-07, + "loss": 0.6644649505615234, + "step": 2001 + }, + { + "epoch": 2.5969543970638926, + "grad_norm": 0.7293047904968262, + "learning_rate": 9.788208219188932e-07, + "loss": 0.6119586825370789, + "step": 2002 + }, + { + "epoch": 2.5982521240140315, + "grad_norm": 0.7182607054710388, + "learning_rate": 9.726599764311318e-07, + "loss": 0.611649215221405, + "step": 2003 + }, + { + "epoch": 2.599549850964171, + "grad_norm": 0.7259273529052734, + "learning_rate": 9.665175895688594e-07, + "loss": 0.6101284623146057, + "step": 2004 + }, + { + "epoch": 2.600847577914309, + "grad_norm": 0.701677680015564, + "learning_rate": 9.603936738917063e-07, + "loss": 0.6807554364204407, + "step": 2005 + }, + { + "epoch": 2.6021453048644485, + "grad_norm": 0.7464570999145508, + "learning_rate": 9.54288241921525e-07, + "loss": 0.6781387329101562, + "step": 2006 + }, + { + "epoch": 2.6034430318145874, + "grad_norm": 0.7273631691932678, + "learning_rate": 9.482013061423833e-07, + "loss": 0.6723061203956604, + "step": 2007 + }, + { + "epoch": 2.604740758764726, + "grad_norm": 0.7473943829536438, + "learning_rate": 9.421328790005213e-07, + "loss": 0.6500118970870972, + "step": 2008 + }, + { + "epoch": 2.606038485714865, + "grad_norm": 0.7298744320869446, + "learning_rate": 9.360829729043375e-07, + "loss": 0.647000789642334, + "step": 2009 + }, + { + "epoch": 2.607336212665004, + "grad_norm": 0.7570067644119263, + "learning_rate": 9.300516002243587e-07, + "loss": 0.658997118473053, + "step": 2010 + }, + { + "epoch": 2.608633939615143, + "grad_norm": 0.7472216486930847, + "learning_rate": 9.240387732932155e-07, + "loss": 0.6748676300048828, + "step": 2011 + }, + { + "epoch": 2.6099316665652816, + "grad_norm": 0.7370826005935669, + "learning_rate": 9.180445044056164e-07, + "loss": 0.6571428179740906, + "step": 2012 + }, + { + "epoch": 2.6112293935154205, + "grad_norm": 0.7431361675262451, + "learning_rate": 9.120688058183269e-07, + "loss": 0.6858744025230408, + "step": 2013 + }, + { + "epoch": 2.6125271204655593, + "grad_norm": 0.7619893550872803, + "learning_rate": 9.061116897501321e-07, + "loss": 0.6860224008560181, + "step": 2014 + }, + { + "epoch": 2.6138248474156986, + "grad_norm": 0.6949592232704163, + "learning_rate": 9.001731683818338e-07, + "loss": 0.6436545848846436, + "step": 2015 + }, + { + "epoch": 2.6151225743658375, + "grad_norm": 0.7831428647041321, + "learning_rate": 8.942532538561988e-07, + "loss": 0.7231192588806152, + "step": 2016 + }, + { + "epoch": 2.6164203013159764, + "grad_norm": 0.7632724046707153, + "learning_rate": 8.883519582779598e-07, + "loss": 0.7117716073989868, + "step": 2017 + }, + { + "epoch": 2.617718028266115, + "grad_norm": 0.7610095739364624, + "learning_rate": 8.82469293713768e-07, + "loss": 0.6059130430221558, + "step": 2018 + }, + { + "epoch": 2.619015755216254, + "grad_norm": 0.7569096684455872, + "learning_rate": 8.766052721921858e-07, + "loss": 0.6521672010421753, + "step": 2019 + }, + { + "epoch": 2.620313482166393, + "grad_norm": 0.7089208960533142, + "learning_rate": 8.70759905703652e-07, + "loss": 0.6266563534736633, + "step": 2020 + }, + { + "epoch": 2.621611209116532, + "grad_norm": 0.7617636919021606, + "learning_rate": 8.649332062004622e-07, + "loss": 0.6242752075195312, + "step": 2021 + }, + { + "epoch": 2.6229089360666706, + "grad_norm": 0.7356528043746948, + "learning_rate": 8.59125185596742e-07, + "loss": 0.6804662942886353, + "step": 2022 + }, + { + "epoch": 2.6242066630168095, + "grad_norm": 0.730805516242981, + "learning_rate": 8.533358557684246e-07, + "loss": 0.6591053605079651, + "step": 2023 + }, + { + "epoch": 2.625504389966949, + "grad_norm": 0.740450382232666, + "learning_rate": 8.475652285532199e-07, + "loss": 0.6597458720207214, + "step": 2024 + }, + { + "epoch": 2.626802116917087, + "grad_norm": 0.7419881224632263, + "learning_rate": 8.41813315750607e-07, + "loss": 0.6208306550979614, + "step": 2025 + }, + { + "epoch": 2.6280998438672265, + "grad_norm": 0.7380879521369934, + "learning_rate": 8.360801291217835e-07, + "loss": 0.6311178803443909, + "step": 2026 + }, + { + "epoch": 2.6293975708173654, + "grad_norm": 0.6968350410461426, + "learning_rate": 8.303656803896731e-07, + "loss": 0.6126903891563416, + "step": 2027 + }, + { + "epoch": 2.630695297767504, + "grad_norm": 0.6993783712387085, + "learning_rate": 8.246699812388714e-07, + "loss": 0.6219539642333984, + "step": 2028 + }, + { + "epoch": 2.631993024717643, + "grad_norm": 0.7296315431594849, + "learning_rate": 8.189930433156424e-07, + "loss": 0.6454072594642639, + "step": 2029 + }, + { + "epoch": 2.633290751667782, + "grad_norm": 0.7435656785964966, + "learning_rate": 8.133348782278916e-07, + "loss": 0.640640139579773, + "step": 2030 + }, + { + "epoch": 2.634588478617921, + "grad_norm": 0.7254202961921692, + "learning_rate": 8.07695497545129e-07, + "loss": 0.574336588382721, + "step": 2031 + }, + { + "epoch": 2.6358862055680596, + "grad_norm": 0.7589125037193298, + "learning_rate": 8.020749127984629e-07, + "loss": 0.6744675636291504, + "step": 2032 + }, + { + "epoch": 2.6371839325181985, + "grad_norm": 0.7237491011619568, + "learning_rate": 7.964731354805677e-07, + "loss": 0.6050382852554321, + "step": 2033 + }, + { + "epoch": 2.6384816594683373, + "grad_norm": 0.736615777015686, + "learning_rate": 7.908901770456579e-07, + "loss": 0.6752466559410095, + "step": 2034 + }, + { + "epoch": 2.6397793864184766, + "grad_norm": 0.7375562787055969, + "learning_rate": 7.853260489094727e-07, + "loss": 0.6168178915977478, + "step": 2035 + }, + { + "epoch": 2.641077113368615, + "grad_norm": 0.7463002800941467, + "learning_rate": 7.79780762449246e-07, + "loss": 0.6608278751373291, + "step": 2036 + }, + { + "epoch": 2.6423748403187544, + "grad_norm": 0.7306200861930847, + "learning_rate": 7.742543290036797e-07, + "loss": 0.6231617331504822, + "step": 2037 + }, + { + "epoch": 2.643672567268893, + "grad_norm": 0.7191357612609863, + "learning_rate": 7.687467598729403e-07, + "loss": 0.6745753884315491, + "step": 2038 + }, + { + "epoch": 2.644970294219032, + "grad_norm": 0.6983992457389832, + "learning_rate": 7.63258066318604e-07, + "loss": 0.6209067702293396, + "step": 2039 + }, + { + "epoch": 2.646268021169171, + "grad_norm": 0.7191793322563171, + "learning_rate": 7.577882595636665e-07, + "loss": 0.6866878867149353, + "step": 2040 + }, + { + "epoch": 2.64756574811931, + "grad_norm": 0.7254435420036316, + "learning_rate": 7.523373507924947e-07, + "loss": 0.6178576946258545, + "step": 2041 + }, + { + "epoch": 2.6488634750694486, + "grad_norm": 0.7166338562965393, + "learning_rate": 7.469053511508184e-07, + "loss": 0.6005609035491943, + "step": 2042 + }, + { + "epoch": 2.6501612020195875, + "grad_norm": 0.7637789249420166, + "learning_rate": 7.414922717457018e-07, + "loss": 0.718099057674408, + "step": 2043 + }, + { + "epoch": 2.6514589289697263, + "grad_norm": 0.7439664006233215, + "learning_rate": 7.360981236455222e-07, + "loss": 0.6896740198135376, + "step": 2044 + }, + { + "epoch": 2.652756655919865, + "grad_norm": 0.7089899182319641, + "learning_rate": 7.307229178799469e-07, + "loss": 0.6416285634040833, + "step": 2045 + }, + { + "epoch": 2.6540543828700045, + "grad_norm": 0.7403551340103149, + "learning_rate": 7.253666654399128e-07, + "loss": 0.6686422824859619, + "step": 2046 + }, + { + "epoch": 2.655352109820143, + "grad_norm": 0.7438167333602905, + "learning_rate": 7.200293772775968e-07, + "loss": 0.6786326766014099, + "step": 2047 + }, + { + "epoch": 2.656649836770282, + "grad_norm": 0.7066054344177246, + "learning_rate": 7.14711064306407e-07, + "loss": 0.6346741318702698, + "step": 2048 + }, + { + "epoch": 2.657947563720421, + "grad_norm": 0.7646064758300781, + "learning_rate": 7.094117374009446e-07, + "loss": 0.67086261510849, + "step": 2049 + }, + { + "epoch": 2.65924529067056, + "grad_norm": 0.7251279950141907, + "learning_rate": 7.041314073969918e-07, + "loss": 0.6325028538703918, + "step": 2050 + }, + { + "epoch": 2.660543017620699, + "grad_norm": 0.7678724527359009, + "learning_rate": 6.988700850914876e-07, + "loss": 0.6267367005348206, + "step": 2051 + }, + { + "epoch": 2.6618407445708376, + "grad_norm": 0.7265689969062805, + "learning_rate": 6.93627781242504e-07, + "loss": 0.6617064476013184, + "step": 2052 + }, + { + "epoch": 2.6631384715209765, + "grad_norm": 0.7217026352882385, + "learning_rate": 6.884045065692257e-07, + "loss": 0.6587082743644714, + "step": 2053 + }, + { + "epoch": 2.6644361984711153, + "grad_norm": 0.7629426121711731, + "learning_rate": 6.83200271751927e-07, + "loss": 0.692336916923523, + "step": 2054 + }, + { + "epoch": 2.665733925421254, + "grad_norm": 0.7733954191207886, + "learning_rate": 6.780150874319524e-07, + "loss": 0.6802124381065369, + "step": 2055 + }, + { + "epoch": 2.667031652371393, + "grad_norm": 0.7317995429039001, + "learning_rate": 6.72848964211692e-07, + "loss": 0.6866804957389832, + "step": 2056 + }, + { + "epoch": 2.6683293793215324, + "grad_norm": 0.7314664721488953, + "learning_rate": 6.677019126545548e-07, + "loss": 0.6293746829032898, + "step": 2057 + }, + { + "epoch": 2.6696271062716708, + "grad_norm": 0.7272669076919556, + "learning_rate": 6.625739432849643e-07, + "loss": 0.673871636390686, + "step": 2058 + }, + { + "epoch": 2.67092483322181, + "grad_norm": 0.7291983962059021, + "learning_rate": 6.574650665883197e-07, + "loss": 0.6971457004547119, + "step": 2059 + }, + { + "epoch": 2.672222560171949, + "grad_norm": 0.746300458908081, + "learning_rate": 6.523752930109761e-07, + "loss": 0.6644643545150757, + "step": 2060 + }, + { + "epoch": 2.673520287122088, + "grad_norm": 0.7214688062667847, + "learning_rate": 6.473046329602384e-07, + "loss": 0.579256534576416, + "step": 2061 + }, + { + "epoch": 2.6748180140722266, + "grad_norm": 0.7157896757125854, + "learning_rate": 6.422530968043173e-07, + "loss": 0.6934089660644531, + "step": 2062 + }, + { + "epoch": 2.6761157410223655, + "grad_norm": 0.7446689605712891, + "learning_rate": 6.372206948723292e-07, + "loss": 0.6685813665390015, + "step": 2063 + }, + { + "epoch": 2.6774134679725043, + "grad_norm": 0.7324274182319641, + "learning_rate": 6.322074374542608e-07, + "loss": 0.6548044085502625, + "step": 2064 + }, + { + "epoch": 2.678711194922643, + "grad_norm": 0.7366431951522827, + "learning_rate": 6.272133348009546e-07, + "loss": 0.6561753153800964, + "step": 2065 + }, + { + "epoch": 2.680008921872782, + "grad_norm": 0.6906739473342896, + "learning_rate": 6.222383971240875e-07, + "loss": 0.6162272095680237, + "step": 2066 + }, + { + "epoch": 2.681306648822921, + "grad_norm": 0.7250291109085083, + "learning_rate": 6.17282634596148e-07, + "loss": 0.6417672038078308, + "step": 2067 + }, + { + "epoch": 2.68260437577306, + "grad_norm": 0.7425340414047241, + "learning_rate": 6.123460573504147e-07, + "loss": 0.6258097887039185, + "step": 2068 + }, + { + "epoch": 2.6839021027231986, + "grad_norm": 0.7179927825927734, + "learning_rate": 6.074286754809411e-07, + "loss": 0.6689911484718323, + "step": 2069 + }, + { + "epoch": 2.685199829673338, + "grad_norm": 0.7198472619056702, + "learning_rate": 6.025304990425241e-07, + "loss": 0.6711916923522949, + "step": 2070 + }, + { + "epoch": 2.685199829673338, + "eval_loss": 0.7492260932922363, + "eval_runtime": 145.3339, + "eval_samples_per_second": 35.725, + "eval_steps_per_second": 8.931, + "step": 2070 + }, + { + "epoch": 2.686497556623477, + "grad_norm": 0.7170226573944092, + "learning_rate": 5.976515380507008e-07, + "loss": 0.6783643960952759, + "step": 2071 + }, + { + "epoch": 2.6877952835736156, + "grad_norm": 0.7576429843902588, + "learning_rate": 5.927918024817059e-07, + "loss": 0.7274392247200012, + "step": 2072 + }, + { + "epoch": 2.6890930105237545, + "grad_norm": 0.7014567255973816, + "learning_rate": 5.879513022724714e-07, + "loss": 0.6101505160331726, + "step": 2073 + }, + { + "epoch": 2.6903907374738933, + "grad_norm": 0.7218198180198669, + "learning_rate": 5.831300473205948e-07, + "loss": 0.6697475910186768, + "step": 2074 + }, + { + "epoch": 2.691688464424032, + "grad_norm": 0.7351176738739014, + "learning_rate": 5.783280474843222e-07, + "loss": 0.6683188080787659, + "step": 2075 + }, + { + "epoch": 2.692986191374171, + "grad_norm": 0.7387964129447937, + "learning_rate": 5.735453125825275e-07, + "loss": 0.6495317220687866, + "step": 2076 + }, + { + "epoch": 2.69428391832431, + "grad_norm": 0.7699364423751831, + "learning_rate": 5.687818523946931e-07, + "loss": 0.6670310497283936, + "step": 2077 + }, + { + "epoch": 2.6955816452744488, + "grad_norm": 0.7399834394454956, + "learning_rate": 5.640376766608902e-07, + "loss": 0.6311538219451904, + "step": 2078 + }, + { + "epoch": 2.696879372224588, + "grad_norm": 0.7210641503334045, + "learning_rate": 5.593127950817579e-07, + "loss": 0.6419323682785034, + "step": 2079 + }, + { + "epoch": 2.698177099174727, + "grad_norm": 0.7432581186294556, + "learning_rate": 5.546072173184791e-07, + "loss": 0.6984769701957703, + "step": 2080 + }, + { + "epoch": 2.699474826124866, + "grad_norm": 0.7039175629615784, + "learning_rate": 5.499209529927751e-07, + "loss": 0.6130697727203369, + "step": 2081 + }, + { + "epoch": 2.7007725530750046, + "grad_norm": 0.7450562715530396, + "learning_rate": 5.452540116868654e-07, + "loss": 0.709285318851471, + "step": 2082 + }, + { + "epoch": 2.7020702800251435, + "grad_norm": 0.7391056418418884, + "learning_rate": 5.406064029434666e-07, + "loss": 0.7196047306060791, + "step": 2083 + }, + { + "epoch": 2.7033680069752823, + "grad_norm": 0.7550768852233887, + "learning_rate": 5.359781362657623e-07, + "loss": 0.6528761982917786, + "step": 2084 + }, + { + "epoch": 2.704665733925421, + "grad_norm": 0.7071364521980286, + "learning_rate": 5.313692211173838e-07, + "loss": 0.664832353591919, + "step": 2085 + }, + { + "epoch": 2.70596346087556, + "grad_norm": 0.7408220171928406, + "learning_rate": 5.26779666922399e-07, + "loss": 0.6972253322601318, + "step": 2086 + }, + { + "epoch": 2.707261187825699, + "grad_norm": 0.706516683101654, + "learning_rate": 5.222094830652835e-07, + "loss": 0.6413928866386414, + "step": 2087 + }, + { + "epoch": 2.708558914775838, + "grad_norm": 0.6609142422676086, + "learning_rate": 5.176586788909066e-07, + "loss": 0.61426842212677, + "step": 2088 + }, + { + "epoch": 2.7098566417259766, + "grad_norm": 0.7437728047370911, + "learning_rate": 5.131272637045104e-07, + "loss": 0.7072603106498718, + "step": 2089 + }, + { + "epoch": 2.711154368676116, + "grad_norm": 0.7043668627738953, + "learning_rate": 5.086152467716932e-07, + "loss": 0.6285822987556458, + "step": 2090 + }, + { + "epoch": 2.7124520956262548, + "grad_norm": 0.740922212600708, + "learning_rate": 5.041226373183861e-07, + "loss": 0.6565816402435303, + "step": 2091 + }, + { + "epoch": 2.7137498225763936, + "grad_norm": 0.716456949710846, + "learning_rate": 4.996494445308409e-07, + "loss": 0.6037598848342896, + "step": 2092 + }, + { + "epoch": 2.7150475495265325, + "grad_norm": 0.7253233194351196, + "learning_rate": 4.951956775556e-07, + "loss": 0.6392321586608887, + "step": 2093 + }, + { + "epoch": 2.7163452764766713, + "grad_norm": 0.7206777334213257, + "learning_rate": 4.907613454994964e-07, + "loss": 0.6381296515464783, + "step": 2094 + }, + { + "epoch": 2.71764300342681, + "grad_norm": 0.7042269110679626, + "learning_rate": 4.863464574296106e-07, + "loss": 0.6764304041862488, + "step": 2095 + }, + { + "epoch": 2.718940730376949, + "grad_norm": 0.7474066019058228, + "learning_rate": 4.819510223732738e-07, + "loss": 0.710769534111023, + "step": 2096 + }, + { + "epoch": 2.720238457327088, + "grad_norm": 0.7537234425544739, + "learning_rate": 4.775750493180386e-07, + "loss": 0.6200648546218872, + "step": 2097 + }, + { + "epoch": 2.7215361842772268, + "grad_norm": 0.7299405336380005, + "learning_rate": 4.7321854721166127e-07, + "loss": 0.6677811741828918, + "step": 2098 + }, + { + "epoch": 2.722833911227366, + "grad_norm": 0.6883127093315125, + "learning_rate": 4.6888152496208593e-07, + "loss": 0.5572382211685181, + "step": 2099 + }, + { + "epoch": 2.7241316381775045, + "grad_norm": 0.730640709400177, + "learning_rate": 4.645639914374278e-07, + "loss": 0.6930029392242432, + "step": 2100 + }, + { + "epoch": 2.7254293651276438, + "grad_norm": 0.7166103720664978, + "learning_rate": 4.602659554659461e-07, + "loss": 0.5943949818611145, + "step": 2101 + }, + { + "epoch": 2.7267270920777826, + "grad_norm": 0.7555888295173645, + "learning_rate": 4.559874258360408e-07, + "loss": 0.6563291549682617, + "step": 2102 + }, + { + "epoch": 2.7280248190279215, + "grad_norm": 0.7199954390525818, + "learning_rate": 4.5172841129621726e-07, + "loss": 0.6438056826591492, + "step": 2103 + }, + { + "epoch": 2.7293225459780603, + "grad_norm": 0.7394102811813354, + "learning_rate": 4.474889205550881e-07, + "loss": 0.6618061065673828, + "step": 2104 + }, + { + "epoch": 2.730620272928199, + "grad_norm": 0.7350549697875977, + "learning_rate": 4.4326896228133354e-07, + "loss": 0.6392850875854492, + "step": 2105 + }, + { + "epoch": 2.731917999878338, + "grad_norm": 0.7010295391082764, + "learning_rate": 4.3906854510370245e-07, + "loss": 0.6507184505462646, + "step": 2106 + }, + { + "epoch": 2.733215726828477, + "grad_norm": 0.7381558418273926, + "learning_rate": 4.348876776109856e-07, + "loss": 0.6545774936676025, + "step": 2107 + }, + { + "epoch": 2.7345134537786158, + "grad_norm": 0.7013775110244751, + "learning_rate": 4.307263683519969e-07, + "loss": 0.6212908625602722, + "step": 2108 + }, + { + "epoch": 2.7358111807287546, + "grad_norm": 0.7366412878036499, + "learning_rate": 4.2658462583556216e-07, + "loss": 0.684171736240387, + "step": 2109 + }, + { + "epoch": 2.737108907678894, + "grad_norm": 0.7112710475921631, + "learning_rate": 4.2246245853049706e-07, + "loss": 0.6173405051231384, + "step": 2110 + }, + { + "epoch": 2.7384066346290323, + "grad_norm": 0.7728049159049988, + "learning_rate": 4.1835987486558595e-07, + "loss": 0.6173956990242004, + "step": 2111 + }, + { + "epoch": 2.7397043615791716, + "grad_norm": 0.6931276321411133, + "learning_rate": 4.142768832295807e-07, + "loss": 0.6579814553260803, + "step": 2112 + }, + { + "epoch": 2.7410020885293105, + "grad_norm": 0.7127827405929565, + "learning_rate": 4.102134919711609e-07, + "loss": 0.6169605255126953, + "step": 2113 + }, + { + "epoch": 2.7422998154794493, + "grad_norm": 0.7167375683784485, + "learning_rate": 4.061697093989347e-07, + "loss": 0.6766916513442993, + "step": 2114 + }, + { + "epoch": 2.743597542429588, + "grad_norm": 0.7316383719444275, + "learning_rate": 4.021455437814148e-07, + "loss": 0.6033115983009338, + "step": 2115 + }, + { + "epoch": 2.744895269379727, + "grad_norm": 0.7062050104141235, + "learning_rate": 3.981410033469979e-07, + "loss": 0.6221883296966553, + "step": 2116 + }, + { + "epoch": 2.746192996329866, + "grad_norm": 0.7120285630226135, + "learning_rate": 3.941560962839619e-07, + "loss": 0.6118264198303223, + "step": 2117 + }, + { + "epoch": 2.7474907232800048, + "grad_norm": 0.7053149938583374, + "learning_rate": 3.9019083074042784e-07, + "loss": 0.5848374962806702, + "step": 2118 + }, + { + "epoch": 2.7487884502301436, + "grad_norm": 0.7223408818244934, + "learning_rate": 3.862452148243623e-07, + "loss": 0.6187662482261658, + "step": 2119 + }, + { + "epoch": 2.7500861771802825, + "grad_norm": 0.7368988394737244, + "learning_rate": 3.823192566035494e-07, + "loss": 0.647794783115387, + "step": 2120 + }, + { + "epoch": 2.7513839041304218, + "grad_norm": 0.7369173765182495, + "learning_rate": 3.7841296410558225e-07, + "loss": 0.6177867650985718, + "step": 2121 + }, + { + "epoch": 2.75268163108056, + "grad_norm": 0.7405387759208679, + "learning_rate": 3.7452634531783935e-07, + "loss": 0.6547641754150391, + "step": 2122 + }, + { + "epoch": 2.7539793580306995, + "grad_norm": 0.7224996089935303, + "learning_rate": 3.706594081874737e-07, + "loss": 0.6353644132614136, + "step": 2123 + }, + { + "epoch": 2.7552770849808383, + "grad_norm": 0.7474029660224915, + "learning_rate": 3.6681216062138923e-07, + "loss": 0.682817816734314, + "step": 2124 + }, + { + "epoch": 2.756574811930977, + "grad_norm": 0.7351192235946655, + "learning_rate": 3.6298461048623887e-07, + "loss": 0.6670258641242981, + "step": 2125 + }, + { + "epoch": 2.757872538881116, + "grad_norm": 0.6816844344139099, + "learning_rate": 3.5917676560838775e-07, + "loss": 0.609431803226471, + "step": 2126 + }, + { + "epoch": 2.759170265831255, + "grad_norm": 0.7361696362495422, + "learning_rate": 3.5538863377392095e-07, + "loss": 0.6345561742782593, + "step": 2127 + }, + { + "epoch": 2.7604679927813938, + "grad_norm": 0.750041663646698, + "learning_rate": 3.5162022272860475e-07, + "loss": 0.6858513951301575, + "step": 2128 + }, + { + "epoch": 2.7617657197315326, + "grad_norm": 0.7399468421936035, + "learning_rate": 3.478715401778876e-07, + "loss": 0.6643052697181702, + "step": 2129 + }, + { + "epoch": 2.7630634466816715, + "grad_norm": 0.764750063419342, + "learning_rate": 3.44142593786877e-07, + "loss": 0.7398065328598022, + "step": 2130 + }, + { + "epoch": 2.7643611736318103, + "grad_norm": 0.7458817958831787, + "learning_rate": 3.404333911803237e-07, + "loss": 0.6310020685195923, + "step": 2131 + }, + { + "epoch": 2.7656589005819496, + "grad_norm": 0.7141246199607849, + "learning_rate": 3.367439399426087e-07, + "loss": 0.6750156879425049, + "step": 2132 + }, + { + "epoch": 2.7669566275320885, + "grad_norm": 0.7121133804321289, + "learning_rate": 3.330742476177273e-07, + "loss": 0.6371780037879944, + "step": 2133 + }, + { + "epoch": 2.7682543544822273, + "grad_norm": 0.7298391461372375, + "learning_rate": 3.2942432170926743e-07, + "loss": 0.5725361108779907, + "step": 2134 + }, + { + "epoch": 2.769552081432366, + "grad_norm": 0.742504358291626, + "learning_rate": 3.257941696804079e-07, + "loss": 0.6555971503257751, + "step": 2135 + }, + { + "epoch": 2.770849808382505, + "grad_norm": 0.7092410922050476, + "learning_rate": 3.2218379895388896e-07, + "loss": 0.5985562205314636, + "step": 2136 + }, + { + "epoch": 2.772147535332644, + "grad_norm": 0.7868666648864746, + "learning_rate": 3.185932169120043e-07, + "loss": 0.6679819226264954, + "step": 2137 + }, + { + "epoch": 2.7734452622827828, + "grad_norm": 0.7421088814735413, + "learning_rate": 3.150224308965866e-07, + "loss": 0.6530116200447083, + "step": 2138 + }, + { + "epoch": 2.7747429892329216, + "grad_norm": 0.8364231586456299, + "learning_rate": 3.114714482089898e-07, + "loss": 0.7263075709342957, + "step": 2139 + }, + { + "epoch": 2.7760407161830605, + "grad_norm": 0.7070637345314026, + "learning_rate": 3.079402761100736e-07, + "loss": 0.5931848883628845, + "step": 2140 + }, + { + "epoch": 2.7773384431331998, + "grad_norm": 0.715865433216095, + "learning_rate": 3.0442892182019236e-07, + "loss": 0.5411802530288696, + "step": 2141 + }, + { + "epoch": 2.778636170083338, + "grad_norm": 0.7688911557197571, + "learning_rate": 3.00937392519175e-07, + "loss": 0.6958683133125305, + "step": 2142 + }, + { + "epoch": 2.7799338970334775, + "grad_norm": 0.7352038621902466, + "learning_rate": 2.974656953463173e-07, + "loss": 0.5754610896110535, + "step": 2143 + }, + { + "epoch": 2.7812316239836163, + "grad_norm": 0.7284995913505554, + "learning_rate": 2.9401383740035983e-07, + "loss": 0.6452664136886597, + "step": 2144 + }, + { + "epoch": 2.782529350933755, + "grad_norm": 0.7445150017738342, + "learning_rate": 2.905818257394799e-07, + "loss": 0.6866068243980408, + "step": 2145 + }, + { + "epoch": 2.783827077883894, + "grad_norm": 0.7142398357391357, + "learning_rate": 2.871696673812718e-07, + "loss": 0.6363600492477417, + "step": 2146 + }, + { + "epoch": 2.785124804834033, + "grad_norm": 0.7269803285598755, + "learning_rate": 2.837773693027346e-07, + "loss": 0.6741392612457275, + "step": 2147 + }, + { + "epoch": 2.7864225317841718, + "grad_norm": 0.7683520317077637, + "learning_rate": 2.8040493844026185e-07, + "loss": 0.6339127421379089, + "step": 2148 + }, + { + "epoch": 2.7877202587343106, + "grad_norm": 0.7308069467544556, + "learning_rate": 2.7705238168961867e-07, + "loss": 0.6009587049484253, + "step": 2149 + }, + { + "epoch": 2.7890179856844495, + "grad_norm": 0.7165871858596802, + "learning_rate": 2.7371970590593597e-07, + "loss": 0.6652488708496094, + "step": 2150 + }, + { + "epoch": 2.7903157126345883, + "grad_norm": 0.7490328550338745, + "learning_rate": 2.7040691790369165e-07, + "loss": 0.6180223226547241, + "step": 2151 + }, + { + "epoch": 2.7916134395847276, + "grad_norm": 0.729664146900177, + "learning_rate": 2.671140244567005e-07, + "loss": 0.6324159502983093, + "step": 2152 + }, + { + "epoch": 2.792911166534866, + "grad_norm": 0.728609025478363, + "learning_rate": 2.6384103229809445e-07, + "loss": 0.6185531616210938, + "step": 2153 + }, + { + "epoch": 2.7942088934850053, + "grad_norm": 0.7523699402809143, + "learning_rate": 2.605879481203144e-07, + "loss": 0.6833655834197998, + "step": 2154 + }, + { + "epoch": 2.795506620435144, + "grad_norm": 0.7207692265510559, + "learning_rate": 2.5735477857509406e-07, + "loss": 0.6240508556365967, + "step": 2155 + }, + { + "epoch": 2.796804347385283, + "grad_norm": 0.7327904105186462, + "learning_rate": 2.5414153027344846e-07, + "loss": 0.6517814993858337, + "step": 2156 + }, + { + "epoch": 2.798102074335422, + "grad_norm": 0.7405744194984436, + "learning_rate": 2.5094820978565416e-07, + "loss": 0.6217131614685059, + "step": 2157 + }, + { + "epoch": 2.7993998012855608, + "grad_norm": 0.7404962182044983, + "learning_rate": 2.4777482364124695e-07, + "loss": 0.6210229992866516, + "step": 2158 + }, + { + "epoch": 2.8006975282356996, + "grad_norm": 0.7105421423912048, + "learning_rate": 2.446213783289941e-07, + "loss": 0.6224609613418579, + "step": 2159 + }, + { + "epoch": 2.8019952551858385, + "grad_norm": 0.777541995048523, + "learning_rate": 2.4148788029689565e-07, + "loss": 0.6957967877388, + "step": 2160 + }, + { + "epoch": 2.8032929821359773, + "grad_norm": 0.7556023001670837, + "learning_rate": 2.3837433595216174e-07, + "loss": 0.6769660115242004, + "step": 2161 + }, + { + "epoch": 2.804590709086116, + "grad_norm": 0.7225756049156189, + "learning_rate": 2.3528075166120323e-07, + "loss": 0.6382290124893188, + "step": 2162 + }, + { + "epoch": 2.8058884360362555, + "grad_norm": 0.7236006259918213, + "learning_rate": 2.3220713374961457e-07, + "loss": 0.6584991216659546, + "step": 2163 + }, + { + "epoch": 2.807186162986394, + "grad_norm": 0.7643389701843262, + "learning_rate": 2.2915348850216955e-07, + "loss": 0.6372033953666687, + "step": 2164 + }, + { + "epoch": 2.808483889936533, + "grad_norm": 0.6990427374839783, + "learning_rate": 2.2611982216279693e-07, + "loss": 0.6647629141807556, + "step": 2165 + }, + { + "epoch": 2.809781616886672, + "grad_norm": 0.7442436814308167, + "learning_rate": 2.2310614093457917e-07, + "loss": 0.6188019514083862, + "step": 2166 + }, + { + "epoch": 2.811079343836811, + "grad_norm": 0.7379173040390015, + "learning_rate": 2.2011245097972812e-07, + "loss": 0.643206000328064, + "step": 2167 + }, + { + "epoch": 2.8123770707869498, + "grad_norm": 0.7450693249702454, + "learning_rate": 2.171387584195861e-07, + "loss": 0.6626617312431335, + "step": 2168 + }, + { + "epoch": 2.8136747977370886, + "grad_norm": 0.7376441359519958, + "learning_rate": 2.1418506933459926e-07, + "loss": 0.6287381052970886, + "step": 2169 + }, + { + "epoch": 2.8149725246872275, + "grad_norm": 0.7581092715263367, + "learning_rate": 2.1125138976431425e-07, + "loss": 0.6942882537841797, + "step": 2170 + }, + { + "epoch": 2.8162702516373663, + "grad_norm": 0.7551229596138, + "learning_rate": 2.0833772570736376e-07, + "loss": 0.6641190052032471, + "step": 2171 + }, + { + "epoch": 2.817567978587505, + "grad_norm": 0.723896861076355, + "learning_rate": 2.0544408312145325e-07, + "loss": 0.6406188607215881, + "step": 2172 + }, + { + "epoch": 2.818865705537644, + "grad_norm": 0.7154518961906433, + "learning_rate": 2.025704679233498e-07, + "loss": 0.6102049946784973, + "step": 2173 + }, + { + "epoch": 2.8201634324877833, + "grad_norm": 0.7203720808029175, + "learning_rate": 1.9971688598886874e-07, + "loss": 0.6299295425415039, + "step": 2174 + }, + { + "epoch": 2.8214611594379218, + "grad_norm": 0.7477232217788696, + "learning_rate": 1.9688334315286383e-07, + "loss": 0.657807469367981, + "step": 2175 + }, + { + "epoch": 2.822758886388061, + "grad_norm": 0.7149349451065063, + "learning_rate": 1.9406984520921156e-07, + "loss": 0.6447558999061584, + "step": 2176 + }, + { + "epoch": 2.8240566133382, + "grad_norm": 0.7502943277359009, + "learning_rate": 1.9127639791080345e-07, + "loss": 0.7339900732040405, + "step": 2177 + }, + { + "epoch": 2.8253543402883388, + "grad_norm": 0.7233054637908936, + "learning_rate": 1.885030069695326e-07, + "loss": 0.668261706829071, + "step": 2178 + }, + { + "epoch": 2.8266520672384776, + "grad_norm": 0.7234363555908203, + "learning_rate": 1.8574967805628174e-07, + "loss": 0.6577302813529968, + "step": 2179 + }, + { + "epoch": 2.8279497941886165, + "grad_norm": 0.7601407766342163, + "learning_rate": 1.8301641680090965e-07, + "loss": 0.6615520715713501, + "step": 2180 + }, + { + "epoch": 2.8292475211387553, + "grad_norm": 0.7155176401138306, + "learning_rate": 1.8030322879224792e-07, + "loss": 0.6732202768325806, + "step": 2181 + }, + { + "epoch": 2.830545248088894, + "grad_norm": 0.7071481347084045, + "learning_rate": 1.7761011957807439e-07, + "loss": 0.6781343817710876, + "step": 2182 + }, + { + "epoch": 2.831842975039033, + "grad_norm": 0.7136833071708679, + "learning_rate": 1.7493709466511965e-07, + "loss": 0.6390227675437927, + "step": 2183 + }, + { + "epoch": 2.833140701989172, + "grad_norm": 0.741337239742279, + "learning_rate": 1.7228415951904165e-07, + "loss": 0.6472516059875488, + "step": 2184 + }, + { + "epoch": 2.834438428939311, + "grad_norm": 0.732276976108551, + "learning_rate": 1.6965131956442004e-07, + "loss": 0.6666471362113953, + "step": 2185 + }, + { + "epoch": 2.83573615588945, + "grad_norm": 0.7136049866676331, + "learning_rate": 1.670385801847485e-07, + "loss": 0.6376191973686218, + "step": 2186 + }, + { + "epoch": 2.837033882839589, + "grad_norm": 0.7336399555206299, + "learning_rate": 1.6444594672241688e-07, + "loss": 0.6784384846687317, + "step": 2187 + }, + { + "epoch": 2.8383316097897278, + "grad_norm": 0.7359493374824524, + "learning_rate": 1.6187342447870235e-07, + "loss": 0.6160508394241333, + "step": 2188 + }, + { + "epoch": 2.8396293367398666, + "grad_norm": 0.7054331302642822, + "learning_rate": 1.5932101871376503e-07, + "loss": 0.6256083846092224, + "step": 2189 + }, + { + "epoch": 2.8409270636900055, + "grad_norm": 0.7195982336997986, + "learning_rate": 1.567887346466257e-07, + "loss": 0.5842984318733215, + "step": 2190 + }, + { + "epoch": 2.8422247906401443, + "grad_norm": 0.7330359220504761, + "learning_rate": 1.54276577455168e-07, + "loss": 0.655302882194519, + "step": 2191 + }, + { + "epoch": 2.843522517590283, + "grad_norm": 0.7195461392402649, + "learning_rate": 1.517845522761141e-07, + "loss": 0.695612370967865, + "step": 2192 + }, + { + "epoch": 2.844820244540422, + "grad_norm": 0.7142940759658813, + "learning_rate": 1.4931266420502687e-07, + "loss": 0.671156108379364, + "step": 2193 + }, + { + "epoch": 2.8461179714905613, + "grad_norm": 0.7329767346382141, + "learning_rate": 1.468609182962899e-07, + "loss": 0.6843516230583191, + "step": 2194 + }, + { + "epoch": 2.8474156984406997, + "grad_norm": 0.7575559616088867, + "learning_rate": 1.4442931956310525e-07, + "loss": 0.6152229309082031, + "step": 2195 + }, + { + "epoch": 2.848713425390839, + "grad_norm": 0.7627936005592346, + "learning_rate": 1.420178729774746e-07, + "loss": 0.6545628905296326, + "step": 2196 + }, + { + "epoch": 2.850011152340978, + "grad_norm": 0.7592964768409729, + "learning_rate": 1.3962658347019819e-07, + "loss": 0.7087745666503906, + "step": 2197 + }, + { + "epoch": 2.8513088792911168, + "grad_norm": 0.7184759974479675, + "learning_rate": 1.372554559308559e-07, + "loss": 0.6886664032936096, + "step": 2198 + }, + { + "epoch": 2.8526066062412556, + "grad_norm": 0.7686153054237366, + "learning_rate": 1.3490449520780492e-07, + "loss": 0.65256667137146, + "step": 2199 + }, + { + "epoch": 2.8539043331913945, + "grad_norm": 0.722467839717865, + "learning_rate": 1.3257370610816333e-07, + "loss": 0.6053767800331116, + "step": 2200 + }, + { + "epoch": 2.8552020601415333, + "grad_norm": 0.7348204255104065, + "learning_rate": 1.3026309339780442e-07, + "loss": 0.57970130443573, + "step": 2201 + }, + { + "epoch": 2.856499787091672, + "grad_norm": 0.724539041519165, + "learning_rate": 1.2797266180134994e-07, + "loss": 0.6097747087478638, + "step": 2202 + }, + { + "epoch": 2.857797514041811, + "grad_norm": 0.7563627362251282, + "learning_rate": 1.2570241600214805e-07, + "loss": 0.6322290897369385, + "step": 2203 + }, + { + "epoch": 2.85909524099195, + "grad_norm": 0.7333301901817322, + "learning_rate": 1.2345236064228216e-07, + "loss": 0.6172837615013123, + "step": 2204 + }, + { + "epoch": 2.860392967942089, + "grad_norm": 0.7645448446273804, + "learning_rate": 1.212225003225409e-07, + "loss": 0.6847653388977051, + "step": 2205 + }, + { + "epoch": 2.8616906948922276, + "grad_norm": 0.7139600515365601, + "learning_rate": 1.1901283960242704e-07, + "loss": 0.641283392906189, + "step": 2206 + }, + { + "epoch": 2.862988421842367, + "grad_norm": 0.7192294597625732, + "learning_rate": 1.168233830001364e-07, + "loss": 0.6558660864830017, + "step": 2207 + }, + { + "epoch": 2.8642861487925058, + "grad_norm": 0.7247057557106018, + "learning_rate": 1.1465413499255452e-07, + "loss": 0.648059070110321, + "step": 2208 + }, + { + "epoch": 2.8655838757426446, + "grad_norm": 0.7141038179397583, + "learning_rate": 1.1250510001524329e-07, + "loss": 0.7089075446128845, + "step": 2209 + }, + { + "epoch": 2.8668816026927835, + "grad_norm": 0.7448967099189758, + "learning_rate": 1.103762824624377e-07, + "loss": 0.655659019947052, + "step": 2210 + }, + { + "epoch": 2.8681793296429223, + "grad_norm": 0.7217125296592712, + "learning_rate": 1.0826768668702691e-07, + "loss": 0.6335598826408386, + "step": 2211 + }, + { + "epoch": 2.869477056593061, + "grad_norm": 0.7432066202163696, + "learning_rate": 1.0617931700055984e-07, + "loss": 0.6629352569580078, + "step": 2212 + }, + { + "epoch": 2.8707747835432, + "grad_norm": 0.759253740310669, + "learning_rate": 1.0411117767322065e-07, + "loss": 0.6971714496612549, + "step": 2213 + }, + { + "epoch": 2.872072510493339, + "grad_norm": 0.7214189171791077, + "learning_rate": 1.0206327293383222e-07, + "loss": 0.6498401165008545, + "step": 2214 + }, + { + "epoch": 2.8733702374434777, + "grad_norm": 0.7300909161567688, + "learning_rate": 1.000356069698416e-07, + "loss": 0.6666358113288879, + "step": 2215 + }, + { + "epoch": 2.874667964393617, + "grad_norm": 0.7169894576072693, + "learning_rate": 9.802818392731117e-08, + "loss": 0.6067378520965576, + "step": 2216 + }, + { + "epoch": 2.8759656913437555, + "grad_norm": 0.7870055437088013, + "learning_rate": 9.60410079109153e-08, + "loss": 0.7164538502693176, + "step": 2217 + }, + { + "epoch": 2.8772634182938948, + "grad_norm": 0.731452465057373, + "learning_rate": 9.407408298392373e-08, + "loss": 0.6627915501594543, + "step": 2218 + }, + { + "epoch": 2.8785611452440336, + "grad_norm": 0.7452148795127869, + "learning_rate": 9.212741316820039e-08, + "loss": 0.6090914607048035, + "step": 2219 + }, + { + "epoch": 2.8798588721941725, + "grad_norm": 0.7165141701698303, + "learning_rate": 9.020100244419461e-08, + "loss": 0.7527438998222351, + "step": 2220 + }, + { + "epoch": 2.8811565991443113, + "grad_norm": 0.7165322303771973, + "learning_rate": 8.829485475092548e-08, + "loss": 0.663241446018219, + "step": 2221 + }, + { + "epoch": 2.88245432609445, + "grad_norm": 0.8054161667823792, + "learning_rate": 8.640897398598525e-08, + "loss": 0.765292227268219, + "step": 2222 + }, + { + "epoch": 2.883752053044589, + "grad_norm": 0.7372357249259949, + "learning_rate": 8.454336400552154e-08, + "loss": 0.6321142911911011, + "step": 2223 + }, + { + "epoch": 2.885049779994728, + "grad_norm": 0.7551286220550537, + "learning_rate": 8.269802862423405e-08, + "loss": 0.6694223880767822, + "step": 2224 + }, + { + "epoch": 2.8863475069448667, + "grad_norm": 0.6954628825187683, + "learning_rate": 8.087297161536778e-08, + "loss": 0.650575578212738, + "step": 2225 + }, + { + "epoch": 2.8876452338950056, + "grad_norm": 0.6984097957611084, + "learning_rate": 7.906819671070098e-08, + "loss": 0.6023176908493042, + "step": 2226 + }, + { + "epoch": 2.888942960845145, + "grad_norm": 0.7234562635421753, + "learning_rate": 7.728370760054283e-08, + "loss": 0.6330822110176086, + "step": 2227 + }, + { + "epoch": 2.8902406877952833, + "grad_norm": 0.7173102498054504, + "learning_rate": 7.55195079337212e-08, + "loss": 0.6250259876251221, + "step": 2228 + }, + { + "epoch": 2.8915384147454226, + "grad_norm": 0.7292760610580444, + "learning_rate": 7.377560131757832e-08, + "loss": 0.6211444139480591, + "step": 2229 + }, + { + "epoch": 2.8928361416955615, + "grad_norm": 0.7143842577934265, + "learning_rate": 7.205199131796182e-08, + "loss": 0.6102809906005859, + "step": 2230 + }, + { + "epoch": 2.8941338686457003, + "grad_norm": 0.7200958132743835, + "learning_rate": 7.034868145921802e-08, + "loss": 0.6820523142814636, + "step": 2231 + }, + { + "epoch": 2.895431595595839, + "grad_norm": 0.7009389400482178, + "learning_rate": 6.866567522418322e-08, + "loss": 0.6737648248672485, + "step": 2232 + }, + { + "epoch": 2.896729322545978, + "grad_norm": 0.7720589637756348, + "learning_rate": 6.700297605418127e-08, + "loss": 0.6236926317214966, + "step": 2233 + }, + { + "epoch": 2.898027049496117, + "grad_norm": 0.7273607850074768, + "learning_rate": 6.53605873490093e-08, + "loss": 0.673498272895813, + "step": 2234 + }, + { + "epoch": 2.8993247764462557, + "grad_norm": 0.7236337065696716, + "learning_rate": 6.373851246693763e-08, + "loss": 0.6256372928619385, + "step": 2235 + }, + { + "epoch": 2.9006225033963946, + "grad_norm": 0.7014041543006897, + "learning_rate": 6.21367547246976e-08, + "loss": 0.6363632678985596, + "step": 2236 + }, + { + "epoch": 2.9019202303465335, + "grad_norm": 0.7210372686386108, + "learning_rate": 6.055531739747933e-08, + "loss": 0.6491326689720154, + "step": 2237 + }, + { + "epoch": 2.9032179572966728, + "grad_norm": 0.766070544719696, + "learning_rate": 5.899420371892173e-08, + "loss": 0.606798529624939, + "step": 2238 + }, + { + "epoch": 2.904515684246811, + "grad_norm": 0.7013832330703735, + "learning_rate": 5.745341688110806e-08, + "loss": 0.6418301463127136, + "step": 2239 + }, + { + "epoch": 2.9058134111969505, + "grad_norm": 0.7240904569625854, + "learning_rate": 5.593296003455595e-08, + "loss": 0.6093890070915222, + "step": 2240 + }, + { + "epoch": 2.9071111381470893, + "grad_norm": 0.7125054001808167, + "learning_rate": 5.4432836288215165e-08, + "loss": 0.6541129350662231, + "step": 2241 + }, + { + "epoch": 2.908408865097228, + "grad_norm": 0.7161985635757446, + "learning_rate": 5.2953048709459834e-08, + "loss": 0.617908239364624, + "step": 2242 + }, + { + "epoch": 2.909706592047367, + "grad_norm": 0.737856388092041, + "learning_rate": 5.1493600324080684e-08, + "loss": 0.649212121963501, + "step": 2243 + }, + { + "epoch": 2.911004318997506, + "grad_norm": 0.7285069227218628, + "learning_rate": 5.0054494116279497e-08, + "loss": 0.6526796221733093, + "step": 2244 + }, + { + "epoch": 2.9123020459476447, + "grad_norm": 0.715974748134613, + "learning_rate": 4.8635733028664644e-08, + "loss": 0.6148603558540344, + "step": 2245 + }, + { + "epoch": 2.9135997728977836, + "grad_norm": 0.7559519410133362, + "learning_rate": 4.723731996224446e-08, + "loss": 0.6750462055206299, + "step": 2246 + }, + { + "epoch": 2.9148974998479225, + "grad_norm": 0.7167734503746033, + "learning_rate": 4.585925777641831e-08, + "loss": 0.6933612823486328, + "step": 2247 + }, + { + "epoch": 2.9161952267980613, + "grad_norm": 0.7255918383598328, + "learning_rate": 4.450154928897443e-08, + "loss": 0.6560993194580078, + "step": 2248 + }, + { + "epoch": 2.9174929537482006, + "grad_norm": 0.7656079530715942, + "learning_rate": 4.316419727608434e-08, + "loss": 0.6685020923614502, + "step": 2249 + }, + { + "epoch": 2.9187906806983395, + "grad_norm": 0.7287185788154602, + "learning_rate": 4.1847204472293954e-08, + "loss": 0.646466851234436, + "step": 2250 + }, + { + "epoch": 2.9200884076484783, + "grad_norm": 0.7272042036056519, + "learning_rate": 4.055057357052139e-08, + "loss": 0.6481143236160278, + "step": 2251 + }, + { + "epoch": 2.921386134598617, + "grad_norm": 0.7513357996940613, + "learning_rate": 3.927430722204473e-08, + "loss": 0.6382118463516235, + "step": 2252 + }, + { + "epoch": 2.922683861548756, + "grad_norm": 0.7202178239822388, + "learning_rate": 3.801840803651091e-08, + "loss": 0.6208593845367432, + "step": 2253 + }, + { + "epoch": 2.923981588498895, + "grad_norm": 0.7391272783279419, + "learning_rate": 3.678287858191132e-08, + "loss": 0.62124103307724, + "step": 2254 + }, + { + "epoch": 2.9252793154490337, + "grad_norm": 0.7046197056770325, + "learning_rate": 3.5567721384593965e-08, + "loss": 0.6635320782661438, + "step": 2255 + }, + { + "epoch": 2.9265770423991726, + "grad_norm": 0.7366517782211304, + "learning_rate": 3.437293892924576e-08, + "loss": 0.657387912273407, + "step": 2256 + }, + { + "epoch": 2.9278747693493115, + "grad_norm": 0.7833458781242371, + "learning_rate": 3.3198533658895804e-08, + "loss": 0.681797981262207, + "step": 2257 + }, + { + "epoch": 2.9291724962994508, + "grad_norm": 0.7216890454292297, + "learning_rate": 3.2044507974905433e-08, + "loss": 0.5936287641525269, + "step": 2258 + }, + { + "epoch": 2.930470223249589, + "grad_norm": 0.736221969127655, + "learning_rate": 3.091086423696377e-08, + "loss": 0.6654385328292847, + "step": 2259 + }, + { + "epoch": 2.9317679501997285, + "grad_norm": 0.7042406797409058, + "learning_rate": 2.9797604763087684e-08, + "loss": 0.6541644930839539, + "step": 2260 + }, + { + "epoch": 2.9330656771498673, + "grad_norm": 0.7537480592727661, + "learning_rate": 2.8704731829609643e-08, + "loss": 0.6462427377700806, + "step": 2261 + }, + { + "epoch": 2.934363404100006, + "grad_norm": 0.748501718044281, + "learning_rate": 2.763224767117767e-08, + "loss": 0.6837744116783142, + "step": 2262 + }, + { + "epoch": 2.935661131050145, + "grad_norm": 0.7571681141853333, + "learning_rate": 2.6580154480750907e-08, + "loss": 0.6494276523590088, + "step": 2263 + }, + { + "epoch": 2.936958858000284, + "grad_norm": 0.7051231265068054, + "learning_rate": 2.554845440959408e-08, + "loss": 0.6642428040504456, + "step": 2264 + }, + { + "epoch": 2.9382565849504227, + "grad_norm": 0.7481043934822083, + "learning_rate": 2.4537149567271935e-08, + "loss": 0.7524136900901794, + "step": 2265 + }, + { + "epoch": 2.9395543119005616, + "grad_norm": 0.7172916531562805, + "learning_rate": 2.3546242021648126e-08, + "loss": 0.6545467972755432, + "step": 2266 + }, + { + "epoch": 2.9408520388507005, + "grad_norm": 0.7390909790992737, + "learning_rate": 2.2575733798876342e-08, + "loss": 0.6789126396179199, + "step": 2267 + }, + { + "epoch": 2.9421497658008393, + "grad_norm": 0.6911484003067017, + "learning_rate": 2.162562688340142e-08, + "loss": 0.5900536775588989, + "step": 2268 + }, + { + "epoch": 2.9434474927509786, + "grad_norm": 0.7650425434112549, + "learning_rate": 2.0695923217950442e-08, + "loss": 0.6601477861404419, + "step": 2269 + }, + { + "epoch": 2.944745219701117, + "grad_norm": 0.7415356040000916, + "learning_rate": 1.9786624703532764e-08, + "loss": 0.7132882475852966, + "step": 2270 + }, + { + "epoch": 2.9460429466512563, + "grad_norm": 0.7267791032791138, + "learning_rate": 1.8897733199434443e-08, + "loss": 0.6234641075134277, + "step": 2271 + }, + { + "epoch": 2.947340673601395, + "grad_norm": 0.7090092897415161, + "learning_rate": 1.8029250523211582e-08, + "loss": 0.6485676765441895, + "step": 2272 + }, + { + "epoch": 2.948638400551534, + "grad_norm": 0.7129170298576355, + "learning_rate": 1.718117845069367e-08, + "loss": 0.6410534977912903, + "step": 2273 + }, + { + "epoch": 2.949936127501673, + "grad_norm": 0.7186943292617798, + "learning_rate": 1.635351871597246e-08, + "loss": 0.7133535146713257, + "step": 2274 + }, + { + "epoch": 2.9512338544518117, + "grad_norm": 0.7258438467979431, + "learning_rate": 1.554627301140199e-08, + "loss": 0.5933857560157776, + "step": 2275 + }, + { + "epoch": 2.9525315814019506, + "grad_norm": 0.7135540843009949, + "learning_rate": 1.4759442987596351e-08, + "loss": 0.6514700055122375, + "step": 2276 + }, + { + "epoch": 2.9538293083520895, + "grad_norm": 0.7308082580566406, + "learning_rate": 1.3993030253423023e-08, + "loss": 0.6132031679153442, + "step": 2277 + }, + { + "epoch": 2.9551270353022283, + "grad_norm": 0.7810271382331848, + "learning_rate": 1.3247036376002886e-08, + "loss": 0.654386043548584, + "step": 2278 + }, + { + "epoch": 2.956424762252367, + "grad_norm": 0.761455237865448, + "learning_rate": 1.252146288070355e-08, + "loss": 0.6730161309242249, + "step": 2279 + }, + { + "epoch": 2.9577224892025065, + "grad_norm": 0.7196770906448364, + "learning_rate": 1.1816311251140466e-08, + "loss": 0.6393716931343079, + "step": 2280 + }, + { + "epoch": 2.959020216152645, + "grad_norm": 0.6943092346191406, + "learning_rate": 1.113158292916916e-08, + "loss": 0.6582570672035217, + "step": 2281 + }, + { + "epoch": 2.960317943102784, + "grad_norm": 0.7215139865875244, + "learning_rate": 1.0467279314886336e-08, + "loss": 0.6728758215904236, + "step": 2282 + }, + { + "epoch": 2.961615670052923, + "grad_norm": 0.7100042700767517, + "learning_rate": 9.82340176662433e-09, + "loss": 0.6192055344581604, + "step": 2283 + }, + { + "epoch": 2.962913397003062, + "grad_norm": 0.772715151309967, + "learning_rate": 9.199951600951106e-09, + "loss": 0.6373339295387268, + "step": 2284 + }, + { + "epoch": 2.9642111239532007, + "grad_norm": 0.6952692866325378, + "learning_rate": 8.596930092662493e-09, + "loss": 0.6480576992034912, + "step": 2285 + }, + { + "epoch": 2.9655088509033396, + "grad_norm": 0.729654848575592, + "learning_rate": 8.014338474785499e-09, + "loss": 0.5901361107826233, + "step": 2286 + }, + { + "epoch": 2.9668065778534785, + "grad_norm": 0.7037022709846497, + "learning_rate": 7.45217793857389e-09, + "loss": 0.6541380882263184, + "step": 2287 + }, + { + "epoch": 2.9681043048036173, + "grad_norm": 0.7359015941619873, + "learning_rate": 6.910449633501515e-09, + "loss": 0.6508733630180359, + "step": 2288 + }, + { + "epoch": 2.969402031753756, + "grad_norm": 0.6860209703445435, + "learning_rate": 6.389154667266751e-09, + "loss": 0.6324610710144043, + "step": 2289 + }, + { + "epoch": 2.970699758703895, + "grad_norm": 0.7234740257263184, + "learning_rate": 5.888294105785841e-09, + "loss": 0.6781293749809265, + "step": 2290 + }, + { + "epoch": 2.9719974856540343, + "grad_norm": 0.748229444026947, + "learning_rate": 5.407868973191788e-09, + "loss": 0.7036339640617371, + "step": 2291 + }, + { + "epoch": 2.9732952126041727, + "grad_norm": 0.7085629105567932, + "learning_rate": 4.947880251832127e-09, + "loss": 0.6461360454559326, + "step": 2292 + }, + { + "epoch": 2.974592939554312, + "grad_norm": 0.7789812088012695, + "learning_rate": 4.508328882268931e-09, + "loss": 0.6448870897293091, + "step": 2293 + }, + { + "epoch": 2.975890666504451, + "grad_norm": 0.7379918694496155, + "learning_rate": 4.089215763271037e-09, + "loss": 0.5733003616333008, + "step": 2294 + }, + { + "epoch": 2.9771883934545897, + "grad_norm": 0.700847864151001, + "learning_rate": 3.6905417518195985e-09, + "loss": 0.6530927419662476, + "step": 2295 + }, + { + "epoch": 2.9784861204047286, + "grad_norm": 0.7081441879272461, + "learning_rate": 3.312307663103642e-09, + "loss": 0.643142819404602, + "step": 2296 + }, + { + "epoch": 2.9797838473548675, + "grad_norm": 0.7461786270141602, + "learning_rate": 2.954514270513409e-09, + "loss": 0.6704539060592651, + "step": 2297 + }, + { + "epoch": 2.9810815743050063, + "grad_norm": 0.7500106692314148, + "learning_rate": 2.6171623056481245e-09, + "loss": 0.6799619197845459, + "step": 2298 + }, + { + "epoch": 2.982379301255145, + "grad_norm": 0.7831278443336487, + "learning_rate": 2.300252458306007e-09, + "loss": 0.6943190097808838, + "step": 2299 + }, + { + "epoch": 2.983677028205284, + "grad_norm": 0.7213168740272522, + "learning_rate": 2.0037853764887096e-09, + "loss": 0.677469789981842, + "step": 2300 + }, + { + "epoch": 2.983677028205284, + "eval_loss": 0.748992383480072, + "eval_runtime": 142.0816, + "eval_samples_per_second": 36.542, + "eval_steps_per_second": 9.136, + "step": 2300 + }, + { + "epoch": 2.984974755155423, + "grad_norm": 0.7276439070701599, + "learning_rate": 1.7277616663946562e-09, + "loss": 0.6558234095573425, + "step": 2301 + }, + { + "epoch": 2.986272482105562, + "grad_norm": 0.7110369205474854, + "learning_rate": 1.4721818924223752e-09, + "loss": 0.6696679592132568, + "step": 2302 + }, + { + "epoch": 2.987570209055701, + "grad_norm": 0.7226536273956299, + "learning_rate": 1.2370465771693874e-09, + "loss": 0.6655494570732117, + "step": 2303 + }, + { + "epoch": 2.98886793600584, + "grad_norm": 0.7148182988166809, + "learning_rate": 1.0223562014277654e-09, + "loss": 0.6355955600738525, + "step": 2304 + }, + { + "epoch": 2.9901656629559787, + "grad_norm": 0.7424213886260986, + "learning_rate": 8.281112041841343e-10, + "loss": 0.6586095094680786, + "step": 2305 + }, + { + "epoch": 2.9914633899061176, + "grad_norm": 0.742540717124939, + "learning_rate": 6.543119826207811e-10, + "loss": 0.6475944519042969, + "step": 2306 + }, + { + "epoch": 2.9927611168562565, + "grad_norm": 0.7507250905036926, + "learning_rate": 5.009588921123243e-10, + "loss": 0.6415051221847534, + "step": 2307 + }, + { + "epoch": 2.9940588438063953, + "grad_norm": 0.7377761602401733, + "learning_rate": 3.680522462279346e-10, + "loss": 0.6111840009689331, + "step": 2308 + }, + { + "epoch": 2.995356570756534, + "grad_norm": 0.735808789730072, + "learning_rate": 2.555923167291141e-10, + "loss": 0.6714158058166504, + "step": 2309 + }, + { + "epoch": 2.996654297706673, + "grad_norm": 0.7448306083679199, + "learning_rate": 1.635793335652558e-10, + "loss": 0.7029042840003967, + "step": 2310 + }, + { + "epoch": 2.9979520246568123, + "grad_norm": 0.761702299118042, + "learning_rate": 9.20134848814147e-11, + "loss": 0.6207424998283386, + "step": 2311 + }, + { + "epoch": 2.9992497516069507, + "grad_norm": 0.76186203956604, + "learning_rate": 4.08949170105366e-11, + "loss": 0.6319787502288818, + "step": 2312 + }, + { + "epoch": 3.0, + "grad_norm": 0.9660639762878418, + "learning_rate": 1.022373447900904e-11, + "loss": 0.7546765804290771, + "step": 2313 + }, + { + "epoch": 3.0, + "step": 2313, + "total_flos": 4.1917370482093916e+18, + "train_loss": 0.06833103949818527, + "train_runtime": 3477.4713, + "train_samples_per_second": 85.09, + "train_steps_per_second": 0.665 + } + ], + "logging_steps": 1.0, + "max_steps": 2313, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 230, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.1917370482093916e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}