{ "log_history": [ { "loss": 3.6598, "grad_norm": 4.916348934173584, "learning_rate": 1.8750000000000003e-06, "entropy": 2.4575500309467317, "num_tokens": 59642.0, "mean_token_accuracy": 0.4153611570596695, "epoch": 0.012630249447426587, "step": 10 }, { "loss": 3.3603, "grad_norm": 3.8026137351989746, "learning_rate": 3.958333333333333e-06, "entropy": 2.4072387635707857, "num_tokens": 119219.0, "mean_token_accuracy": 0.4350100517272949, "epoch": 0.025260498894853173, "step": 20 }, { "loss": 2.9434, "grad_norm": 3.7880399227142334, "learning_rate": 6.041666666666667e-06, "entropy": 2.3899864494800567, "num_tokens": 179590.0, "mean_token_accuracy": 0.4788561977446079, "epoch": 0.03789074834227976, "step": 30 }, { "loss": 2.3919, "grad_norm": 3.0592074394226074, "learning_rate": 8.125000000000001e-06, "entropy": 2.1122478008270265, "num_tokens": 238845.0, "mean_token_accuracy": 0.574567300081253, "epoch": 0.05052099778970635, "step": 40 }, { "loss": 1.912, "grad_norm": 1.5836262702941895, "learning_rate": 9.993489583333334e-06, "entropy": 1.7037649989128112, "num_tokens": 298317.0, "mean_token_accuracy": 0.6467478528618813, "epoch": 0.06315124723713293, "step": 50 }, { "loss": 1.6762, "grad_norm": 1.217679738998413, "learning_rate": 9.928385416666668e-06, "entropy": 1.5623225390911102, "num_tokens": 357858.0, "mean_token_accuracy": 0.679128734767437, "epoch": 0.07578149668455952, "step": 60 }, { "loss": 1.5372, "grad_norm": 0.973615288734436, "learning_rate": 9.863281250000001e-06, "entropy": 1.5071247130632401, "num_tokens": 418834.0, "mean_token_accuracy": 0.6943170607089997, "epoch": 0.0884117461319861, "step": 70 }, { "loss": 1.4751, "grad_norm": 0.9853116869926453, "learning_rate": 9.798177083333335e-06, "entropy": 1.4568549275398255, "num_tokens": 478960.0, "mean_token_accuracy": 0.7024633795022964, "epoch": 0.1010419955794127, "step": 80 }, { "loss": 1.474, "grad_norm": 0.9147132039070129, "learning_rate": 9.733072916666667e-06, "entropy": 1.4889154583215714, "num_tokens": 541795.0, "mean_token_accuracy": 0.6996816232800483, "epoch": 0.11367224502683929, "step": 90 }, { "loss": 1.3805, "grad_norm": 0.9684887528419495, "learning_rate": 9.66796875e-06, "entropy": 1.4158774405717849, "num_tokens": 601174.0, "mean_token_accuracy": 0.7165829420089722, "epoch": 0.12630249447426586, "step": 100 }, { "loss": 1.3718, "grad_norm": 0.9440239667892456, "learning_rate": 9.602864583333335e-06, "entropy": 1.4276181221008302, "num_tokens": 661048.0, "mean_token_accuracy": 0.7143253713846207, "epoch": 0.13893274392169244, "step": 110 }, { "loss": 1.3661, "grad_norm": 0.8779081702232361, "learning_rate": 9.537760416666667e-06, "entropy": 1.4359370201826096, "num_tokens": 722298.0, "mean_token_accuracy": 0.7162409156560898, "epoch": 0.15156299336911905, "step": 120 }, { "loss": 1.3193, "grad_norm": 0.8999291062355042, "learning_rate": 9.47265625e-06, "entropy": 1.3943599790334702, "num_tokens": 782683.0, "mean_token_accuracy": 0.7252198755741119, "epoch": 0.16419324281654563, "step": 130 }, { "loss": 1.3054, "grad_norm": 0.8218080997467041, "learning_rate": 9.407552083333334e-06, "entropy": 1.3758090347051621, "num_tokens": 842988.0, "mean_token_accuracy": 0.7277572214603424, "epoch": 0.1768234922639722, "step": 140 }, { "loss": 1.3291, "grad_norm": 0.8062577843666077, "learning_rate": 9.342447916666668e-06, "entropy": 1.381770172715187, "num_tokens": 903912.0, "mean_token_accuracy": 0.7222751513123512, "epoch": 0.1894537417113988, "step": 150 }, { "loss": 1.2974, "grad_norm": 0.8221862316131592, "learning_rate": 9.277343750000001e-06, "entropy": 1.352141672372818, "num_tokens": 964887.0, "mean_token_accuracy": 0.7260218441486359, "epoch": 0.2020839911588254, "step": 160 }, { "loss": 1.2969, "grad_norm": 0.7375346422195435, "learning_rate": 9.212239583333335e-06, "entropy": 1.346352329850197, "num_tokens": 1026887.0, "mean_token_accuracy": 0.7252495244145394, "epoch": 0.21471424060625197, "step": 170 }, { "loss": 1.2824, "grad_norm": 0.7950690388679504, "learning_rate": 9.147135416666667e-06, "entropy": 1.3165962457656861, "num_tokens": 1086995.0, "mean_token_accuracy": 0.7250601649284363, "epoch": 0.22734449005367857, "step": 180 }, { "loss": 1.2628, "grad_norm": 0.7147737145423889, "learning_rate": 9.082031250000001e-06, "entropy": 1.3047442227602004, "num_tokens": 1147209.0, "mean_token_accuracy": 0.7318986386060715, "epoch": 0.23997473950110515, "step": 190 }, { "loss": 1.2484, "grad_norm": 0.756094753742218, "learning_rate": 9.016927083333335e-06, "entropy": 1.2989415228366852, "num_tokens": 1207602.0, "mean_token_accuracy": 0.7319697335362434, "epoch": 0.25260498894853173, "step": 200 }, { "loss": 1.2447, "grad_norm": 0.7715655565261841, "learning_rate": 8.951822916666667e-06, "entropy": 1.2904020875692368, "num_tokens": 1267500.0, "mean_token_accuracy": 0.7349080622196198, "epoch": 0.2652352383959583, "step": 210 }, { "loss": 1.2111, "grad_norm": 0.6824166774749756, "learning_rate": 8.88671875e-06, "entropy": 1.2543610483407974, "num_tokens": 1327666.0, "mean_token_accuracy": 0.7386362582445145, "epoch": 0.2778654878433849, "step": 220 }, { "loss": 1.2574, "grad_norm": 0.6559598445892334, "learning_rate": 8.821614583333334e-06, "entropy": 1.2946221768856048, "num_tokens": 1389712.0, "mean_token_accuracy": 0.7287471711635589, "epoch": 0.2904957372908115, "step": 230 }, { "loss": 1.2092, "grad_norm": 0.7000382542610168, "learning_rate": 8.756510416666666e-06, "entropy": 1.2489944666624069, "num_tokens": 1448670.0, "mean_token_accuracy": 0.7372458636760711, "epoch": 0.3031259867382381, "step": 240 }, { "loss": 1.2132, "grad_norm": 0.6579836010932922, "learning_rate": 8.69140625e-06, "entropy": 1.2534994542598725, "num_tokens": 1508428.0, "mean_token_accuracy": 0.7380462676286698, "epoch": 0.3157562361856647, "step": 250 }, { "loss": 1.2103, "grad_norm": 0.6546089053153992, "learning_rate": 8.626302083333334e-06, "entropy": 1.2474523901939392, "num_tokens": 1568018.0, "mean_token_accuracy": 0.7395781621336937, "epoch": 0.32838648563309125, "step": 260 }, { "loss": 1.2007, "grad_norm": 0.6377413868904114, "learning_rate": 8.561197916666667e-06, "entropy": 1.2445458561182021, "num_tokens": 1627904.0, "mean_token_accuracy": 0.7419240340590477, "epoch": 0.34101673508051783, "step": 270 }, { "loss": 1.2497, "grad_norm": 0.6460844278335571, "learning_rate": 8.496093750000001e-06, "entropy": 1.279063493013382, "num_tokens": 1689637.0, "mean_token_accuracy": 0.729638360440731, "epoch": 0.3536469845279444, "step": 280 }, { "loss": 1.2091, "grad_norm": 0.6648440361022949, "learning_rate": 8.430989583333335e-06, "entropy": 1.2362476408481597, "num_tokens": 1749861.0, "mean_token_accuracy": 0.7385585099458695, "epoch": 0.366277233975371, "step": 290 }, { "loss": 1.2163, "grad_norm": 0.6637682318687439, "learning_rate": 8.365885416666667e-06, "entropy": 1.2533661901950837, "num_tokens": 1810407.0, "mean_token_accuracy": 0.7371826618909836, "epoch": 0.3789074834227976, "step": 300 }, { "loss": 1.2026, "grad_norm": 0.660043478012085, "learning_rate": 8.30078125e-06, "entropy": 1.2383619010448457, "num_tokens": 1871544.0, "mean_token_accuracy": 0.7364327058196067, "epoch": 0.3915377328702242, "step": 310 }, { "loss": 1.2064, "grad_norm": 0.6285788416862488, "learning_rate": 8.235677083333334e-06, "entropy": 1.2316229462623596, "num_tokens": 1932125.0, "mean_token_accuracy": 0.7371214032173157, "epoch": 0.4041679823176508, "step": 320 }, { "loss": 1.1997, "grad_norm": 0.6204569339752197, "learning_rate": 8.170572916666666e-06, "entropy": 1.2459111303091048, "num_tokens": 1993924.0, "mean_token_accuracy": 0.7365512102842331, "epoch": 0.41679823176507735, "step": 330 }, { "loss": 1.1863, "grad_norm": 0.6501284241676331, "learning_rate": 8.10546875e-06, "entropy": 1.2156363114714623, "num_tokens": 2054496.0, "mean_token_accuracy": 0.741255110502243, "epoch": 0.42942848121250393, "step": 340 }, { "loss": 1.1913, "grad_norm": 0.602418065071106, "learning_rate": 8.040364583333334e-06, "entropy": 1.2222040683031081, "num_tokens": 2114825.0, "mean_token_accuracy": 0.739654652774334, "epoch": 0.4420587306599305, "step": 350 }, { "loss": 1.2142, "grad_norm": 0.6289706230163574, "learning_rate": 7.975260416666668e-06, "entropy": 1.2437947690486908, "num_tokens": 2176058.0, "mean_token_accuracy": 0.7374308854341507, "epoch": 0.45468898010735714, "step": 360 }, { "loss": 1.1769, "grad_norm": 0.6439516544342041, "learning_rate": 7.910156250000001e-06, "entropy": 1.2139764934778214, "num_tokens": 2236783.0, "mean_token_accuracy": 0.7426491379737854, "epoch": 0.4673192295547837, "step": 370 }, { "loss": 1.1829, "grad_norm": 0.6499606966972351, "learning_rate": 7.845052083333335e-06, "entropy": 1.19720456302166, "num_tokens": 2298432.0, "mean_token_accuracy": 0.7399616882205009, "epoch": 0.4799494790022103, "step": 380 }, { "loss": 1.1577, "grad_norm": 0.6545577645301819, "learning_rate": 7.779947916666667e-06, "entropy": 1.205560651421547, "num_tokens": 2357808.0, "mean_token_accuracy": 0.7463845536112785, "epoch": 0.4925797284496369, "step": 390 }, { "loss": 1.1583, "grad_norm": 0.6930111050605774, "learning_rate": 7.71484375e-06, "entropy": 1.19621299803257, "num_tokens": 2417574.0, "mean_token_accuracy": 0.7453805327415466, "epoch": 0.5052099778970635, "step": 400 }, { "loss": 1.1723, "grad_norm": 0.648593544960022, "learning_rate": 7.649739583333334e-06, "entropy": 1.1963690370321274, "num_tokens": 2478088.0, "mean_token_accuracy": 0.7415376961231231, "epoch": 0.5178402273444901, "step": 410 }, { "loss": 1.1701, "grad_norm": 0.6348926424980164, "learning_rate": 7.5846354166666665e-06, "entropy": 1.216522666811943, "num_tokens": 2538612.0, "mean_token_accuracy": 0.7432737082242966, "epoch": 0.5304704767919166, "step": 420 }, { "loss": 1.1688, "grad_norm": 0.627249002456665, "learning_rate": 7.51953125e-06, "entropy": 1.1990931153297424, "num_tokens": 2599023.0, "mean_token_accuracy": 0.7435364574193954, "epoch": 0.5431007262393432, "step": 430 }, { "loss": 1.1622, "grad_norm": 0.6614134311676025, "learning_rate": 7.454427083333334e-06, "entropy": 1.1872963696718215, "num_tokens": 2658338.0, "mean_token_accuracy": 0.7470521196722985, "epoch": 0.5557309756867698, "step": 440 }, { "loss": 1.1898, "grad_norm": 0.6228342652320862, "learning_rate": 7.389322916666667e-06, "entropy": 1.215770760178566, "num_tokens": 2719316.0, "mean_token_accuracy": 0.7409805700182914, "epoch": 0.5683612251341964, "step": 450 }, { "loss": 1.167, "grad_norm": 0.6525698304176331, "learning_rate": 7.3242187500000006e-06, "entropy": 1.1998004853725432, "num_tokens": 2780272.0, "mean_token_accuracy": 0.7438512742519379, "epoch": 0.580991474581623, "step": 460 }, { "loss": 1.1669, "grad_norm": 0.6669884324073792, "learning_rate": 7.259114583333334e-06, "entropy": 1.1898580551147462, "num_tokens": 2840261.0, "mean_token_accuracy": 0.7437147945165634, "epoch": 0.5936217240290496, "step": 470 }, { "loss": 1.177, "grad_norm": 0.6129422783851624, "learning_rate": 7.194010416666667e-06, "entropy": 1.21882204413414, "num_tokens": 2901347.0, "mean_token_accuracy": 0.7423913896083831, "epoch": 0.6062519734764762, "step": 480 }, { "loss": 1.1393, "grad_norm": 0.6334741115570068, "learning_rate": 7.128906250000001e-06, "entropy": 1.163309469819069, "num_tokens": 2960518.0, "mean_token_accuracy": 0.7511255607008934, "epoch": 0.6188822229239027, "step": 490 }, { "loss": 1.1605, "grad_norm": 0.6261829733848572, "learning_rate": 7.063802083333335e-06, "entropy": 1.1994746267795562, "num_tokens": 3021957.0, "mean_token_accuracy": 0.7433080047369003, "epoch": 0.6315124723713293, "step": 500 }, { "loss": 1.1468, "grad_norm": 0.5909908413887024, "learning_rate": 6.998697916666667e-06, "entropy": 1.167793545126915, "num_tokens": 3083301.0, "mean_token_accuracy": 0.7475745663046837, "epoch": 0.6441427218187559, "step": 510 }, { "loss": 1.1425, "grad_norm": 0.6018249988555908, "learning_rate": 6.93359375e-06, "entropy": 1.1670663714408875, "num_tokens": 3143187.0, "mean_token_accuracy": 0.7485125616192818, "epoch": 0.6567729712661825, "step": 520 }, { "loss": 1.1297, "grad_norm": 0.6088816523551941, "learning_rate": 6.868489583333334e-06, "entropy": 1.1626142784953117, "num_tokens": 3202489.0, "mean_token_accuracy": 0.7490727782249451, "epoch": 0.6694032207136091, "step": 530 }, { "loss": 1.1656, "grad_norm": 0.6021592020988464, "learning_rate": 6.803385416666667e-06, "entropy": 1.1758243769407273, "num_tokens": 3263476.0, "mean_token_accuracy": 0.7443674057722092, "epoch": 0.6820334701610357, "step": 540 }, { "loss": 1.1385, "grad_norm": 0.5955655574798584, "learning_rate": 6.738281250000001e-06, "entropy": 1.179671287536621, "num_tokens": 3324008.0, "mean_token_accuracy": 0.7481714516878128, "epoch": 0.6946637196084623, "step": 550 }, { "loss": 1.1607, "grad_norm": 0.6246835589408875, "learning_rate": 6.6731770833333345e-06, "entropy": 1.1886188358068466, "num_tokens": 3383861.0, "mean_token_accuracy": 0.7447509884834289, "epoch": 0.7072939690558888, "step": 560 }, { "loss": 1.1298, "grad_norm": 0.606743335723877, "learning_rate": 6.6080729166666665e-06, "entropy": 1.1690475821495057, "num_tokens": 3443946.0, "mean_token_accuracy": 0.7493681326508522, "epoch": 0.7199242185033154, "step": 570 }, { "loss": 1.1452, "grad_norm": 0.6846170425415039, "learning_rate": 6.54296875e-06, "entropy": 1.1725697651505471, "num_tokens": 3503787.0, "mean_token_accuracy": 0.7482522815465927, "epoch": 0.732554467950742, "step": 580 }, { "loss": 1.1338, "grad_norm": 0.6522074341773987, "learning_rate": 6.477864583333334e-06, "entropy": 1.1713406786322593, "num_tokens": 3563403.0, "mean_token_accuracy": 0.7498400524258614, "epoch": 0.7451847173981686, "step": 590 }, { "loss": 1.1499, "grad_norm": 0.6417824625968933, "learning_rate": 6.412760416666667e-06, "entropy": 1.1848436295986176, "num_tokens": 3625007.0, "mean_token_accuracy": 0.7452719643712044, "epoch": 0.7578149668455952, "step": 600 }, { "loss": 1.159, "grad_norm": 0.6329619884490967, "learning_rate": 6.3476562500000006e-06, "entropy": 1.1822121858596801, "num_tokens": 3686099.0, "mean_token_accuracy": 0.7452733591198921, "epoch": 0.7704452162930218, "step": 610 }, { "loss": 1.1558, "grad_norm": 0.6627410054206848, "learning_rate": 6.282552083333334e-06, "entropy": 1.190292978286743, "num_tokens": 3747233.0, "mean_token_accuracy": 0.7438480347394943, "epoch": 0.7830754657404484, "step": 620 }, { "loss": 1.1377, "grad_norm": 0.5941329002380371, "learning_rate": 6.217447916666667e-06, "entropy": 1.1619529083371163, "num_tokens": 3807833.0, "mean_token_accuracy": 0.7503219902515411, "epoch": 0.7957057151878749, "step": 630 }, { "loss": 1.1397, "grad_norm": 0.6438832879066467, "learning_rate": 6.152343750000001e-06, "entropy": 1.1658748656511306, "num_tokens": 3868549.0, "mean_token_accuracy": 0.7471553102135658, "epoch": 0.8083359646353016, "step": 640 }, { "loss": 1.1434, "grad_norm": 0.6389635801315308, "learning_rate": 6.087239583333335e-06, "entropy": 1.1782082825899125, "num_tokens": 3929057.0, "mean_token_accuracy": 0.7477709770202636, "epoch": 0.8209662140827282, "step": 650 }, { "loss": 1.1352, "grad_norm": 0.6134201288223267, "learning_rate": 6.022135416666667e-06, "entropy": 1.1625961899757384, "num_tokens": 3990676.0, "mean_token_accuracy": 0.748055274784565, "epoch": 0.8335964635301547, "step": 660 }, { "loss": 1.1182, "grad_norm": 0.6336613893508911, "learning_rate": 5.95703125e-06, "entropy": 1.1510928481817246, "num_tokens": 4051046.0, "mean_token_accuracy": 0.7524245917797089, "epoch": 0.8462267129775813, "step": 670 }, { "loss": 1.1186, "grad_norm": 0.6758144497871399, "learning_rate": 5.891927083333334e-06, "entropy": 1.1498646020889283, "num_tokens": 4111084.0, "mean_token_accuracy": 0.7507978692650795, "epoch": 0.8588569624250079, "step": 680 }, { "loss": 1.1395, "grad_norm": 0.6285990476608276, "learning_rate": 5.826822916666667e-06, "entropy": 1.167962297797203, "num_tokens": 4172628.0, "mean_token_accuracy": 0.7476246923208236, "epoch": 0.8714872118724345, "step": 690 }, { "loss": 1.0919, "grad_norm": 0.64762282371521, "learning_rate": 5.761718750000001e-06, "entropy": 1.1178194358944893, "num_tokens": 4231821.0, "mean_token_accuracy": 0.7569874793291091, "epoch": 0.884117461319861, "step": 700 }, { "loss": 1.1354, "grad_norm": 0.6292758584022522, "learning_rate": 5.6966145833333344e-06, "entropy": 1.1606462925672532, "num_tokens": 4292646.0, "mean_token_accuracy": 0.750880953669548, "epoch": 0.8967477107672877, "step": 710 }, { "loss": 1.1205, "grad_norm": 0.6393706798553467, "learning_rate": 5.6315104166666665e-06, "entropy": 1.1580617666244506, "num_tokens": 4353199.0, "mean_token_accuracy": 0.7499566927552224, "epoch": 0.9093779602147143, "step": 720 }, { "loss": 1.1138, "grad_norm": 0.687380313873291, "learning_rate": 5.56640625e-06, "entropy": 1.1515695974230766, "num_tokens": 4414122.0, "mean_token_accuracy": 0.7514134287834168, "epoch": 0.9220082096621408, "step": 730 }, { "loss": 1.1302, "grad_norm": 0.6102684736251831, "learning_rate": 5.501302083333334e-06, "entropy": 1.1574165880680085, "num_tokens": 4474548.0, "mean_token_accuracy": 0.7507740229368209, "epoch": 0.9346384591095674, "step": 740 }, { "loss": 1.129, "grad_norm": 0.623504638671875, "learning_rate": 5.436197916666667e-06, "entropy": 1.1491190433502196, "num_tokens": 4534678.0, "mean_token_accuracy": 0.7512574091553688, "epoch": 0.947268708556994, "step": 750 }, { "loss": 1.1181, "grad_norm": 0.6368807554244995, "learning_rate": 5.3710937500000005e-06, "entropy": 1.1538215219974517, "num_tokens": 4594878.0, "mean_token_accuracy": 0.7520082175731659, "epoch": 0.9598989580044206, "step": 760 }, { "loss": 1.1308, "grad_norm": 0.6332852840423584, "learning_rate": 5.305989583333334e-06, "entropy": 1.1623035803437234, "num_tokens": 4656513.0, "mean_token_accuracy": 0.7497873172163964, "epoch": 0.9725292074518471, "step": 770 }, { "loss": 1.1142, "grad_norm": 0.6341389417648315, "learning_rate": 5.240885416666667e-06, "entropy": 1.1483627527952194, "num_tokens": 4717111.0, "mean_token_accuracy": 0.7533516198396683, "epoch": 0.9851594568992738, "step": 780 }, { "loss": 1.1117, "grad_norm": 0.6641396880149841, "learning_rate": 5.17578125e-06, "entropy": 1.1455359414219857, "num_tokens": 4777713.0, "mean_token_accuracy": 0.7530950620770455, "epoch": 0.9977897063467004, "step": 790 }, { "loss": 1.1146, "grad_norm": 0.6454346776008606, "learning_rate": 5.110677083333334e-06, "entropy": 1.148778918461922, "num_tokens": 4837103.0, "mean_token_accuracy": 0.7511914097345792, "epoch": 1.0101041995579412, "step": 800 }, { "loss": 1.1003, "grad_norm": 0.6368332505226135, "learning_rate": 5.045572916666667e-06, "entropy": 1.1441998034715652, "num_tokens": 4898715.0, "mean_token_accuracy": 0.7535203993320465, "epoch": 1.0227344490053678, "step": 810 }, { "loss": 1.0924, "grad_norm": 0.6546683311462402, "learning_rate": 4.98046875e-06, "entropy": 1.1195117503404617, "num_tokens": 4959681.0, "mean_token_accuracy": 0.7574156150221825, "epoch": 1.0353646984527944, "step": 820 }, { "loss": 1.1031, "grad_norm": 0.6645976305007935, "learning_rate": 4.915364583333333e-06, "entropy": 1.1403603315353394, "num_tokens": 5020382.0, "mean_token_accuracy": 0.7548869714140892, "epoch": 1.047994947900221, "step": 830 }, { "loss": 1.0915, "grad_norm": 0.6225126385688782, "learning_rate": 4.850260416666667e-06, "entropy": 1.1299657106399537, "num_tokens": 5080360.0, "mean_token_accuracy": 0.7562400087714195, "epoch": 1.0606251973476477, "step": 840 }, { "loss": 1.1064, "grad_norm": 0.6478942036628723, "learning_rate": 4.785156250000001e-06, "entropy": 1.12370226085186, "num_tokens": 5140349.0, "mean_token_accuracy": 0.7542634457349777, "epoch": 1.0732554467950741, "step": 850 }, { "loss": 1.1043, "grad_norm": 0.615678608417511, "learning_rate": 4.7200520833333336e-06, "entropy": 1.1469928681850434, "num_tokens": 5201690.0, "mean_token_accuracy": 0.7529336720705032, "epoch": 1.0858856962425008, "step": 860 }, { "loss": 1.1081, "grad_norm": 0.6458525061607361, "learning_rate": 4.654947916666667e-06, "entropy": 1.137891921401024, "num_tokens": 5261698.0, "mean_token_accuracy": 0.7543051362037658, "epoch": 1.0985159456899274, "step": 870 }, { "loss": 1.0951, "grad_norm": 0.6362131237983704, "learning_rate": 4.58984375e-06, "entropy": 1.1202880129218102, "num_tokens": 5321775.0, "mean_token_accuracy": 0.7552427321672439, "epoch": 1.111146195137354, "step": 880 }, { "loss": 1.0961, "grad_norm": 0.6511764526367188, "learning_rate": 4.524739583333334e-06, "entropy": 1.1365787714719773, "num_tokens": 5383140.0, "mean_token_accuracy": 0.7562274217605591, "epoch": 1.1237764445847804, "step": 890 }, { "loss": 1.0848, "grad_norm": 0.6207822561264038, "learning_rate": 4.459635416666668e-06, "entropy": 1.1074503496289254, "num_tokens": 5443006.0, "mean_token_accuracy": 0.7591574639081955, "epoch": 1.136406694032207, "step": 900 }, { "loss": 1.1121, "grad_norm": 0.6404831409454346, "learning_rate": 4.3945312500000005e-06, "entropy": 1.1545074522495269, "num_tokens": 5503942.0, "mean_token_accuracy": 0.7507721096277237, "epoch": 1.1490369434796337, "step": 910 }, { "loss": 1.1011, "grad_norm": 0.6468749046325684, "learning_rate": 4.329427083333333e-06, "entropy": 1.1401477769017219, "num_tokens": 5564518.0, "mean_token_accuracy": 0.753543746471405, "epoch": 1.1616671929270603, "step": 920 }, { "loss": 1.0614, "grad_norm": 0.6418051719665527, "learning_rate": 4.264322916666667e-06, "entropy": 1.0945423126220704, "num_tokens": 5624109.0, "mean_token_accuracy": 0.7643799662590027, "epoch": 1.174297442374487, "step": 930 }, { "loss": 1.0974, "grad_norm": 0.6422064304351807, "learning_rate": 4.19921875e-06, "entropy": 1.1136713281273842, "num_tokens": 5684801.0, "mean_token_accuracy": 0.7561314895749092, "epoch": 1.1869276918219134, "step": 940 }, { "loss": 1.0801, "grad_norm": 0.6453995108604431, "learning_rate": 4.134114583333334e-06, "entropy": 1.1215770334005355, "num_tokens": 5745499.0, "mean_token_accuracy": 0.7590720430016518, "epoch": 1.19955794126934, "step": 950 }, { "loss": 1.049, "grad_norm": 0.61696857213974, "learning_rate": 4.0690104166666675e-06, "entropy": 1.1010483756661416, "num_tokens": 5806117.0, "mean_token_accuracy": 0.7627070844173431, "epoch": 1.2121881907167666, "step": 960 }, { "loss": 1.0807, "grad_norm": 0.6523500680923462, "learning_rate": 4.00390625e-06, "entropy": 1.1082940384745599, "num_tokens": 5865537.0, "mean_token_accuracy": 0.7579552844166756, "epoch": 1.2248184401641933, "step": 970 }, { "loss": 1.0679, "grad_norm": 0.6376118063926697, "learning_rate": 3.938802083333333e-06, "entropy": 1.102595229446888, "num_tokens": 5925254.0, "mean_token_accuracy": 0.7592279806733131, "epoch": 1.23744868961162, "step": 980 }, { "loss": 1.0888, "grad_norm": 0.6571747660636902, "learning_rate": 3.873697916666667e-06, "entropy": 1.1277900233864784, "num_tokens": 5986084.0, "mean_token_accuracy": 0.7549166217446327, "epoch": 1.2500789390590463, "step": 990 }, { "loss": 1.0718, "grad_norm": 0.6531611084938049, "learning_rate": 3.8085937500000002e-06, "entropy": 1.113915103673935, "num_tokens": 6046857.0, "mean_token_accuracy": 0.7577856734395028, "epoch": 1.262709188506473, "step": 1000 }, { "loss": 1.0699, "grad_norm": 0.636698842048645, "learning_rate": 3.7434895833333336e-06, "entropy": 1.0966202467679977, "num_tokens": 6106886.0, "mean_token_accuracy": 0.7601938605308532, "epoch": 1.2753394379538996, "step": 1010 }, { "loss": 1.0851, "grad_norm": 0.6492161750793457, "learning_rate": 3.6783854166666673e-06, "entropy": 1.1121985822916032, "num_tokens": 6167935.0, "mean_token_accuracy": 0.7588792949914932, "epoch": 1.2879696874013262, "step": 1020 }, { "loss": 1.094, "grad_norm": 0.6697131395339966, "learning_rate": 3.61328125e-06, "entropy": 1.1355163961648942, "num_tokens": 6228870.0, "mean_token_accuracy": 0.754327917098999, "epoch": 1.3005999368487529, "step": 1030 }, { "loss": 1.0893, "grad_norm": 0.6773020625114441, "learning_rate": 3.5481770833333335e-06, "entropy": 1.11816665828228, "num_tokens": 6288847.0, "mean_token_accuracy": 0.7571294933557511, "epoch": 1.3132301862961793, "step": 1040 }, { "loss": 1.0875, "grad_norm": 0.6566488146781921, "learning_rate": 3.483072916666667e-06, "entropy": 1.1343947052955627, "num_tokens": 6350161.0, "mean_token_accuracy": 0.755756102502346, "epoch": 1.325860435743606, "step": 1050 }, { "loss": 1.0782, "grad_norm": 0.6575057506561279, "learning_rate": 3.41796875e-06, "entropy": 1.1109364911913873, "num_tokens": 6410972.0, "mean_token_accuracy": 0.7591001376509666, "epoch": 1.3384906851910325, "step": 1060 }, { "loss": 1.0901, "grad_norm": 0.6655089259147644, "learning_rate": 3.3528645833333334e-06, "entropy": 1.1165167808532714, "num_tokens": 6471984.0, "mean_token_accuracy": 0.7573199763894081, "epoch": 1.3511209346384592, "step": 1070 }, { "loss": 1.0716, "grad_norm": 0.6363748908042908, "learning_rate": 3.287760416666667e-06, "entropy": 1.1066906094551086, "num_tokens": 6532514.0, "mean_token_accuracy": 0.7598252177238465, "epoch": 1.3637511840858858, "step": 1080 }, { "loss": 1.0823, "grad_norm": 0.6684281826019287, "learning_rate": 3.2226562500000004e-06, "entropy": 1.1047193810343743, "num_tokens": 6592949.0, "mean_token_accuracy": 0.7593759268522262, "epoch": 1.3763814335333122, "step": 1090 }, { "loss": 1.1031, "grad_norm": 0.6439023017883301, "learning_rate": 3.1575520833333333e-06, "entropy": 1.1348285049200058, "num_tokens": 6654231.0, "mean_token_accuracy": 0.7526842474937439, "epoch": 1.3890116829807388, "step": 1100 }, { "loss": 1.0799, "grad_norm": 0.6556984186172485, "learning_rate": 3.092447916666667e-06, "entropy": 1.1191302105784415, "num_tokens": 6714430.0, "mean_token_accuracy": 0.7590983435511589, "epoch": 1.4016419324281655, "step": 1110 }, { "loss": 1.0614, "grad_norm": 0.6618829965591431, "learning_rate": 3.0273437500000003e-06, "entropy": 1.093433029949665, "num_tokens": 6774176.0, "mean_token_accuracy": 0.7611085593700408, "epoch": 1.4142721818755921, "step": 1120 }, { "loss": 1.0939, "grad_norm": 0.6382298469543457, "learning_rate": 2.962239583333333e-06, "entropy": 1.135184645652771, "num_tokens": 6836522.0, "mean_token_accuracy": 0.7532851651310921, "epoch": 1.4269024313230187, "step": 1130 }, { "loss": 1.0709, "grad_norm": 0.6382166147232056, "learning_rate": 2.897135416666667e-06, "entropy": 1.1093149304389953, "num_tokens": 6896353.0, "mean_token_accuracy": 0.7608326107263566, "epoch": 1.4395326807704452, "step": 1140 }, { "loss": 1.0738, "grad_norm": 0.6356373429298401, "learning_rate": 2.8320312500000002e-06, "entropy": 1.1047044202685357, "num_tokens": 6956828.0, "mean_token_accuracy": 0.7615469440817833, "epoch": 1.4521629302178718, "step": 1150 }, { "loss": 1.0589, "grad_norm": 0.6593008041381836, "learning_rate": 2.7669270833333335e-06, "entropy": 1.1073317646980285, "num_tokens": 7017026.0, "mean_token_accuracy": 0.7599197804927826, "epoch": 1.4647931796652984, "step": 1160 }, { "loss": 1.0584, "grad_norm": 0.6466282606124878, "learning_rate": 2.7018229166666673e-06, "entropy": 1.0851576775312424, "num_tokens": 7076806.0, "mean_token_accuracy": 0.7626572713255882, "epoch": 1.4774234291127248, "step": 1170 }, { "loss": 1.0753, "grad_norm": 0.6285493969917297, "learning_rate": 2.63671875e-06, "entropy": 1.1103300124406814, "num_tokens": 7137946.0, "mean_token_accuracy": 0.7593718692660332, "epoch": 1.4900536785601517, "step": 1180 }, { "loss": 1.0642, "grad_norm": 0.6664257645606995, "learning_rate": 2.5716145833333334e-06, "entropy": 1.1066975593566895, "num_tokens": 7200103.0, "mean_token_accuracy": 0.7612839996814728, "epoch": 1.502683928007578, "step": 1190 }, { "loss": 1.0726, "grad_norm": 0.683022141456604, "learning_rate": 2.506510416666667e-06, "entropy": 1.0994308680295943, "num_tokens": 7259051.0, "mean_token_accuracy": 0.7611020535230637, "epoch": 1.5153141774550047, "step": 1200 }, { "loss": 1.0758, "grad_norm": 0.6556797623634338, "learning_rate": 2.44140625e-06, "entropy": 1.1130555748939515, "num_tokens": 7318904.0, "mean_token_accuracy": 0.7601210430264473, "epoch": 1.5279444269024314, "step": 1210 }, { "loss": 1.0691, "grad_norm": 0.6336252689361572, "learning_rate": 2.3763020833333338e-06, "entropy": 1.112100276350975, "num_tokens": 7378611.0, "mean_token_accuracy": 0.7613141894340515, "epoch": 1.5405746763498578, "step": 1220 }, { "loss": 1.074, "grad_norm": 0.6907696723937988, "learning_rate": 2.3111979166666667e-06, "entropy": 1.1018309980630874, "num_tokens": 7438960.0, "mean_token_accuracy": 0.7605119064450264, "epoch": 1.5532049257972846, "step": 1230 }, { "loss": 1.0413, "grad_norm": 0.6463876962661743, "learning_rate": 2.2460937500000004e-06, "entropy": 1.071268692612648, "num_tokens": 7497275.0, "mean_token_accuracy": 0.7670892596244812, "epoch": 1.565835175244711, "step": 1240 }, { "loss": 1.092, "grad_norm": 0.6366226077079773, "learning_rate": 2.1809895833333337e-06, "entropy": 1.134592017531395, "num_tokens": 7558285.0, "mean_token_accuracy": 0.7548690542578698, "epoch": 1.5784654246921377, "step": 1250 }, { "loss": 1.0752, "grad_norm": 0.6590870022773743, "learning_rate": 2.1158854166666666e-06, "entropy": 1.1141762882471085, "num_tokens": 7620287.0, "mean_token_accuracy": 0.7603784337639808, "epoch": 1.5910956741395643, "step": 1260 }, { "loss": 1.0593, "grad_norm": 0.656830370426178, "learning_rate": 2.0507812500000003e-06, "entropy": 1.09154414832592, "num_tokens": 7681170.0, "mean_token_accuracy": 0.76341772377491, "epoch": 1.6037259235869907, "step": 1270 }, { "loss": 1.0724, "grad_norm": 0.6511245965957642, "learning_rate": 1.9856770833333336e-06, "entropy": 1.10728869587183, "num_tokens": 7741125.0, "mean_token_accuracy": 0.7592613711953163, "epoch": 1.6163561730344176, "step": 1280 }, { "loss": 1.0678, "grad_norm": 0.648682713508606, "learning_rate": 1.920572916666667e-06, "entropy": 1.1026839420199395, "num_tokens": 7801002.0, "mean_token_accuracy": 0.7615165829658508, "epoch": 1.628986422481844, "step": 1290 }, { "loss": 1.0662, "grad_norm": 0.6691455245018005, "learning_rate": 1.8554687500000002e-06, "entropy": 1.113681361079216, "num_tokens": 7861077.0, "mean_token_accuracy": 0.76031324416399, "epoch": 1.6416166719292706, "step": 1300 }, { "loss": 1.0564, "grad_norm": 0.6221432685852051, "learning_rate": 1.7903645833333335e-06, "entropy": 1.0854344859719276, "num_tokens": 7920955.0, "mean_token_accuracy": 0.7638715595006943, "epoch": 1.6542469213766973, "step": 1310 }, { "loss": 1.0803, "grad_norm": 0.6376025080680847, "learning_rate": 1.7252604166666668e-06, "entropy": 1.1128123462200166, "num_tokens": 7981933.0, "mean_token_accuracy": 0.7579856783151626, "epoch": 1.6668771708241237, "step": 1320 }, { "loss": 1.0758, "grad_norm": 0.712565541267395, "learning_rate": 1.6601562500000001e-06, "entropy": 1.105194841325283, "num_tokens": 8042084.0, "mean_token_accuracy": 0.7577270165085792, "epoch": 1.6795074202715503, "step": 1330 }, { "loss": 1.0731, "grad_norm": 0.6228471994400024, "learning_rate": 1.5950520833333336e-06, "entropy": 1.1065697744488716, "num_tokens": 8102976.0, "mean_token_accuracy": 0.7590463057160377, "epoch": 1.692137669718977, "step": 1340 }, { "loss": 1.0708, "grad_norm": 0.6447433829307556, "learning_rate": 1.5299479166666667e-06, "entropy": 1.1064435616135597, "num_tokens": 8163206.0, "mean_token_accuracy": 0.7608707517385482, "epoch": 1.7047679191664036, "step": 1350 }, { "loss": 1.0606, "grad_norm": 0.6594550609588623, "learning_rate": 1.46484375e-06, "entropy": 1.1046179130673408, "num_tokens": 8224905.0, "mean_token_accuracy": 0.7627649754285812, "epoch": 1.7173981686138302, "step": 1360 }, { "loss": 1.0741, "grad_norm": 0.6550594568252563, "learning_rate": 1.3997395833333335e-06, "entropy": 1.1129515051841736, "num_tokens": 8286587.0, "mean_token_accuracy": 0.7577028945088387, "epoch": 1.7300284180612566, "step": 1370 }, { "loss": 1.0597, "grad_norm": 0.6420894265174866, "learning_rate": 1.3346354166666666e-06, "entropy": 1.094475807249546, "num_tokens": 8346283.0, "mean_token_accuracy": 0.7612502560019493, "epoch": 1.7426586675086833, "step": 1380 }, { "loss": 1.0705, "grad_norm": 0.6647622585296631, "learning_rate": 1.2695312500000002e-06, "entropy": 1.1026990562677383, "num_tokens": 8406590.0, "mean_token_accuracy": 0.7592111378908157, "epoch": 1.75528891695611, "step": 1390 }, { "loss": 1.0783, "grad_norm": 0.6625591516494751, "learning_rate": 1.2044270833333335e-06, "entropy": 1.1024970307946205, "num_tokens": 8467230.0, "mean_token_accuracy": 0.756389918923378, "epoch": 1.7679191664035365, "step": 1400 }, { "loss": 1.0869, "grad_norm": 0.6827495098114014, "learning_rate": 1.1393229166666668e-06, "entropy": 1.1201951175928115, "num_tokens": 8527820.0, "mean_token_accuracy": 0.756199948489666, "epoch": 1.7805494158509632, "step": 1410 }, { "loss": 1.0638, "grad_norm": 0.6496292948722839, "learning_rate": 1.07421875e-06, "entropy": 1.1082668006420135, "num_tokens": 8587544.0, "mean_token_accuracy": 0.7621515318751335, "epoch": 1.7931796652983896, "step": 1420 }, { "loss": 1.0587, "grad_norm": 0.6577737927436829, "learning_rate": 1.0091145833333334e-06, "entropy": 1.0907854005694388, "num_tokens": 8647424.0, "mean_token_accuracy": 0.7625794589519501, "epoch": 1.8058099147458162, "step": 1430 }, { "loss": 1.0648, "grad_norm": 0.6546240448951721, "learning_rate": 9.440104166666668e-07, "entropy": 1.0947823762893676, "num_tokens": 8706635.0, "mean_token_accuracy": 0.7618604898452759, "epoch": 1.8184401641932428, "step": 1440 }, { "loss": 1.0615, "grad_norm": 0.6349791884422302, "learning_rate": 8.789062500000001e-07, "entropy": 1.1016521960496903, "num_tokens": 8766624.0, "mean_token_accuracy": 0.7619734451174736, "epoch": 1.8310704136406692, "step": 1450 }, { "loss": 1.079, "grad_norm": 0.6273230314254761, "learning_rate": 8.138020833333334e-07, "entropy": 1.110970026254654, "num_tokens": 8826556.0, "mean_token_accuracy": 0.7600797146558762, "epoch": 1.843700663088096, "step": 1460 }, { "loss": 1.0695, "grad_norm": 0.6720101833343506, "learning_rate": 7.486979166666668e-07, "entropy": 1.1075817868113518, "num_tokens": 8887460.0, "mean_token_accuracy": 0.759764339029789, "epoch": 1.8563309125355225, "step": 1470 }, { "loss": 1.0658, "grad_norm": 0.6578065752983093, "learning_rate": 6.835937500000001e-07, "entropy": 1.0957570180296898, "num_tokens": 8947077.0, "mean_token_accuracy": 0.7630386680364609, "epoch": 1.8689611619829491, "step": 1480 }, { "loss": 1.062, "grad_norm": 0.6177386045455933, "learning_rate": 6.184895833333334e-07, "entropy": 1.1054737836122512, "num_tokens": 9008717.0, "mean_token_accuracy": 0.7603132933378219, "epoch": 1.8815914114303758, "step": 1490 }, { "loss": 1.0706, "grad_norm": 0.6226282119750977, "learning_rate": 5.533854166666667e-07, "entropy": 1.1026621460914612, "num_tokens": 9068623.0, "mean_token_accuracy": 0.7603669881820678, "epoch": 1.8942216608778022, "step": 1500 }, { "loss": 1.0583, "grad_norm": 0.6504780650138855, "learning_rate": 4.8828125e-07, "entropy": 1.1010279595851897, "num_tokens": 9129086.0, "mean_token_accuracy": 0.7632956698536872, "epoch": 1.906851910325229, "step": 1510 }, { "loss": 1.0698, "grad_norm": 0.667875349521637, "learning_rate": 4.2317708333333337e-07, "entropy": 1.1021641314029693, "num_tokens": 9189845.0, "mean_token_accuracy": 0.7609776973724365, "epoch": 1.9194821597726555, "step": 1520 }, { "loss": 1.0541, "grad_norm": 0.650221586227417, "learning_rate": 3.5807291666666667e-07, "entropy": 1.0909265503287315, "num_tokens": 9250808.0, "mean_token_accuracy": 0.7616324663162232, "epoch": 1.932112409220082, "step": 1530 }, { "loss": 1.0821, "grad_norm": 0.6560048460960388, "learning_rate": 2.9296875000000003e-07, "entropy": 1.1204875528812408, "num_tokens": 9312275.0, "mean_token_accuracy": 0.7588548183441162, "epoch": 1.9447426586675087, "step": 1540 }, { "loss": 1.058, "grad_norm": 0.6481816172599792, "learning_rate": 2.2786458333333333e-07, "entropy": 1.0873224779963493, "num_tokens": 9372216.0, "mean_token_accuracy": 0.762654073536396, "epoch": 1.9573729081149351, "step": 1550 }, { "loss": 1.07, "grad_norm": 0.6645349264144897, "learning_rate": 1.627604166666667e-07, "entropy": 1.1072645708918571, "num_tokens": 9431986.0, "mean_token_accuracy": 0.7614389002323151, "epoch": 1.970003157562362, "step": 1560 }, { "loss": 1.0754, "grad_norm": 0.6455146670341492, "learning_rate": 9.765625e-08, "entropy": 1.1167670994997025, "num_tokens": 9492961.0, "mean_token_accuracy": 0.7594234853982925, "epoch": 1.9826334070097884, "step": 1570 }, { "loss": 1.0718, "grad_norm": 0.6205505132675171, "learning_rate": 3.2552083333333335e-08, "entropy": 1.104009985923767, "num_tokens": 9553806.0, "mean_token_accuracy": 0.7597839057445526, "epoch": 1.995263656457215, "step": 1580 }, { "train_runtime": 22826.5387, "train_samples_per_second": 2.22, "train_steps_per_second": 0.069, "total_flos": 5.4237850982977536e+17, "train_loss": 1.193551750797214, "entropy": 1.1039235631624857, "num_tokens": 9576420.0, "mean_token_accuracy": 0.7573710878690084, "epoch": 2.0, "step": 1584 } ], "best_metric": null, "best_model_checkpoint": null, "global_step": 1584, "num_train_epochs": 2 }