{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1623, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0030807147258163892, "grad_norm": 1.2150330543518066, "learning_rate": 9.756097560975611e-06, "loss": 1.1177, "step": 5 }, { "epoch": 0.0061614294516327784, "grad_norm": 0.9305108189582825, "learning_rate": 2.1951219512195124e-05, "loss": 1.1874, "step": 10 }, { "epoch": 0.009242144177449169, "grad_norm": 0.47463372349739075, "learning_rate": 3.414634146341464e-05, "loss": 1.1992, "step": 15 }, { "epoch": 0.012322858903265557, "grad_norm": 0.5908312201499939, "learning_rate": 4.634146341463415e-05, "loss": 1.0286, "step": 20 }, { "epoch": 0.015403573629081947, "grad_norm": 4.2904276847839355, "learning_rate": 5.853658536585366e-05, "loss": 0.9678, "step": 25 }, { "epoch": 0.018484288354898338, "grad_norm": 0.4929400086402893, "learning_rate": 7.073170731707317e-05, "loss": 0.9721, "step": 30 }, { "epoch": 0.021565003080714726, "grad_norm": 0.48444443941116333, "learning_rate": 8.292682926829268e-05, "loss": 1.0496, "step": 35 }, { "epoch": 0.024645717806531114, "grad_norm": 0.3187097907066345, "learning_rate": 9.51219512195122e-05, "loss": 0.9946, "step": 40 }, { "epoch": 0.027726432532347505, "grad_norm": 0.6586086750030518, "learning_rate": 0.00010731707317073172, "loss": 0.9008, "step": 45 }, { "epoch": 0.030807147258163893, "grad_norm": 0.8100060820579529, "learning_rate": 0.00011951219512195122, "loss": 0.8526, "step": 50 }, { "epoch": 0.033887861983980284, "grad_norm": 0.3796943724155426, "learning_rate": 0.00013170731707317076, "loss": 0.9529, "step": 55 }, { "epoch": 0.036968576709796676, "grad_norm": 0.3700108230113983, "learning_rate": 0.00014390243902439025, "loss": 0.9401, "step": 60 }, { "epoch": 0.04004929143561306, "grad_norm": 0.30697202682495117, "learning_rate": 0.00015609756097560978, "loss": 0.9047, "step": 65 }, { "epoch": 0.04313000616142945, "grad_norm": 0.4444568455219269, "learning_rate": 0.00016829268292682927, "loss": 0.9589, "step": 70 }, { "epoch": 0.04621072088724584, "grad_norm": 0.5713182687759399, "learning_rate": 0.0001804878048780488, "loss": 0.797, "step": 75 }, { "epoch": 0.04929143561306223, "grad_norm": 0.3340379297733307, "learning_rate": 0.0001926829268292683, "loss": 0.8844, "step": 80 }, { "epoch": 0.05237215033887862, "grad_norm": 0.30011042952537537, "learning_rate": 0.0001999991687649223, "loss": 0.9395, "step": 85 }, { "epoch": 0.05545286506469501, "grad_norm": 0.2434113323688507, "learning_rate": 0.00019998981752900036, "loss": 0.9361, "step": 90 }, { "epoch": 0.0585335797905114, "grad_norm": 0.4201102554798126, "learning_rate": 0.00019997007698817557, "loss": 0.906, "step": 95 }, { "epoch": 0.061614294516327786, "grad_norm": 0.5135448575019836, "learning_rate": 0.00019993994919356167, "loss": 0.8074, "step": 100 }, { "epoch": 0.06469500924214418, "grad_norm": 0.2790917456150055, "learning_rate": 0.00019989943727554598, "loss": 0.8454, "step": 105 }, { "epoch": 0.06777572396796057, "grad_norm": 0.3034830391407013, "learning_rate": 0.00019984854544346367, "loss": 0.8807, "step": 110 }, { "epoch": 0.07085643869377696, "grad_norm": 0.27664294838905334, "learning_rate": 0.00019978727898516086, "loss": 0.9583, "step": 115 }, { "epoch": 0.07393715341959335, "grad_norm": 0.42292487621307373, "learning_rate": 0.0001997156442664449, "loss": 0.7675, "step": 120 }, { "epoch": 0.07701786814540973, "grad_norm": 0.5137152075767517, "learning_rate": 0.00019963364873042298, "loss": 0.7801, "step": 125 }, { "epoch": 0.08009858287122612, "grad_norm": 0.25252246856689453, "learning_rate": 0.0001995413008967289, "loss": 0.9171, "step": 130 }, { "epoch": 0.08317929759704251, "grad_norm": 0.2585853636264801, "learning_rate": 0.00019943861036063768, "loss": 0.865, "step": 135 }, { "epoch": 0.0862600123228589, "grad_norm": 0.2344920039176941, "learning_rate": 0.00019932558779206874, "loss": 0.8805, "step": 140 }, { "epoch": 0.0893407270486753, "grad_norm": 0.38098931312561035, "learning_rate": 0.00019920224493447702, "loss": 0.884, "step": 145 }, { "epoch": 0.09242144177449169, "grad_norm": 0.5012897253036499, "learning_rate": 0.00019906859460363307, "loss": 0.7584, "step": 150 }, { "epoch": 0.09550215650030808, "grad_norm": 0.289332777261734, "learning_rate": 0.00019892465068629131, "loss": 0.838, "step": 155 }, { "epoch": 0.09858287122612445, "grad_norm": 0.33769166469573975, "learning_rate": 0.0001987704281387471, "loss": 0.971, "step": 160 }, { "epoch": 0.10166358595194085, "grad_norm": 0.2457900047302246, "learning_rate": 0.00019860594298528282, "loss": 0.8543, "step": 165 }, { "epoch": 0.10474430067775724, "grad_norm": 0.42815569043159485, "learning_rate": 0.0001984312123165028, "loss": 0.9284, "step": 170 }, { "epoch": 0.10782501540357363, "grad_norm": 0.540185809135437, "learning_rate": 0.0001982462542875576, "loss": 0.7112, "step": 175 }, { "epoch": 0.11090573012939002, "grad_norm": 0.23496200144290924, "learning_rate": 0.00019805108811625773, "loss": 0.8433, "step": 180 }, { "epoch": 0.11398644485520641, "grad_norm": 0.2658989429473877, "learning_rate": 0.00019784573408107657, "loss": 0.8989, "step": 185 }, { "epoch": 0.1170671595810228, "grad_norm": 0.2345050424337387, "learning_rate": 0.00019763021351904358, "loss": 0.8656, "step": 190 }, { "epoch": 0.12014787430683918, "grad_norm": 0.38727903366088867, "learning_rate": 0.00019740454882352732, "loss": 0.8039, "step": 195 }, { "epoch": 0.12322858903265557, "grad_norm": 0.5198805928230286, "learning_rate": 0.0001971687634419086, "loss": 0.748, "step": 200 }, { "epoch": 0.12630930375847196, "grad_norm": 0.3000946342945099, "learning_rate": 0.0001969228818731442, "loss": 0.8237, "step": 205 }, { "epoch": 0.12939001848428835, "grad_norm": 0.2512762248516083, "learning_rate": 0.00019666692966522145, "loss": 0.8571, "step": 210 }, { "epoch": 0.13247073321010475, "grad_norm": 0.2670898735523224, "learning_rate": 0.00019640093341250357, "loss": 0.8693, "step": 215 }, { "epoch": 0.13555144793592114, "grad_norm": 0.38732317090034485, "learning_rate": 0.0001961249207529665, "loss": 0.8316, "step": 220 }, { "epoch": 0.13863216266173753, "grad_norm": 0.535698652267456, "learning_rate": 0.00019583892036532726, "loss": 0.776, "step": 225 }, { "epoch": 0.14171287738755392, "grad_norm": 0.2649231553077698, "learning_rate": 0.00019554296196606395, "loss": 0.8533, "step": 230 }, { "epoch": 0.1447935921133703, "grad_norm": 0.29415225982666016, "learning_rate": 0.00019523707630632835, "loss": 0.8079, "step": 235 }, { "epoch": 0.1478743068391867, "grad_norm": 0.24607762694358826, "learning_rate": 0.00019492129516875055, "loss": 0.8944, "step": 240 }, { "epoch": 0.15095502156500307, "grad_norm": 0.40626609325408936, "learning_rate": 0.00019459565136413666, "loss": 0.8825, "step": 245 }, { "epoch": 0.15403573629081946, "grad_norm": 0.4869477152824402, "learning_rate": 0.0001942601787280598, "loss": 0.745, "step": 250 }, { "epoch": 0.15711645101663585, "grad_norm": 0.3302883803844452, "learning_rate": 0.00019391491211734425, "loss": 0.8672, "step": 255 }, { "epoch": 0.16019716574245224, "grad_norm": 0.22587017714977264, "learning_rate": 0.0001935598874064438, "loss": 0.8139, "step": 260 }, { "epoch": 0.16327788046826863, "grad_norm": 0.2447589784860611, "learning_rate": 0.00019319514148371435, "loss": 0.7876, "step": 265 }, { "epoch": 0.16635859519408502, "grad_norm": 0.4069214463233948, "learning_rate": 0.00019282071224758091, "loss": 0.7761, "step": 270 }, { "epoch": 0.16943930991990142, "grad_norm": 0.6559478640556335, "learning_rate": 0.00019243663860259993, "loss": 0.7823, "step": 275 }, { "epoch": 0.1725200246457178, "grad_norm": 0.2602563798427582, "learning_rate": 0.00019204296045541685, "loss": 0.8695, "step": 280 }, { "epoch": 0.1756007393715342, "grad_norm": 0.23746228218078613, "learning_rate": 0.0001916397187106199, "loss": 0.7707, "step": 285 }, { "epoch": 0.1786814540973506, "grad_norm": 0.21985304355621338, "learning_rate": 0.00019122695526648968, "loss": 0.8086, "step": 290 }, { "epoch": 0.18176216882316698, "grad_norm": 0.40395426750183105, "learning_rate": 0.00019080471301064598, "loss": 0.8641, "step": 295 }, { "epoch": 0.18484288354898337, "grad_norm": 0.6175822615623474, "learning_rate": 0.00019037303581559143, "loss": 0.7143, "step": 300 }, { "epoch": 0.18792359827479976, "grad_norm": 0.2363148182630539, "learning_rate": 0.00018993196853415317, "loss": 0.8094, "step": 305 }, { "epoch": 0.19100431300061615, "grad_norm": 0.27667897939682007, "learning_rate": 0.00018948155699482244, "loss": 0.7817, "step": 310 }, { "epoch": 0.19408502772643252, "grad_norm": 0.29201361536979675, "learning_rate": 0.00018902184799699263, "loss": 0.9028, "step": 315 }, { "epoch": 0.1971657424522489, "grad_norm": 0.33440956473350525, "learning_rate": 0.00018855288930609692, "loss": 0.8056, "step": 320 }, { "epoch": 0.2002464571780653, "grad_norm": 0.42880240082740784, "learning_rate": 0.00018807472964864515, "loss": 0.7429, "step": 325 }, { "epoch": 0.2033271719038817, "grad_norm": 0.273947149515152, "learning_rate": 0.00018758741870716092, "loss": 0.897, "step": 330 }, { "epoch": 0.20640788662969808, "grad_norm": 0.22410909831523895, "learning_rate": 0.00018709100711501955, "loss": 0.8507, "step": 335 }, { "epoch": 0.20948860135551448, "grad_norm": 0.22884678840637207, "learning_rate": 0.0001865855464511869, "loss": 0.7909, "step": 340 }, { "epoch": 0.21256931608133087, "grad_norm": 0.3462975025177002, "learning_rate": 0.00018607108923486025, "loss": 0.6947, "step": 345 }, { "epoch": 0.21565003080714726, "grad_norm": 0.44200509786605835, "learning_rate": 0.00018554768892001136, "loss": 0.7239, "step": 350 }, { "epoch": 0.21873074553296365, "grad_norm": 0.3955911695957184, "learning_rate": 0.00018501539988983234, "loss": 0.7421, "step": 355 }, { "epoch": 0.22181146025878004, "grad_norm": 0.2413174957036972, "learning_rate": 0.0001844742774510851, "loss": 0.8316, "step": 360 }, { "epoch": 0.22489217498459643, "grad_norm": 0.19873136281967163, "learning_rate": 0.00018392437782835475, "loss": 0.8296, "step": 365 }, { "epoch": 0.22797288971041282, "grad_norm": 0.36676397919654846, "learning_rate": 0.00018336575815820766, "loss": 0.8001, "step": 370 }, { "epoch": 0.23105360443622922, "grad_norm": 0.5671842098236084, "learning_rate": 0.00018279847648325478, "loss": 0.7479, "step": 375 }, { "epoch": 0.2341343191620456, "grad_norm": 0.2957541048526764, "learning_rate": 0.0001822225917461208, "loss": 0.7542, "step": 380 }, { "epoch": 0.23721503388786197, "grad_norm": 0.29336631298065186, "learning_rate": 0.0001816381637833198, "loss": 0.8205, "step": 385 }, { "epoch": 0.24029574861367836, "grad_norm": 0.2481537014245987, "learning_rate": 0.00018104525331903799, "loss": 0.8362, "step": 390 }, { "epoch": 0.24337646333949475, "grad_norm": 0.41952452063560486, "learning_rate": 0.00018044392195882427, "loss": 0.9265, "step": 395 }, { "epoch": 0.24645717806531114, "grad_norm": 0.7947683334350586, "learning_rate": 0.00017983423218318918, "loss": 0.7255, "step": 400 }, { "epoch": 0.24953789279112754, "grad_norm": 0.4045758545398712, "learning_rate": 0.00017921624734111292, "loss": 0.8064, "step": 405 }, { "epoch": 0.2526186075169439, "grad_norm": 0.27069926261901855, "learning_rate": 0.00017859003164346336, "loss": 0.876, "step": 410 }, { "epoch": 0.2556993222427603, "grad_norm": 0.20166383683681488, "learning_rate": 0.0001779556501563239, "loss": 0.7996, "step": 415 }, { "epoch": 0.2587800369685767, "grad_norm": 0.36772456765174866, "learning_rate": 0.00017731316879423327, "loss": 0.8794, "step": 420 }, { "epoch": 0.2618607516943931, "grad_norm": 0.5478736758232117, "learning_rate": 0.00017666265431333654, "loss": 0.8172, "step": 425 }, { "epoch": 0.2649414664202095, "grad_norm": 0.2308587282896042, "learning_rate": 0.000176004174304449, "loss": 0.8298, "step": 430 }, { "epoch": 0.2680221811460259, "grad_norm": 0.26311734318733215, "learning_rate": 0.00017533779718603313, "loss": 0.8613, "step": 435 }, { "epoch": 0.2711028958718423, "grad_norm": 0.2165333479642868, "learning_rate": 0.00017466359219708985, "loss": 0.7564, "step": 440 }, { "epoch": 0.27418361059765867, "grad_norm": 0.30435317754745483, "learning_rate": 0.00017398162938996422, "loss": 0.7289, "step": 445 }, { "epoch": 0.27726432532347506, "grad_norm": 0.35832491517066956, "learning_rate": 0.00017329197962306664, "loss": 0.6967, "step": 450 }, { "epoch": 0.28034504004929145, "grad_norm": 0.25740063190460205, "learning_rate": 0.00017259471455351072, "loss": 0.7969, "step": 455 }, { "epoch": 0.28342575477510784, "grad_norm": 0.2562999129295349, "learning_rate": 0.0001718899066296675, "loss": 0.8194, "step": 460 }, { "epoch": 0.28650646950092423, "grad_norm": 0.24349433183670044, "learning_rate": 0.000171177629083638, "loss": 0.8007, "step": 465 }, { "epoch": 0.2895871842267406, "grad_norm": 0.36831149458885193, "learning_rate": 0.0001704579559236441, "loss": 0.8348, "step": 470 }, { "epoch": 0.292667898952557, "grad_norm": 0.510749876499176, "learning_rate": 0.00016973096192633884, "loss": 0.7043, "step": 475 }, { "epoch": 0.2957486136783734, "grad_norm": 0.28559786081314087, "learning_rate": 0.00016899672262903677, "loss": 0.7505, "step": 480 }, { "epoch": 0.2988293284041898, "grad_norm": 0.2619992792606354, "learning_rate": 0.00016825531432186543, "loss": 0.8856, "step": 485 }, { "epoch": 0.30191004313000613, "grad_norm": 0.21462175250053406, "learning_rate": 0.00016750681403983846, "loss": 0.913, "step": 490 }, { "epoch": 0.3049907578558225, "grad_norm": 0.4136449992656708, "learning_rate": 0.00016675129955485152, "loss": 0.7869, "step": 495 }, { "epoch": 0.3080714725816389, "grad_norm": 0.38793209195137024, "learning_rate": 0.00016598884936760131, "loss": 0.7076, "step": 500 }, { "epoch": 0.3111521873074553, "grad_norm": 0.24016061425209045, "learning_rate": 0.00016521954269942918, "loss": 0.8514, "step": 505 }, { "epoch": 0.3142329020332717, "grad_norm": 0.2508719563484192, "learning_rate": 0.00016444345948408984, "loss": 0.829, "step": 510 }, { "epoch": 0.3173136167590881, "grad_norm": 0.24618899822235107, "learning_rate": 0.0001636606803594457, "loss": 0.8197, "step": 515 }, { "epoch": 0.3203943314849045, "grad_norm": 0.4258035719394684, "learning_rate": 0.0001628712866590885, "loss": 0.7916, "step": 520 }, { "epoch": 0.3234750462107209, "grad_norm": 0.5021526217460632, "learning_rate": 0.00016207536040388845, "loss": 0.6987, "step": 525 }, { "epoch": 0.32655576093653726, "grad_norm": 0.2670142352581024, "learning_rate": 0.0001612729842934718, "loss": 0.8528, "step": 530 }, { "epoch": 0.32963647566235366, "grad_norm": 0.2767278552055359, "learning_rate": 0.00016046424169762827, "loss": 0.9161, "step": 535 }, { "epoch": 0.33271719038817005, "grad_norm": 0.20473924279212952, "learning_rate": 0.0001596492166476485, "loss": 0.8161, "step": 540 }, { "epoch": 0.33579790511398644, "grad_norm": 0.34235650300979614, "learning_rate": 0.0001588279938275929, "loss": 0.8034, "step": 545 }, { "epoch": 0.33887861983980283, "grad_norm": 0.4922890067100525, "learning_rate": 0.00015800065856549269, "loss": 0.6528, "step": 550 }, { "epoch": 0.3419593345656192, "grad_norm": 0.2635882496833801, "learning_rate": 0.00015716729682448393, "loss": 0.8168, "step": 555 }, { "epoch": 0.3450400492914356, "grad_norm": 0.25564834475517273, "learning_rate": 0.0001563279951938758, "loss": 0.8155, "step": 560 }, { "epoch": 0.348120764017252, "grad_norm": 0.2140437662601471, "learning_rate": 0.00015548284088015354, "loss": 0.7878, "step": 565 }, { "epoch": 0.3512014787430684, "grad_norm": 0.37995290756225586, "learning_rate": 0.00015463192169791741, "loss": 0.7966, "step": 570 }, { "epoch": 0.3542821934688848, "grad_norm": 0.4770684838294983, "learning_rate": 0.0001537753260607584, "loss": 0.7281, "step": 575 }, { "epoch": 0.3573629081947012, "grad_norm": 0.2325628101825714, "learning_rate": 0.00015291314297207175, "loss": 0.7927, "step": 580 }, { "epoch": 0.36044362292051757, "grad_norm": 0.2019563913345337, "learning_rate": 0.0001520454620158093, "loss": 0.9002, "step": 585 }, { "epoch": 0.36352433764633396, "grad_norm": 0.22624002397060394, "learning_rate": 0.00015117237334717117, "loss": 0.8, "step": 590 }, { "epoch": 0.36660505237215035, "grad_norm": 0.33231621980667114, "learning_rate": 0.00015029396768323846, "loss": 0.8996, "step": 595 }, { "epoch": 0.36968576709796674, "grad_norm": 0.4482916593551636, "learning_rate": 0.00014941033629354734, "loss": 0.7985, "step": 600 }, { "epoch": 0.37276648182378314, "grad_norm": 0.30958321690559387, "learning_rate": 0.00014852157099060596, "loss": 0.8665, "step": 605 }, { "epoch": 0.3758471965495995, "grad_norm": 0.2573617994785309, "learning_rate": 0.00014762776412035456, "loss": 0.8723, "step": 610 }, { "epoch": 0.3789279112754159, "grad_norm": 0.21480366587638855, "learning_rate": 0.00014672900855257056, "loss": 0.7742, "step": 615 }, { "epoch": 0.3820086260012323, "grad_norm": 0.4593837559223175, "learning_rate": 0.00014582539767121904, "loss": 0.8569, "step": 620 }, { "epoch": 0.3850893407270487, "grad_norm": 0.40562301874160767, "learning_rate": 0.0001449170253647498, "loss": 0.6574, "step": 625 }, { "epoch": 0.38817005545286504, "grad_norm": 0.2880898416042328, "learning_rate": 0.0001440039860163419, "loss": 0.804, "step": 630 }, { "epoch": 0.39125077017868143, "grad_norm": 0.2713911831378937, "learning_rate": 0.00014308637449409706, "loss": 0.8401, "step": 635 }, { "epoch": 0.3943314849044978, "grad_norm": 0.23417170345783234, "learning_rate": 0.00014216428614118243, "loss": 0.8221, "step": 640 }, { "epoch": 0.3974121996303142, "grad_norm": 0.3655683696269989, "learning_rate": 0.00014123781676592418, "loss": 0.8223, "step": 645 }, { "epoch": 0.4004929143561306, "grad_norm": 0.48337438702583313, "learning_rate": 0.00014030706263185247, "loss": 0.7682, "step": 650 }, { "epoch": 0.403573629081947, "grad_norm": 0.2492971569299698, "learning_rate": 0.00013937212044769955, "loss": 0.7924, "step": 655 }, { "epoch": 0.4066543438077634, "grad_norm": 0.33426758646965027, "learning_rate": 0.0001384330873573513, "loss": 0.8693, "step": 660 }, { "epoch": 0.4097350585335798, "grad_norm": 0.20911215245723724, "learning_rate": 0.00013749006092975347, "loss": 0.8298, "step": 665 }, { "epoch": 0.41281577325939617, "grad_norm": 0.38434621691703796, "learning_rate": 0.00013654313914877414, "loss": 0.8065, "step": 670 }, { "epoch": 0.41589648798521256, "grad_norm": 0.42332616448402405, "learning_rate": 0.00013559242040302272, "loss": 0.6792, "step": 675 }, { "epoch": 0.41897720271102895, "grad_norm": 0.2749365568161011, "learning_rate": 0.00013463800347562706, "loss": 0.8358, "step": 680 }, { "epoch": 0.42205791743684534, "grad_norm": 0.2405983805656433, "learning_rate": 0.00013367998753396944, "loss": 0.7455, "step": 685 }, { "epoch": 0.42513863216266173, "grad_norm": 0.22858920693397522, "learning_rate": 0.00013271847211938285, "loss": 0.8079, "step": 690 }, { "epoch": 0.4282193468884781, "grad_norm": 0.3635597229003906, "learning_rate": 0.0001317535571368082, "loss": 0.7994, "step": 695 }, { "epoch": 0.4313000616142945, "grad_norm": 0.44722065329551697, "learning_rate": 0.00013078534284441382, "loss": 0.7677, "step": 700 }, { "epoch": 0.4343807763401109, "grad_norm": 0.2262091040611267, "learning_rate": 0.00012981392984317834, "loss": 0.7453, "step": 705 }, { "epoch": 0.4374614910659273, "grad_norm": 0.25868818163871765, "learning_rate": 0.00012883941906643786, "loss": 0.8428, "step": 710 }, { "epoch": 0.4405422057917437, "grad_norm": 0.2550651431083679, "learning_rate": 0.00012786191176939848, "loss": 0.8179, "step": 715 }, { "epoch": 0.4436229205175601, "grad_norm": 0.34813833236694336, "learning_rate": 0.00012688150951861582, "loss": 0.7265, "step": 720 }, { "epoch": 0.4467036352433765, "grad_norm": 0.5182507634162903, "learning_rate": 0.00012589831418144154, "loss": 0.735, "step": 725 }, { "epoch": 0.44978434996919286, "grad_norm": 0.2376311868429184, "learning_rate": 0.00012491242791543922, "loss": 0.8237, "step": 730 }, { "epoch": 0.45286506469500926, "grad_norm": 0.2320733517408371, "learning_rate": 0.00012392395315776963, "loss": 0.8574, "step": 735 }, { "epoch": 0.45594577942082565, "grad_norm": 0.22170424461364746, "learning_rate": 0.00012293299261454725, "loss": 0.7741, "step": 740 }, { "epoch": 0.45902649414664204, "grad_norm": 0.3497295081615448, "learning_rate": 0.00012193964925016872, "loss": 0.7643, "step": 745 }, { "epoch": 0.46210720887245843, "grad_norm": 0.4965573251247406, "learning_rate": 0.00012094402627661447, "loss": 0.6955, "step": 750 }, { "epoch": 0.4651879235982748, "grad_norm": 0.27073344588279724, "learning_rate": 0.00011994622714272448, "loss": 0.8408, "step": 755 }, { "epoch": 0.4682686383240912, "grad_norm": 0.28380638360977173, "learning_rate": 0.00011894635552344975, "loss": 0.8311, "step": 760 }, { "epoch": 0.4713493530499076, "grad_norm": 0.1889481097459793, "learning_rate": 0.00011794451530908011, "loss": 0.7574, "step": 765 }, { "epoch": 0.47443006777572394, "grad_norm": 0.3745817244052887, "learning_rate": 0.00011694081059444946, "loss": 0.7866, "step": 770 }, { "epoch": 0.47751078250154033, "grad_norm": 0.4287342131137848, "learning_rate": 0.0001159353456681201, "loss": 0.7334, "step": 775 }, { "epoch": 0.4805914972273567, "grad_norm": 0.2466813325881958, "learning_rate": 0.00011492822500154667, "loss": 0.755, "step": 780 }, { "epoch": 0.4836722119531731, "grad_norm": 0.24119067192077637, "learning_rate": 0.00011391955323822126, "loss": 0.7655, "step": 785 }, { "epoch": 0.4867529266789895, "grad_norm": 0.20918749272823334, "learning_rate": 0.00011290943518280057, "loss": 0.8439, "step": 790 }, { "epoch": 0.4898336414048059, "grad_norm": 0.517594575881958, "learning_rate": 0.0001118979757902162, "loss": 0.7538, "step": 795 }, { "epoch": 0.4929143561306223, "grad_norm": 0.4631362855434418, "learning_rate": 0.00011088528015476964, "loss": 0.7929, "step": 800 }, { "epoch": 0.4959950708564387, "grad_norm": 0.21728238463401794, "learning_rate": 0.00010987145349921251, "loss": 0.7764, "step": 805 }, { "epoch": 0.49907578558225507, "grad_norm": 0.23387549817562103, "learning_rate": 0.0001088566011638134, "loss": 0.7706, "step": 810 }, { "epoch": 0.5021565003080715, "grad_norm": 0.22177070379257202, "learning_rate": 0.00010784082859541292, "loss": 0.7951, "step": 815 }, { "epoch": 0.5052372150338879, "grad_norm": 0.3871817886829376, "learning_rate": 0.0001068242413364671, "loss": 0.821, "step": 820 }, { "epoch": 0.5083179297597042, "grad_norm": 0.4635089933872223, "learning_rate": 0.00010580694501408138, "loss": 0.7503, "step": 825 }, { "epoch": 0.5113986444855206, "grad_norm": 0.23239871859550476, "learning_rate": 0.00010478904532903535, "loss": 0.7393, "step": 830 }, { "epoch": 0.514479359211337, "grad_norm": 0.24534587562084198, "learning_rate": 0.00010377064804480025, "loss": 0.8473, "step": 835 }, { "epoch": 0.5175600739371534, "grad_norm": 0.20399847626686096, "learning_rate": 0.00010275185897654971, "loss": 0.7359, "step": 840 }, { "epoch": 0.5206407886629698, "grad_norm": 0.38309943675994873, "learning_rate": 0.00010173278398016501, "loss": 0.7669, "step": 845 }, { "epoch": 0.5237215033887862, "grad_norm": 0.4085754156112671, "learning_rate": 0.00010071352894123654, "loss": 0.7226, "step": 850 }, { "epoch": 0.5268022181146026, "grad_norm": 0.2643273174762726, "learning_rate": 9.969419976406165e-05, "loss": 0.7809, "step": 855 }, { "epoch": 0.529882932840419, "grad_norm": 0.2764434516429901, "learning_rate": 9.867490236064108e-05, "loss": 0.7778, "step": 860 }, { "epoch": 0.5329636475662354, "grad_norm": 0.24302256107330322, "learning_rate": 9.765574263967396e-05, "loss": 0.8407, "step": 865 }, { "epoch": 0.5360443622920518, "grad_norm": 0.3125867545604706, "learning_rate": 9.66368264955539e-05, "loss": 0.7848, "step": 870 }, { "epoch": 0.5391250770178682, "grad_norm": 0.41807302832603455, "learning_rate": 9.56182597973658e-05, "loss": 0.7592, "step": 875 }, { "epoch": 0.5422057917436846, "grad_norm": 0.23016157746315002, "learning_rate": 9.460014837788605e-05, "loss": 0.7994, "step": 880 }, { "epoch": 0.5452865064695009, "grad_norm": 0.24911296367645264, "learning_rate": 9.358259802258581e-05, "loss": 0.8176, "step": 885 }, { "epoch": 0.5483672211953173, "grad_norm": 0.21787093579769135, "learning_rate": 9.256571445863972e-05, "loss": 0.761, "step": 890 }, { "epoch": 0.5514479359211337, "grad_norm": 0.35370132327079773, "learning_rate": 9.154960334394027e-05, "loss": 0.8247, "step": 895 }, { "epoch": 0.5545286506469501, "grad_norm": 0.4619176983833313, "learning_rate": 9.053437025611973e-05, "loss": 0.7113, "step": 900 }, { "epoch": 0.5576093653727665, "grad_norm": 0.25977957248687744, "learning_rate": 8.952012068158027e-05, "loss": 0.8267, "step": 905 }, { "epoch": 0.5606900800985829, "grad_norm": 0.21369901299476624, "learning_rate": 8.850696000453326e-05, "loss": 0.8462, "step": 910 }, { "epoch": 0.5637707948243993, "grad_norm": 0.2048785239458084, "learning_rate": 8.749499349604993e-05, "loss": 0.8032, "step": 915 }, { "epoch": 0.5668515095502157, "grad_norm": 0.3455217182636261, "learning_rate": 8.64843263031228e-05, "loss": 0.7907, "step": 920 }, { "epoch": 0.5699322242760321, "grad_norm": 0.4672120213508606, "learning_rate": 8.547506343774097e-05, "loss": 0.6793, "step": 925 }, { "epoch": 0.5730129390018485, "grad_norm": 0.25407537817955017, "learning_rate": 8.446730976597878e-05, "loss": 0.8099, "step": 930 }, { "epoch": 0.5760936537276649, "grad_norm": 0.28147977590560913, "learning_rate": 8.346116999709975e-05, "loss": 0.7905, "step": 935 }, { "epoch": 0.5791743684534812, "grad_norm": 0.2097829431295395, "learning_rate": 8.245674867267724e-05, "loss": 0.7415, "step": 940 }, { "epoch": 0.5822550831792976, "grad_norm": 0.34850969910621643, "learning_rate": 8.145415015573183e-05, "loss": 0.8718, "step": 945 }, { "epoch": 0.585335797905114, "grad_norm": 0.42216917872428894, "learning_rate": 8.045347861988789e-05, "loss": 0.6458, "step": 950 }, { "epoch": 0.5884165126309304, "grad_norm": 0.3279024362564087, "learning_rate": 7.945483803854936e-05, "loss": 0.8005, "step": 955 }, { "epoch": 0.5914972273567468, "grad_norm": 0.27357375621795654, "learning_rate": 7.845833217409675e-05, "loss": 0.8931, "step": 960 }, { "epoch": 0.5945779420825632, "grad_norm": 0.2173648178577423, "learning_rate": 7.746406456710564e-05, "loss": 0.765, "step": 965 }, { "epoch": 0.5976586568083796, "grad_norm": 0.38057172298431396, "learning_rate": 7.64721385255886e-05, "loss": 0.7678, "step": 970 }, { "epoch": 0.600739371534196, "grad_norm": 0.5046900510787964, "learning_rate": 7.548265711426104e-05, "loss": 0.6781, "step": 975 }, { "epoch": 0.6038200862600123, "grad_norm": 0.2965739071369171, "learning_rate": 7.449572314383237e-05, "loss": 0.8683, "step": 980 }, { "epoch": 0.6069008009858287, "grad_norm": 0.24035529792308807, "learning_rate": 7.351143916032374e-05, "loss": 0.8702, "step": 985 }, { "epoch": 0.609981515711645, "grad_norm": 0.22509080171585083, "learning_rate": 7.252990743441293e-05, "loss": 0.7761, "step": 990 }, { "epoch": 0.6130622304374614, "grad_norm": 0.34243789315223694, "learning_rate": 7.155122995080827e-05, "loss": 0.825, "step": 995 }, { "epoch": 0.6161429451632778, "grad_norm": 0.4898974597454071, "learning_rate": 7.057550839765188e-05, "loss": 0.7639, "step": 1000 }, { "epoch": 0.6192236598890942, "grad_norm": 0.2895849347114563, "learning_rate": 6.960284415595407e-05, "loss": 0.8098, "step": 1005 }, { "epoch": 0.6223043746149106, "grad_norm": 0.25053802132606506, "learning_rate": 6.863333828905929e-05, "loss": 0.9302, "step": 1010 }, { "epoch": 0.625385089340727, "grad_norm": 0.2251722365617752, "learning_rate": 6.766709153214542e-05, "loss": 0.8277, "step": 1015 }, { "epoch": 0.6284658040665434, "grad_norm": 0.3466779589653015, "learning_rate": 6.670420428175705e-05, "loss": 0.8307, "step": 1020 }, { "epoch": 0.6315465187923598, "grad_norm": 0.3713000416755676, "learning_rate": 6.574477658537375e-05, "loss": 0.7061, "step": 1025 }, { "epoch": 0.6346272335181762, "grad_norm": 0.31640493869781494, "learning_rate": 6.4788908131015e-05, "loss": 0.7782, "step": 1030 }, { "epoch": 0.6377079482439926, "grad_norm": 0.24188384413719177, "learning_rate": 6.38366982368819e-05, "loss": 0.7245, "step": 1035 }, { "epoch": 0.640788662969809, "grad_norm": 0.22182178497314453, "learning_rate": 6.288824584103816e-05, "loss": 0.7947, "step": 1040 }, { "epoch": 0.6438693776956254, "grad_norm": 0.3702578842639923, "learning_rate": 6.194364949112953e-05, "loss": 0.7678, "step": 1045 }, { "epoch": 0.6469500924214417, "grad_norm": 0.4144516885280609, "learning_rate": 6.100300733414474e-05, "loss": 0.7027, "step": 1050 }, { "epoch": 0.6500308071472581, "grad_norm": 0.25839751958847046, "learning_rate": 6.0066417106217455e-05, "loss": 0.797, "step": 1055 }, { "epoch": 0.6531115218730745, "grad_norm": 0.24363520741462708, "learning_rate": 5.9133976122471214e-05, "loss": 0.8165, "step": 1060 }, { "epoch": 0.6561922365988909, "grad_norm": 0.21458542346954346, "learning_rate": 5.82057812669081e-05, "loss": 0.7934, "step": 1065 }, { "epoch": 0.6592729513247073, "grad_norm": 0.29251885414123535, "learning_rate": 5.728192898234195e-05, "loss": 0.7279, "step": 1070 }, { "epoch": 0.6623536660505237, "grad_norm": 0.4403933882713318, "learning_rate": 5.6362515260377835e-05, "loss": 0.6894, "step": 1075 }, { "epoch": 0.6654343807763401, "grad_norm": 0.29335981607437134, "learning_rate": 5.544763563143793e-05, "loss": 0.8523, "step": 1080 }, { "epoch": 0.6685150955021565, "grad_norm": 0.2615460157394409, "learning_rate": 5.4537385154835864e-05, "loss": 0.8002, "step": 1085 }, { "epoch": 0.6715958102279729, "grad_norm": 0.20354525744915009, "learning_rate": 5.363185840889935e-05, "loss": 0.7535, "step": 1090 }, { "epoch": 0.6746765249537893, "grad_norm": 0.33114710450172424, "learning_rate": 5.273114948114346e-05, "loss": 0.797, "step": 1095 }, { "epoch": 0.6777572396796057, "grad_norm": 0.4398595988750458, "learning_rate": 5.1835351958494515e-05, "loss": 0.6914, "step": 1100 }, { "epoch": 0.680837954405422, "grad_norm": 0.2481193095445633, "learning_rate": 5.094455891756587e-05, "loss": 0.8033, "step": 1105 }, { "epoch": 0.6839186691312384, "grad_norm": 0.2384270578622818, "learning_rate": 5.00588629149872e-05, "loss": 0.8644, "step": 1110 }, { "epoch": 0.6869993838570548, "grad_norm": 0.22420865297317505, "learning_rate": 4.91783559777873e-05, "loss": 0.7891, "step": 1115 }, { "epoch": 0.6900800985828712, "grad_norm": 0.38343584537506104, "learning_rate": 4.830312959383238e-05, "loss": 0.7373, "step": 1120 }, { "epoch": 0.6931608133086876, "grad_norm": 0.38524994254112244, "learning_rate": 4.7433274702319815e-05, "loss": 0.6168, "step": 1125 }, { "epoch": 0.696241528034504, "grad_norm": 0.2434820532798767, "learning_rate": 4.656888168432962e-05, "loss": 0.7838, "step": 1130 }, { "epoch": 0.6993222427603204, "grad_norm": 0.243381068110466, "learning_rate": 4.571004035343315e-05, "loss": 0.8248, "step": 1135 }, { "epoch": 0.7024029574861368, "grad_norm": 0.20797839760780334, "learning_rate": 4.485683994636144e-05, "loss": 0.7717, "step": 1140 }, { "epoch": 0.7054836722119532, "grad_norm": 0.357668399810791, "learning_rate": 4.400936911373308e-05, "loss": 0.8102, "step": 1145 }, { "epoch": 0.7085643869377696, "grad_norm": 0.4109031856060028, "learning_rate": 4.3167715910842966e-05, "loss": 0.7208, "step": 1150 }, { "epoch": 0.711645101663586, "grad_norm": 0.27286359667778015, "learning_rate": 4.2331967788513295e-05, "loss": 0.7993, "step": 1155 }, { "epoch": 0.7147258163894024, "grad_norm": 0.2251722365617752, "learning_rate": 4.1502211584006836e-05, "loss": 0.8202, "step": 1160 }, { "epoch": 0.7178065311152187, "grad_norm": 0.19988128542900085, "learning_rate": 4.067853351200446e-05, "loss": 0.8353, "step": 1165 }, { "epoch": 0.7208872458410351, "grad_norm": 0.32562148571014404, "learning_rate": 3.986101915564695e-05, "loss": 0.7811, "step": 1170 }, { "epoch": 0.7239679605668515, "grad_norm": 0.5299817323684692, "learning_rate": 3.904975345764262e-05, "loss": 0.7235, "step": 1175 }, { "epoch": 0.7270486752926679, "grad_norm": 0.24168257415294647, "learning_rate": 3.824482071144163e-05, "loss": 0.7845, "step": 1180 }, { "epoch": 0.7301293900184843, "grad_norm": 0.28862902522087097, "learning_rate": 3.744630455247739e-05, "loss": 0.8085, "step": 1185 }, { "epoch": 0.7332101047443007, "grad_norm": 0.19976598024368286, "learning_rate": 3.6654287949476626e-05, "loss": 0.7586, "step": 1190 }, { "epoch": 0.7362908194701171, "grad_norm": 0.320616215467453, "learning_rate": 3.586885319583858e-05, "loss": 0.7874, "step": 1195 }, { "epoch": 0.7393715341959335, "grad_norm": 0.4317881464958191, "learning_rate": 3.5090081901084525e-05, "loss": 0.7074, "step": 1200 }, { "epoch": 0.7424522489217499, "grad_norm": 0.2932340204715729, "learning_rate": 3.431805498237808e-05, "loss": 0.8518, "step": 1205 }, { "epoch": 0.7455329636475663, "grad_norm": 0.2546040117740631, "learning_rate": 3.355285265611784e-05, "loss": 0.8227, "step": 1210 }, { "epoch": 0.7486136783733827, "grad_norm": 0.19364731013774872, "learning_rate": 3.279455442960238e-05, "loss": 0.8077, "step": 1215 }, { "epoch": 0.751694393099199, "grad_norm": 0.3423250615596771, "learning_rate": 3.204323909276924e-05, "loss": 0.8205, "step": 1220 }, { "epoch": 0.7547751078250154, "grad_norm": 0.3572053015232086, "learning_rate": 3.1298984710008484e-05, "loss": 0.6807, "step": 1225 }, { "epoch": 0.7578558225508318, "grad_norm": 0.32454174757003784, "learning_rate": 3.056186861205136e-05, "loss": 0.7632, "step": 1230 }, { "epoch": 0.7609365372766482, "grad_norm": 0.23129241168498993, "learning_rate": 2.9831967387935467e-05, "loss": 0.884, "step": 1235 }, { "epoch": 0.7640172520024646, "grad_norm": 0.2326166033744812, "learning_rate": 2.9109356877046712e-05, "loss": 0.7617, "step": 1240 }, { "epoch": 0.767097966728281, "grad_norm": 0.36473801732063293, "learning_rate": 2.8394112161239605e-05, "loss": 0.8017, "step": 1245 }, { "epoch": 0.7701786814540974, "grad_norm": 0.4652582108974457, "learning_rate": 2.7686307557035685e-05, "loss": 0.6805, "step": 1250 }, { "epoch": 0.7732593961799138, "grad_norm": 0.2206043004989624, "learning_rate": 2.6986016607901908e-05, "loss": 0.7304, "step": 1255 }, { "epoch": 0.7763401109057301, "grad_norm": 0.2539527118206024, "learning_rate": 2.629331207660931e-05, "loss": 0.7691, "step": 1260 }, { "epoch": 0.7794208256315465, "grad_norm": 0.22998766601085663, "learning_rate": 2.5608265937672436e-05, "loss": 0.7646, "step": 1265 }, { "epoch": 0.7825015403573629, "grad_norm": 0.3425155282020569, "learning_rate": 2.4930949369871203e-05, "loss": 0.7994, "step": 1270 }, { "epoch": 0.7855822550831792, "grad_norm": 0.48752111196517944, "learning_rate": 2.426143274885493e-05, "loss": 0.6582, "step": 1275 }, { "epoch": 0.7886629698089956, "grad_norm": 0.30728885531425476, "learning_rate": 2.359978563983022e-05, "loss": 0.7866, "step": 1280 }, { "epoch": 0.791743684534812, "grad_norm": 0.21921317279338837, "learning_rate": 2.2946076790332827e-05, "loss": 0.7317, "step": 1285 }, { "epoch": 0.7948243992606284, "grad_norm": 0.21226321160793304, "learning_rate": 2.2300374123084522e-05, "loss": 0.7303, "step": 1290 }, { "epoch": 0.7979051139864448, "grad_norm": 0.4159514605998993, "learning_rate": 2.166274472893567e-05, "loss": 0.762, "step": 1295 }, { "epoch": 0.8009858287122612, "grad_norm": 0.3819718062877655, "learning_rate": 2.1033254859894226e-05, "loss": 0.6641, "step": 1300 }, { "epoch": 0.8040665434380776, "grad_norm": 0.21891655027866364, "learning_rate": 2.041196992224206e-05, "loss": 0.7669, "step": 1305 }, { "epoch": 0.807147258163894, "grad_norm": 0.2436547726392746, "learning_rate": 1.9798954469738762e-05, "loss": 0.7634, "step": 1310 }, { "epoch": 0.8102279728897104, "grad_norm": 0.2127164751291275, "learning_rate": 1.919427219691453e-05, "loss": 0.7888, "step": 1315 }, { "epoch": 0.8133086876155268, "grad_norm": 0.3121797442436218, "learning_rate": 1.8597985932451856e-05, "loss": 0.7472, "step": 1320 }, { "epoch": 0.8163894023413432, "grad_norm": 0.44053328037261963, "learning_rate": 1.8010157632657543e-05, "loss": 0.6843, "step": 1325 }, { "epoch": 0.8194701170671596, "grad_norm": 0.2766016125679016, "learning_rate": 1.7430848375025176e-05, "loss": 0.8204, "step": 1330 }, { "epoch": 0.822550831792976, "grad_norm": 0.2547926604747772, "learning_rate": 1.686011835188891e-05, "loss": 0.8059, "step": 1335 }, { "epoch": 0.8256315465187923, "grad_norm": 0.19498097896575928, "learning_rate": 1.6298026864169335e-05, "loss": 0.7734, "step": 1340 }, { "epoch": 0.8287122612446087, "grad_norm": 0.33014756441116333, "learning_rate": 1.5744632315211815e-05, "loss": 0.7823, "step": 1345 }, { "epoch": 0.8317929759704251, "grad_norm": 0.4378814995288849, "learning_rate": 1.5199992204718294e-05, "loss": 0.7058, "step": 1350 }, { "epoch": 0.8348736906962415, "grad_norm": 0.27441856265068054, "learning_rate": 1.4664163122772689e-05, "loss": 0.8222, "step": 1355 }, { "epoch": 0.8379544054220579, "grad_norm": 0.2730132043361664, "learning_rate": 1.4137200743961188e-05, "loss": 0.8079, "step": 1360 }, { "epoch": 0.8410351201478743, "grad_norm": 0.1908617615699768, "learning_rate": 1.3619159821587235e-05, "loss": 0.7629, "step": 1365 }, { "epoch": 0.8441158348736907, "grad_norm": 0.3156476616859436, "learning_rate": 1.3110094181982657e-05, "loss": 0.7778, "step": 1370 }, { "epoch": 0.8471965495995071, "grad_norm": 0.9144028425216675, "learning_rate": 1.261005671891482e-05, "loss": 0.6881, "step": 1375 }, { "epoch": 0.8502772643253235, "grad_norm": 0.2342410683631897, "learning_rate": 1.2119099388090716e-05, "loss": 0.7912, "step": 1380 }, { "epoch": 0.8533579790511399, "grad_norm": 0.29499155282974243, "learning_rate": 1.1637273201758748e-05, "loss": 0.8108, "step": 1385 }, { "epoch": 0.8564386937769563, "grad_norm": 0.21403230726718903, "learning_rate": 1.1164628223408168e-05, "loss": 0.7471, "step": 1390 }, { "epoch": 0.8595194085027726, "grad_norm": 0.3718407452106476, "learning_rate": 1.0701213562567492e-05, "loss": 0.7137, "step": 1395 }, { "epoch": 0.862600123228589, "grad_norm": 0.5841807723045349, "learning_rate": 1.0247077369701653e-05, "loss": 0.6953, "step": 1400 }, { "epoch": 0.8656808379544054, "grad_norm": 0.2595619261264801, "learning_rate": 9.802266831209206e-06, "loss": 0.7355, "step": 1405 }, { "epoch": 0.8687615526802218, "grad_norm": 0.19861729443073273, "learning_rate": 9.366828164519258e-06, "loss": 0.761, "step": 1410 }, { "epoch": 0.8718422674060382, "grad_norm": 0.2072681188583374, "learning_rate": 8.940806613289498e-06, "loss": 0.8041, "step": 1415 }, { "epoch": 0.8749229821318546, "grad_norm": 0.3955484628677368, "learning_rate": 8.524246442705153e-06, "loss": 0.7264, "step": 1420 }, { "epoch": 0.878003696857671, "grad_norm": 0.5402655005455017, "learning_rate": 8.117190934879593e-06, "loss": 0.6967, "step": 1425 }, { "epoch": 0.8810844115834874, "grad_norm": 0.2759842872619629, "learning_rate": 7.719682384357308e-06, "loss": 0.779, "step": 1430 }, { "epoch": 0.8841651263093038, "grad_norm": 0.26952704787254333, "learning_rate": 7.33176209371923e-06, "loss": 0.8355, "step": 1435 }, { "epoch": 0.8872458410351202, "grad_norm": 0.24499641358852386, "learning_rate": 6.953470369291348e-06, "loss": 0.7629, "step": 1440 }, { "epoch": 0.8903265557609366, "grad_norm": 0.35346612334251404, "learning_rate": 6.5848465169566e-06, "loss": 0.8149, "step": 1445 }, { "epoch": 0.893407270486753, "grad_norm": 0.450797438621521, "learning_rate": 6.225928838071016e-06, "loss": 0.6792, "step": 1450 }, { "epoch": 0.8964879852125693, "grad_norm": 0.2921581268310547, "learning_rate": 5.876754625483904e-06, "loss": 0.7122, "step": 1455 }, { "epoch": 0.8995686999383857, "grad_norm": 0.2305372953414917, "learning_rate": 5.537360159663108e-06, "loss": 0.7524, "step": 1460 }, { "epoch": 0.9026494146642021, "grad_norm": 0.23024219274520874, "learning_rate": 5.207780704925314e-06, "loss": 0.7386, "step": 1465 }, { "epoch": 0.9057301293900185, "grad_norm": 0.33967798948287964, "learning_rate": 4.888050505771868e-06, "loss": 0.7753, "step": 1470 }, { "epoch": 0.9088108441158349, "grad_norm": 0.4888007640838623, "learning_rate": 4.578202783330799e-06, "loss": 0.7009, "step": 1475 }, { "epoch": 0.9118915588416513, "grad_norm": 0.266887366771698, "learning_rate": 4.2782697319048605e-06, "loss": 0.775, "step": 1480 }, { "epoch": 0.9149722735674677, "grad_norm": 0.23958100378513336, "learning_rate": 3.988282515626585e-06, "loss": 0.7319, "step": 1485 }, { "epoch": 0.9180529882932841, "grad_norm": 0.2224549800157547, "learning_rate": 3.7082712652200867e-06, "loss": 0.7513, "step": 1490 }, { "epoch": 0.9211337030191005, "grad_norm": 0.32627391815185547, "learning_rate": 3.438265074870417e-06, "loss": 0.8114, "step": 1495 }, { "epoch": 0.9242144177449169, "grad_norm": 0.429466187953949, "learning_rate": 3.1782919992006333e-06, "loss": 0.6879, "step": 1500 }, { "epoch": 0.9272951324707333, "grad_norm": 0.2456153929233551, "learning_rate": 2.9283790503567222e-06, "loss": 0.7926, "step": 1505 }, { "epoch": 0.9303758471965496, "grad_norm": 0.2562521994113922, "learning_rate": 2.6885521952010105e-06, "loss": 0.804, "step": 1510 }, { "epoch": 0.933456561922366, "grad_norm": 0.232356458902359, "learning_rate": 2.458836352614069e-06, "loss": 0.7496, "step": 1515 }, { "epoch": 0.9365372766481824, "grad_norm": 0.37867313623428345, "learning_rate": 2.239255390905581e-06, "loss": 0.7937, "step": 1520 }, { "epoch": 0.9396179913739988, "grad_norm": 0.5115949511528015, "learning_rate": 2.029832125334319e-06, "loss": 0.6868, "step": 1525 }, { "epoch": 0.9426987060998152, "grad_norm": 0.23035408556461334, "learning_rate": 1.8305883157375804e-06, "loss": 0.7426, "step": 1530 }, { "epoch": 0.9457794208256316, "grad_norm": 0.24654920399188995, "learning_rate": 1.6415446642702337e-06, "loss": 0.8562, "step": 1535 }, { "epoch": 0.9488601355514479, "grad_norm": 0.22488954663276672, "learning_rate": 1.462720813253682e-06, "loss": 0.7478, "step": 1540 }, { "epoch": 0.9519408502772643, "grad_norm": 0.37392520904541016, "learning_rate": 1.2941353431350056e-06, "loss": 0.7557, "step": 1545 }, { "epoch": 0.9550215650030807, "grad_norm": 0.5033255815505981, "learning_rate": 1.135805770556364e-06, "loss": 0.7122, "step": 1550 }, { "epoch": 0.958102279728897, "grad_norm": 0.26435384154319763, "learning_rate": 9.877485465349058e-07, "loss": 0.8407, "step": 1555 }, { "epoch": 0.9611829944547134, "grad_norm": 0.24363724887371063, "learning_rate": 8.499790547535025e-07, "loss": 0.7504, "step": 1560 }, { "epoch": 0.9642637091805298, "grad_norm": 0.19287309050559998, "learning_rate": 7.225116099623286e-07, "loss": 0.7052, "step": 1565 }, { "epoch": 0.9673444239063462, "grad_norm": 0.36122244596481323, "learning_rate": 6.053594564914611e-07, "loss": 0.7875, "step": 1570 }, { "epoch": 0.9704251386321626, "grad_norm": 0.4581773281097412, "learning_rate": 4.985347668747809e-07, "loss": 0.685, "step": 1575 }, { "epoch": 0.973505853357979, "grad_norm": 0.26282060146331787, "learning_rate": 4.0204864058522864e-07, "loss": 0.8346, "step": 1580 }, { "epoch": 0.9765865680837954, "grad_norm": 0.19449281692504883, "learning_rate": 3.15911102881461e-07, "loss": 0.8744, "step": 1585 }, { "epoch": 0.9796672828096118, "grad_norm": 0.23728476464748383, "learning_rate": 2.40131103766239e-07, "loss": 0.7637, "step": 1590 }, { "epoch": 0.9827479975354282, "grad_norm": 0.550537109375, "learning_rate": 1.747165170564724e-07, "loss": 0.8459, "step": 1595 }, { "epoch": 0.9858287122612446, "grad_norm": 0.413791686296463, "learning_rate": 1.1967413956510686e-07, "loss": 0.729, "step": 1600 }, { "epoch": 0.988909426987061, "grad_norm": 0.3087040185928345, "learning_rate": 7.500969039491157e-08, "loss": 0.7882, "step": 1605 }, { "epoch": 0.9919901417128774, "grad_norm": 0.27406904101371765, "learning_rate": 4.0727810344254325e-08, "loss": 0.8378, "step": 1610 }, { "epoch": 0.9950708564386938, "grad_norm": 0.20725923776626587, "learning_rate": 1.6832061424865153e-08, "loss": 0.7447, "step": 1615 }, { "epoch": 0.9981515711645101, "grad_norm": 0.29900407791137695, "learning_rate": 3.3249264917878387e-09, "loss": 0.7179, "step": 1620 }, { "epoch": 1.0, "step": 1623, "total_flos": 889480391426048.0, "train_loss": 0.7996691749629163, "train_runtime": 17995.3775, "train_samples_per_second": 2.886, "train_steps_per_second": 0.09 } ], "logging_steps": 5, "max_steps": 1623, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 889480391426048.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }