{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 30, "global_step": 1635, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "learning_rate": 2e-05, "loss": 2.9348, "step": 10 }, { "epoch": 0.01, "learning_rate": 4e-05, "loss": 2.993, "step": 20 }, { "epoch": 0.02, "learning_rate": 6e-05, "loss": 2.6684, "step": 30 }, { "epoch": 0.02, "eval_loss": 2.1267645359039307, "eval_runtime": 14.5639, "eval_samples_per_second": 80.404, "eval_steps_per_second": 6.729, "step": 30 }, { "epoch": 0.02, "learning_rate": 8e-05, "loss": 2.493, "step": 40 }, { "epoch": 0.03, "learning_rate": 0.0001, "loss": 2.1131, "step": 50 }, { "epoch": 0.04, "learning_rate": 0.00012, "loss": 1.7994, "step": 60 }, { "epoch": 0.04, "eval_loss": 1.461239218711853, "eval_runtime": 15.3978, "eval_samples_per_second": 76.05, "eval_steps_per_second": 6.365, "step": 60 }, { "epoch": 0.04, "learning_rate": 0.00014000000000000001, "loss": 1.3698, "step": 70 }, { "epoch": 0.05, "learning_rate": 0.00016, "loss": 1.205, "step": 80 }, { "epoch": 0.06, "learning_rate": 0.00017999999999999998, "loss": 1.1072, "step": 90 }, { "epoch": 0.06, "eval_loss": 0.979037344455719, "eval_runtime": 15.999, "eval_samples_per_second": 73.192, "eval_steps_per_second": 6.125, "step": 90 }, { "epoch": 0.06, "learning_rate": 0.0002, "loss": 1.154, "step": 100 }, { "epoch": 0.07, "learning_rate": 0.00022, "loss": 0.9462, "step": 110 }, { "epoch": 0.07, "learning_rate": 0.00024, "loss": 1.052, "step": 120 }, { "epoch": 0.07, "eval_loss": 0.8346178531646729, "eval_runtime": 15.0815, "eval_samples_per_second": 77.645, "eval_steps_per_second": 6.498, "step": 120 }, { "epoch": 0.08, "learning_rate": 0.00026000000000000003, "loss": 1.0541, "step": 130 }, { "epoch": 0.09, "learning_rate": 0.00028000000000000003, "loss": 0.9987, "step": 140 }, { "epoch": 0.09, "learning_rate": 0.0003, "loss": 0.9079, "step": 150 }, { "epoch": 0.09, "eval_loss": 0.8187346458435059, "eval_runtime": 15.0493, "eval_samples_per_second": 77.811, "eval_steps_per_second": 6.512, "step": 150 }, { "epoch": 0.1, "learning_rate": 0.00032, "loss": 0.9271, "step": 160 }, { "epoch": 0.1, "learning_rate": 0.00034, "loss": 0.9079, "step": 170 }, { "epoch": 0.11, "learning_rate": 0.00035999999999999997, "loss": 0.8843, "step": 180 }, { "epoch": 0.11, "eval_loss": 0.7424212694168091, "eval_runtime": 15.4048, "eval_samples_per_second": 76.016, "eval_steps_per_second": 6.362, "step": 180 }, { "epoch": 0.12, "learning_rate": 0.00038, "loss": 0.818, "step": 190 }, { "epoch": 0.12, "learning_rate": 0.0004, "loss": 0.8288, "step": 200 }, { "epoch": 0.13, "learning_rate": 0.00042, "loss": 0.9119, "step": 210 }, { "epoch": 0.13, "eval_loss": 0.7072588205337524, "eval_runtime": 15.4947, "eval_samples_per_second": 75.574, "eval_steps_per_second": 6.325, "step": 210 }, { "epoch": 0.13, "learning_rate": 0.00044, "loss": 0.8642, "step": 220 }, { "epoch": 0.14, "learning_rate": 0.00046, "loss": 0.7842, "step": 230 }, { "epoch": 0.15, "learning_rate": 0.00048, "loss": 0.8218, "step": 240 }, { "epoch": 0.15, "eval_loss": 0.6722940802574158, "eval_runtime": 15.2385, "eval_samples_per_second": 76.845, "eval_steps_per_second": 6.431, "step": 240 }, { "epoch": 0.15, "learning_rate": 0.0005, "loss": 0.8302, "step": 250 }, { "epoch": 0.16, "learning_rate": 0.0005200000000000001, "loss": 0.81, "step": 260 }, { "epoch": 0.17, "learning_rate": 0.00054, "loss": 0.7181, "step": 270 }, { "epoch": 0.17, "eval_loss": 0.6630179286003113, "eval_runtime": 15.2349, "eval_samples_per_second": 76.863, "eval_steps_per_second": 6.433, "step": 270 }, { "epoch": 0.17, "learning_rate": 0.0005600000000000001, "loss": 0.7874, "step": 280 }, { "epoch": 0.18, "learning_rate": 0.00058, "loss": 0.7636, "step": 290 }, { "epoch": 0.18, "learning_rate": 0.0006, "loss": 0.7933, "step": 300 }, { "epoch": 0.18, "eval_loss": 0.648048460483551, "eval_runtime": 15.3271, "eval_samples_per_second": 76.401, "eval_steps_per_second": 6.394, "step": 300 }, { "epoch": 0.19, "learning_rate": 0.00062, "loss": 0.806, "step": 310 }, { "epoch": 0.2, "learning_rate": 0.00064, "loss": 0.8229, "step": 320 }, { "epoch": 0.2, "learning_rate": 0.00066, "loss": 0.7979, "step": 330 }, { "epoch": 0.2, "eval_loss": 0.6749615669250488, "eval_runtime": 15.4179, "eval_samples_per_second": 75.951, "eval_steps_per_second": 6.356, "step": 330 }, { "epoch": 0.21, "learning_rate": 0.00068, "loss": 0.7493, "step": 340 }, { "epoch": 0.21, "learning_rate": 0.0007, "loss": 0.8082, "step": 350 }, { "epoch": 0.22, "learning_rate": 0.0007199999999999999, "loss": 0.7077, "step": 360 }, { "epoch": 0.22, "eval_loss": 0.6362787485122681, "eval_runtime": 15.3758, "eval_samples_per_second": 76.159, "eval_steps_per_second": 6.374, "step": 360 }, { "epoch": 0.23, "learning_rate": 0.00074, "loss": 0.7082, "step": 370 }, { "epoch": 0.23, "learning_rate": 0.00076, "loss": 0.7552, "step": 380 }, { "epoch": 0.24, "learning_rate": 0.0007800000000000001, "loss": 0.7012, "step": 390 }, { "epoch": 0.24, "eval_loss": 0.6303613781929016, "eval_runtime": 15.3461, "eval_samples_per_second": 76.306, "eval_steps_per_second": 6.386, "step": 390 }, { "epoch": 0.24, "learning_rate": 0.0008, "loss": 0.6524, "step": 400 }, { "epoch": 0.25, "learning_rate": 0.00082, "loss": 0.7403, "step": 410 }, { "epoch": 0.26, "learning_rate": 0.00084, "loss": 0.7808, "step": 420 }, { "epoch": 0.26, "eval_loss": 0.6001694798469543, "eval_runtime": 15.2815, "eval_samples_per_second": 76.629, "eval_steps_per_second": 6.413, "step": 420 }, { "epoch": 0.26, "learning_rate": 0.00086, "loss": 0.6942, "step": 430 }, { "epoch": 0.27, "learning_rate": 0.00088, "loss": 0.6691, "step": 440 }, { "epoch": 0.28, "learning_rate": 0.0009000000000000001, "loss": 0.613, "step": 450 }, { "epoch": 0.28, "eval_loss": 0.6457517743110657, "eval_runtime": 15.2997, "eval_samples_per_second": 76.537, "eval_steps_per_second": 6.405, "step": 450 }, { "epoch": 0.28, "learning_rate": 0.00092, "loss": 0.7459, "step": 460 }, { "epoch": 0.29, "learning_rate": 0.00094, "loss": 0.7077, "step": 470 }, { "epoch": 0.29, "learning_rate": 0.00096, "loss": 0.6939, "step": 480 }, { "epoch": 0.29, "eval_loss": 0.6020320057868958, "eval_runtime": 15.2909, "eval_samples_per_second": 76.581, "eval_steps_per_second": 6.409, "step": 480 }, { "epoch": 0.3, "learning_rate": 0.00098, "loss": 0.7313, "step": 490 }, { "epoch": 0.31, "learning_rate": 0.001, "loss": 0.6848, "step": 500 }, { "epoch": 0.31, "learning_rate": 0.0009993690851735015, "loss": 0.7805, "step": 510 }, { "epoch": 0.31, "eval_loss": 0.5767749547958374, "eval_runtime": 15.3007, "eval_samples_per_second": 76.532, "eval_steps_per_second": 6.405, "step": 510 }, { "epoch": 0.32, "learning_rate": 0.0009987381703470033, "loss": 0.6732, "step": 520 }, { "epoch": 0.32, "learning_rate": 0.0009981072555205047, "loss": 0.6052, "step": 530 }, { "epoch": 0.33, "learning_rate": 0.0009974763406940062, "loss": 0.664, "step": 540 }, { "epoch": 0.33, "eval_loss": 0.5918195843696594, "eval_runtime": 15.2896, "eval_samples_per_second": 76.588, "eval_steps_per_second": 6.41, "step": 540 }, { "epoch": 0.34, "learning_rate": 0.000996845425867508, "loss": 0.657, "step": 550 }, { "epoch": 0.34, "learning_rate": 0.0009962145110410095, "loss": 0.6524, "step": 560 }, { "epoch": 0.35, "learning_rate": 0.0009955835962145111, "loss": 0.7756, "step": 570 }, { "epoch": 0.35, "eval_loss": 0.5548932552337646, "eval_runtime": 15.3185, "eval_samples_per_second": 76.443, "eval_steps_per_second": 6.397, "step": 570 }, { "epoch": 0.35, "learning_rate": 0.0009949526813880128, "loss": 0.6771, "step": 580 }, { "epoch": 0.36, "learning_rate": 0.0009943217665615142, "loss": 0.6054, "step": 590 }, { "epoch": 0.37, "learning_rate": 0.0009936908517350158, "loss": 0.6582, "step": 600 }, { "epoch": 0.37, "eval_loss": 0.5589831471443176, "eval_runtime": 15.3491, "eval_samples_per_second": 76.291, "eval_steps_per_second": 6.385, "step": 600 }, { "epoch": 0.37, "learning_rate": 0.0009930599369085175, "loss": 0.6386, "step": 610 }, { "epoch": 0.38, "learning_rate": 0.000992429022082019, "loss": 0.6355, "step": 620 }, { "epoch": 0.39, "learning_rate": 0.0009917981072555206, "loss": 0.6883, "step": 630 }, { "epoch": 0.39, "eval_loss": 0.5954719185829163, "eval_runtime": 15.317, "eval_samples_per_second": 76.451, "eval_steps_per_second": 6.398, "step": 630 }, { "epoch": 0.39, "learning_rate": 0.0009911671924290222, "loss": 0.6283, "step": 640 }, { "epoch": 0.4, "learning_rate": 0.0009905362776025236, "loss": 0.6973, "step": 650 }, { "epoch": 0.4, "learning_rate": 0.0009899053627760253, "loss": 0.6252, "step": 660 }, { "epoch": 0.4, "eval_loss": 0.5695869326591492, "eval_runtime": 15.3301, "eval_samples_per_second": 76.386, "eval_steps_per_second": 6.393, "step": 660 }, { "epoch": 0.41, "learning_rate": 0.000989274447949527, "loss": 0.6971, "step": 670 }, { "epoch": 0.42, "learning_rate": 0.0009886435331230286, "loss": 0.6276, "step": 680 }, { "epoch": 0.42, "learning_rate": 0.00098801261829653, "loss": 0.6776, "step": 690 }, { "epoch": 0.42, "eval_loss": 0.5438072085380554, "eval_runtime": 15.2869, "eval_samples_per_second": 76.602, "eval_steps_per_second": 6.411, "step": 690 }, { "epoch": 0.43, "learning_rate": 0.0009873817034700316, "loss": 0.5928, "step": 700 }, { "epoch": 0.43, "learning_rate": 0.0009867507886435333, "loss": 0.6095, "step": 710 }, { "epoch": 0.44, "learning_rate": 0.0009861198738170347, "loss": 0.6427, "step": 720 }, { "epoch": 0.44, "eval_loss": 0.5402929186820984, "eval_runtime": 15.332, "eval_samples_per_second": 76.376, "eval_steps_per_second": 6.392, "step": 720 }, { "epoch": 0.45, "learning_rate": 0.0009854889589905364, "loss": 0.5991, "step": 730 }, { "epoch": 0.45, "learning_rate": 0.000984858044164038, "loss": 0.6153, "step": 740 }, { "epoch": 0.46, "learning_rate": 0.0009842271293375394, "loss": 0.6422, "step": 750 }, { "epoch": 0.46, "eval_loss": 0.55536949634552, "eval_runtime": 15.315, "eval_samples_per_second": 76.461, "eval_steps_per_second": 6.399, "step": 750 }, { "epoch": 0.46, "learning_rate": 0.000983596214511041, "loss": 0.6874, "step": 760 }, { "epoch": 0.47, "learning_rate": 0.0009829652996845427, "loss": 0.6588, "step": 770 }, { "epoch": 0.48, "learning_rate": 0.0009823343848580442, "loss": 0.5467, "step": 780 }, { "epoch": 0.48, "eval_loss": 0.5687702298164368, "eval_runtime": 15.3018, "eval_samples_per_second": 76.527, "eval_steps_per_second": 6.404, "step": 780 }, { "epoch": 0.48, "learning_rate": 0.0009817034700315458, "loss": 0.5929, "step": 790 }, { "epoch": 0.49, "learning_rate": 0.0009810725552050475, "loss": 0.6407, "step": 800 }, { "epoch": 0.5, "learning_rate": 0.0009804416403785489, "loss": 0.621, "step": 810 }, { "epoch": 0.5, "eval_loss": 0.5593787431716919, "eval_runtime": 15.32, "eval_samples_per_second": 76.436, "eval_steps_per_second": 6.397, "step": 810 }, { "epoch": 0.5, "learning_rate": 0.0009798107255520505, "loss": 0.6191, "step": 820 }, { "epoch": 0.51, "learning_rate": 0.0009791798107255522, "loss": 0.6468, "step": 830 }, { "epoch": 0.51, "learning_rate": 0.0009785488958990536, "loss": 0.6486, "step": 840 }, { "epoch": 0.51, "eval_loss": 0.542827308177948, "eval_runtime": 15.3141, "eval_samples_per_second": 76.466, "eval_steps_per_second": 6.399, "step": 840 }, { "epoch": 0.52, "learning_rate": 0.0009779179810725552, "loss": 0.6503, "step": 850 }, { "epoch": 0.53, "learning_rate": 0.000977287066246057, "loss": 0.5697, "step": 860 }, { "epoch": 0.53, "learning_rate": 0.0009766561514195583, "loss": 0.5993, "step": 870 }, { "epoch": 0.53, "eval_loss": 0.5758046507835388, "eval_runtime": 15.2882, "eval_samples_per_second": 76.595, "eval_steps_per_second": 6.41, "step": 870 }, { "epoch": 0.54, "learning_rate": 0.00097602523659306, "loss": 0.6151, "step": 880 }, { "epoch": 0.54, "learning_rate": 0.0009753943217665615, "loss": 0.5733, "step": 890 }, { "epoch": 0.55, "learning_rate": 0.0009747634069400632, "loss": 0.5637, "step": 900 }, { "epoch": 0.55, "eval_loss": 0.5380309224128723, "eval_runtime": 15.3344, "eval_samples_per_second": 76.364, "eval_steps_per_second": 6.391, "step": 900 }, { "epoch": 0.56, "learning_rate": 0.0009741324921135647, "loss": 0.5768, "step": 910 }, { "epoch": 0.56, "learning_rate": 0.0009735015772870662, "loss": 0.5445, "step": 920 }, { "epoch": 0.57, "learning_rate": 0.0009728706624605679, "loss": 0.627, "step": 930 }, { "epoch": 0.57, "eval_loss": 0.5249527096748352, "eval_runtime": 15.3162, "eval_samples_per_second": 76.455, "eval_steps_per_second": 6.398, "step": 930 }, { "epoch": 0.57, "learning_rate": 0.0009722397476340694, "loss": 0.5396, "step": 940 }, { "epoch": 0.58, "learning_rate": 0.000971608832807571, "loss": 0.6558, "step": 950 }, { "epoch": 0.59, "learning_rate": 0.0009709779179810726, "loss": 0.5876, "step": 960 }, { "epoch": 0.59, "eval_loss": 0.5324742197990417, "eval_runtime": 15.3008, "eval_samples_per_second": 76.532, "eval_steps_per_second": 6.405, "step": 960 }, { "epoch": 0.59, "learning_rate": 0.0009703470031545741, "loss": 0.581, "step": 970 }, { "epoch": 0.6, "learning_rate": 0.0009697160883280758, "loss": 0.5938, "step": 980 }, { "epoch": 0.61, "learning_rate": 0.0009690851735015773, "loss": 0.5539, "step": 990 }, { "epoch": 0.61, "eval_loss": 0.5384231209754944, "eval_runtime": 15.3241, "eval_samples_per_second": 76.416, "eval_steps_per_second": 6.395, "step": 990 }, { "epoch": 0.61, "learning_rate": 0.0009684542586750789, "loss": 0.6287, "step": 1000 }, { "epoch": 0.62, "learning_rate": 0.0009678233438485805, "loss": 0.6094, "step": 1010 }, { "epoch": 0.62, "learning_rate": 0.000967192429022082, "loss": 0.5671, "step": 1020 }, { "epoch": 0.62, "eval_loss": 0.5662776827812195, "eval_runtime": 15.3105, "eval_samples_per_second": 76.483, "eval_steps_per_second": 6.401, "step": 1020 }, { "epoch": 0.63, "learning_rate": 0.0009665615141955836, "loss": 0.5863, "step": 1030 }, { "epoch": 0.64, "learning_rate": 0.0009659305993690852, "loss": 0.5675, "step": 1040 }, { "epoch": 0.64, "learning_rate": 0.0009652996845425868, "loss": 0.714, "step": 1050 }, { "epoch": 0.64, "eval_loss": 0.5345898270606995, "eval_runtime": 15.3604, "eval_samples_per_second": 76.235, "eval_steps_per_second": 6.38, "step": 1050 }, { "epoch": 0.65, "learning_rate": 0.0009646687697160883, "loss": 0.6364, "step": 1060 }, { "epoch": 0.65, "learning_rate": 0.0009640378548895899, "loss": 0.5733, "step": 1070 }, { "epoch": 0.66, "learning_rate": 0.0009634069400630915, "loss": 0.6243, "step": 1080 }, { "epoch": 0.66, "eval_loss": 0.5188413858413696, "eval_runtime": 15.3443, "eval_samples_per_second": 76.315, "eval_steps_per_second": 6.387, "step": 1080 }, { "epoch": 0.67, "learning_rate": 0.0009627760252365931, "loss": 0.5955, "step": 1090 }, { "epoch": 0.67, "learning_rate": 0.0009621451104100947, "loss": 0.5949, "step": 1100 }, { "epoch": 0.68, "learning_rate": 0.0009615141955835962, "loss": 0.5342, "step": 1110 }, { "epoch": 0.68, "eval_loss": 0.5572788119316101, "eval_runtime": 15.4088, "eval_samples_per_second": 75.996, "eval_steps_per_second": 6.36, "step": 1110 }, { "epoch": 0.69, "learning_rate": 0.0009608832807570978, "loss": 0.5371, "step": 1120 }, { "epoch": 0.69, "learning_rate": 0.0009602523659305994, "loss": 0.6767, "step": 1130 }, { "epoch": 0.7, "learning_rate": 0.0009596214511041009, "loss": 0.589, "step": 1140 }, { "epoch": 0.7, "eval_loss": 0.5333449840545654, "eval_runtime": 15.3542, "eval_samples_per_second": 76.266, "eval_steps_per_second": 6.383, "step": 1140 }, { "epoch": 0.7, "learning_rate": 0.0009589905362776026, "loss": 0.5914, "step": 1150 }, { "epoch": 0.71, "learning_rate": 0.0009583596214511041, "loss": 0.547, "step": 1160 }, { "epoch": 0.72, "learning_rate": 0.0009577287066246056, "loss": 0.577, "step": 1170 }, { "epoch": 0.72, "eval_loss": 0.520677924156189, "eval_runtime": 15.3767, "eval_samples_per_second": 76.154, "eval_steps_per_second": 6.373, "step": 1170 }, { "epoch": 0.72, "learning_rate": 0.0009570977917981073, "loss": 0.6002, "step": 1180 }, { "epoch": 0.73, "learning_rate": 0.0009564668769716088, "loss": 0.6236, "step": 1190 }, { "epoch": 0.73, "learning_rate": 0.0009558359621451105, "loss": 0.5757, "step": 1200 }, { "epoch": 0.73, "eval_loss": 0.5401705503463745, "eval_runtime": 15.3622, "eval_samples_per_second": 76.226, "eval_steps_per_second": 6.379, "step": 1200 }, { "epoch": 0.74, "learning_rate": 0.000955205047318612, "loss": 0.681, "step": 1210 }, { "epoch": 0.75, "learning_rate": 0.0009545741324921136, "loss": 0.5508, "step": 1220 }, { "epoch": 0.75, "learning_rate": 0.0009539432176656152, "loss": 0.5726, "step": 1230 }, { "epoch": 0.75, "eval_loss": 0.5101749897003174, "eval_runtime": 15.3465, "eval_samples_per_second": 76.304, "eval_steps_per_second": 6.386, "step": 1230 }, { "epoch": 0.76, "learning_rate": 0.0009533123028391167, "loss": 0.5528, "step": 1240 }, { "epoch": 0.76, "learning_rate": 0.0009526813880126183, "loss": 0.6107, "step": 1250 }, { "epoch": 0.77, "learning_rate": 0.0009520504731861199, "loss": 0.5675, "step": 1260 }, { "epoch": 0.77, "eval_loss": 0.5156053900718689, "eval_runtime": 15.3646, "eval_samples_per_second": 76.214, "eval_steps_per_second": 6.378, "step": 1260 }, { "epoch": 0.78, "learning_rate": 0.0009514195583596215, "loss": 0.5795, "step": 1270 }, { "epoch": 0.78, "learning_rate": 0.000950788643533123, "loss": 0.5489, "step": 1280 }, { "epoch": 0.79, "learning_rate": 0.0009501577287066246, "loss": 0.5931, "step": 1290 }, { "epoch": 0.79, "eval_loss": 0.5232254266738892, "eval_runtime": 15.3627, "eval_samples_per_second": 76.224, "eval_steps_per_second": 6.379, "step": 1290 }, { "epoch": 0.8, "learning_rate": 0.0009495268138801262, "loss": 0.6082, "step": 1300 }, { "epoch": 0.8, "learning_rate": 0.0009488958990536278, "loss": 0.6085, "step": 1310 }, { "epoch": 0.81, "learning_rate": 0.0009482649842271294, "loss": 0.5727, "step": 1320 }, { "epoch": 0.81, "eval_loss": 0.5149978399276733, "eval_runtime": 15.3731, "eval_samples_per_second": 76.172, "eval_steps_per_second": 6.375, "step": 1320 }, { "epoch": 0.81, "learning_rate": 0.0009476340694006309, "loss": 0.5912, "step": 1330 }, { "epoch": 0.82, "learning_rate": 0.0009470031545741325, "loss": 0.6397, "step": 1340 }, { "epoch": 0.83, "learning_rate": 0.0009463722397476341, "loss": 0.6233, "step": 1350 }, { "epoch": 0.83, "eval_loss": 0.5305810570716858, "eval_runtime": 15.3303, "eval_samples_per_second": 76.385, "eval_steps_per_second": 6.393, "step": 1350 }, { "epoch": 0.83, "learning_rate": 0.0009457413249211356, "loss": 0.54, "step": 1360 }, { "epoch": 0.84, "learning_rate": 0.0009451104100946373, "loss": 0.6118, "step": 1370 }, { "epoch": 0.84, "learning_rate": 0.0009444794952681388, "loss": 0.5661, "step": 1380 }, { "epoch": 0.84, "eval_loss": 0.5178245306015015, "eval_runtime": 15.3439, "eval_samples_per_second": 76.317, "eval_steps_per_second": 6.387, "step": 1380 }, { "epoch": 0.85, "learning_rate": 0.0009438485804416403, "loss": 0.6365, "step": 1390 }, { "epoch": 0.86, "learning_rate": 0.000943217665615142, "loss": 0.5635, "step": 1400 }, { "epoch": 0.86, "learning_rate": 0.0009425867507886435, "loss": 0.5993, "step": 1410 }, { "epoch": 0.86, "eval_loss": 0.5094326734542847, "eval_runtime": 15.3033, "eval_samples_per_second": 76.519, "eval_steps_per_second": 6.404, "step": 1410 }, { "epoch": 0.87, "learning_rate": 0.0009419558359621452, "loss": 0.5921, "step": 1420 }, { "epoch": 0.87, "learning_rate": 0.0009413249211356467, "loss": 0.5571, "step": 1430 }, { "epoch": 0.88, "learning_rate": 0.0009406940063091482, "loss": 0.5359, "step": 1440 }, { "epoch": 0.88, "eval_loss": 0.5110692977905273, "eval_runtime": 15.3327, "eval_samples_per_second": 76.373, "eval_steps_per_second": 6.392, "step": 1440 }, { "epoch": 0.89, "learning_rate": 0.0009400630914826499, "loss": 0.5363, "step": 1450 }, { "epoch": 0.89, "learning_rate": 0.0009394321766561514, "loss": 0.5619, "step": 1460 }, { "epoch": 0.9, "learning_rate": 0.000938801261829653, "loss": 0.5925, "step": 1470 }, { "epoch": 0.9, "eval_loss": 0.5252500176429749, "eval_runtime": 15.3611, "eval_samples_per_second": 76.231, "eval_steps_per_second": 6.38, "step": 1470 }, { "epoch": 0.91, "learning_rate": 0.0009381703470031546, "loss": 0.611, "step": 1480 }, { "epoch": 0.91, "learning_rate": 0.0009375394321766562, "loss": 0.581, "step": 1490 }, { "epoch": 0.92, "learning_rate": 0.0009369085173501577, "loss": 0.4984, "step": 1500 }, { "epoch": 0.92, "eval_loss": 0.5312231779098511, "eval_runtime": 15.5553, "eval_samples_per_second": 75.28, "eval_steps_per_second": 6.3, "step": 1500 }, { "epoch": 0.92, "learning_rate": 0.0009362776025236593, "loss": 0.5921, "step": 1510 }, { "epoch": 0.93, "learning_rate": 0.0009356466876971609, "loss": 0.5776, "step": 1520 }, { "epoch": 0.94, "learning_rate": 0.0009350157728706625, "loss": 0.5551, "step": 1530 }, { "epoch": 0.94, "eval_loss": 0.515774130821228, "eval_runtime": 15.3618, "eval_samples_per_second": 76.228, "eval_steps_per_second": 6.379, "step": 1530 }, { "epoch": 0.94, "learning_rate": 0.0009343848580441641, "loss": 0.5832, "step": 1540 }, { "epoch": 0.95, "learning_rate": 0.0009337539432176656, "loss": 0.5673, "step": 1550 }, { "epoch": 0.95, "learning_rate": 0.0009331230283911672, "loss": 0.6288, "step": 1560 }, { "epoch": 0.95, "eval_loss": 0.4972882866859436, "eval_runtime": 15.359, "eval_samples_per_second": 76.242, "eval_steps_per_second": 6.381, "step": 1560 }, { "epoch": 0.96, "learning_rate": 0.0009324921135646688, "loss": 0.555, "step": 1570 }, { "epoch": 0.97, "learning_rate": 0.0009318611987381703, "loss": 0.5405, "step": 1580 }, { "epoch": 0.97, "learning_rate": 0.000931230283911672, "loss": 0.6164, "step": 1590 }, { "epoch": 0.97, "eval_loss": 0.5089274644851685, "eval_runtime": 15.3184, "eval_samples_per_second": 76.444, "eval_steps_per_second": 6.398, "step": 1590 }, { "epoch": 0.98, "learning_rate": 0.0009305993690851735, "loss": 0.5609, "step": 1600 }, { "epoch": 0.98, "learning_rate": 0.000929968454258675, "loss": 0.6188, "step": 1610 }, { "epoch": 0.99, "learning_rate": 0.0009293375394321767, "loss": 0.5423, "step": 1620 }, { "epoch": 0.99, "eval_loss": 0.5065773129463196, "eval_runtime": 15.2816, "eval_samples_per_second": 76.628, "eval_steps_per_second": 6.413, "step": 1620 }, { "epoch": 1.0, "learning_rate": 0.0009287066246056782, "loss": 0.5393, "step": 1630 } ], "logging_steps": 10, "max_steps": 16350, "num_train_epochs": 10, "save_steps": 500, "total_flos": 1250508061458432.0, "trial_name": null, "trial_params": null }