Feature Extraction
sentence-transformers
Safetensors
modernbert
sparse-encoder
sparse
Generated from Trainer
dataset_size:202427
loss:SpladeColbertTopKLoss
loss:FlopsLoss
text-embeddings-inference
Instructions to use UBC-SLIME/sparcol-large-k512-no-cls with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- sentence-transformers
How to use UBC-SLIME/sparcol-large-k512-no-cls with sentence-transformers:
from sentence_transformers import SentenceTransformer model = SentenceTransformer("UBC-SLIME/sparcol-large-k512-no-cls") sentences = [ "The weather is lovely today.", "It's so sunny outside!", "He drove to the stadium." ] embeddings = model.encode(sentences) similarities = model.similarity(embeddings, embeddings) print(similarities.shape) # [3, 3] - Notebooks
- Google Colab
- Kaggle
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 20000, | |
| "global_step": 12652, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.003161555485298767, | |
| "grad_norm": 128.30313110351562, | |
| "learning_rate": 5.00526870389884e-07, | |
| "loss": 16.6252, | |
| "sparse_loss": 16.6252, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.006323110970597534, | |
| "grad_norm": 90.16519165039062, | |
| "learning_rate": 1.0273972602739725e-06, | |
| "loss": 14.3114, | |
| "sparse_loss": 14.3114, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.0094846664558963, | |
| "grad_norm": 159.9249267578125, | |
| "learning_rate": 1.554267650158061e-06, | |
| "loss": 14.3623, | |
| "sparse_loss": 14.3623, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.012646221941195067, | |
| "grad_norm": 81.25911712646484, | |
| "learning_rate": 2.08113804004215e-06, | |
| "loss": 11.6206, | |
| "sparse_loss": 11.6206, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.015807777426493835, | |
| "grad_norm": 152.113525390625, | |
| "learning_rate": 2.6080084299262384e-06, | |
| "loss": 9.1902, | |
| "sparse_loss": 9.1902, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.0189693329117926, | |
| "grad_norm": 68.47811889648438, | |
| "learning_rate": 3.1348788198103265e-06, | |
| "loss": 6.6879, | |
| "sparse_loss": 6.6879, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.022130888397091368, | |
| "grad_norm": 61.585304260253906, | |
| "learning_rate": 3.661749209694415e-06, | |
| "loss": 4.3414, | |
| "sparse_loss": 4.3414, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.025292443882390134, | |
| "grad_norm": 32.29441452026367, | |
| "learning_rate": 4.188619599578504e-06, | |
| "loss": 2.8611, | |
| "sparse_loss": 2.8611, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.028453999367688904, | |
| "grad_norm": 23.945159912109375, | |
| "learning_rate": 4.715489989462593e-06, | |
| "loss": 2.3256, | |
| "sparse_loss": 2.3256, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.03161555485298767, | |
| "grad_norm": 28.487728118896484, | |
| "learning_rate": 5.242360379346681e-06, | |
| "loss": 2.1056, | |
| "sparse_loss": 2.1056, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.034777110338286434, | |
| "grad_norm": 21.144609451293945, | |
| "learning_rate": 5.76923076923077e-06, | |
| "loss": 2.2798, | |
| "sparse_loss": 2.2798, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.0379386658235852, | |
| "grad_norm": 57.145225524902344, | |
| "learning_rate": 6.296101159114858e-06, | |
| "loss": 2.2509, | |
| "sparse_loss": 2.2509, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.04110022130888397, | |
| "grad_norm": 35.35124588012695, | |
| "learning_rate": 6.822971548998947e-06, | |
| "loss": 2.0234, | |
| "sparse_loss": 2.0234, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.044261776794182736, | |
| "grad_norm": 27.13028907775879, | |
| "learning_rate": 7.349841938883036e-06, | |
| "loss": 1.9, | |
| "sparse_loss": 1.9, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.047423332279481506, | |
| "grad_norm": 17.84204864501953, | |
| "learning_rate": 7.876712328767124e-06, | |
| "loss": 2.0252, | |
| "sparse_loss": 2.0252, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.05058488776478027, | |
| "grad_norm": 99.34933471679688, | |
| "learning_rate": 8.403582718651212e-06, | |
| "loss": 1.8767, | |
| "sparse_loss": 1.8767, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.05374644325007904, | |
| "grad_norm": 20.827627182006836, | |
| "learning_rate": 8.930453108535302e-06, | |
| "loss": 1.7484, | |
| "sparse_loss": 1.7484, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.05690799873537781, | |
| "grad_norm": 22.89020538330078, | |
| "learning_rate": 9.457323498419388e-06, | |
| "loss": 1.7255, | |
| "sparse_loss": 1.7255, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.06006955422067657, | |
| "grad_norm": 18.195846557617188, | |
| "learning_rate": 9.984193888303478e-06, | |
| "loss": 1.6325, | |
| "sparse_loss": 1.6325, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.06323110970597534, | |
| "grad_norm": 12.281404495239258, | |
| "learning_rate": 1.0511064278187566e-05, | |
| "loss": 1.9005, | |
| "sparse_loss": 1.9005, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.06639266519127411, | |
| "grad_norm": 13.496838569641113, | |
| "learning_rate": 1.1037934668071655e-05, | |
| "loss": 1.6418, | |
| "sparse_loss": 1.6418, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.06955422067657287, | |
| "grad_norm": 13.643138885498047, | |
| "learning_rate": 1.1564805057955744e-05, | |
| "loss": 1.6877, | |
| "sparse_loss": 1.6877, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.07271577616187164, | |
| "grad_norm": 9.594386100769043, | |
| "learning_rate": 1.209167544783983e-05, | |
| "loss": 1.698, | |
| "sparse_loss": 1.698, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.0758773316471704, | |
| "grad_norm": 234.09896850585938, | |
| "learning_rate": 1.2618545837723922e-05, | |
| "loss": 1.6121, | |
| "sparse_loss": 1.6121, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.07903888713246918, | |
| "grad_norm": 11.300992965698242, | |
| "learning_rate": 1.3145416227608009e-05, | |
| "loss": 1.618, | |
| "sparse_loss": 1.618, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.08220044261776795, | |
| "grad_norm": 12.089790344238281, | |
| "learning_rate": 1.3672286617492097e-05, | |
| "loss": 1.5691, | |
| "sparse_loss": 1.5691, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.0853619981030667, | |
| "grad_norm": 6.890669345855713, | |
| "learning_rate": 1.4199157007376185e-05, | |
| "loss": 1.7044, | |
| "sparse_loss": 1.7044, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.08852355358836547, | |
| "grad_norm": 7.337052345275879, | |
| "learning_rate": 1.4726027397260275e-05, | |
| "loss": 1.5826, | |
| "sparse_loss": 1.5826, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.09168510907366424, | |
| "grad_norm": 11.253660202026367, | |
| "learning_rate": 1.5252897787144363e-05, | |
| "loss": 1.4962, | |
| "sparse_loss": 1.4962, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.09484666455896301, | |
| "grad_norm": 11.72204875946045, | |
| "learning_rate": 1.577976817702845e-05, | |
| "loss": 1.5067, | |
| "sparse_loss": 1.5067, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.09800822004426178, | |
| "grad_norm": 17.81527328491211, | |
| "learning_rate": 1.630663856691254e-05, | |
| "loss": 1.3541, | |
| "sparse_loss": 1.3541, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.10116977552956054, | |
| "grad_norm": 7.416006088256836, | |
| "learning_rate": 1.683350895679663e-05, | |
| "loss": 1.4784, | |
| "sparse_loss": 1.4784, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.10433133101485931, | |
| "grad_norm": 11.03427791595459, | |
| "learning_rate": 1.7360379346680716e-05, | |
| "loss": 1.4868, | |
| "sparse_loss": 1.4868, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.10749288650015808, | |
| "grad_norm": 6.934051036834717, | |
| "learning_rate": 1.7887249736564805e-05, | |
| "loss": 1.5381, | |
| "sparse_loss": 1.5381, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.11065444198545685, | |
| "grad_norm": 9.158458709716797, | |
| "learning_rate": 1.8414120126448895e-05, | |
| "loss": 1.4103, | |
| "sparse_loss": 1.4103, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.11381599747075562, | |
| "grad_norm": 15.296072959899902, | |
| "learning_rate": 1.894099051633298e-05, | |
| "loss": 1.4227, | |
| "sparse_loss": 1.4227, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.11697755295605437, | |
| "grad_norm": 10.86883544921875, | |
| "learning_rate": 1.946786090621707e-05, | |
| "loss": 1.5968, | |
| "sparse_loss": 1.5968, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.12013910844135314, | |
| "grad_norm": 13.303630828857422, | |
| "learning_rate": 1.999473129610116e-05, | |
| "loss": 1.3709, | |
| "sparse_loss": 1.3709, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.12330066392665191, | |
| "grad_norm": 7.9238362312316895, | |
| "learning_rate": 2.0521601685985248e-05, | |
| "loss": 1.3811, | |
| "sparse_loss": 1.3811, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.12646221941195068, | |
| "grad_norm": 17.758581161499023, | |
| "learning_rate": 2.1048472075869338e-05, | |
| "loss": 1.5005, | |
| "sparse_loss": 1.5005, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.12962377489724944, | |
| "grad_norm": 3.8343210220336914, | |
| "learning_rate": 2.1575342465753427e-05, | |
| "loss": 1.6216, | |
| "sparse_loss": 1.6216, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.13278533038254822, | |
| "grad_norm": 3.8403618335723877, | |
| "learning_rate": 2.2102212855637514e-05, | |
| "loss": 1.37, | |
| "sparse_loss": 1.37, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.13594688586784698, | |
| "grad_norm": 5.895383358001709, | |
| "learning_rate": 2.2629083245521604e-05, | |
| "loss": 1.4373, | |
| "sparse_loss": 1.4373, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.13910844135314573, | |
| "grad_norm": 8.331881523132324, | |
| "learning_rate": 2.315595363540569e-05, | |
| "loss": 1.4005, | |
| "sparse_loss": 1.4005, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.14226999683844452, | |
| "grad_norm": 12.499529838562012, | |
| "learning_rate": 2.368282402528978e-05, | |
| "loss": 1.402, | |
| "sparse_loss": 1.402, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.14543155232374327, | |
| "grad_norm": 8.521995544433594, | |
| "learning_rate": 2.420969441517387e-05, | |
| "loss": 1.5812, | |
| "sparse_loss": 1.5812, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.14859310780904206, | |
| "grad_norm": 6.057605743408203, | |
| "learning_rate": 2.4736564805057956e-05, | |
| "loss": 1.3752, | |
| "sparse_loss": 1.3752, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.1517546632943408, | |
| "grad_norm": 26.1428165435791, | |
| "learning_rate": 2.5263435194942046e-05, | |
| "loss": 1.4496, | |
| "sparse_loss": 1.4496, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.15491621877963957, | |
| "grad_norm": 11.980278015136719, | |
| "learning_rate": 2.5790305584826136e-05, | |
| "loss": 1.4868, | |
| "sparse_loss": 1.4868, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.15807777426493835, | |
| "grad_norm": 5.677769184112549, | |
| "learning_rate": 2.6317175974710222e-05, | |
| "loss": 1.2911, | |
| "sparse_loss": 1.2911, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.1612393297502371, | |
| "grad_norm": 7.899789333343506, | |
| "learning_rate": 2.6844046364594312e-05, | |
| "loss": 1.2455, | |
| "sparse_loss": 1.2455, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.1644008852355359, | |
| "grad_norm": 9.248682975769043, | |
| "learning_rate": 2.73709167544784e-05, | |
| "loss": 1.4636, | |
| "sparse_loss": 1.4636, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.16756244072083465, | |
| "grad_norm": 25.693893432617188, | |
| "learning_rate": 2.7897787144362485e-05, | |
| "loss": 1.431, | |
| "sparse_loss": 1.431, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.1707239962061334, | |
| "grad_norm": 5.5594940185546875, | |
| "learning_rate": 2.842465753424658e-05, | |
| "loss": 1.4264, | |
| "sparse_loss": 1.4264, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.1738855516914322, | |
| "grad_norm": 9.96285629272461, | |
| "learning_rate": 2.8951527924130668e-05, | |
| "loss": 1.379, | |
| "sparse_loss": 1.379, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.17704710717673094, | |
| "grad_norm": 7.128167629241943, | |
| "learning_rate": 2.9478398314014755e-05, | |
| "loss": 1.482, | |
| "sparse_loss": 1.482, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.18020866266202973, | |
| "grad_norm": 4.2342658042907715, | |
| "learning_rate": 3.000526870389884e-05, | |
| "loss": 1.3317, | |
| "sparse_loss": 1.3317, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.18337021814732848, | |
| "grad_norm": 5.94893217086792, | |
| "learning_rate": 3.053213909378293e-05, | |
| "loss": 1.3711, | |
| "sparse_loss": 1.3711, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.18653177363262724, | |
| "grad_norm": 9.314913749694824, | |
| "learning_rate": 3.105900948366702e-05, | |
| "loss": 1.3961, | |
| "sparse_loss": 1.3961, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.18969332911792602, | |
| "grad_norm": 12.692520141601562, | |
| "learning_rate": 3.1585879873551104e-05, | |
| "loss": 1.2723, | |
| "sparse_loss": 1.2723, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.19285488460322478, | |
| "grad_norm": 4.648574352264404, | |
| "learning_rate": 3.21127502634352e-05, | |
| "loss": 1.4275, | |
| "sparse_loss": 1.4275, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.19601644008852356, | |
| "grad_norm": 10.362495422363281, | |
| "learning_rate": 3.2639620653319283e-05, | |
| "loss": 1.3174, | |
| "sparse_loss": 1.3174, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.19917799557382232, | |
| "grad_norm": 226.89773559570312, | |
| "learning_rate": 3.316649104320337e-05, | |
| "loss": 1.4134, | |
| "sparse_loss": 1.4134, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.20233955105912108, | |
| "grad_norm": 3.8117411136627197, | |
| "learning_rate": 3.369336143308746e-05, | |
| "loss": 1.3341, | |
| "sparse_loss": 1.3341, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.20550110654441986, | |
| "grad_norm": 4.760183334350586, | |
| "learning_rate": 3.4220231822971546e-05, | |
| "loss": 1.2803, | |
| "sparse_loss": 1.2803, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.20866266202971862, | |
| "grad_norm": 4.356404781341553, | |
| "learning_rate": 3.4747102212855636e-05, | |
| "loss": 1.3362, | |
| "sparse_loss": 1.3362, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.2118242175150174, | |
| "grad_norm": 13.460466384887695, | |
| "learning_rate": 3.527397260273973e-05, | |
| "loss": 1.285, | |
| "sparse_loss": 1.285, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.21498577300031615, | |
| "grad_norm": 6.3744940757751465, | |
| "learning_rate": 3.5800842992623816e-05, | |
| "loss": 1.3639, | |
| "sparse_loss": 1.3639, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.2181473284856149, | |
| "grad_norm": 9.473126411437988, | |
| "learning_rate": 3.6327713382507905e-05, | |
| "loss": 1.2435, | |
| "sparse_loss": 1.2435, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.2213088839709137, | |
| "grad_norm": 3.342799425125122, | |
| "learning_rate": 3.6854583772391995e-05, | |
| "loss": 1.4165, | |
| "sparse_loss": 1.4165, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.22447043945621245, | |
| "grad_norm": 22.65892791748047, | |
| "learning_rate": 3.738145416227608e-05, | |
| "loss": 1.3688, | |
| "sparse_loss": 1.3688, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.22763199494151123, | |
| "grad_norm": 5.911210536956787, | |
| "learning_rate": 3.790832455216017e-05, | |
| "loss": 1.3473, | |
| "sparse_loss": 1.3473, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.23079355042681, | |
| "grad_norm": 4.169831275939941, | |
| "learning_rate": 3.843519494204426e-05, | |
| "loss": 1.2015, | |
| "sparse_loss": 1.2015, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.23395510591210875, | |
| "grad_norm": 3.30840802192688, | |
| "learning_rate": 3.896206533192835e-05, | |
| "loss": 1.1465, | |
| "sparse_loss": 1.1465, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.23711666139740753, | |
| "grad_norm": 84.1337661743164, | |
| "learning_rate": 3.948893572181244e-05, | |
| "loss": 1.1855, | |
| "sparse_loss": 1.1855, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.24027821688270629, | |
| "grad_norm": 7.522827625274658, | |
| "learning_rate": 4.001580611169653e-05, | |
| "loss": 1.2851, | |
| "sparse_loss": 1.2851, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.24343977236800507, | |
| "grad_norm": 197.22821044921875, | |
| "learning_rate": 4.054267650158061e-05, | |
| "loss": 1.5259, | |
| "sparse_loss": 1.5259, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.24660132785330383, | |
| "grad_norm": 7.006889820098877, | |
| "learning_rate": 4.10695468914647e-05, | |
| "loss": 1.3605, | |
| "sparse_loss": 1.3605, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.24976288333860258, | |
| "grad_norm": 11.081674575805664, | |
| "learning_rate": 4.159641728134879e-05, | |
| "loss": 1.3445, | |
| "sparse_loss": 1.3445, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.25292443882390137, | |
| "grad_norm": 5.594365119934082, | |
| "learning_rate": 4.212328767123288e-05, | |
| "loss": 1.2714, | |
| "sparse_loss": 1.2714, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.25608599430920015, | |
| "grad_norm": 4.106484413146973, | |
| "learning_rate": 4.265015806111697e-05, | |
| "loss": 1.3731, | |
| "sparse_loss": 1.3731, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.2592475497944989, | |
| "grad_norm": 4.2914533615112305, | |
| "learning_rate": 4.317702845100105e-05, | |
| "loss": 1.2776, | |
| "sparse_loss": 1.2776, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.26240910527979766, | |
| "grad_norm": 4.921117782592773, | |
| "learning_rate": 4.370389884088514e-05, | |
| "loss": 1.4268, | |
| "sparse_loss": 1.4268, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.26557066076509644, | |
| "grad_norm": 4.515608787536621, | |
| "learning_rate": 4.423076923076923e-05, | |
| "loss": 1.3297, | |
| "sparse_loss": 1.3297, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.2687322162503952, | |
| "grad_norm": 4.874992847442627, | |
| "learning_rate": 4.4757639620653316e-05, | |
| "loss": 1.3508, | |
| "sparse_loss": 1.3508, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.27189377173569396, | |
| "grad_norm": 9.342555046081543, | |
| "learning_rate": 4.528451001053741e-05, | |
| "loss": 1.2828, | |
| "sparse_loss": 1.2828, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.27505532722099274, | |
| "grad_norm": 22.3294734954834, | |
| "learning_rate": 4.58113804004215e-05, | |
| "loss": 1.1539, | |
| "sparse_loss": 1.1539, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.27821688270629147, | |
| "grad_norm": 4.5409321784973145, | |
| "learning_rate": 4.6338250790305585e-05, | |
| "loss": 1.2459, | |
| "sparse_loss": 1.2459, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.28137843819159025, | |
| "grad_norm": 10.592023849487305, | |
| "learning_rate": 4.6865121180189675e-05, | |
| "loss": 1.241, | |
| "sparse_loss": 1.241, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.28453999367688904, | |
| "grad_norm": 92.1581802368164, | |
| "learning_rate": 4.7391991570073765e-05, | |
| "loss": 1.2905, | |
| "sparse_loss": 1.2905, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.2877015491621878, | |
| "grad_norm": 8.384659767150879, | |
| "learning_rate": 4.791886195995785e-05, | |
| "loss": 1.2808, | |
| "sparse_loss": 1.2808, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.29086310464748655, | |
| "grad_norm": 3.508902072906494, | |
| "learning_rate": 4.8445732349841945e-05, | |
| "loss": 1.2958, | |
| "sparse_loss": 1.2958, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.29402466013278533, | |
| "grad_norm": 2.9955217838287354, | |
| "learning_rate": 4.8972602739726034e-05, | |
| "loss": 1.0703, | |
| "sparse_loss": 1.0703, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.2971862156180841, | |
| "grad_norm": 3.8358285427093506, | |
| "learning_rate": 4.949947312961012e-05, | |
| "loss": 1.3482, | |
| "sparse_loss": 1.3482, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.30034777110338284, | |
| "grad_norm": 4.9970479011535645, | |
| "learning_rate": 4.999999893323271e-05, | |
| "loss": 1.2378, | |
| "sparse_loss": 1.2378, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.3035093265886816, | |
| "grad_norm": 5.74011754989624, | |
| "learning_rate": 4.999952955709672e-05, | |
| "loss": 1.2849, | |
| "sparse_loss": 1.2849, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.3066708820739804, | |
| "grad_norm": 4.790186882019043, | |
| "learning_rate": 4.999820678560873e-05, | |
| "loss": 1.2481, | |
| "sparse_loss": 1.2481, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.30983243755927914, | |
| "grad_norm": 4.901861667633057, | |
| "learning_rate": 4.999603066392346e-05, | |
| "loss": 1.2009, | |
| "sparse_loss": 1.2009, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.3129939930445779, | |
| "grad_norm": 8.495441436767578, | |
| "learning_rate": 4.999300126632601e-05, | |
| "loss": 1.2779, | |
| "sparse_loss": 1.2779, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.3161555485298767, | |
| "grad_norm": 3.7238235473632812, | |
| "learning_rate": 4.998911869622926e-05, | |
| "loss": 1.0882, | |
| "sparse_loss": 1.0882, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.3193171040151755, | |
| "grad_norm": 7.2108354568481445, | |
| "learning_rate": 4.998438308617042e-05, | |
| "loss": 1.3365, | |
| "sparse_loss": 1.3365, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.3224786595004742, | |
| "grad_norm": 3.252174139022827, | |
| "learning_rate": 4.997879459780641e-05, | |
| "loss": 1.3396, | |
| "sparse_loss": 1.3396, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.325640214985773, | |
| "grad_norm": 4.206798076629639, | |
| "learning_rate": 4.997235342190843e-05, | |
| "loss": 1.3244, | |
| "sparse_loss": 1.3244, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.3288017704710718, | |
| "grad_norm": 3.757575511932373, | |
| "learning_rate": 4.996505977835541e-05, | |
| "loss": 1.1792, | |
| "sparse_loss": 1.1792, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.3319633259563705, | |
| "grad_norm": 5.497296333312988, | |
| "learning_rate": 4.995691391612649e-05, | |
| "loss": 1.2505, | |
| "sparse_loss": 1.2505, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.3351248814416693, | |
| "grad_norm": 5.120765686035156, | |
| "learning_rate": 4.994791611329253e-05, | |
| "loss": 1.2359, | |
| "sparse_loss": 1.2359, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.3382864369269681, | |
| "grad_norm": 15.797529220581055, | |
| "learning_rate": 4.9938066677006644e-05, | |
| "loss": 1.2179, | |
| "sparse_loss": 1.2179, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.3414479924122668, | |
| "grad_norm": 7.707025051116943, | |
| "learning_rate": 4.9927365943493686e-05, | |
| "loss": 1.2451, | |
| "sparse_loss": 1.2451, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.3446095478975656, | |
| "grad_norm": 4.341976642608643, | |
| "learning_rate": 4.991581427803879e-05, | |
| "loss": 1.0311, | |
| "sparse_loss": 1.0311, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.3477711033828644, | |
| "grad_norm": 4.378412246704102, | |
| "learning_rate": 4.990341207497485e-05, | |
| "loss": 1.12, | |
| "sparse_loss": 1.12, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.35093265886816316, | |
| "grad_norm": 7.830859661102295, | |
| "learning_rate": 4.989015975766916e-05, | |
| "loss": 1.0732, | |
| "sparse_loss": 1.0732, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.3540942143534619, | |
| "grad_norm": 4.191309452056885, | |
| "learning_rate": 4.987605777850886e-05, | |
| "loss": 1.1785, | |
| "sparse_loss": 1.1785, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.3572557698387607, | |
| "grad_norm": 21.474308013916016, | |
| "learning_rate": 4.986110661888555e-05, | |
| "loss": 1.1085, | |
| "sparse_loss": 1.1085, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.36041732532405946, | |
| "grad_norm": 5.993708610534668, | |
| "learning_rate": 4.9845306789178833e-05, | |
| "loss": 1.2287, | |
| "sparse_loss": 1.2287, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.3635788808093582, | |
| "grad_norm": 3.8776209354400635, | |
| "learning_rate": 4.982865882873893e-05, | |
| "loss": 1.0967, | |
| "sparse_loss": 1.0967, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.36674043629465697, | |
| "grad_norm": 4.90541934967041, | |
| "learning_rate": 4.9811163305868185e-05, | |
| "loss": 1.157, | |
| "sparse_loss": 1.157, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.36990199177995575, | |
| "grad_norm": 14.786788940429688, | |
| "learning_rate": 4.9792820817801776e-05, | |
| "loss": 1.1239, | |
| "sparse_loss": 1.1239, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.3730635472652545, | |
| "grad_norm": 10.61219596862793, | |
| "learning_rate": 4.977363199068724e-05, | |
| "loss": 1.1468, | |
| "sparse_loss": 1.1468, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.37622510275055326, | |
| "grad_norm": 3.993335008621216, | |
| "learning_rate": 4.9753597479563135e-05, | |
| "loss": 1.1354, | |
| "sparse_loss": 1.1354, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.37938665823585205, | |
| "grad_norm": 25.283395767211914, | |
| "learning_rate": 4.9732717968336684e-05, | |
| "loss": 1.2344, | |
| "sparse_loss": 1.2344, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.38254821372115083, | |
| "grad_norm": 3.873542547225952, | |
| "learning_rate": 4.971099416976041e-05, | |
| "loss": 1.0804, | |
| "sparse_loss": 1.0804, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.38570976920644956, | |
| "grad_norm": 6.974337100982666, | |
| "learning_rate": 4.968842682540782e-05, | |
| "loss": 1.4594, | |
| "sparse_loss": 1.4594, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.38887132469174834, | |
| "grad_norm": 3.187790870666504, | |
| "learning_rate": 4.966501670564807e-05, | |
| "loss": 1.1694, | |
| "sparse_loss": 1.1694, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.3920328801770471, | |
| "grad_norm": 12.773173332214355, | |
| "learning_rate": 4.964076460961971e-05, | |
| "loss": 1.066, | |
| "sparse_loss": 1.066, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.39519443566234586, | |
| "grad_norm": 4.0825371742248535, | |
| "learning_rate": 4.961567136520335e-05, | |
| "loss": 1.0973, | |
| "sparse_loss": 1.0973, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.39835599114764464, | |
| "grad_norm": 3.0021812915802, | |
| "learning_rate": 4.958973782899344e-05, | |
| "loss": 1.3674, | |
| "sparse_loss": 1.3674, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.4015175466329434, | |
| "grad_norm": 4.639908790588379, | |
| "learning_rate": 4.9562964886269005e-05, | |
| "loss": 1.169, | |
| "sparse_loss": 1.169, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.40467910211824215, | |
| "grad_norm": 151.06048583984375, | |
| "learning_rate": 4.953535345096344e-05, | |
| "loss": 1.4649, | |
| "sparse_loss": 1.4649, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.40784065760354093, | |
| "grad_norm": 33.80763244628906, | |
| "learning_rate": 4.95069044656333e-05, | |
| "loss": 4.1995, | |
| "sparse_loss": 4.1995, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.4110022130888397, | |
| "grad_norm": 5.223995685577393, | |
| "learning_rate": 4.947761890142615e-05, | |
| "loss": 1.5354, | |
| "sparse_loss": 1.5354, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.4141637685741385, | |
| "grad_norm": 15.408181190490723, | |
| "learning_rate": 4.9447497758047354e-05, | |
| "loss": 1.2286, | |
| "sparse_loss": 1.2286, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.41732532405943723, | |
| "grad_norm": 2.977509021759033, | |
| "learning_rate": 4.941654206372602e-05, | |
| "loss": 1.1715, | |
| "sparse_loss": 1.1715, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.420486879544736, | |
| "grad_norm": 4.334832191467285, | |
| "learning_rate": 4.9384752875179876e-05, | |
| "loss": 1.1055, | |
| "sparse_loss": 1.1055, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.4236484350300348, | |
| "grad_norm": 5.943917751312256, | |
| "learning_rate": 4.9352131277579144e-05, | |
| "loss": 1.0437, | |
| "sparse_loss": 1.0437, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.4268099905153335, | |
| "grad_norm": 4.5376996994018555, | |
| "learning_rate": 4.931867838450959e-05, | |
| "loss": 1.0396, | |
| "sparse_loss": 1.0396, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.4299715460006323, | |
| "grad_norm": 8.081282615661621, | |
| "learning_rate": 4.928439533793443e-05, | |
| "loss": 1.0279, | |
| "sparse_loss": 1.0279, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.4331331014859311, | |
| "grad_norm": 2.9060018062591553, | |
| "learning_rate": 4.92492833081554e-05, | |
| "loss": 1.0892, | |
| "sparse_loss": 1.0892, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.4362946569712298, | |
| "grad_norm": 1.9138152599334717, | |
| "learning_rate": 4.921334349377277e-05, | |
| "loss": 1.0709, | |
| "sparse_loss": 1.0709, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.4394562124565286, | |
| "grad_norm": 3.0852744579315186, | |
| "learning_rate": 4.917657712164445e-05, | |
| "loss": 1.0889, | |
| "sparse_loss": 1.0889, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.4426177679418274, | |
| "grad_norm": 2.95231294631958, | |
| "learning_rate": 4.91389854468441e-05, | |
| "loss": 1.1733, | |
| "sparse_loss": 1.1733, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.4457793234271262, | |
| "grad_norm": 4.986962795257568, | |
| "learning_rate": 4.910056975261829e-05, | |
| "loss": 1.1991, | |
| "sparse_loss": 1.1991, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.4489408789124249, | |
| "grad_norm": 4.206987380981445, | |
| "learning_rate": 4.906133135034269e-05, | |
| "loss": 1.2295, | |
| "sparse_loss": 1.2295, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.4521024343977237, | |
| "grad_norm": 4.294253349304199, | |
| "learning_rate": 4.902127157947732e-05, | |
| "loss": 1.0959, | |
| "sparse_loss": 1.0959, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.45526398988302247, | |
| "grad_norm": 4.660038948059082, | |
| "learning_rate": 4.898039180752079e-05, | |
| "loss": 1.1378, | |
| "sparse_loss": 1.1378, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.4584255453683212, | |
| "grad_norm": 14.10377311706543, | |
| "learning_rate": 4.893869342996367e-05, | |
| "loss": 1.0923, | |
| "sparse_loss": 1.0923, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.46158710085362, | |
| "grad_norm": 55.42957305908203, | |
| "learning_rate": 4.889617787024079e-05, | |
| "loss": 1.1051, | |
| "sparse_loss": 1.1051, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.46474865633891876, | |
| "grad_norm": 2.9921209812164307, | |
| "learning_rate": 4.885284657968272e-05, | |
| "loss": 1.0354, | |
| "sparse_loss": 1.0354, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.4679102118242175, | |
| "grad_norm": 1.9534173011779785, | |
| "learning_rate": 4.880870103746617e-05, | |
| "loss": 1.1892, | |
| "sparse_loss": 1.1892, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.4710717673095163, | |
| "grad_norm": 3.1078741550445557, | |
| "learning_rate": 4.8763742750563515e-05, | |
| "loss": 1.1636, | |
| "sparse_loss": 1.1636, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.47423332279481506, | |
| "grad_norm": 5.104287147521973, | |
| "learning_rate": 4.8717973253691365e-05, | |
| "loss": 1.2148, | |
| "sparse_loss": 1.2148, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.47739487828011384, | |
| "grad_norm": 3.2280941009521484, | |
| "learning_rate": 4.867139410925815e-05, | |
| "loss": 1.196, | |
| "sparse_loss": 1.196, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 0.48055643376541257, | |
| "grad_norm": 2.9964301586151123, | |
| "learning_rate": 4.8624006907310804e-05, | |
| "loss": 1.1401, | |
| "sparse_loss": 1.1401, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.48371798925071136, | |
| "grad_norm": 8.02204418182373, | |
| "learning_rate": 4.857581326548049e-05, | |
| "loss": 1.1311, | |
| "sparse_loss": 1.1311, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 0.48687954473601014, | |
| "grad_norm": 4.118027687072754, | |
| "learning_rate": 4.852681482892735e-05, | |
| "loss": 1.191, | |
| "sparse_loss": 1.191, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 0.49004110022130887, | |
| "grad_norm": 2.0322353839874268, | |
| "learning_rate": 4.847701327028439e-05, | |
| "loss": 1.1354, | |
| "sparse_loss": 1.1354, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.49320265570660765, | |
| "grad_norm": 3.882215738296509, | |
| "learning_rate": 4.8426410289600356e-05, | |
| "loss": 1.1447, | |
| "sparse_loss": 1.1447, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 0.49636421119190643, | |
| "grad_norm": 45.387672424316406, | |
| "learning_rate": 4.837500761428167e-05, | |
| "loss": 0.9855, | |
| "sparse_loss": 0.9855, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 0.49952576667720516, | |
| "grad_norm": 4.83632755279541, | |
| "learning_rate": 4.832280699903355e-05, | |
| "loss": 1.1387, | |
| "sparse_loss": 1.1387, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 0.502687322162504, | |
| "grad_norm": 2.312962532043457, | |
| "learning_rate": 4.826981022580001e-05, | |
| "loss": 1.2482, | |
| "sparse_loss": 1.2482, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 0.5058488776478027, | |
| "grad_norm": 6.244421482086182, | |
| "learning_rate": 4.821601910370308e-05, | |
| "loss": 1.0939, | |
| "sparse_loss": 1.0939, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.5090104331331015, | |
| "grad_norm": 4.036285400390625, | |
| "learning_rate": 4.8161435468981074e-05, | |
| "loss": 1.1258, | |
| "sparse_loss": 1.1258, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 0.5121719886184003, | |
| "grad_norm": 7.3748884201049805, | |
| "learning_rate": 4.8106061184925856e-05, | |
| "loss": 1.0983, | |
| "sparse_loss": 1.0983, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 0.515333544103699, | |
| "grad_norm": 2.0951173305511475, | |
| "learning_rate": 4.804989814181926e-05, | |
| "loss": 0.9883, | |
| "sparse_loss": 0.9883, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 0.5184950995889978, | |
| "grad_norm": 5.343947410583496, | |
| "learning_rate": 4.799294825686855e-05, | |
| "loss": 1.0743, | |
| "sparse_loss": 1.0743, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 0.5216566550742966, | |
| "grad_norm": 4.659270286560059, | |
| "learning_rate": 4.793521347414102e-05, | |
| "loss": 1.0794, | |
| "sparse_loss": 1.0794, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.5248182105595953, | |
| "grad_norm": 4.440898895263672, | |
| "learning_rate": 4.787669576449755e-05, | |
| "loss": 1.0888, | |
| "sparse_loss": 1.0888, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 0.527979766044894, | |
| "grad_norm": 3.1608309745788574, | |
| "learning_rate": 4.781739712552539e-05, | |
| "loss": 1.0183, | |
| "sparse_loss": 1.0183, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 0.5311413215301929, | |
| "grad_norm": 5.888028144836426, | |
| "learning_rate": 4.775731958146995e-05, | |
| "loss": 1.0622, | |
| "sparse_loss": 1.0622, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 0.5343028770154916, | |
| "grad_norm": 5.10470724105835, | |
| "learning_rate": 4.769646518316568e-05, | |
| "loss": 1.0711, | |
| "sparse_loss": 1.0711, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 0.5374644325007903, | |
| "grad_norm": 6.20997953414917, | |
| "learning_rate": 4.763483600796612e-05, | |
| "loss": 1.0966, | |
| "sparse_loss": 1.0966, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.5406259879860892, | |
| "grad_norm": 4.204266548156738, | |
| "learning_rate": 4.757243415967291e-05, | |
| "loss": 1.0076, | |
| "sparse_loss": 1.0076, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 0.5437875434713879, | |
| "grad_norm": 2.871713399887085, | |
| "learning_rate": 4.750926176846404e-05, | |
| "loss": 1.0147, | |
| "sparse_loss": 1.0147, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 0.5469490989566866, | |
| "grad_norm": 3.438703775405884, | |
| "learning_rate": 4.744532099082107e-05, | |
| "loss": 0.948, | |
| "sparse_loss": 0.948, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 0.5501106544419855, | |
| "grad_norm": 2.7154433727264404, | |
| "learning_rate": 4.7380614009455595e-05, | |
| "loss": 1.0644, | |
| "sparse_loss": 1.0644, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 0.5532722099272842, | |
| "grad_norm": 34.20964813232422, | |
| "learning_rate": 4.7315143033234654e-05, | |
| "loss": 1.0487, | |
| "sparse_loss": 1.0487, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.5564337654125829, | |
| "grad_norm": 3.2565040588378906, | |
| "learning_rate": 4.724891029710537e-05, | |
| "loss": 1.0309, | |
| "sparse_loss": 1.0309, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 0.5595953208978818, | |
| "grad_norm": 2.995903968811035, | |
| "learning_rate": 4.7181918062018674e-05, | |
| "loss": 1.2914, | |
| "sparse_loss": 1.2914, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 0.5627568763831805, | |
| "grad_norm": 1.9953080415725708, | |
| "learning_rate": 4.7114168614852064e-05, | |
| "loss": 1.0632, | |
| "sparse_loss": 1.0632, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 0.5659184318684793, | |
| "grad_norm": 2.5321481227874756, | |
| "learning_rate": 4.70456642683316e-05, | |
| "loss": 1.1658, | |
| "sparse_loss": 1.1658, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 0.5690799873537781, | |
| "grad_norm": 4.2941131591796875, | |
| "learning_rate": 4.697640736095292e-05, | |
| "loss": 1.0742, | |
| "sparse_loss": 1.0742, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.5722415428390768, | |
| "grad_norm": 2.0793707370758057, | |
| "learning_rate": 4.690640025690143e-05, | |
| "loss": 1.1901, | |
| "sparse_loss": 1.1901, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 0.5754030983243756, | |
| "grad_norm": 9.403236389160156, | |
| "learning_rate": 4.683564534597159e-05, | |
| "loss": 0.9989, | |
| "sparse_loss": 0.9989, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 0.5785646538096744, | |
| "grad_norm": 4.044275760650635, | |
| "learning_rate": 4.676414504348533e-05, | |
| "loss": 1.0833, | |
| "sparse_loss": 1.0833, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 0.5817262092949731, | |
| "grad_norm": 3.725008487701416, | |
| "learning_rate": 4.669190179020962e-05, | |
| "loss": 1.3316, | |
| "sparse_loss": 1.3316, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 0.5848877647802719, | |
| "grad_norm": 2.5833184719085693, | |
| "learning_rate": 4.661891805227313e-05, | |
| "loss": 1.3381, | |
| "sparse_loss": 1.3381, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.5880493202655707, | |
| "grad_norm": 112.98980712890625, | |
| "learning_rate": 4.654519632108204e-05, | |
| "loss": 1.1105, | |
| "sparse_loss": 1.1105, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 0.5912108757508694, | |
| "grad_norm": 3.47724986076355, | |
| "learning_rate": 4.6470739113235026e-05, | |
| "loss": 1.0299, | |
| "sparse_loss": 1.0299, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 0.5943724312361682, | |
| "grad_norm": 5.579695701599121, | |
| "learning_rate": 4.639554897043731e-05, | |
| "loss": 0.9969, | |
| "sparse_loss": 0.9969, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 0.597533986721467, | |
| "grad_norm": 3.7847225666046143, | |
| "learning_rate": 4.6319628459413946e-05, | |
| "loss": 0.9187, | |
| "sparse_loss": 0.9187, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 0.6006955422067657, | |
| "grad_norm": 3.587329864501953, | |
| "learning_rate": 4.6242980171822134e-05, | |
| "loss": 1.0435, | |
| "sparse_loss": 1.0435, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.6038570976920645, | |
| "grad_norm": 8.99221420288086, | |
| "learning_rate": 4.6165606724162816e-05, | |
| "loss": 1.0201, | |
| "sparse_loss": 1.0201, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 0.6070186531773633, | |
| "grad_norm": 2.8481829166412354, | |
| "learning_rate": 4.608751075769131e-05, | |
| "loss": 1.2422, | |
| "sparse_loss": 1.2422, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 0.610180208662662, | |
| "grad_norm": 10.8621187210083, | |
| "learning_rate": 4.600869493832718e-05, | |
| "loss": 1.0296, | |
| "sparse_loss": 1.0296, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 0.6133417641479608, | |
| "grad_norm": 2.5077457427978516, | |
| "learning_rate": 4.592916195656322e-05, | |
| "loss": 1.0305, | |
| "sparse_loss": 1.0305, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 0.6165033196332595, | |
| "grad_norm": 4.512426376342773, | |
| "learning_rate": 4.5848914527373574e-05, | |
| "loss": 1.3777, | |
| "sparse_loss": 1.3777, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.6196648751185583, | |
| "grad_norm": 5.227989196777344, | |
| "learning_rate": 4.576795539012114e-05, | |
| "loss": 0.9716, | |
| "sparse_loss": 0.9716, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 0.6228264306038571, | |
| "grad_norm": 2.234457015991211, | |
| "learning_rate": 4.568628730846397e-05, | |
| "loss": 0.8175, | |
| "sparse_loss": 0.8175, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 0.6259879860891558, | |
| "grad_norm": 2.7576990127563477, | |
| "learning_rate": 4.560391307026097e-05, | |
| "loss": 1.0889, | |
| "sparse_loss": 1.0889, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 0.6291495415744547, | |
| "grad_norm": 3.3322880268096924, | |
| "learning_rate": 4.5520835487476753e-05, | |
| "loss": 0.9866, | |
| "sparse_loss": 0.9866, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 0.6323110970597534, | |
| "grad_norm": 2.961534261703491, | |
| "learning_rate": 4.5437057396085584e-05, | |
| "loss": 0.9558, | |
| "sparse_loss": 0.9558, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.6354726525450521, | |
| "grad_norm": 8.994413375854492, | |
| "learning_rate": 4.535258165597465e-05, | |
| "loss": 0.9634, | |
| "sparse_loss": 0.9634, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 0.638634208030351, | |
| "grad_norm": 9.025190353393555, | |
| "learning_rate": 4.526741115084636e-05, | |
| "loss": 1.0034, | |
| "sparse_loss": 1.0034, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 0.6417957635156497, | |
| "grad_norm": 5.431192398071289, | |
| "learning_rate": 4.518154878811997e-05, | |
| "loss": 1.0137, | |
| "sparse_loss": 1.0137, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 0.6449573190009484, | |
| "grad_norm": 7.579077243804932, | |
| "learning_rate": 4.509499749883226e-05, | |
| "loss": 1.0167, | |
| "sparse_loss": 1.0167, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 0.6481188744862473, | |
| "grad_norm": 5.1751909255981445, | |
| "learning_rate": 4.5007760237537566e-05, | |
| "loss": 0.9525, | |
| "sparse_loss": 0.9525, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.651280429971546, | |
| "grad_norm": 5.494132995605469, | |
| "learning_rate": 4.491983998220686e-05, | |
| "loss": 0.978, | |
| "sparse_loss": 0.978, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 0.6544419854568447, | |
| "grad_norm": 2.640233278274536, | |
| "learning_rate": 4.483123973412611e-05, | |
| "loss": 1.0011, | |
| "sparse_loss": 1.0011, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 0.6576035409421436, | |
| "grad_norm": 7.292989253997803, | |
| "learning_rate": 4.474196251779381e-05, | |
| "loss": 1.0074, | |
| "sparse_loss": 1.0074, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 0.6607650964274423, | |
| "grad_norm": 87.05461120605469, | |
| "learning_rate": 4.465201138081778e-05, | |
| "loss": 1.0582, | |
| "sparse_loss": 1.0582, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 0.663926651912741, | |
| "grad_norm": 4.470405578613281, | |
| "learning_rate": 4.4561389393811096e-05, | |
| "loss": 1.0093, | |
| "sparse_loss": 1.0093, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.6670882073980399, | |
| "grad_norm": 4.9339518547058105, | |
| "learning_rate": 4.4470099650287255e-05, | |
| "loss": 0.9241, | |
| "sparse_loss": 0.9241, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 0.6702497628833386, | |
| "grad_norm": 2.3951964378356934, | |
| "learning_rate": 4.4378145266554625e-05, | |
| "loss": 0.9304, | |
| "sparse_loss": 0.9304, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 0.6734113183686373, | |
| "grad_norm": 2.9704620838165283, | |
| "learning_rate": 4.428552938161002e-05, | |
| "loss": 1.04, | |
| "sparse_loss": 1.04, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 0.6765728738539362, | |
| "grad_norm": 46.931556701660156, | |
| "learning_rate": 4.419225515703155e-05, | |
| "loss": 0.9433, | |
| "sparse_loss": 0.9433, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 0.6797344293392349, | |
| "grad_norm": 2.5629138946533203, | |
| "learning_rate": 4.4098325776870734e-05, | |
| "loss": 1.2199, | |
| "sparse_loss": 1.2199, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.6828959848245336, | |
| "grad_norm": 40.42338180541992, | |
| "learning_rate": 4.400374444754376e-05, | |
| "loss": 0.9545, | |
| "sparse_loss": 0.9545, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 0.6860575403098325, | |
| "grad_norm": 3.68564510345459, | |
| "learning_rate": 4.3908514397722064e-05, | |
| "loss": 0.9973, | |
| "sparse_loss": 0.9973, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 0.6892190957951312, | |
| "grad_norm": 2.8047549724578857, | |
| "learning_rate": 4.3812638878222095e-05, | |
| "loss": 1.0985, | |
| "sparse_loss": 1.0985, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 0.69238065128043, | |
| "grad_norm": 4.292630672454834, | |
| "learning_rate": 4.371612116189434e-05, | |
| "loss": 0.8444, | |
| "sparse_loss": 0.8444, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 0.6955422067657288, | |
| "grad_norm": 2.901982307434082, | |
| "learning_rate": 4.361896454351162e-05, | |
| "loss": 0.9891, | |
| "sparse_loss": 0.9891, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.6987037622510275, | |
| "grad_norm": 5.447702407836914, | |
| "learning_rate": 4.3521172339656616e-05, | |
| "loss": 1.0007, | |
| "sparse_loss": 1.0007, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 0.7018653177363263, | |
| "grad_norm": 7.081545829772949, | |
| "learning_rate": 4.342274788860863e-05, | |
| "loss": 1.3366, | |
| "sparse_loss": 1.3366, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 0.705026873221625, | |
| "grad_norm": 4.312632083892822, | |
| "learning_rate": 4.332369455022965e-05, | |
| "loss": 0.9928, | |
| "sparse_loss": 0.9928, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 0.7081884287069238, | |
| "grad_norm": 2.817713499069214, | |
| "learning_rate": 4.322401570584965e-05, | |
| "loss": 0.9269, | |
| "sparse_loss": 0.9269, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 0.7113499841922226, | |
| "grad_norm": 3.021247148513794, | |
| "learning_rate": 4.312371475815116e-05, | |
| "loss": 0.8685, | |
| "sparse_loss": 0.8685, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.7145115396775213, | |
| "grad_norm": 4.620492935180664, | |
| "learning_rate": 4.3022795131053104e-05, | |
| "loss": 0.9361, | |
| "sparse_loss": 0.9361, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 0.7176730951628201, | |
| "grad_norm": 3.594322443008423, | |
| "learning_rate": 4.2921260269593954e-05, | |
| "loss": 1.0142, | |
| "sparse_loss": 1.0142, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 0.7208346506481189, | |
| "grad_norm": 2.517115831375122, | |
| "learning_rate": 4.281911363981407e-05, | |
| "loss": 0.9497, | |
| "sparse_loss": 0.9497, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 0.7239962061334176, | |
| "grad_norm": 2.5836758613586426, | |
| "learning_rate": 4.271635872863744e-05, | |
| "loss": 0.9003, | |
| "sparse_loss": 0.9003, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 0.7271577616187164, | |
| "grad_norm": 3.6526050567626953, | |
| "learning_rate": 4.261299904375261e-05, | |
| "loss": 1.6835, | |
| "sparse_loss": 1.6835, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.7303193171040152, | |
| "grad_norm": 3.3744492530822754, | |
| "learning_rate": 4.250903811349297e-05, | |
| "loss": 0.9629, | |
| "sparse_loss": 0.9629, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 0.7334808725893139, | |
| "grad_norm": 4.435449123382568, | |
| "learning_rate": 4.240447948671628e-05, | |
| "loss": 0.9577, | |
| "sparse_loss": 0.9577, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 0.7366424280746127, | |
| "grad_norm": 10.854448318481445, | |
| "learning_rate": 4.2299326732683555e-05, | |
| "loss": 0.8803, | |
| "sparse_loss": 0.8803, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 0.7398039835599115, | |
| "grad_norm": 13.589402198791504, | |
| "learning_rate": 4.219358344093719e-05, | |
| "loss": 0.8339, | |
| "sparse_loss": 0.8339, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 0.7429655390452102, | |
| "grad_norm": 3.220682144165039, | |
| "learning_rate": 4.208725322117848e-05, | |
| "loss": 0.98, | |
| "sparse_loss": 0.98, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.746127094530509, | |
| "grad_norm": 3.848068952560425, | |
| "learning_rate": 4.1980339703144325e-05, | |
| "loss": 0.9304, | |
| "sparse_loss": 0.9304, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 0.7492886500158078, | |
| "grad_norm": 4.343535423278809, | |
| "learning_rate": 4.1872846536483377e-05, | |
| "loss": 0.9197, | |
| "sparse_loss": 0.9197, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 0.7524502055011065, | |
| "grad_norm": 22.216102600097656, | |
| "learning_rate": 4.176477739063146e-05, | |
| "loss": 0.9199, | |
| "sparse_loss": 0.9199, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 0.7556117609864053, | |
| "grad_norm": 4.729385852813721, | |
| "learning_rate": 4.165613595468624e-05, | |
| "loss": 0.9136, | |
| "sparse_loss": 0.9136, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 0.7587733164717041, | |
| "grad_norm": 3.364084482192993, | |
| "learning_rate": 4.1546925937281376e-05, | |
| "loss": 1.0395, | |
| "sparse_loss": 1.0395, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.7619348719570028, | |
| "grad_norm": 2.467050790786743, | |
| "learning_rate": 4.143715106645986e-05, | |
| "loss": 0.896, | |
| "sparse_loss": 0.896, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 0.7650964274423017, | |
| "grad_norm": 81.41665649414062, | |
| "learning_rate": 4.13268150895468e-05, | |
| "loss": 1.0098, | |
| "sparse_loss": 1.0098, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 0.7682579829276004, | |
| "grad_norm": 2.784630298614502, | |
| "learning_rate": 4.121592177302147e-05, | |
| "loss": 0.9415, | |
| "sparse_loss": 0.9415, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 0.7714195384128991, | |
| "grad_norm": 3.2318930625915527, | |
| "learning_rate": 4.1104474902388734e-05, | |
| "loss": 1.0379, | |
| "sparse_loss": 1.0379, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 0.774581093898198, | |
| "grad_norm": 5.31190299987793, | |
| "learning_rate": 4.099247828204984e-05, | |
| "loss": 0.9674, | |
| "sparse_loss": 0.9674, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.7777426493834967, | |
| "grad_norm": 3.919340133666992, | |
| "learning_rate": 4.0879935735172526e-05, | |
| "loss": 0.9471, | |
| "sparse_loss": 0.9471, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 0.7809042048687954, | |
| "grad_norm": 26.871673583984375, | |
| "learning_rate": 4.076685110356057e-05, | |
| "loss": 0.9859, | |
| "sparse_loss": 0.9859, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 0.7840657603540943, | |
| "grad_norm": 9.538094520568848, | |
| "learning_rate": 4.0653228247522545e-05, | |
| "loss": 1.3406, | |
| "sparse_loss": 1.3406, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 0.787227315839393, | |
| "grad_norm": 3.3164422512054443, | |
| "learning_rate": 4.053907104574016e-05, | |
| "loss": 1.0039, | |
| "sparse_loss": 1.0039, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 0.7903888713246917, | |
| "grad_norm": 3.28509783744812, | |
| "learning_rate": 4.042438339513573e-05, | |
| "loss": 0.9906, | |
| "sparse_loss": 0.9906, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.7935504268099906, | |
| "grad_norm": 5.863593101501465, | |
| "learning_rate": 4.030916921073926e-05, | |
| "loss": 1.333, | |
| "sparse_loss": 1.333, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 0.7967119822952893, | |
| "grad_norm": 11.845443725585938, | |
| "learning_rate": 4.019343242555474e-05, | |
| "loss": 0.9501, | |
| "sparse_loss": 0.9501, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 0.799873537780588, | |
| "grad_norm": 3.840085506439209, | |
| "learning_rate": 4.00771769904259e-05, | |
| "loss": 0.9624, | |
| "sparse_loss": 0.9624, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 0.8030350932658868, | |
| "grad_norm": 7.45705509185791, | |
| "learning_rate": 3.9960406873901335e-05, | |
| "loss": 1.1257, | |
| "sparse_loss": 1.1257, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 0.8061966487511856, | |
| "grad_norm": 2.9892349243164062, | |
| "learning_rate": 3.984312606209904e-05, | |
| "loss": 1.0608, | |
| "sparse_loss": 1.0608, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.8093582042364843, | |
| "grad_norm": 4.22726583480835, | |
| "learning_rate": 3.9725338558570335e-05, | |
| "loss": 0.8869, | |
| "sparse_loss": 0.8869, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 0.8125197597217831, | |
| "grad_norm": 71.5716552734375, | |
| "learning_rate": 3.960704838416321e-05, | |
| "loss": 1.024, | |
| "sparse_loss": 1.024, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 0.8156813152070819, | |
| "grad_norm": 3.4674580097198486, | |
| "learning_rate": 3.948825957688506e-05, | |
| "loss": 0.9383, | |
| "sparse_loss": 0.9383, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 0.8188428706923806, | |
| "grad_norm": 22.559823989868164, | |
| "learning_rate": 3.9368976191764806e-05, | |
| "loss": 1.2298, | |
| "sparse_loss": 1.2298, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 0.8220044261776794, | |
| "grad_norm": 2.515516757965088, | |
| "learning_rate": 3.924920230071456e-05, | |
| "loss": 1.0466, | |
| "sparse_loss": 1.0466, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.8251659816629782, | |
| "grad_norm": 2.4846043586730957, | |
| "learning_rate": 3.912894199239052e-05, | |
| "loss": 0.8882, | |
| "sparse_loss": 0.8882, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 0.828327537148277, | |
| "grad_norm": 4.767989635467529, | |
| "learning_rate": 3.900819937205348e-05, | |
| "loss": 0.981, | |
| "sparse_loss": 0.981, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 0.8314890926335757, | |
| "grad_norm": 5.391225814819336, | |
| "learning_rate": 3.888697856142861e-05, | |
| "loss": 0.9302, | |
| "sparse_loss": 0.9302, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 0.8346506481188745, | |
| "grad_norm": 4.460025787353516, | |
| "learning_rate": 3.876528369856486e-05, | |
| "loss": 0.8613, | |
| "sparse_loss": 0.8613, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 0.8378122036041733, | |
| "grad_norm": 3.7380456924438477, | |
| "learning_rate": 3.864311893769361e-05, | |
| "loss": 1.1334, | |
| "sparse_loss": 1.1334, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.840973759089472, | |
| "grad_norm": 3.2851319313049316, | |
| "learning_rate": 3.85204884490869e-05, | |
| "loss": 1.0204, | |
| "sparse_loss": 1.0204, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 0.8441353145747708, | |
| "grad_norm": 4.777285575866699, | |
| "learning_rate": 3.839739641891506e-05, | |
| "loss": 0.9311, | |
| "sparse_loss": 0.9311, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 0.8472968700600696, | |
| "grad_norm": 5.82297945022583, | |
| "learning_rate": 3.8273847049103816e-05, | |
| "loss": 0.8136, | |
| "sparse_loss": 0.8136, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 0.8504584255453683, | |
| "grad_norm": 3.461158275604248, | |
| "learning_rate": 3.8149844557190855e-05, | |
| "loss": 0.8487, | |
| "sparse_loss": 0.8487, | |
| "step": 5380 | |
| }, | |
| { | |
| "epoch": 0.853619981030667, | |
| "grad_norm": 8.542764663696289, | |
| "learning_rate": 3.802539317618185e-05, | |
| "loss": 0.8648, | |
| "sparse_loss": 0.8648, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.8567815365159659, | |
| "grad_norm": 3.9751503467559814, | |
| "learning_rate": 3.790049715440592e-05, | |
| "loss": 0.8832, | |
| "sparse_loss": 0.8832, | |
| "step": 5420 | |
| }, | |
| { | |
| "epoch": 0.8599430920012646, | |
| "grad_norm": 6.192680358886719, | |
| "learning_rate": 3.7775160755370695e-05, | |
| "loss": 0.8357, | |
| "sparse_loss": 0.8357, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 0.8631046474865633, | |
| "grad_norm": 3.71183705329895, | |
| "learning_rate": 3.764938825761671e-05, | |
| "loss": 0.8037, | |
| "sparse_loss": 0.8037, | |
| "step": 5460 | |
| }, | |
| { | |
| "epoch": 0.8662662029718622, | |
| "grad_norm": 3.918074369430542, | |
| "learning_rate": 3.7523183954571336e-05, | |
| "loss": 0.9258, | |
| "sparse_loss": 0.9258, | |
| "step": 5480 | |
| }, | |
| { | |
| "epoch": 0.8694277584571609, | |
| "grad_norm": 3.448901891708374, | |
| "learning_rate": 3.739655215440228e-05, | |
| "loss": 0.8469, | |
| "sparse_loss": 0.8469, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.8725893139424596, | |
| "grad_norm": 3.151432752609253, | |
| "learning_rate": 3.726949717987048e-05, | |
| "loss": 0.8945, | |
| "sparse_loss": 0.8945, | |
| "step": 5520 | |
| }, | |
| { | |
| "epoch": 0.8757508694277585, | |
| "grad_norm": 2.4400060176849365, | |
| "learning_rate": 3.714202336818252e-05, | |
| "loss": 0.8608, | |
| "sparse_loss": 0.8608, | |
| "step": 5540 | |
| }, | |
| { | |
| "epoch": 0.8789124249130572, | |
| "grad_norm": 8.550524711608887, | |
| "learning_rate": 3.701413507084264e-05, | |
| "loss": 1.0025, | |
| "sparse_loss": 1.0025, | |
| "step": 5560 | |
| }, | |
| { | |
| "epoch": 0.8820739803983559, | |
| "grad_norm": 3.8528268337249756, | |
| "learning_rate": 3.6885836653504124e-05, | |
| "loss": 0.8956, | |
| "sparse_loss": 0.8956, | |
| "step": 5580 | |
| }, | |
| { | |
| "epoch": 0.8852355358836548, | |
| "grad_norm": 6.683523178100586, | |
| "learning_rate": 3.675713249582031e-05, | |
| "loss": 0.8487, | |
| "sparse_loss": 0.8487, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.8883970913689535, | |
| "grad_norm": 4.686063766479492, | |
| "learning_rate": 3.662802699129508e-05, | |
| "loss": 0.8578, | |
| "sparse_loss": 0.8578, | |
| "step": 5620 | |
| }, | |
| { | |
| "epoch": 0.8915586468542523, | |
| "grad_norm": 11.728353500366211, | |
| "learning_rate": 3.649852454713286e-05, | |
| "loss": 0.7899, | |
| "sparse_loss": 0.7899, | |
| "step": 5640 | |
| }, | |
| { | |
| "epoch": 0.8947202023395511, | |
| "grad_norm": 3.126070499420166, | |
| "learning_rate": 3.636862958408818e-05, | |
| "loss": 0.8662, | |
| "sparse_loss": 0.8662, | |
| "step": 5660 | |
| }, | |
| { | |
| "epoch": 0.8978817578248498, | |
| "grad_norm": 10.942789077758789, | |
| "learning_rate": 3.6238346536314815e-05, | |
| "loss": 0.8668, | |
| "sparse_loss": 0.8668, | |
| "step": 5680 | |
| }, | |
| { | |
| "epoch": 0.9010433133101486, | |
| "grad_norm": 7.786405563354492, | |
| "learning_rate": 3.610767985121433e-05, | |
| "loss": 0.8688, | |
| "sparse_loss": 0.8688, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.9042048687954474, | |
| "grad_norm": 4.515435218811035, | |
| "learning_rate": 3.597663398928435e-05, | |
| "loss": 1.035, | |
| "sparse_loss": 1.035, | |
| "step": 5720 | |
| }, | |
| { | |
| "epoch": 0.9073664242807461, | |
| "grad_norm": 5.420071601867676, | |
| "learning_rate": 3.584521342396623e-05, | |
| "loss": 0.8736, | |
| "sparse_loss": 0.8736, | |
| "step": 5740 | |
| }, | |
| { | |
| "epoch": 0.9105279797660449, | |
| "grad_norm": 4.7605767250061035, | |
| "learning_rate": 3.5713422641492355e-05, | |
| "loss": 0.9587, | |
| "sparse_loss": 0.9587, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 0.9136895352513437, | |
| "grad_norm": 4.38069486618042, | |
| "learning_rate": 3.558126614073305e-05, | |
| "loss": 0.9497, | |
| "sparse_loss": 0.9497, | |
| "step": 5780 | |
| }, | |
| { | |
| "epoch": 0.9168510907366424, | |
| "grad_norm": 3.70302152633667, | |
| "learning_rate": 3.544874843304294e-05, | |
| "loss": 0.8356, | |
| "sparse_loss": 0.8356, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.9200126462219412, | |
| "grad_norm": 3.1259734630584717, | |
| "learning_rate": 3.5315874042107e-05, | |
| "loss": 0.9136, | |
| "sparse_loss": 0.9136, | |
| "step": 5820 | |
| }, | |
| { | |
| "epoch": 0.92317420170724, | |
| "grad_norm": 9.207280158996582, | |
| "learning_rate": 3.518264750378606e-05, | |
| "loss": 0.8825, | |
| "sparse_loss": 0.8825, | |
| "step": 5840 | |
| }, | |
| { | |
| "epoch": 0.9263357571925387, | |
| "grad_norm": 4.420866012573242, | |
| "learning_rate": 3.5049073365962065e-05, | |
| "loss": 0.9497, | |
| "sparse_loss": 0.9497, | |
| "step": 5860 | |
| }, | |
| { | |
| "epoch": 0.9294973126778375, | |
| "grad_norm": 6.084704875946045, | |
| "learning_rate": 3.491515618838275e-05, | |
| "loss": 0.9559, | |
| "sparse_loss": 0.9559, | |
| "step": 5880 | |
| }, | |
| { | |
| "epoch": 0.9326588681631363, | |
| "grad_norm": 7.439093112945557, | |
| "learning_rate": 3.4780900542506e-05, | |
| "loss": 0.9589, | |
| "sparse_loss": 0.9589, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.935820423648435, | |
| "grad_norm": 2.716554880142212, | |
| "learning_rate": 3.464631101134385e-05, | |
| "loss": 0.8888, | |
| "sparse_loss": 0.8888, | |
| "step": 5920 | |
| }, | |
| { | |
| "epoch": 0.9389819791337338, | |
| "grad_norm": 242.3243865966797, | |
| "learning_rate": 3.451139218930595e-05, | |
| "loss": 0.8851, | |
| "sparse_loss": 0.8851, | |
| "step": 5940 | |
| }, | |
| { | |
| "epoch": 0.9421435346190326, | |
| "grad_norm": 4.233142852783203, | |
| "learning_rate": 3.43761486820428e-05, | |
| "loss": 0.9576, | |
| "sparse_loss": 0.9576, | |
| "step": 5960 | |
| }, | |
| { | |
| "epoch": 0.9453050901043313, | |
| "grad_norm": 4.0793328285217285, | |
| "learning_rate": 3.424058510628849e-05, | |
| "loss": 0.9139, | |
| "sparse_loss": 0.9139, | |
| "step": 5980 | |
| }, | |
| { | |
| "epoch": 0.9484666455896301, | |
| "grad_norm": 30.09717559814453, | |
| "learning_rate": 3.410470608970313e-05, | |
| "loss": 0.8681, | |
| "sparse_loss": 0.8681, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.9516282010749288, | |
| "grad_norm": 4.182109355926514, | |
| "learning_rate": 3.396851627071484e-05, | |
| "loss": 0.8576, | |
| "sparse_loss": 0.8576, | |
| "step": 6020 | |
| }, | |
| { | |
| "epoch": 0.9547897565602277, | |
| "grad_norm": 10.850983619689941, | |
| "learning_rate": 3.383202029836145e-05, | |
| "loss": 0.9598, | |
| "sparse_loss": 0.9598, | |
| "step": 6040 | |
| }, | |
| { | |
| "epoch": 0.9579513120455264, | |
| "grad_norm": 10.119805335998535, | |
| "learning_rate": 3.369522283213179e-05, | |
| "loss": 0.9393, | |
| "sparse_loss": 0.9393, | |
| "step": 6060 | |
| }, | |
| { | |
| "epoch": 0.9611128675308251, | |
| "grad_norm": 3.4797518253326416, | |
| "learning_rate": 3.3558128541806586e-05, | |
| "loss": 0.9382, | |
| "sparse_loss": 0.9382, | |
| "step": 6080 | |
| }, | |
| { | |
| "epoch": 0.964274423016124, | |
| "grad_norm": 3.270132064819336, | |
| "learning_rate": 3.3420742107299117e-05, | |
| "loss": 0.9342, | |
| "sparse_loss": 0.9342, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.9674359785014227, | |
| "grad_norm": 4.080615520477295, | |
| "learning_rate": 3.328306821849542e-05, | |
| "loss": 0.9381, | |
| "sparse_loss": 0.9381, | |
| "step": 6120 | |
| }, | |
| { | |
| "epoch": 0.9705975339867214, | |
| "grad_norm": 2.0365800857543945, | |
| "learning_rate": 3.314511157509422e-05, | |
| "loss": 0.826, | |
| "sparse_loss": 0.826, | |
| "step": 6140 | |
| }, | |
| { | |
| "epoch": 0.9737590894720203, | |
| "grad_norm": 5.458964824676514, | |
| "learning_rate": 3.300687688644644e-05, | |
| "loss": 0.9035, | |
| "sparse_loss": 0.9035, | |
| "step": 6160 | |
| }, | |
| { | |
| "epoch": 0.976920644957319, | |
| "grad_norm": 7.074070453643799, | |
| "learning_rate": 3.286836887139454e-05, | |
| "loss": 0.9065, | |
| "sparse_loss": 0.9065, | |
| "step": 6180 | |
| }, | |
| { | |
| "epoch": 0.9800822004426177, | |
| "grad_norm": 2.327319622039795, | |
| "learning_rate": 3.272959225811132e-05, | |
| "loss": 0.813, | |
| "sparse_loss": 0.813, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.9832437559279166, | |
| "grad_norm": 2.7987070083618164, | |
| "learning_rate": 3.259055178393859e-05, | |
| "loss": 0.8557, | |
| "sparse_loss": 0.8557, | |
| "step": 6220 | |
| }, | |
| { | |
| "epoch": 0.9864053114132153, | |
| "grad_norm": 3.6612040996551514, | |
| "learning_rate": 3.2451252195225476e-05, | |
| "loss": 0.8239, | |
| "sparse_loss": 0.8239, | |
| "step": 6240 | |
| }, | |
| { | |
| "epoch": 0.989566866898514, | |
| "grad_norm": 2.771278142929077, | |
| "learning_rate": 3.231169824716628e-05, | |
| "loss": 0.9103, | |
| "sparse_loss": 0.9103, | |
| "step": 6260 | |
| }, | |
| { | |
| "epoch": 0.9927284223838129, | |
| "grad_norm": 3.7233340740203857, | |
| "learning_rate": 3.2171894703638306e-05, | |
| "loss": 0.7789, | |
| "sparse_loss": 0.7789, | |
| "step": 6280 | |
| }, | |
| { | |
| "epoch": 0.9958899778691116, | |
| "grad_norm": 2.886253833770752, | |
| "learning_rate": 3.2031846337039105e-05, | |
| "loss": 1.0294, | |
| "sparse_loss": 1.0294, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.9990515333544103, | |
| "grad_norm": 3.4484128952026367, | |
| "learning_rate": 3.189155792812366e-05, | |
| "loss": 0.9973, | |
| "sparse_loss": 0.9973, | |
| "step": 6320 | |
| }, | |
| { | |
| "epoch": 1.0022130888397092, | |
| "grad_norm": 3.3322901725769043, | |
| "learning_rate": 3.175103426584113e-05, | |
| "loss": 0.851, | |
| "sparse_loss": 0.851, | |
| "step": 6340 | |
| }, | |
| { | |
| "epoch": 1.005374644325008, | |
| "grad_norm": 3.2941579818725586, | |
| "learning_rate": 3.161028014717138e-05, | |
| "loss": 0.8414, | |
| "sparse_loss": 0.8414, | |
| "step": 6360 | |
| }, | |
| { | |
| "epoch": 1.0085361998103066, | |
| "grad_norm": 2.743898868560791, | |
| "learning_rate": 3.146930037696127e-05, | |
| "loss": 0.8934, | |
| "sparse_loss": 0.8934, | |
| "step": 6380 | |
| }, | |
| { | |
| "epoch": 1.0116977552956055, | |
| "grad_norm": 5.593410015106201, | |
| "learning_rate": 3.1328099767760584e-05, | |
| "loss": 0.9014, | |
| "sparse_loss": 0.9014, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 1.0148593107809043, | |
| "grad_norm": 5.132786750793457, | |
| "learning_rate": 3.118668313965775e-05, | |
| "loss": 0.8435, | |
| "sparse_loss": 0.8435, | |
| "step": 6420 | |
| }, | |
| { | |
| "epoch": 1.018020866266203, | |
| "grad_norm": 23.462923049926758, | |
| "learning_rate": 3.1045055320115356e-05, | |
| "loss": 0.8969, | |
| "sparse_loss": 0.8969, | |
| "step": 6440 | |
| }, | |
| { | |
| "epoch": 1.0211824217515018, | |
| "grad_norm": 4.707699775695801, | |
| "learning_rate": 3.090322114380528e-05, | |
| "loss": 0.8335, | |
| "sparse_loss": 0.8335, | |
| "step": 6460 | |
| }, | |
| { | |
| "epoch": 1.0243439772368006, | |
| "grad_norm": 3.600597620010376, | |
| "learning_rate": 3.076118545244371e-05, | |
| "loss": 0.7767, | |
| "sparse_loss": 0.7767, | |
| "step": 6480 | |
| }, | |
| { | |
| "epoch": 1.0275055327220992, | |
| "grad_norm": 2.8197829723358154, | |
| "learning_rate": 3.0618953094625856e-05, | |
| "loss": 0.7853, | |
| "sparse_loss": 0.7853, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.030667088207398, | |
| "grad_norm": 3.2937378883361816, | |
| "learning_rate": 3.0476528925660382e-05, | |
| "loss": 0.8076, | |
| "sparse_loss": 0.8076, | |
| "step": 6520 | |
| }, | |
| { | |
| "epoch": 1.033828643692697, | |
| "grad_norm": 9.177017211914062, | |
| "learning_rate": 3.033391780740374e-05, | |
| "loss": 0.7984, | |
| "sparse_loss": 0.7984, | |
| "step": 6540 | |
| }, | |
| { | |
| "epoch": 1.0369901991779955, | |
| "grad_norm": 3.7408878803253174, | |
| "learning_rate": 3.019112460809415e-05, | |
| "loss": 0.8806, | |
| "sparse_loss": 0.8806, | |
| "step": 6560 | |
| }, | |
| { | |
| "epoch": 1.0401517546632943, | |
| "grad_norm": 3.173116445541382, | |
| "learning_rate": 3.0048154202185452e-05, | |
| "loss": 0.7925, | |
| "sparse_loss": 0.7925, | |
| "step": 6580 | |
| }, | |
| { | |
| "epoch": 1.0433133101485932, | |
| "grad_norm": 3.0256857872009277, | |
| "learning_rate": 2.9905011470180683e-05, | |
| "loss": 0.7768, | |
| "sparse_loss": 0.7768, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 1.0464748656338918, | |
| "grad_norm": 3.942674398422241, | |
| "learning_rate": 2.9761701298465465e-05, | |
| "loss": 0.7864, | |
| "sparse_loss": 0.7864, | |
| "step": 6620 | |
| }, | |
| { | |
| "epoch": 1.0496364211191906, | |
| "grad_norm": 2.6104464530944824, | |
| "learning_rate": 2.9618228579141244e-05, | |
| "loss": 0.8511, | |
| "sparse_loss": 0.8511, | |
| "step": 6640 | |
| }, | |
| { | |
| "epoch": 1.0527979766044895, | |
| "grad_norm": 4.09505033493042, | |
| "learning_rate": 2.9474598209858262e-05, | |
| "loss": 0.7585, | |
| "sparse_loss": 0.7585, | |
| "step": 6660 | |
| }, | |
| { | |
| "epoch": 1.055959532089788, | |
| "grad_norm": 3.442453145980835, | |
| "learning_rate": 2.9330815093648344e-05, | |
| "loss": 0.8245, | |
| "sparse_loss": 0.8245, | |
| "step": 6680 | |
| }, | |
| { | |
| "epoch": 1.059121087575087, | |
| "grad_norm": 4.608921527862549, | |
| "learning_rate": 2.9186884138757596e-05, | |
| "loss": 0.7885, | |
| "sparse_loss": 0.7885, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 1.0622826430603858, | |
| "grad_norm": 3.9954628944396973, | |
| "learning_rate": 2.9042810258478785e-05, | |
| "loss": 0.7727, | |
| "sparse_loss": 0.7727, | |
| "step": 6720 | |
| }, | |
| { | |
| "epoch": 1.0654441985456844, | |
| "grad_norm": 2.6084184646606445, | |
| "learning_rate": 2.8898598370983642e-05, | |
| "loss": 0.8582, | |
| "sparse_loss": 0.8582, | |
| "step": 6740 | |
| }, | |
| { | |
| "epoch": 1.0686057540309832, | |
| "grad_norm": 17.903118133544922, | |
| "learning_rate": 2.8754253399154995e-05, | |
| "loss": 0.7957, | |
| "sparse_loss": 0.7957, | |
| "step": 6760 | |
| }, | |
| { | |
| "epoch": 1.071767309516282, | |
| "grad_norm": 5.914791107177734, | |
| "learning_rate": 2.8609780270418684e-05, | |
| "loss": 0.7795, | |
| "sparse_loss": 0.7795, | |
| "step": 6780 | |
| }, | |
| { | |
| "epoch": 1.0749288650015807, | |
| "grad_norm": 3.333829641342163, | |
| "learning_rate": 2.846518391657538e-05, | |
| "loss": 0.8225, | |
| "sparse_loss": 0.8225, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 1.0780904204868795, | |
| "grad_norm": 3.164673089981079, | |
| "learning_rate": 2.832046927363221e-05, | |
| "loss": 0.799, | |
| "sparse_loss": 0.799, | |
| "step": 6820 | |
| }, | |
| { | |
| "epoch": 1.0812519759721784, | |
| "grad_norm": 12.461358070373535, | |
| "learning_rate": 2.8175641281634285e-05, | |
| "loss": 0.8586, | |
| "sparse_loss": 0.8586, | |
| "step": 6840 | |
| }, | |
| { | |
| "epoch": 1.084413531457477, | |
| "grad_norm": 11.53919792175293, | |
| "learning_rate": 2.8030704884496056e-05, | |
| "loss": 0.7813, | |
| "sparse_loss": 0.7813, | |
| "step": 6860 | |
| }, | |
| { | |
| "epoch": 1.0875750869427758, | |
| "grad_norm": 3.0285277366638184, | |
| "learning_rate": 2.7885665029832515e-05, | |
| "loss": 0.689, | |
| "sparse_loss": 0.689, | |
| "step": 6880 | |
| }, | |
| { | |
| "epoch": 1.0907366424280747, | |
| "grad_norm": 3.562075614929199, | |
| "learning_rate": 2.7740526668790355e-05, | |
| "loss": 0.8058, | |
| "sparse_loss": 0.8058, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 1.0938981979133733, | |
| "grad_norm": 3.2898313999176025, | |
| "learning_rate": 2.7595294755878914e-05, | |
| "loss": 0.7827, | |
| "sparse_loss": 0.7827, | |
| "step": 6920 | |
| }, | |
| { | |
| "epoch": 1.0970597533986721, | |
| "grad_norm": 3.8332738876342773, | |
| "learning_rate": 2.744997424880107e-05, | |
| "loss": 0.7735, | |
| "sparse_loss": 0.7735, | |
| "step": 6940 | |
| }, | |
| { | |
| "epoch": 1.100221308883971, | |
| "grad_norm": 3.5667922496795654, | |
| "learning_rate": 2.7304570108283978e-05, | |
| "loss": 0.7801, | |
| "sparse_loss": 0.7801, | |
| "step": 6960 | |
| }, | |
| { | |
| "epoch": 1.1033828643692696, | |
| "grad_norm": 6.848054885864258, | |
| "learning_rate": 2.715908729790974e-05, | |
| "loss": 0.7815, | |
| "sparse_loss": 0.7815, | |
| "step": 6980 | |
| }, | |
| { | |
| "epoch": 1.1065444198545684, | |
| "grad_norm": 22.703956604003906, | |
| "learning_rate": 2.701353078394599e-05, | |
| "loss": 0.9333, | |
| "sparse_loss": 0.9333, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.1097059753398673, | |
| "grad_norm": 2.80373215675354, | |
| "learning_rate": 2.686790553517632e-05, | |
| "loss": 0.7076, | |
| "sparse_loss": 0.7076, | |
| "step": 7020 | |
| }, | |
| { | |
| "epoch": 1.112867530825166, | |
| "grad_norm": 3.815857172012329, | |
| "learning_rate": 2.6722216522730693e-05, | |
| "loss": 0.785, | |
| "sparse_loss": 0.785, | |
| "step": 7040 | |
| }, | |
| { | |
| "epoch": 1.1160290863104647, | |
| "grad_norm": 3.837503433227539, | |
| "learning_rate": 2.657646871991575e-05, | |
| "loss": 0.8114, | |
| "sparse_loss": 0.8114, | |
| "step": 7060 | |
| }, | |
| { | |
| "epoch": 1.1191906417957636, | |
| "grad_norm": 3.5608763694763184, | |
| "learning_rate": 2.6430667102044994e-05, | |
| "loss": 0.8866, | |
| "sparse_loss": 0.8866, | |
| "step": 7080 | |
| }, | |
| { | |
| "epoch": 1.1223521972810624, | |
| "grad_norm": 13.049933433532715, | |
| "learning_rate": 2.628481664626901e-05, | |
| "loss": 0.7034, | |
| "sparse_loss": 0.7034, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 1.125513752766361, | |
| "grad_norm": 3.2948811054229736, | |
| "learning_rate": 2.6138922331405545e-05, | |
| "loss": 0.7277, | |
| "sparse_loss": 0.7277, | |
| "step": 7120 | |
| }, | |
| { | |
| "epoch": 1.1286753082516598, | |
| "grad_norm": 3.614666223526001, | |
| "learning_rate": 2.5992989137769512e-05, | |
| "loss": 0.7875, | |
| "sparse_loss": 0.7875, | |
| "step": 7140 | |
| }, | |
| { | |
| "epoch": 1.1318368637369587, | |
| "grad_norm": 9.70922565460205, | |
| "learning_rate": 2.5847022047003016e-05, | |
| "loss": 0.8039, | |
| "sparse_loss": 0.8039, | |
| "step": 7160 | |
| }, | |
| { | |
| "epoch": 1.1349984192222573, | |
| "grad_norm": 4.468625068664551, | |
| "learning_rate": 2.5701026041905306e-05, | |
| "loss": 0.6848, | |
| "sparse_loss": 0.6848, | |
| "step": 7180 | |
| }, | |
| { | |
| "epoch": 1.1381599747075561, | |
| "grad_norm": 2.4922146797180176, | |
| "learning_rate": 2.555500610626264e-05, | |
| "loss": 0.853, | |
| "sparse_loss": 0.853, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 1.141321530192855, | |
| "grad_norm": 4.103279113769531, | |
| "learning_rate": 2.5408967224678203e-05, | |
| "loss": 0.7167, | |
| "sparse_loss": 0.7167, | |
| "step": 7220 | |
| }, | |
| { | |
| "epoch": 1.1444830856781536, | |
| "grad_norm": 14.72502326965332, | |
| "learning_rate": 2.5262914382401908e-05, | |
| "loss": 0.8225, | |
| "sparse_loss": 0.8225, | |
| "step": 7240 | |
| }, | |
| { | |
| "epoch": 1.1476446411634524, | |
| "grad_norm": 3.0032124519348145, | |
| "learning_rate": 2.5116852565160253e-05, | |
| "loss": 0.7334, | |
| "sparse_loss": 0.7334, | |
| "step": 7260 | |
| }, | |
| { | |
| "epoch": 1.1508061966487513, | |
| "grad_norm": 2.6268882751464844, | |
| "learning_rate": 2.4970786758986098e-05, | |
| "loss": 0.771, | |
| "sparse_loss": 0.771, | |
| "step": 7280 | |
| }, | |
| { | |
| "epoch": 1.15396775213405, | |
| "grad_norm": 3.5500500202178955, | |
| "learning_rate": 2.482472195004847e-05, | |
| "loss": 0.7327, | |
| "sparse_loss": 0.7327, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 1.1571293076193487, | |
| "grad_norm": 3.703057289123535, | |
| "learning_rate": 2.4678663124482358e-05, | |
| "loss": 0.7744, | |
| "sparse_loss": 0.7744, | |
| "step": 7320 | |
| }, | |
| { | |
| "epoch": 1.1602908631046476, | |
| "grad_norm": 5.458900451660156, | |
| "learning_rate": 2.4532615268218503e-05, | |
| "loss": 0.7852, | |
| "sparse_loss": 0.7852, | |
| "step": 7340 | |
| }, | |
| { | |
| "epoch": 1.1634524185899462, | |
| "grad_norm": 3.3717634677886963, | |
| "learning_rate": 2.438658336681319e-05, | |
| "loss": 0.8021, | |
| "sparse_loss": 0.8021, | |
| "step": 7360 | |
| }, | |
| { | |
| "epoch": 1.166613974075245, | |
| "grad_norm": 6.050585746765137, | |
| "learning_rate": 2.4240572405278065e-05, | |
| "loss": 0.7909, | |
| "sparse_loss": 0.7909, | |
| "step": 7380 | |
| }, | |
| { | |
| "epoch": 1.1697755295605439, | |
| "grad_norm": 5.964054584503174, | |
| "learning_rate": 2.4094587367909942e-05, | |
| "loss": 0.884, | |
| "sparse_loss": 0.884, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 1.1729370850458425, | |
| "grad_norm": 4.546367645263672, | |
| "learning_rate": 2.394863323812072e-05, | |
| "loss": 0.7565, | |
| "sparse_loss": 0.7565, | |
| "step": 7420 | |
| }, | |
| { | |
| "epoch": 1.1760986405311413, | |
| "grad_norm": 1.9227008819580078, | |
| "learning_rate": 2.3802714998267177e-05, | |
| "loss": 0.7934, | |
| "sparse_loss": 0.7934, | |
| "step": 7440 | |
| }, | |
| { | |
| "epoch": 1.1792601960164402, | |
| "grad_norm": 4.1103010177612305, | |
| "learning_rate": 2.365683762948094e-05, | |
| "loss": 0.753, | |
| "sparse_loss": 0.753, | |
| "step": 7460 | |
| }, | |
| { | |
| "epoch": 1.1824217515017388, | |
| "grad_norm": 4.982396602630615, | |
| "learning_rate": 2.3511006111498486e-05, | |
| "loss": 0.7338, | |
| "sparse_loss": 0.7338, | |
| "step": 7480 | |
| }, | |
| { | |
| "epoch": 1.1855833069870376, | |
| "grad_norm": 4.692390441894531, | |
| "learning_rate": 2.3365225422491045e-05, | |
| "loss": 0.8314, | |
| "sparse_loss": 0.8314, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 1.1887448624723365, | |
| "grad_norm": 4.539749622344971, | |
| "learning_rate": 2.3219500538894796e-05, | |
| "loss": 0.766, | |
| "sparse_loss": 0.766, | |
| "step": 7520 | |
| }, | |
| { | |
| "epoch": 1.191906417957635, | |
| "grad_norm": 2.8554506301879883, | |
| "learning_rate": 2.307383643524085e-05, | |
| "loss": 0.8289, | |
| "sparse_loss": 0.8289, | |
| "step": 7540 | |
| }, | |
| { | |
| "epoch": 1.195067973442934, | |
| "grad_norm": 2.8660402297973633, | |
| "learning_rate": 2.292823808398554e-05, | |
| "loss": 0.7801, | |
| "sparse_loss": 0.7801, | |
| "step": 7560 | |
| }, | |
| { | |
| "epoch": 1.1982295289282328, | |
| "grad_norm": 2.8601577281951904, | |
| "learning_rate": 2.2782710455340666e-05, | |
| "loss": 0.8038, | |
| "sparse_loss": 0.8038, | |
| "step": 7580 | |
| }, | |
| { | |
| "epoch": 1.2013910844135314, | |
| "grad_norm": 3.46244478225708, | |
| "learning_rate": 2.2637258517103754e-05, | |
| "loss": 0.7507, | |
| "sparse_loss": 0.7507, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 1.2045526398988302, | |
| "grad_norm": 5.021501541137695, | |
| "learning_rate": 2.249188723448859e-05, | |
| "loss": 0.8116, | |
| "sparse_loss": 0.8116, | |
| "step": 7620 | |
| }, | |
| { | |
| "epoch": 1.207714195384129, | |
| "grad_norm": 2.997774839401245, | |
| "learning_rate": 2.2346601569955622e-05, | |
| "loss": 0.7993, | |
| "sparse_loss": 0.7993, | |
| "step": 7640 | |
| }, | |
| { | |
| "epoch": 1.2108757508694277, | |
| "grad_norm": 2.6975152492523193, | |
| "learning_rate": 2.2201406483042592e-05, | |
| "loss": 0.7502, | |
| "sparse_loss": 0.7502, | |
| "step": 7660 | |
| }, | |
| { | |
| "epoch": 1.2140373063547265, | |
| "grad_norm": 2.3823089599609375, | |
| "learning_rate": 2.205630693019529e-05, | |
| "loss": 0.7218, | |
| "sparse_loss": 0.7218, | |
| "step": 7680 | |
| }, | |
| { | |
| "epoch": 1.2171988618400253, | |
| "grad_norm": 4.206768989562988, | |
| "learning_rate": 2.1911307864598253e-05, | |
| "loss": 0.7008, | |
| "sparse_loss": 0.7008, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 1.220360417325324, | |
| "grad_norm": 3.4627437591552734, | |
| "learning_rate": 2.1766414236005795e-05, | |
| "loss": 0.7893, | |
| "sparse_loss": 0.7893, | |
| "step": 7720 | |
| }, | |
| { | |
| "epoch": 1.2235219728106228, | |
| "grad_norm": 2.8255398273468018, | |
| "learning_rate": 2.162163099057295e-05, | |
| "loss": 0.7608, | |
| "sparse_loss": 0.7608, | |
| "step": 7740 | |
| }, | |
| { | |
| "epoch": 1.2266835282959216, | |
| "grad_norm": 4.011843204498291, | |
| "learning_rate": 2.1476963070686658e-05, | |
| "loss": 0.807, | |
| "sparse_loss": 0.807, | |
| "step": 7760 | |
| }, | |
| { | |
| "epoch": 1.2298450837812203, | |
| "grad_norm": 1.6373748779296875, | |
| "learning_rate": 2.1332415414797083e-05, | |
| "loss": 0.7559, | |
| "sparse_loss": 0.7559, | |
| "step": 7780 | |
| }, | |
| { | |
| "epoch": 1.233006639266519, | |
| "grad_norm": 5.056970119476318, | |
| "learning_rate": 2.1187992957248975e-05, | |
| "loss": 0.7906, | |
| "sparse_loss": 0.7906, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 1.236168194751818, | |
| "grad_norm": 2.484866142272949, | |
| "learning_rate": 2.1043700628113274e-05, | |
| "loss": 0.7988, | |
| "sparse_loss": 0.7988, | |
| "step": 7820 | |
| }, | |
| { | |
| "epoch": 1.2393297502371166, | |
| "grad_norm": 3.7188453674316406, | |
| "learning_rate": 2.0899543353018792e-05, | |
| "loss": 0.7656, | |
| "sparse_loss": 0.7656, | |
| "step": 7840 | |
| }, | |
| { | |
| "epoch": 1.2424913057224154, | |
| "grad_norm": 6.211063861846924, | |
| "learning_rate": 2.0755526052984048e-05, | |
| "loss": 0.715, | |
| "sparse_loss": 0.715, | |
| "step": 7860 | |
| }, | |
| { | |
| "epoch": 1.2456528612077142, | |
| "grad_norm": 7.281302452087402, | |
| "learning_rate": 2.0611653644249363e-05, | |
| "loss": 0.7624, | |
| "sparse_loss": 0.7624, | |
| "step": 7880 | |
| }, | |
| { | |
| "epoch": 1.2488144166930129, | |
| "grad_norm": 8.34732723236084, | |
| "learning_rate": 2.0467931038108933e-05, | |
| "loss": 0.6751, | |
| "sparse_loss": 0.6751, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 1.2519759721783117, | |
| "grad_norm": 3.84628963470459, | |
| "learning_rate": 2.032436314074326e-05, | |
| "loss": 0.8031, | |
| "sparse_loss": 0.8031, | |
| "step": 7920 | |
| }, | |
| { | |
| "epoch": 1.2551375276636105, | |
| "grad_norm": 4.450046062469482, | |
| "learning_rate": 2.01809548530516e-05, | |
| "loss": 0.8891, | |
| "sparse_loss": 0.8891, | |
| "step": 7940 | |
| }, | |
| { | |
| "epoch": 1.2582990831489091, | |
| "grad_norm": 4.694953441619873, | |
| "learning_rate": 2.003771107048474e-05, | |
| "loss": 0.7813, | |
| "sparse_loss": 0.7813, | |
| "step": 7960 | |
| }, | |
| { | |
| "epoch": 1.261460638634208, | |
| "grad_norm": 3.50838303565979, | |
| "learning_rate": 1.9894636682877812e-05, | |
| "loss": 0.6967, | |
| "sparse_loss": 0.6967, | |
| "step": 7980 | |
| }, | |
| { | |
| "epoch": 1.2646221941195068, | |
| "grad_norm": 6.803529262542725, | |
| "learning_rate": 1.9751736574283416e-05, | |
| "loss": 0.7321, | |
| "sparse_loss": 0.7321, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.2677837496048054, | |
| "grad_norm": 3.093456268310547, | |
| "learning_rate": 1.96090156228049e-05, | |
| "loss": 0.777, | |
| "sparse_loss": 0.777, | |
| "step": 8020 | |
| }, | |
| { | |
| "epoch": 1.2709453050901043, | |
| "grad_norm": 4.572612762451172, | |
| "learning_rate": 1.9466478700429793e-05, | |
| "loss": 0.7862, | |
| "sparse_loss": 0.7862, | |
| "step": 8040 | |
| }, | |
| { | |
| "epoch": 1.2741068605754031, | |
| "grad_norm": 3.8324055671691895, | |
| "learning_rate": 1.932413067286355e-05, | |
| "loss": 0.7502, | |
| "sparse_loss": 0.7502, | |
| "step": 8060 | |
| }, | |
| { | |
| "epoch": 1.2772684160607017, | |
| "grad_norm": 4.8424763679504395, | |
| "learning_rate": 1.9181976399363415e-05, | |
| "loss": 0.7847, | |
| "sparse_loss": 0.7847, | |
| "step": 8080 | |
| }, | |
| { | |
| "epoch": 1.2804299715460006, | |
| "grad_norm": 3.3925790786743164, | |
| "learning_rate": 1.904002073257254e-05, | |
| "loss": 0.6804, | |
| "sparse_loss": 0.6804, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 1.2835915270312994, | |
| "grad_norm": 3.5479822158813477, | |
| "learning_rate": 1.8898268518354383e-05, | |
| "loss": 0.7036, | |
| "sparse_loss": 0.7036, | |
| "step": 8120 | |
| }, | |
| { | |
| "epoch": 1.286753082516598, | |
| "grad_norm": 4.0524396896362305, | |
| "learning_rate": 1.8756724595627207e-05, | |
| "loss": 0.7484, | |
| "sparse_loss": 0.7484, | |
| "step": 8140 | |
| }, | |
| { | |
| "epoch": 1.2899146380018969, | |
| "grad_norm": 3.455230474472046, | |
| "learning_rate": 1.861539379619899e-05, | |
| "loss": 0.8262, | |
| "sparse_loss": 0.8262, | |
| "step": 8160 | |
| }, | |
| { | |
| "epoch": 1.2930761934871957, | |
| "grad_norm": 3.041726589202881, | |
| "learning_rate": 1.84742809446024e-05, | |
| "loss": 0.732, | |
| "sparse_loss": 0.732, | |
| "step": 8180 | |
| }, | |
| { | |
| "epoch": 1.2962377489724943, | |
| "grad_norm": 4.920287132263184, | |
| "learning_rate": 1.8333390857930144e-05, | |
| "loss": 0.6776, | |
| "sparse_loss": 0.6776, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 1.2993993044577932, | |
| "grad_norm": 3.5329527854919434, | |
| "learning_rate": 1.8192728345670547e-05, | |
| "loss": 0.7193, | |
| "sparse_loss": 0.7193, | |
| "step": 8220 | |
| }, | |
| { | |
| "epoch": 1.302560859943092, | |
| "grad_norm": 2.954718589782715, | |
| "learning_rate": 1.8052298209543315e-05, | |
| "loss": 0.7685, | |
| "sparse_loss": 0.7685, | |
| "step": 8240 | |
| }, | |
| { | |
| "epoch": 1.3057224154283908, | |
| "grad_norm": 3.8826847076416016, | |
| "learning_rate": 1.7912105243335687e-05, | |
| "loss": 0.8173, | |
| "sparse_loss": 0.8173, | |
| "step": 8260 | |
| }, | |
| { | |
| "epoch": 1.3088839709136895, | |
| "grad_norm": 3.8182241916656494, | |
| "learning_rate": 1.7772154232738745e-05, | |
| "loss": 0.7295, | |
| "sparse_loss": 0.7295, | |
| "step": 8280 | |
| }, | |
| { | |
| "epoch": 1.3120455263989883, | |
| "grad_norm": 8.97372817993164, | |
| "learning_rate": 1.763244995518406e-05, | |
| "loss": 0.7348, | |
| "sparse_loss": 0.7348, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 1.3152070818842871, | |
| "grad_norm": 4.511361122131348, | |
| "learning_rate": 1.749299717968063e-05, | |
| "loss": 0.7708, | |
| "sparse_loss": 0.7708, | |
| "step": 8320 | |
| }, | |
| { | |
| "epoch": 1.3183686373695858, | |
| "grad_norm": 6.190827369689941, | |
| "learning_rate": 1.7353800666652046e-05, | |
| "loss": 0.7471, | |
| "sparse_loss": 0.7471, | |
| "step": 8340 | |
| }, | |
| { | |
| "epoch": 1.3215301928548846, | |
| "grad_norm": 5.850632667541504, | |
| "learning_rate": 1.721486516777402e-05, | |
| "loss": 0.6956, | |
| "sparse_loss": 0.6956, | |
| "step": 8360 | |
| }, | |
| { | |
| "epoch": 1.3246917483401834, | |
| "grad_norm": 3.439373016357422, | |
| "learning_rate": 1.707619542581215e-05, | |
| "loss": 0.7216, | |
| "sparse_loss": 0.7216, | |
| "step": 8380 | |
| }, | |
| { | |
| "epoch": 1.327853303825482, | |
| "grad_norm": 3.1243553161621094, | |
| "learning_rate": 1.6937796174460044e-05, | |
| "loss": 0.7003, | |
| "sparse_loss": 0.7003, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 1.331014859310781, | |
| "grad_norm": 2.189188003540039, | |
| "learning_rate": 1.6799672138177726e-05, | |
| "loss": 0.7786, | |
| "sparse_loss": 0.7786, | |
| "step": 8420 | |
| }, | |
| { | |
| "epoch": 1.3341764147960797, | |
| "grad_norm": 4.986023902893066, | |
| "learning_rate": 1.6661828032030334e-05, | |
| "loss": 0.7062, | |
| "sparse_loss": 0.7062, | |
| "step": 8440 | |
| }, | |
| { | |
| "epoch": 1.3373379702813786, | |
| "grad_norm": 4.733263969421387, | |
| "learning_rate": 1.652426856152721e-05, | |
| "loss": 0.728, | |
| "sparse_loss": 0.728, | |
| "step": 8460 | |
| }, | |
| { | |
| "epoch": 1.3404995257666772, | |
| "grad_norm": 3.997431516647339, | |
| "learning_rate": 1.638699842246121e-05, | |
| "loss": 0.7494, | |
| "sparse_loss": 0.7494, | |
| "step": 8480 | |
| }, | |
| { | |
| "epoch": 1.343661081251976, | |
| "grad_norm": 3.0536346435546875, | |
| "learning_rate": 1.6250022300748486e-05, | |
| "loss": 0.6844, | |
| "sparse_loss": 0.6844, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 1.3468226367372749, | |
| "grad_norm": 5.487432956695557, | |
| "learning_rate": 1.611334487226842e-05, | |
| "loss": 0.6523, | |
| "sparse_loss": 0.6523, | |
| "step": 8520 | |
| }, | |
| { | |
| "epoch": 1.3499841922225735, | |
| "grad_norm": 5.399076461791992, | |
| "learning_rate": 1.5976970802704106e-05, | |
| "loss": 0.7584, | |
| "sparse_loss": 0.7584, | |
| "step": 8540 | |
| }, | |
| { | |
| "epoch": 1.3531457477078723, | |
| "grad_norm": 3.9257521629333496, | |
| "learning_rate": 1.584090474738305e-05, | |
| "loss": 0.9882, | |
| "sparse_loss": 0.9882, | |
| "step": 8560 | |
| }, | |
| { | |
| "epoch": 1.3563073031931712, | |
| "grad_norm": 3.830868721008301, | |
| "learning_rate": 1.5705151351118192e-05, | |
| "loss": 0.9246, | |
| "sparse_loss": 0.9246, | |
| "step": 8580 | |
| }, | |
| { | |
| "epoch": 1.3594688586784698, | |
| "grad_norm": 3.6509013175964355, | |
| "learning_rate": 1.5569715248049457e-05, | |
| "loss": 0.9254, | |
| "sparse_loss": 0.9254, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 1.3626304141637686, | |
| "grad_norm": 4.829010009765625, | |
| "learning_rate": 1.5434601061485477e-05, | |
| "loss": 0.7008, | |
| "sparse_loss": 0.7008, | |
| "step": 8620 | |
| }, | |
| { | |
| "epoch": 1.3657919696490675, | |
| "grad_norm": 4.709140300750732, | |
| "learning_rate": 1.5299813403745777e-05, | |
| "loss": 0.7746, | |
| "sparse_loss": 0.7746, | |
| "step": 8640 | |
| }, | |
| { | |
| "epoch": 1.368953525134366, | |
| "grad_norm": 3.270906448364258, | |
| "learning_rate": 1.5165356876003395e-05, | |
| "loss": 0.7616, | |
| "sparse_loss": 0.7616, | |
| "step": 8660 | |
| }, | |
| { | |
| "epoch": 1.372115080619665, | |
| "grad_norm": 3.6882264614105225, | |
| "learning_rate": 1.5031236068127701e-05, | |
| "loss": 0.736, | |
| "sparse_loss": 0.736, | |
| "step": 8680 | |
| }, | |
| { | |
| "epoch": 1.3752766361049638, | |
| "grad_norm": 3.479408025741577, | |
| "learning_rate": 1.4897455558527845e-05, | |
| "loss": 0.7595, | |
| "sparse_loss": 0.7595, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 1.3784381915902624, | |
| "grad_norm": 3.896000623703003, | |
| "learning_rate": 1.4764019913996355e-05, | |
| "loss": 0.7135, | |
| "sparse_loss": 0.7135, | |
| "step": 8720 | |
| }, | |
| { | |
| "epoch": 1.3815997470755612, | |
| "grad_norm": 2.6327970027923584, | |
| "learning_rate": 1.463093368955328e-05, | |
| "loss": 0.8016, | |
| "sparse_loss": 0.8016, | |
| "step": 8740 | |
| }, | |
| { | |
| "epoch": 1.38476130256086, | |
| "grad_norm": 3.2666549682617188, | |
| "learning_rate": 1.4498201428290759e-05, | |
| "loss": 0.7027, | |
| "sparse_loss": 0.7027, | |
| "step": 8760 | |
| }, | |
| { | |
| "epoch": 1.3879228580461587, | |
| "grad_norm": 3.704594612121582, | |
| "learning_rate": 1.4365827661217815e-05, | |
| "loss": 0.7176, | |
| "sparse_loss": 0.7176, | |
| "step": 8780 | |
| }, | |
| { | |
| "epoch": 1.3910844135314575, | |
| "grad_norm": 4.167449474334717, | |
| "learning_rate": 1.4233816907105808e-05, | |
| "loss": 0.7141, | |
| "sparse_loss": 0.7141, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 1.3942459690167563, | |
| "grad_norm": 3.4307448863983154, | |
| "learning_rate": 1.4102173672334087e-05, | |
| "loss": 0.6909, | |
| "sparse_loss": 0.6909, | |
| "step": 8820 | |
| }, | |
| { | |
| "epoch": 1.397407524502055, | |
| "grad_norm": 6.434845447540283, | |
| "learning_rate": 1.3970902450736207e-05, | |
| "loss": 0.8462, | |
| "sparse_loss": 0.8462, | |
| "step": 8840 | |
| }, | |
| { | |
| "epoch": 1.4005690799873538, | |
| "grad_norm": 3.118427038192749, | |
| "learning_rate": 1.3840007723446497e-05, | |
| "loss": 1.653, | |
| "sparse_loss": 1.653, | |
| "step": 8860 | |
| }, | |
| { | |
| "epoch": 1.4037306354726526, | |
| "grad_norm": 3.181898355484009, | |
| "learning_rate": 1.3709493958747114e-05, | |
| "loss": 1.1503, | |
| "sparse_loss": 1.1503, | |
| "step": 8880 | |
| }, | |
| { | |
| "epoch": 1.4068921909579513, | |
| "grad_norm": 2.60162091255188, | |
| "learning_rate": 1.3579365611915517e-05, | |
| "loss": 0.7187, | |
| "sparse_loss": 0.7187, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 1.41005374644325, | |
| "grad_norm": 3.9322071075439453, | |
| "learning_rate": 1.3449627125072348e-05, | |
| "loss": 0.7415, | |
| "sparse_loss": 0.7415, | |
| "step": 8920 | |
| }, | |
| { | |
| "epoch": 1.413215301928549, | |
| "grad_norm": 12.794656753540039, | |
| "learning_rate": 1.3320282927029806e-05, | |
| "loss": 0.9116, | |
| "sparse_loss": 0.9116, | |
| "step": 8940 | |
| }, | |
| { | |
| "epoch": 1.4163768574138476, | |
| "grad_norm": 2.7025208473205566, | |
| "learning_rate": 1.3191337433140477e-05, | |
| "loss": 0.8108, | |
| "sparse_loss": 0.8108, | |
| "step": 8960 | |
| }, | |
| { | |
| "epoch": 1.4195384128991464, | |
| "grad_norm": 7.067229270935059, | |
| "learning_rate": 1.3062795045146586e-05, | |
| "loss": 0.7282, | |
| "sparse_loss": 0.7282, | |
| "step": 8980 | |
| }, | |
| { | |
| "epoch": 1.4226999683844452, | |
| "grad_norm": 17.709030151367188, | |
| "learning_rate": 1.2934660151029787e-05, | |
| "loss": 0.7099, | |
| "sparse_loss": 0.7099, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 1.4258615238697439, | |
| "grad_norm": 2.6624441146850586, | |
| "learning_rate": 1.280693712486129e-05, | |
| "loss": 0.709, | |
| "sparse_loss": 0.709, | |
| "step": 9020 | |
| }, | |
| { | |
| "epoch": 1.4290230793550427, | |
| "grad_norm": 4.256680011749268, | |
| "learning_rate": 1.2679630326652637e-05, | |
| "loss": 0.7621, | |
| "sparse_loss": 0.7621, | |
| "step": 9040 | |
| }, | |
| { | |
| "epoch": 1.4321846348403415, | |
| "grad_norm": 3.280846357345581, | |
| "learning_rate": 1.2552744102206795e-05, | |
| "loss": 0.7032, | |
| "sparse_loss": 0.7032, | |
| "step": 9060 | |
| }, | |
| { | |
| "epoch": 1.4353461903256401, | |
| "grad_norm": 4.684087753295898, | |
| "learning_rate": 1.2426282782969817e-05, | |
| "loss": 0.714, | |
| "sparse_loss": 0.714, | |
| "step": 9080 | |
| }, | |
| { | |
| "epoch": 1.438507745810939, | |
| "grad_norm": 2.5815060138702393, | |
| "learning_rate": 1.2300250685883045e-05, | |
| "loss": 0.6459, | |
| "sparse_loss": 0.6459, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 1.4416693012962378, | |
| "grad_norm": 90.20246887207031, | |
| "learning_rate": 1.2174652113235651e-05, | |
| "loss": 0.7376, | |
| "sparse_loss": 0.7376, | |
| "step": 9120 | |
| }, | |
| { | |
| "epoch": 1.4448308567815364, | |
| "grad_norm": 4.301577568054199, | |
| "learning_rate": 1.2049491352517866e-05, | |
| "loss": 0.7237, | |
| "sparse_loss": 0.7237, | |
| "step": 9140 | |
| }, | |
| { | |
| "epoch": 1.4479924122668353, | |
| "grad_norm": 3.5579445362091064, | |
| "learning_rate": 1.1924772676274546e-05, | |
| "loss": 0.7621, | |
| "sparse_loss": 0.7621, | |
| "step": 9160 | |
| }, | |
| { | |
| "epoch": 1.4511539677521341, | |
| "grad_norm": 2.510053873062134, | |
| "learning_rate": 1.1800500341959317e-05, | |
| "loss": 0.7033, | |
| "sparse_loss": 0.7033, | |
| "step": 9180 | |
| }, | |
| { | |
| "epoch": 1.4543155232374327, | |
| "grad_norm": 2.4674551486968994, | |
| "learning_rate": 1.1676678591789341e-05, | |
| "loss": 0.7039, | |
| "sparse_loss": 0.7039, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 1.4574770787227316, | |
| "grad_norm": 2.625556468963623, | |
| "learning_rate": 1.155331165260038e-05, | |
| "loss": 0.7147, | |
| "sparse_loss": 0.7147, | |
| "step": 9220 | |
| }, | |
| { | |
| "epoch": 1.4606386342080304, | |
| "grad_norm": 3.8454997539520264, | |
| "learning_rate": 1.1430403735702599e-05, | |
| "loss": 0.7117, | |
| "sparse_loss": 0.7117, | |
| "step": 9240 | |
| }, | |
| { | |
| "epoch": 1.463800189693329, | |
| "grad_norm": 4.563284873962402, | |
| "learning_rate": 1.1307959036736754e-05, | |
| "loss": 0.6743, | |
| "sparse_loss": 0.6743, | |
| "step": 9260 | |
| }, | |
| { | |
| "epoch": 1.4669617451786279, | |
| "grad_norm": 9.20923137664795, | |
| "learning_rate": 1.1185981735530945e-05, | |
| "loss": 0.7482, | |
| "sparse_loss": 0.7482, | |
| "step": 9280 | |
| }, | |
| { | |
| "epoch": 1.4701233006639267, | |
| "grad_norm": 4.920022964477539, | |
| "learning_rate": 1.1064475995958035e-05, | |
| "loss": 0.6762, | |
| "sparse_loss": 0.6762, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 1.4732848561492253, | |
| "grad_norm": 5.852681636810303, | |
| "learning_rate": 1.0943445965793391e-05, | |
| "loss": 0.8167, | |
| "sparse_loss": 0.8167, | |
| "step": 9320 | |
| }, | |
| { | |
| "epoch": 1.4764464116345242, | |
| "grad_norm": 2.162567138671875, | |
| "learning_rate": 1.0822895776573386e-05, | |
| "loss": 0.7235, | |
| "sparse_loss": 0.7235, | |
| "step": 9340 | |
| }, | |
| { | |
| "epoch": 1.479607967119823, | |
| "grad_norm": 7.650509357452393, | |
| "learning_rate": 1.0702829543454295e-05, | |
| "loss": 0.6727, | |
| "sparse_loss": 0.6727, | |
| "step": 9360 | |
| }, | |
| { | |
| "epoch": 1.4827695226051216, | |
| "grad_norm": 30.643972396850586, | |
| "learning_rate": 1.0583251365071856e-05, | |
| "loss": 0.7458, | |
| "sparse_loss": 0.7458, | |
| "step": 9380 | |
| }, | |
| { | |
| "epoch": 1.4859310780904205, | |
| "grad_norm": 2.6235289573669434, | |
| "learning_rate": 1.0464165323401348e-05, | |
| "loss": 0.6345, | |
| "sparse_loss": 0.6345, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 1.4890926335757193, | |
| "grad_norm": 4.027590751647949, | |
| "learning_rate": 1.0345575483618236e-05, | |
| "loss": 0.7367, | |
| "sparse_loss": 0.7367, | |
| "step": 9420 | |
| }, | |
| { | |
| "epoch": 1.492254189061018, | |
| "grad_norm": 5.625932216644287, | |
| "learning_rate": 1.022748589395944e-05, | |
| "loss": 0.7412, | |
| "sparse_loss": 0.7412, | |
| "step": 9440 | |
| }, | |
| { | |
| "epoch": 1.4954157445463168, | |
| "grad_norm": 3.3360841274261475, | |
| "learning_rate": 1.0109900585585089e-05, | |
| "loss": 0.6915, | |
| "sparse_loss": 0.6915, | |
| "step": 9460 | |
| }, | |
| { | |
| "epoch": 1.4985773000316156, | |
| "grad_norm": 3.0357189178466797, | |
| "learning_rate": 9.992823572440936e-06, | |
| "loss": 0.707, | |
| "sparse_loss": 0.707, | |
| "step": 9480 | |
| }, | |
| { | |
| "epoch": 1.5017388555169142, | |
| "grad_norm": 2.9004428386688232, | |
| "learning_rate": 9.876258851121342e-06, | |
| "loss": 0.6771, | |
| "sparse_loss": 0.6771, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 1.504900411002213, | |
| "grad_norm": 3.5283262729644775, | |
| "learning_rate": 9.760210400732837e-06, | |
| "loss": 0.75, | |
| "sparse_loss": 0.75, | |
| "step": 9520 | |
| }, | |
| { | |
| "epoch": 1.508061966487512, | |
| "grad_norm": 4.224815845489502, | |
| "learning_rate": 9.644682182758306e-06, | |
| "loss": 0.7227, | |
| "sparse_loss": 0.7227, | |
| "step": 9540 | |
| }, | |
| { | |
| "epoch": 1.5112235219728105, | |
| "grad_norm": 2.3723509311676025, | |
| "learning_rate": 9.529678140921721e-06, | |
| "loss": 0.6999, | |
| "sparse_loss": 0.6999, | |
| "step": 9560 | |
| }, | |
| { | |
| "epoch": 1.5143850774581094, | |
| "grad_norm": 4.764114856719971, | |
| "learning_rate": 9.415202201053553e-06, | |
| "loss": 0.7451, | |
| "sparse_loss": 0.7451, | |
| "step": 9580 | |
| }, | |
| { | |
| "epoch": 1.5175466329434082, | |
| "grad_norm": 2.386294364929199, | |
| "learning_rate": 9.301258270956733e-06, | |
| "loss": 0.7274, | |
| "sparse_loss": 0.7274, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 1.5207081884287068, | |
| "grad_norm": 2.7732250690460205, | |
| "learning_rate": 9.187850240273263e-06, | |
| "loss": 0.6657, | |
| "sparse_loss": 0.6657, | |
| "step": 9620 | |
| }, | |
| { | |
| "epoch": 1.5238697439140056, | |
| "grad_norm": 4.688830852508545, | |
| "learning_rate": 9.074981980351461e-06, | |
| "loss": 0.7123, | |
| "sparse_loss": 0.7123, | |
| "step": 9640 | |
| }, | |
| { | |
| "epoch": 1.5270312993993045, | |
| "grad_norm": 2.6799607276916504, | |
| "learning_rate": 8.962657344113756e-06, | |
| "loss": 0.76, | |
| "sparse_loss": 0.76, | |
| "step": 9660 | |
| }, | |
| { | |
| "epoch": 1.530192854884603, | |
| "grad_norm": 3.9289462566375732, | |
| "learning_rate": 8.850880165925198e-06, | |
| "loss": 0.6947, | |
| "sparse_loss": 0.6947, | |
| "step": 9680 | |
| }, | |
| { | |
| "epoch": 1.533354410369902, | |
| "grad_norm": 4.236504554748535, | |
| "learning_rate": 8.73965426146257e-06, | |
| "loss": 0.6947, | |
| "sparse_loss": 0.6947, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 1.5365159658552008, | |
| "grad_norm": 2.219120740890503, | |
| "learning_rate": 8.628983427584104e-06, | |
| "loss": 0.6093, | |
| "sparse_loss": 0.6093, | |
| "step": 9720 | |
| }, | |
| { | |
| "epoch": 1.5396775213404994, | |
| "grad_norm": 2.3669049739837646, | |
| "learning_rate": 8.518871442199916e-06, | |
| "loss": 0.7508, | |
| "sparse_loss": 0.7508, | |
| "step": 9740 | |
| }, | |
| { | |
| "epoch": 1.5428390768257982, | |
| "grad_norm": 3.3307414054870605, | |
| "learning_rate": 8.40932206414299e-06, | |
| "loss": 0.6202, | |
| "sparse_loss": 0.6202, | |
| "step": 9760 | |
| }, | |
| { | |
| "epoch": 1.546000632311097, | |
| "grad_norm": 3.576195001602173, | |
| "learning_rate": 8.300339033040908e-06, | |
| "loss": 0.6884, | |
| "sparse_loss": 0.6884, | |
| "step": 9780 | |
| }, | |
| { | |
| "epoch": 1.5491621877963957, | |
| "grad_norm": 3.074108362197876, | |
| "learning_rate": 8.191926069188155e-06, | |
| "loss": 0.6555, | |
| "sparse_loss": 0.6555, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 1.5523237432816948, | |
| "grad_norm": 2.822364330291748, | |
| "learning_rate": 8.084086873419144e-06, | |
| "loss": 0.6959, | |
| "sparse_loss": 0.6959, | |
| "step": 9820 | |
| }, | |
| { | |
| "epoch": 1.5554852987669934, | |
| "grad_norm": 3.8235299587249756, | |
| "learning_rate": 7.976825126981907e-06, | |
| "loss": 0.7085, | |
| "sparse_loss": 0.7085, | |
| "step": 9840 | |
| }, | |
| { | |
| "epoch": 1.558646854252292, | |
| "grad_norm": 3.4154646396636963, | |
| "learning_rate": 7.87014449141236e-06, | |
| "loss": 0.7891, | |
| "sparse_loss": 0.7891, | |
| "step": 9860 | |
| }, | |
| { | |
| "epoch": 1.561808409737591, | |
| "grad_norm": 2.6401171684265137, | |
| "learning_rate": 7.764048608409394e-06, | |
| "loss": 0.7563, | |
| "sparse_loss": 0.7563, | |
| "step": 9880 | |
| }, | |
| { | |
| "epoch": 1.5649699652228897, | |
| "grad_norm": 14.986632347106934, | |
| "learning_rate": 7.65854109971048e-06, | |
| "loss": 0.655, | |
| "sparse_loss": 0.655, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 1.5681315207081883, | |
| "grad_norm": 2.6173582077026367, | |
| "learning_rate": 7.553625566968092e-06, | |
| "loss": 0.6957, | |
| "sparse_loss": 0.6957, | |
| "step": 9920 | |
| }, | |
| { | |
| "epoch": 1.5712930761934873, | |
| "grad_norm": 2.3389580249786377, | |
| "learning_rate": 7.44930559162676e-06, | |
| "loss": 0.7026, | |
| "sparse_loss": 0.7026, | |
| "step": 9940 | |
| }, | |
| { | |
| "epoch": 1.574454631678786, | |
| "grad_norm": 3.2184102535247803, | |
| "learning_rate": 7.345584734800764e-06, | |
| "loss": 0.7599, | |
| "sparse_loss": 0.7599, | |
| "step": 9960 | |
| }, | |
| { | |
| "epoch": 1.5776161871640846, | |
| "grad_norm": 2.2282955646514893, | |
| "learning_rate": 7.242466537152639e-06, | |
| "loss": 0.6167, | |
| "sparse_loss": 0.6167, | |
| "step": 9980 | |
| }, | |
| { | |
| "epoch": 1.5807777426493836, | |
| "grad_norm": 4.351361274719238, | |
| "learning_rate": 7.139954518772227e-06, | |
| "loss": 0.808, | |
| "sparse_loss": 0.808, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.5839392981346823, | |
| "grad_norm": 2.3593902587890625, | |
| "learning_rate": 7.038052179056573e-06, | |
| "loss": 0.6325, | |
| "sparse_loss": 0.6325, | |
| "step": 10020 | |
| }, | |
| { | |
| "epoch": 1.5871008536199809, | |
| "grad_norm": 2.93237566947937, | |
| "learning_rate": 6.936762996590482e-06, | |
| "loss": 0.7235, | |
| "sparse_loss": 0.7235, | |
| "step": 10040 | |
| }, | |
| { | |
| "epoch": 1.59026240910528, | |
| "grad_norm": 8.080977439880371, | |
| "learning_rate": 6.8360904290276975e-06, | |
| "loss": 0.7299, | |
| "sparse_loss": 0.7299, | |
| "step": 10060 | |
| }, | |
| { | |
| "epoch": 1.5934239645905786, | |
| "grad_norm": 2.9335365295410156, | |
| "learning_rate": 6.736037912972967e-06, | |
| "loss": 0.6473, | |
| "sparse_loss": 0.6473, | |
| "step": 10080 | |
| }, | |
| { | |
| "epoch": 1.5965855200758772, | |
| "grad_norm": 4.444180965423584, | |
| "learning_rate": 6.6366088638646154e-06, | |
| "loss": 0.5555, | |
| "sparse_loss": 0.5555, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 1.5997470755611762, | |
| "grad_norm": 2.5193140506744385, | |
| "learning_rate": 6.537806675858066e-06, | |
| "loss": 0.6496, | |
| "sparse_loss": 0.6496, | |
| "step": 10120 | |
| }, | |
| { | |
| "epoch": 1.6029086310464749, | |
| "grad_norm": 4.642623424530029, | |
| "learning_rate": 6.439634721709905e-06, | |
| "loss": 0.6155, | |
| "sparse_loss": 0.6155, | |
| "step": 10140 | |
| }, | |
| { | |
| "epoch": 1.6060701865317735, | |
| "grad_norm": 2.8118245601654053, | |
| "learning_rate": 6.34209635266276e-06, | |
| "loss": 0.7121, | |
| "sparse_loss": 0.7121, | |
| "step": 10160 | |
| }, | |
| { | |
| "epoch": 1.6092317420170725, | |
| "grad_norm": 3.5233209133148193, | |
| "learning_rate": 6.245194898330933e-06, | |
| "loss": 0.6628, | |
| "sparse_loss": 0.6628, | |
| "step": 10180 | |
| }, | |
| { | |
| "epoch": 1.6123932975023711, | |
| "grad_norm": 3.2448441982269287, | |
| "learning_rate": 6.148933666586693e-06, | |
| "loss": 0.7382, | |
| "sparse_loss": 0.7382, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 1.61555485298767, | |
| "grad_norm": 4.133220672607422, | |
| "learning_rate": 6.0533159434473825e-06, | |
| "loss": 0.6039, | |
| "sparse_loss": 0.6039, | |
| "step": 10220 | |
| }, | |
| { | |
| "epoch": 1.6187164084729688, | |
| "grad_norm": 3.2340219020843506, | |
| "learning_rate": 5.958344992963247e-06, | |
| "loss": 0.737, | |
| "sparse_loss": 0.737, | |
| "step": 10240 | |
| }, | |
| { | |
| "epoch": 1.6218779639582674, | |
| "grad_norm": 3.0795609951019287, | |
| "learning_rate": 5.864024057105993e-06, | |
| "loss": 0.6226, | |
| "sparse_loss": 0.6226, | |
| "step": 10260 | |
| }, | |
| { | |
| "epoch": 1.6250395194435663, | |
| "grad_norm": 2.284395217895508, | |
| "learning_rate": 5.770356355658155e-06, | |
| "loss": 0.7065, | |
| "sparse_loss": 0.7065, | |
| "step": 10280 | |
| }, | |
| { | |
| "epoch": 1.6282010749288651, | |
| "grad_norm": 2.981004238128662, | |
| "learning_rate": 5.6773450861031365e-06, | |
| "loss": 0.6393, | |
| "sparse_loss": 0.6393, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 1.6313626304141637, | |
| "grad_norm": 2.6331748962402344, | |
| "learning_rate": 5.584993423516088e-06, | |
| "loss": 0.6835, | |
| "sparse_loss": 0.6835, | |
| "step": 10320 | |
| }, | |
| { | |
| "epoch": 1.6345241858994626, | |
| "grad_norm": 3.171314001083374, | |
| "learning_rate": 5.49330452045552e-06, | |
| "loss": 0.7525, | |
| "sparse_loss": 0.7525, | |
| "step": 10340 | |
| }, | |
| { | |
| "epoch": 1.6376857413847614, | |
| "grad_norm": 3.8974101543426514, | |
| "learning_rate": 5.402281506855672e-06, | |
| "loss": 0.7167, | |
| "sparse_loss": 0.7167, | |
| "step": 10360 | |
| }, | |
| { | |
| "epoch": 1.64084729687006, | |
| "grad_norm": 3.284602403640747, | |
| "learning_rate": 5.3119274899196965e-06, | |
| "loss": 0.6216, | |
| "sparse_loss": 0.6216, | |
| "step": 10380 | |
| }, | |
| { | |
| "epoch": 1.6440088523553589, | |
| "grad_norm": 3.199734926223755, | |
| "learning_rate": 5.222245554013552e-06, | |
| "loss": 0.7085, | |
| "sparse_loss": 0.7085, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 1.6471704078406577, | |
| "grad_norm": 4.375881671905518, | |
| "learning_rate": 5.133238760560735e-06, | |
| "loss": 0.5973, | |
| "sparse_loss": 0.5973, | |
| "step": 10420 | |
| }, | |
| { | |
| "epoch": 1.6503319633259563, | |
| "grad_norm": 3.2240214347839355, | |
| "learning_rate": 5.044910147937778e-06, | |
| "loss": 0.5963, | |
| "sparse_loss": 0.5963, | |
| "step": 10440 | |
| }, | |
| { | |
| "epoch": 1.6534935188112552, | |
| "grad_norm": 3.3825221061706543, | |
| "learning_rate": 4.95726273137051e-06, | |
| "loss": 0.7874, | |
| "sparse_loss": 0.7874, | |
| "step": 10460 | |
| }, | |
| { | |
| "epoch": 1.656655074296554, | |
| "grad_norm": 6.872448444366455, | |
| "learning_rate": 4.870299502831163e-06, | |
| "loss": 0.6904, | |
| "sparse_loss": 0.6904, | |
| "step": 10480 | |
| }, | |
| { | |
| "epoch": 1.6598166297818526, | |
| "grad_norm": 2.23103666305542, | |
| "learning_rate": 4.784023430936193e-06, | |
| "loss": 0.7489, | |
| "sparse_loss": 0.7489, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 1.6629781852671515, | |
| "grad_norm": 2.302872657775879, | |
| "learning_rate": 4.698437460844976e-06, | |
| "loss": 0.7505, | |
| "sparse_loss": 0.7505, | |
| "step": 10520 | |
| }, | |
| { | |
| "epoch": 1.6661397407524503, | |
| "grad_norm": 10.811643600463867, | |
| "learning_rate": 4.613544514159246e-06, | |
| "loss": 0.6442, | |
| "sparse_loss": 0.6442, | |
| "step": 10540 | |
| }, | |
| { | |
| "epoch": 1.669301296237749, | |
| "grad_norm": 6.4289021492004395, | |
| "learning_rate": 4.52934748882338e-06, | |
| "loss": 0.7088, | |
| "sparse_loss": 0.7088, | |
| "step": 10560 | |
| }, | |
| { | |
| "epoch": 1.6724628517230478, | |
| "grad_norm": 4.472427845001221, | |
| "learning_rate": 4.445849259025475e-06, | |
| "loss": 0.717, | |
| "sparse_loss": 0.717, | |
| "step": 10580 | |
| }, | |
| { | |
| "epoch": 1.6756244072083466, | |
| "grad_norm": 7.740157127380371, | |
| "learning_rate": 4.363052675099213e-06, | |
| "loss": 0.6795, | |
| "sparse_loss": 0.6795, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 1.6787859626936452, | |
| "grad_norm": 3.1040191650390625, | |
| "learning_rate": 4.2809605634265755e-06, | |
| "loss": 0.6909, | |
| "sparse_loss": 0.6909, | |
| "step": 10620 | |
| }, | |
| { | |
| "epoch": 1.681947518178944, | |
| "grad_norm": 18.65782356262207, | |
| "learning_rate": 4.199575726341346e-06, | |
| "loss": 0.6712, | |
| "sparse_loss": 0.6712, | |
| "step": 10640 | |
| }, | |
| { | |
| "epoch": 1.685109073664243, | |
| "grad_norm": 2.706319808959961, | |
| "learning_rate": 4.118900942033491e-06, | |
| "loss": 0.7309, | |
| "sparse_loss": 0.7309, | |
| "step": 10660 | |
| }, | |
| { | |
| "epoch": 1.6882706291495415, | |
| "grad_norm": 2.1650848388671875, | |
| "learning_rate": 4.0389389644542586e-06, | |
| "loss": 0.599, | |
| "sparse_loss": 0.599, | |
| "step": 10680 | |
| }, | |
| { | |
| "epoch": 1.6914321846348404, | |
| "grad_norm": 5.536136150360107, | |
| "learning_rate": 3.9596925232222196e-06, | |
| "loss": 0.6251, | |
| "sparse_loss": 0.6251, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 1.6945937401201392, | |
| "grad_norm": 4.87489652633667, | |
| "learning_rate": 3.881164323530062e-06, | |
| "loss": 0.6247, | |
| "sparse_loss": 0.6247, | |
| "step": 10720 | |
| }, | |
| { | |
| "epoch": 1.6977552956054378, | |
| "grad_norm": 4.932882308959961, | |
| "learning_rate": 3.8033570460522498e-06, | |
| "loss": 0.7014, | |
| "sparse_loss": 0.7014, | |
| "step": 10740 | |
| }, | |
| { | |
| "epoch": 1.7009168510907366, | |
| "grad_norm": 2.601973056793213, | |
| "learning_rate": 3.7262733468535317e-06, | |
| "loss": 0.651, | |
| "sparse_loss": 0.651, | |
| "step": 10760 | |
| }, | |
| { | |
| "epoch": 1.7040784065760355, | |
| "grad_norm": 2.757931709289551, | |
| "learning_rate": 3.649915857298242e-06, | |
| "loss": 0.6175, | |
| "sparse_loss": 0.6175, | |
| "step": 10780 | |
| }, | |
| { | |
| "epoch": 1.707239962061334, | |
| "grad_norm": 10.078351974487305, | |
| "learning_rate": 3.5742871839605006e-06, | |
| "loss": 0.6855, | |
| "sparse_loss": 0.6855, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 1.710401517546633, | |
| "grad_norm": 3.487138509750366, | |
| "learning_rate": 3.499389908535222e-06, | |
| "loss": 0.6298, | |
| "sparse_loss": 0.6298, | |
| "step": 10820 | |
| }, | |
| { | |
| "epoch": 1.7135630730319318, | |
| "grad_norm": 5.3016133308410645, | |
| "learning_rate": 3.425226587749977e-06, | |
| "loss": 0.6735, | |
| "sparse_loss": 0.6735, | |
| "step": 10840 | |
| }, | |
| { | |
| "epoch": 1.7167246285172304, | |
| "grad_norm": 3.6436634063720703, | |
| "learning_rate": 3.3517997532777485e-06, | |
| "loss": 0.6128, | |
| "sparse_loss": 0.6128, | |
| "step": 10860 | |
| }, | |
| { | |
| "epoch": 1.7198861840025292, | |
| "grad_norm": 4.625311374664307, | |
| "learning_rate": 3.2791119116504703e-06, | |
| "loss": 0.6725, | |
| "sparse_loss": 0.6725, | |
| "step": 10880 | |
| }, | |
| { | |
| "epoch": 1.723047739487828, | |
| "grad_norm": 3.579132318496704, | |
| "learning_rate": 3.207165544173482e-06, | |
| "loss": 0.7013, | |
| "sparse_loss": 0.7013, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 1.7262092949731267, | |
| "grad_norm": 4.380514144897461, | |
| "learning_rate": 3.1359631068408224e-06, | |
| "loss": 0.6835, | |
| "sparse_loss": 0.6835, | |
| "step": 10920 | |
| }, | |
| { | |
| "epoch": 1.7293708504584255, | |
| "grad_norm": 3.3078274726867676, | |
| "learning_rate": 3.0655070302513884e-06, | |
| "loss": 0.6052, | |
| "sparse_loss": 0.6052, | |
| "step": 10940 | |
| }, | |
| { | |
| "epoch": 1.7325324059437244, | |
| "grad_norm": 3.0710272789001465, | |
| "learning_rate": 2.9957997195259796e-06, | |
| "loss": 0.7106, | |
| "sparse_loss": 0.7106, | |
| "step": 10960 | |
| }, | |
| { | |
| "epoch": 1.735693961429023, | |
| "grad_norm": 3.607321262359619, | |
| "learning_rate": 2.926843554225167e-06, | |
| "loss": 0.7583, | |
| "sparse_loss": 0.7583, | |
| "step": 10980 | |
| }, | |
| { | |
| "epoch": 1.7388555169143218, | |
| "grad_norm": 2.6866536140441895, | |
| "learning_rate": 2.8586408882680827e-06, | |
| "loss": 0.8333, | |
| "sparse_loss": 0.8333, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 1.7420170723996207, | |
| "grad_norm": 3.9435837268829346, | |
| "learning_rate": 2.791194049852075e-06, | |
| "loss": 0.6172, | |
| "sparse_loss": 0.6172, | |
| "step": 11020 | |
| }, | |
| { | |
| "epoch": 1.7451786278849193, | |
| "grad_norm": 3.074887275695801, | |
| "learning_rate": 2.7245053413731876e-06, | |
| "loss": 0.6502, | |
| "sparse_loss": 0.6502, | |
| "step": 11040 | |
| }, | |
| { | |
| "epoch": 1.7483401833702181, | |
| "grad_norm": 4.271072864532471, | |
| "learning_rate": 2.6585770393476288e-06, | |
| "loss": 0.6979, | |
| "sparse_loss": 0.6979, | |
| "step": 11060 | |
| }, | |
| { | |
| "epoch": 1.751501738855517, | |
| "grad_norm": 2.711543083190918, | |
| "learning_rate": 2.593411394334e-06, | |
| "loss": 0.5692, | |
| "sparse_loss": 0.5692, | |
| "step": 11080 | |
| }, | |
| { | |
| "epoch": 1.7546632943408156, | |
| "grad_norm": 2.817941904067993, | |
| "learning_rate": 2.529010630856507e-06, | |
| "loss": 0.6522, | |
| "sparse_loss": 0.6522, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 1.7578248498261144, | |
| "grad_norm": 2.2273244857788086, | |
| "learning_rate": 2.465376947329015e-06, | |
| "loss": 0.6217, | |
| "sparse_loss": 0.6217, | |
| "step": 11120 | |
| }, | |
| { | |
| "epoch": 1.7609864053114133, | |
| "grad_norm": 9.07394027709961, | |
| "learning_rate": 2.402512515979974e-06, | |
| "loss": 0.656, | |
| "sparse_loss": 0.656, | |
| "step": 11140 | |
| }, | |
| { | |
| "epoch": 1.7641479607967119, | |
| "grad_norm": 5.390905857086182, | |
| "learning_rate": 2.3404194827783223e-06, | |
| "loss": 0.7755, | |
| "sparse_loss": 0.7755, | |
| "step": 11160 | |
| }, | |
| { | |
| "epoch": 1.7673095162820107, | |
| "grad_norm": 4.0121941566467285, | |
| "learning_rate": 2.2790999673601736e-06, | |
| "loss": 0.6861, | |
| "sparse_loss": 0.6861, | |
| "step": 11180 | |
| }, | |
| { | |
| "epoch": 1.7704710717673096, | |
| "grad_norm": 4.383898735046387, | |
| "learning_rate": 2.218556062956506e-06, | |
| "loss": 0.6274, | |
| "sparse_loss": 0.6274, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 1.7736326272526082, | |
| "grad_norm": 6.343504428863525, | |
| "learning_rate": 2.158789836321673e-06, | |
| "loss": 0.6694, | |
| "sparse_loss": 0.6694, | |
| "step": 11220 | |
| }, | |
| { | |
| "epoch": 1.776794182737907, | |
| "grad_norm": 3.90850567817688, | |
| "learning_rate": 2.0998033276628525e-06, | |
| "loss": 0.5824, | |
| "sparse_loss": 0.5824, | |
| "step": 11240 | |
| }, | |
| { | |
| "epoch": 1.7799557382232059, | |
| "grad_norm": 3.579479694366455, | |
| "learning_rate": 2.0415985505704476e-06, | |
| "loss": 0.6302, | |
| "sparse_loss": 0.6302, | |
| "step": 11260 | |
| }, | |
| { | |
| "epoch": 1.7831172937085045, | |
| "grad_norm": 2.917056083679199, | |
| "learning_rate": 1.984177491949285e-06, | |
| "loss": 0.6404, | |
| "sparse_loss": 0.6404, | |
| "step": 11280 | |
| }, | |
| { | |
| "epoch": 1.7862788491938033, | |
| "grad_norm": 3.728649139404297, | |
| "learning_rate": 1.927542111950836e-06, | |
| "loss": 0.5912, | |
| "sparse_loss": 0.5912, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 1.7894404046791021, | |
| "grad_norm": 3.583627700805664, | |
| "learning_rate": 1.8716943439062883e-06, | |
| "loss": 0.6079, | |
| "sparse_loss": 0.6079, | |
| "step": 11320 | |
| }, | |
| { | |
| "epoch": 1.7926019601644008, | |
| "grad_norm": 3.6497068405151367, | |
| "learning_rate": 1.8166360942605348e-06, | |
| "loss": 0.6869, | |
| "sparse_loss": 0.6869, | |
| "step": 11340 | |
| }, | |
| { | |
| "epoch": 1.7957635156496996, | |
| "grad_norm": 3.265042304992676, | |
| "learning_rate": 1.7623692425071225e-06, | |
| "loss": 0.6614, | |
| "sparse_loss": 0.6614, | |
| "step": 11360 | |
| }, | |
| { | |
| "epoch": 1.7989250711349984, | |
| "grad_norm": 4.365682125091553, | |
| "learning_rate": 1.708895641124064e-06, | |
| "loss": 0.7749, | |
| "sparse_loss": 0.7749, | |
| "step": 11380 | |
| }, | |
| { | |
| "epoch": 1.802086626620297, | |
| "grad_norm": 3.0027592182159424, | |
| "learning_rate": 1.656217115510636e-06, | |
| "loss": 0.665, | |
| "sparse_loss": 0.665, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 1.805248182105596, | |
| "grad_norm": 1.8759933710098267, | |
| "learning_rate": 1.6043354639250301e-06, | |
| "loss": 0.6043, | |
| "sparse_loss": 0.6043, | |
| "step": 11420 | |
| }, | |
| { | |
| "epoch": 1.8084097375908947, | |
| "grad_norm": 4.52070951461792, | |
| "learning_rate": 1.553252457422985e-06, | |
| "loss": 0.6661, | |
| "sparse_loss": 0.6661, | |
| "step": 11440 | |
| }, | |
| { | |
| "epoch": 1.8115712930761934, | |
| "grad_norm": 5.161223888397217, | |
| "learning_rate": 1.5029698397973274e-06, | |
| "loss": 0.6998, | |
| "sparse_loss": 0.6998, | |
| "step": 11460 | |
| }, | |
| { | |
| "epoch": 1.8147328485614924, | |
| "grad_norm": 13.196478843688965, | |
| "learning_rate": 1.4534893275184397e-06, | |
| "loss": 0.7442, | |
| "sparse_loss": 0.7442, | |
| "step": 11480 | |
| }, | |
| { | |
| "epoch": 1.817894404046791, | |
| "grad_norm": 2.45226788520813, | |
| "learning_rate": 1.4048126096756847e-06, | |
| "loss": 0.6336, | |
| "sparse_loss": 0.6336, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 1.8210559595320897, | |
| "grad_norm": 14.519777297973633, | |
| "learning_rate": 1.3569413479197129e-06, | |
| "loss": 0.7014, | |
| "sparse_loss": 0.7014, | |
| "step": 11520 | |
| }, | |
| { | |
| "epoch": 1.8242175150173887, | |
| "grad_norm": 3.6050455570220947, | |
| "learning_rate": 1.3098771764057715e-06, | |
| "loss": 0.7002, | |
| "sparse_loss": 0.7002, | |
| "step": 11540 | |
| }, | |
| { | |
| "epoch": 1.8273790705026873, | |
| "grad_norm": 2.05651593208313, | |
| "learning_rate": 1.2636217017378992e-06, | |
| "loss": 0.6105, | |
| "sparse_loss": 0.6105, | |
| "step": 11560 | |
| }, | |
| { | |
| "epoch": 1.830540625987986, | |
| "grad_norm": 5.284204483032227, | |
| "learning_rate": 1.2181765029140868e-06, | |
| "loss": 0.6764, | |
| "sparse_loss": 0.6764, | |
| "step": 11580 | |
| }, | |
| { | |
| "epoch": 1.833702181473285, | |
| "grad_norm": 2.5336947441101074, | |
| "learning_rate": 1.173543131272395e-06, | |
| "loss": 0.6703, | |
| "sparse_loss": 0.6703, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 1.8368637369585836, | |
| "grad_norm": 2.767129421234131, | |
| "learning_rate": 1.1297231104379691e-06, | |
| "loss": 0.7235, | |
| "sparse_loss": 0.7235, | |
| "step": 11620 | |
| }, | |
| { | |
| "epoch": 1.8400252924438822, | |
| "grad_norm": 3.1908180713653564, | |
| "learning_rate": 1.0867179362710367e-06, | |
| "loss": 0.7188, | |
| "sparse_loss": 0.7188, | |
| "step": 11640 | |
| }, | |
| { | |
| "epoch": 1.8431868479291813, | |
| "grad_norm": 3.3988351821899414, | |
| "learning_rate": 1.0445290768158561e-06, | |
| "loss": 0.6119, | |
| "sparse_loss": 0.6119, | |
| "step": 11660 | |
| }, | |
| { | |
| "epoch": 1.84634840341448, | |
| "grad_norm": 3.3586833477020264, | |
| "learning_rate": 1.0031579722505902e-06, | |
| "loss": 0.5684, | |
| "sparse_loss": 0.5684, | |
| "step": 11680 | |
| }, | |
| { | |
| "epoch": 1.8495099588997785, | |
| "grad_norm": 2.7259163856506348, | |
| "learning_rate": 9.626060348381482e-07, | |
| "loss": 0.7029, | |
| "sparse_loss": 0.7029, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 1.8526715143850776, | |
| "grad_norm": 4.455019474029541, | |
| "learning_rate": 9.228746488779777e-07, | |
| "loss": 0.6075, | |
| "sparse_loss": 0.6075, | |
| "step": 11720 | |
| }, | |
| { | |
| "epoch": 1.8558330698703762, | |
| "grad_norm": 3.4989333152770996, | |
| "learning_rate": 8.839651706588042e-07, | |
| "loss": 0.6807, | |
| "sparse_loss": 0.6807, | |
| "step": 11740 | |
| }, | |
| { | |
| "epoch": 1.8589946253556748, | |
| "grad_norm": 3.912893295288086, | |
| "learning_rate": 8.458789284123359e-07, | |
| "loss": 0.6755, | |
| "sparse_loss": 0.6755, | |
| "step": 11760 | |
| }, | |
| { | |
| "epoch": 1.862156180840974, | |
| "grad_norm": 3.059116840362549, | |
| "learning_rate": 8.086172222679184e-07, | |
| "loss": 0.6841, | |
| "sparse_loss": 0.6841, | |
| "step": 11780 | |
| }, | |
| { | |
| "epoch": 1.8653177363262725, | |
| "grad_norm": 5.806337356567383, | |
| "learning_rate": 7.721813242081682e-07, | |
| "loss": 0.6623, | |
| "sparse_loss": 0.6623, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 1.8684792918115711, | |
| "grad_norm": 3.4639861583709717, | |
| "learning_rate": 7.365724780255239e-07, | |
| "loss": 0.7208, | |
| "sparse_loss": 0.7208, | |
| "step": 11820 | |
| }, | |
| { | |
| "epoch": 1.8716408472968702, | |
| "grad_norm": 7.740152359008789, | |
| "learning_rate": 7.017918992798272e-07, | |
| "loss": 0.6936, | |
| "sparse_loss": 0.6936, | |
| "step": 11840 | |
| }, | |
| { | |
| "epoch": 1.8748024027821688, | |
| "grad_norm": 2.8128342628479004, | |
| "learning_rate": 6.678407752567756e-07, | |
| "loss": 0.615, | |
| "sparse_loss": 0.615, | |
| "step": 11860 | |
| }, | |
| { | |
| "epoch": 1.8779639582674676, | |
| "grad_norm": 4.114563465118408, | |
| "learning_rate": 6.34720264927438e-07, | |
| "loss": 0.635, | |
| "sparse_loss": 0.635, | |
| "step": 11880 | |
| }, | |
| { | |
| "epoch": 1.8811255137527665, | |
| "grad_norm": 3.8733608722686768, | |
| "learning_rate": 6.024314989086788e-07, | |
| "loss": 0.6929, | |
| "sparse_loss": 0.6929, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 1.884287069238065, | |
| "grad_norm": 5.899589538574219, | |
| "learning_rate": 5.709755794245458e-07, | |
| "loss": 0.6765, | |
| "sparse_loss": 0.6765, | |
| "step": 11920 | |
| }, | |
| { | |
| "epoch": 1.887448624723364, | |
| "grad_norm": 2.3009846210479736, | |
| "learning_rate": 5.403535802686738e-07, | |
| "loss": 0.6189, | |
| "sparse_loss": 0.6189, | |
| "step": 11940 | |
| }, | |
| { | |
| "epoch": 1.8906101802086628, | |
| "grad_norm": 2.454423427581787, | |
| "learning_rate": 5.105665467675963e-07, | |
| "loss": 0.6262, | |
| "sparse_loss": 0.6262, | |
| "step": 11960 | |
| }, | |
| { | |
| "epoch": 1.8937717356939614, | |
| "grad_norm": 3.036147356033325, | |
| "learning_rate": 4.816154957450831e-07, | |
| "loss": 0.5448, | |
| "sparse_loss": 0.5448, | |
| "step": 11980 | |
| }, | |
| { | |
| "epoch": 1.8969332911792602, | |
| "grad_norm": 4.604625701904297, | |
| "learning_rate": 4.53501415487434e-07, | |
| "loss": 0.6672, | |
| "sparse_loss": 0.6672, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 1.900094846664559, | |
| "grad_norm": 3.325061798095703, | |
| "learning_rate": 4.2622526570972044e-07, | |
| "loss": 0.6019, | |
| "sparse_loss": 0.6019, | |
| "step": 12020 | |
| }, | |
| { | |
| "epoch": 1.9032564021498577, | |
| "grad_norm": 4.258634090423584, | |
| "learning_rate": 3.997879775230445e-07, | |
| "loss": 0.6101, | |
| "sparse_loss": 0.6101, | |
| "step": 12040 | |
| }, | |
| { | |
| "epoch": 1.9064179576351565, | |
| "grad_norm": 2.745493173599243, | |
| "learning_rate": 3.741904534027424e-07, | |
| "loss": 0.6053, | |
| "sparse_loss": 0.6053, | |
| "step": 12060 | |
| }, | |
| { | |
| "epoch": 1.9095795131204554, | |
| "grad_norm": 3.0191283226013184, | |
| "learning_rate": 3.494335671575755e-07, | |
| "loss": 0.6834, | |
| "sparse_loss": 0.6834, | |
| "step": 12080 | |
| }, | |
| { | |
| "epoch": 1.912741068605754, | |
| "grad_norm": 2.9336516857147217, | |
| "learning_rate": 3.255181638999211e-07, | |
| "loss": 0.6392, | |
| "sparse_loss": 0.6392, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 1.9159026240910528, | |
| "grad_norm": 3.190945625305176, | |
| "learning_rate": 3.0244506001689543e-07, | |
| "loss": 0.6366, | |
| "sparse_loss": 0.6366, | |
| "step": 12120 | |
| }, | |
| { | |
| "epoch": 1.9190641795763517, | |
| "grad_norm": 3.285252571105957, | |
| "learning_rate": 2.8021504314250934e-07, | |
| "loss": 0.7014, | |
| "sparse_loss": 0.7014, | |
| "step": 12140 | |
| }, | |
| { | |
| "epoch": 1.9222257350616503, | |
| "grad_norm": 7.00942850112915, | |
| "learning_rate": 2.588288721307619e-07, | |
| "loss": 0.618, | |
| "sparse_loss": 0.618, | |
| "step": 12160 | |
| }, | |
| { | |
| "epoch": 1.9253872905469491, | |
| "grad_norm": 5.004833698272705, | |
| "learning_rate": 2.3828727702975007e-07, | |
| "loss": 0.6935, | |
| "sparse_loss": 0.6935, | |
| "step": 12180 | |
| }, | |
| { | |
| "epoch": 1.928548846032248, | |
| "grad_norm": 4.934511184692383, | |
| "learning_rate": 2.1859095905674143e-07, | |
| "loss": 0.6286, | |
| "sparse_loss": 0.6286, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 1.9317104015175466, | |
| "grad_norm": 2.8532612323760986, | |
| "learning_rate": 1.9974059057423223e-07, | |
| "loss": 0.661, | |
| "sparse_loss": 0.661, | |
| "step": 12220 | |
| }, | |
| { | |
| "epoch": 1.9348719570028454, | |
| "grad_norm": 3.2379302978515625, | |
| "learning_rate": 1.8173681506701013e-07, | |
| "loss": 0.6822, | |
| "sparse_loss": 0.6822, | |
| "step": 12240 | |
| }, | |
| { | |
| "epoch": 1.9380335124881443, | |
| "grad_norm": 3.928940534591675, | |
| "learning_rate": 1.6458024712017182e-07, | |
| "loss": 0.6936, | |
| "sparse_loss": 0.6936, | |
| "step": 12260 | |
| }, | |
| { | |
| "epoch": 1.9411950679734429, | |
| "grad_norm": 2.6900010108947754, | |
| "learning_rate": 1.4827147239815097e-07, | |
| "loss": 0.6284, | |
| "sparse_loss": 0.6284, | |
| "step": 12280 | |
| }, | |
| { | |
| "epoch": 1.9443566234587417, | |
| "grad_norm": 4.513027667999268, | |
| "learning_rate": 1.328110476247285e-07, | |
| "loss": 0.6469, | |
| "sparse_loss": 0.6469, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 1.9475181789440406, | |
| "grad_norm": 16.359464645385742, | |
| "learning_rate": 1.181995005640174e-07, | |
| "loss": 0.737, | |
| "sparse_loss": 0.737, | |
| "step": 12320 | |
| }, | |
| { | |
| "epoch": 1.9506797344293392, | |
| "grad_norm": 2.822321653366089, | |
| "learning_rate": 1.0443733000246037e-07, | |
| "loss": 0.7011, | |
| "sparse_loss": 0.7011, | |
| "step": 12340 | |
| }, | |
| { | |
| "epoch": 1.953841289914638, | |
| "grad_norm": 2.8069210052490234, | |
| "learning_rate": 9.152500573179345e-08, | |
| "loss": 0.6338, | |
| "sparse_loss": 0.6338, | |
| "step": 12360 | |
| }, | |
| { | |
| "epoch": 1.9570028453999369, | |
| "grad_norm": 2.046774387359619, | |
| "learning_rate": 7.946296853300895e-08, | |
| "loss": 0.6635, | |
| "sparse_loss": 0.6635, | |
| "step": 12380 | |
| }, | |
| { | |
| "epoch": 1.9601644008852355, | |
| "grad_norm": 3.2035796642303467, | |
| "learning_rate": 6.825163016132007e-08, | |
| "loss": 0.6019, | |
| "sparse_loss": 0.6019, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 1.9633259563705343, | |
| "grad_norm": 3.2717444896698, | |
| "learning_rate": 5.78913733320835e-08, | |
| "loss": 0.6897, | |
| "sparse_loss": 0.6897, | |
| "step": 12420 | |
| }, | |
| { | |
| "epoch": 1.9664875118558331, | |
| "grad_norm": 3.2161619663238525, | |
| "learning_rate": 4.8382551707762403e-08, | |
| "loss": 0.6334, | |
| "sparse_loss": 0.6334, | |
| "step": 12440 | |
| }, | |
| { | |
| "epoch": 1.9696490673411318, | |
| "grad_norm": 3.822545051574707, | |
| "learning_rate": 3.972548988582792e-08, | |
| "loss": 0.7097, | |
| "sparse_loss": 0.7097, | |
| "step": 12460 | |
| }, | |
| { | |
| "epoch": 1.9728106228264306, | |
| "grad_norm": 3.633744239807129, | |
| "learning_rate": 3.192048338769293e-08, | |
| "loss": 0.758, | |
| "sparse_loss": 0.758, | |
| "step": 12480 | |
| }, | |
| { | |
| "epoch": 1.9759721783117294, | |
| "grad_norm": 3.19952392578125, | |
| "learning_rate": 2.496779864862575e-08, | |
| "loss": 0.6445, | |
| "sparse_loss": 0.6445, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 1.979133733797028, | |
| "grad_norm": 2.4408233165740967, | |
| "learning_rate": 1.886767300864345e-08, | |
| "loss": 0.6809, | |
| "sparse_loss": 0.6809, | |
| "step": 12520 | |
| }, | |
| { | |
| "epoch": 1.982295289282327, | |
| "grad_norm": 2.962395191192627, | |
| "learning_rate": 1.362031470441838e-08, | |
| "loss": 0.6987, | |
| "sparse_loss": 0.6987, | |
| "step": 12540 | |
| }, | |
| { | |
| "epoch": 1.9854568447676257, | |
| "grad_norm": 2.5761423110961914, | |
| "learning_rate": 9.225902862172731e-09, | |
| "loss": 0.6249, | |
| "sparse_loss": 0.6249, | |
| "step": 12560 | |
| }, | |
| { | |
| "epoch": 1.9886184002529244, | |
| "grad_norm": 2.363164186477661, | |
| "learning_rate": 5.684587491550097e-09, | |
| "loss": 0.6844, | |
| "sparse_loss": 0.6844, | |
| "step": 12580 | |
| }, | |
| { | |
| "epoch": 1.9917799557382232, | |
| "grad_norm": 2.3563790321350098, | |
| "learning_rate": 2.996489480514009e-09, | |
| "loss": 0.7159, | |
| "sparse_loss": 0.7159, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 1.994941511223522, | |
| "grad_norm": 2.364633321762085, | |
| "learning_rate": 1.1617005911984668e-09, | |
| "loss": 0.7133, | |
| "sparse_loss": 0.7133, | |
| "step": 12620 | |
| }, | |
| { | |
| "epoch": 1.9981030667088207, | |
| "grad_norm": 8.201303482055664, | |
| "learning_rate": 1.8028345680209946e-10, | |
| "loss": 0.6341, | |
| "sparse_loss": 0.6341, | |
| "step": 12640 | |
| } | |
| ], | |
| "logging_steps": 20, | |
| "max_steps": 12652, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 2000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 32, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |