Instructions to use WhiteGiverPlus/Qwen3.5-2B-metamath with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- PEFT
How to use WhiteGiverPlus/Qwen3.5-2B-metamath with PEFT:
from peft import PeftModel from transformers import AutoModelForCausalLM base_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3.5-2B") model = PeftModel.from_pretrained(base_model, "WhiteGiverPlus/Qwen3.5-2B-metamath") - Transformers
How to use WhiteGiverPlus/Qwen3.5-2B-metamath with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="WhiteGiverPlus/Qwen3.5-2B-metamath") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("WhiteGiverPlus/Qwen3.5-2B-metamath", dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use WhiteGiverPlus/Qwen3.5-2B-metamath with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "WhiteGiverPlus/Qwen3.5-2B-metamath" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "WhiteGiverPlus/Qwen3.5-2B-metamath", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/WhiteGiverPlus/Qwen3.5-2B-metamath
- SGLang
How to use WhiteGiverPlus/Qwen3.5-2B-metamath with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "WhiteGiverPlus/Qwen3.5-2B-metamath" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "WhiteGiverPlus/Qwen3.5-2B-metamath", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "WhiteGiverPlus/Qwen3.5-2B-metamath" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "WhiteGiverPlus/Qwen3.5-2B-metamath", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use WhiteGiverPlus/Qwen3.5-2B-metamath with Docker Model Runner:
docker model run hf.co/WhiteGiverPlus/Qwen3.5-2B-metamath
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 250, | |
| "global_step": 2865, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.010479434110558029, | |
| "grad_norm": 0.19915591180324554, | |
| "learning_rate": 1.0465116279069768e-05, | |
| "loss": 1.1350045204162598, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.020958868221116058, | |
| "grad_norm": 0.18158815801143646, | |
| "learning_rate": 2.2093023255813955e-05, | |
| "loss": 1.0580164909362793, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.03143830233167409, | |
| "grad_norm": 0.16481591761112213, | |
| "learning_rate": 3.372093023255814e-05, | |
| "loss": 0.9252842903137207, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.041917736442232116, | |
| "grad_norm": 0.15599584579467773, | |
| "learning_rate": 4.5348837209302326e-05, | |
| "loss": 0.8342072486877441, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.05239717055279015, | |
| "grad_norm": 0.1804327368736267, | |
| "learning_rate": 5.697674418604652e-05, | |
| "loss": 0.7955524921417236, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.06287660466334818, | |
| "grad_norm": 0.16934047639369965, | |
| "learning_rate": 6.86046511627907e-05, | |
| "loss": 0.7358035087585449, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.07335603877390622, | |
| "grad_norm": 0.2234930843114853, | |
| "learning_rate": 8.023255813953489e-05, | |
| "loss": 0.6985861301422119, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.08383547288446423, | |
| "grad_norm": 0.16290400922298431, | |
| "learning_rate": 9.186046511627907e-05, | |
| "loss": 0.599607515335083, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.09431490699502226, | |
| "grad_norm": 0.1660464107990265, | |
| "learning_rate": 9.999971245570617e-05, | |
| "loss": 0.5886398315429687, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.1047943411055803, | |
| "grad_norm": 0.16978025436401367, | |
| "learning_rate": 9.999460064915317e-05, | |
| "loss": 0.5450529098510742, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.11527377521613832, | |
| "grad_norm": 0.21447990834712982, | |
| "learning_rate": 9.998309972134645e-05, | |
| "loss": 0.5072262287139893, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.12575320932669637, | |
| "grad_norm": 0.17418669164180756, | |
| "learning_rate": 9.996521114206116e-05, | |
| "loss": 0.49445347785949706, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.13623264343725439, | |
| "grad_norm": 0.22226351499557495, | |
| "learning_rate": 9.994093719739023e-05, | |
| "loss": 0.47142682075500486, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.14671207754781243, | |
| "grad_norm": 0.1745530068874359, | |
| "learning_rate": 9.991028098945215e-05, | |
| "loss": 0.46663532257080076, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.15719151165837045, | |
| "grad_norm": 0.17074695229530334, | |
| "learning_rate": 9.987324643599459e-05, | |
| "loss": 0.4508847236633301, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.16767094576892846, | |
| "grad_norm": 0.13428406417369843, | |
| "learning_rate": 9.982983826989367e-05, | |
| "loss": 0.40740265846252444, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.1781503798794865, | |
| "grad_norm": 0.17766578495502472, | |
| "learning_rate": 9.978006203854918e-05, | |
| "loss": 0.3998516321182251, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.18862981399004453, | |
| "grad_norm": 0.1672629565000534, | |
| "learning_rate": 9.972392410317562e-05, | |
| "loss": 0.41658673286437986, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.19910924810060257, | |
| "grad_norm": 0.1333673745393753, | |
| "learning_rate": 9.96614316379892e-05, | |
| "loss": 0.37024455070495604, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.2095886822111606, | |
| "grad_norm": 0.18037110567092896, | |
| "learning_rate": 9.959259262929113e-05, | |
| "loss": 0.35086841583251954, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.22006811632171863, | |
| "grad_norm": 0.14616410434246063, | |
| "learning_rate": 9.951741587444683e-05, | |
| "loss": 0.37918968200683595, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.23054755043227665, | |
| "grad_norm": 0.14523574709892273, | |
| "learning_rate": 9.943591098076184e-05, | |
| "loss": 0.32804527282714846, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.2410269845428347, | |
| "grad_norm": 0.14667049050331116, | |
| "learning_rate": 9.934808836425393e-05, | |
| "loss": 0.3480507850646973, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.25150641865339274, | |
| "grad_norm": 0.18156558275222778, | |
| "learning_rate": 9.925395924832198e-05, | |
| "loss": 0.3300448179244995, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.26198585276395076, | |
| "grad_norm": 0.13806430995464325, | |
| "learning_rate": 9.91535356623117e-05, | |
| "loss": 0.3127591609954834, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.26198585276395076, | |
| "eval_loss": 0.3132782578468323, | |
| "eval_runtime": 94.8848, | |
| "eval_samples_per_second": 3.278, | |
| "eval_steps_per_second": 3.278, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.27246528687450877, | |
| "grad_norm": 0.17205959558486938, | |
| "learning_rate": 9.904683043997835e-05, | |
| "loss": 0.3306673288345337, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.2829447209850668, | |
| "grad_norm": 0.12620031833648682, | |
| "learning_rate": 9.893385721784656e-05, | |
| "loss": 0.3011106729507446, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.29342415509562486, | |
| "grad_norm": 0.11466006934642792, | |
| "learning_rate": 9.881463043346768e-05, | |
| "loss": 0.2951968669891357, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.3039035892061829, | |
| "grad_norm": 0.1671207845211029, | |
| "learning_rate": 9.868916532357475e-05, | |
| "loss": 0.2910990953445435, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.3143830233167409, | |
| "grad_norm": 0.1683349907398224, | |
| "learning_rate": 9.855747792213521e-05, | |
| "loss": 0.31409192085266113, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.3248624574272989, | |
| "grad_norm": 0.12934699654579163, | |
| "learning_rate": 9.84195850583019e-05, | |
| "loss": 0.27755858898162844, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.33534189153785693, | |
| "grad_norm": 0.13784605264663696, | |
| "learning_rate": 9.827550435426234e-05, | |
| "loss": 0.2809821605682373, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.345821325648415, | |
| "grad_norm": 0.18590271472930908, | |
| "learning_rate": 9.812525422298664e-05, | |
| "loss": 0.28698866367340087, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.356300759758973, | |
| "grad_norm": 0.1704522967338562, | |
| "learning_rate": 9.796885386587447e-05, | |
| "loss": 0.250814414024353, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.36678019386953103, | |
| "grad_norm": 0.1316167265176773, | |
| "learning_rate": 9.780632327030112e-05, | |
| "loss": 0.25458922386169436, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.37725962798008905, | |
| "grad_norm": 0.16226200759410858, | |
| "learning_rate": 9.763768320706319e-05, | |
| "loss": 0.26563262939453125, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.3877390620906471, | |
| "grad_norm": 0.1297195851802826, | |
| "learning_rate": 9.746295522772424e-05, | |
| "loss": 0.2632328748703003, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.39821849620120514, | |
| "grad_norm": 0.1286139190196991, | |
| "learning_rate": 9.728216166186049e-05, | |
| "loss": 0.2624588251113892, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.40869793031176316, | |
| "grad_norm": 0.1587965339422226, | |
| "learning_rate": 9.709532561420725e-05, | |
| "loss": 0.24741590023040771, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.4191773644223212, | |
| "grad_norm": 0.11963177472352982, | |
| "learning_rate": 9.690247096170615e-05, | |
| "loss": 0.22777397632598878, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.42965679853287925, | |
| "grad_norm": 0.13638927042484283, | |
| "learning_rate": 9.670362235045387e-05, | |
| "loss": 0.23324952125549317, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.44013623264343726, | |
| "grad_norm": 0.1514088362455368, | |
| "learning_rate": 9.649880519255232e-05, | |
| "loss": 0.2505915880203247, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.4506156667539953, | |
| "grad_norm": 0.10994207113981247, | |
| "learning_rate": 9.62880456628612e-05, | |
| "loss": 0.2078850269317627, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.4610951008645533, | |
| "grad_norm": 0.11983369290828705, | |
| "learning_rate": 9.607137069565288e-05, | |
| "loss": 0.21452484130859376, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.47157453497511137, | |
| "grad_norm": 0.12684305012226105, | |
| "learning_rate": 9.58488079811703e-05, | |
| "loss": 0.22002685070037842, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.4820539690856694, | |
| "grad_norm": 0.16841623187065125, | |
| "learning_rate": 9.562038596208828e-05, | |
| "loss": 0.21405396461486817, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.4925334031962274, | |
| "grad_norm": 0.1498555839061737, | |
| "learning_rate": 9.538613382987865e-05, | |
| "loss": 0.20534911155700683, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.5030128373067855, | |
| "grad_norm": 0.13913628458976746, | |
| "learning_rate": 9.514608152107974e-05, | |
| "loss": 0.22248730659484864, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.5134922714173434, | |
| "grad_norm": 0.14408951997756958, | |
| "learning_rate": 9.490025971347047e-05, | |
| "loss": 0.214866042137146, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.5239717055279015, | |
| "grad_norm": 0.1649770438671112, | |
| "learning_rate": 9.464869982215001e-05, | |
| "loss": 0.19965900182724, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.5239717055279015, | |
| "eval_loss": 0.19267401099205017, | |
| "eval_runtime": 95.3374, | |
| "eval_samples_per_second": 3.262, | |
| "eval_steps_per_second": 3.262, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.5344511396384595, | |
| "grad_norm": 0.1305568665266037, | |
| "learning_rate": 9.439143399552291e-05, | |
| "loss": 0.21112546920776368, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.5449305737490175, | |
| "grad_norm": 0.11998175084590912, | |
| "learning_rate": 9.412849511119074e-05, | |
| "loss": 0.21422922611236572, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.5554100078595756, | |
| "grad_norm": 0.15220341086387634, | |
| "learning_rate": 9.385991677175046e-05, | |
| "loss": 0.20999882221221924, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.5658894419701336, | |
| "grad_norm": 0.13170023262500763, | |
| "learning_rate": 9.358573330050004e-05, | |
| "loss": 0.20208392143249512, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.5763688760806917, | |
| "grad_norm": 0.10457764565944672, | |
| "learning_rate": 9.330597973705219e-05, | |
| "loss": 0.1908803701400757, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.5868483101912497, | |
| "grad_norm": 0.12568537890911102, | |
| "learning_rate": 9.302069183285637e-05, | |
| "loss": 0.19316340684890748, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.5973277443018077, | |
| "grad_norm": 0.14824528992176056, | |
| "learning_rate": 9.272990604662988e-05, | |
| "loss": 0.18987581729888917, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.6078071784123658, | |
| "grad_norm": 0.14521734416484833, | |
| "learning_rate": 9.243365953969861e-05, | |
| "loss": 0.19232832193374633, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.6182866125229237, | |
| "grad_norm": 0.1335408091545105, | |
| "learning_rate": 9.213199017124793e-05, | |
| "loss": 0.1758212924003601, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.6287660466334818, | |
| "grad_norm": 0.11143071949481964, | |
| "learning_rate": 9.182493649348447e-05, | |
| "loss": 0.19117680788040162, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.6392454807440399, | |
| "grad_norm": 0.14789296686649323, | |
| "learning_rate": 9.151253774670921e-05, | |
| "loss": 0.184559965133667, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.6497249148545978, | |
| "grad_norm": 0.10541336238384247, | |
| "learning_rate": 9.119483385430283e-05, | |
| "loss": 0.1720304846763611, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.6602043489651559, | |
| "grad_norm": 0.12105975300073624, | |
| "learning_rate": 9.087186541762358e-05, | |
| "loss": 0.17654836177825928, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.6706837830757139, | |
| "grad_norm": 0.13114669919013977, | |
| "learning_rate": 9.054367371081858e-05, | |
| "loss": 0.1696592688560486, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.6811632171862719, | |
| "grad_norm": 0.13745592534542084, | |
| "learning_rate": 9.021030067554919e-05, | |
| "loss": 0.15404462814331055, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.69164265129683, | |
| "grad_norm": 0.15927442908287048, | |
| "learning_rate": 8.987178891563094e-05, | |
| "loss": 0.17024366855621337, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.702122085407388, | |
| "grad_norm": 0.13737429678440094, | |
| "learning_rate": 8.952818169158903e-05, | |
| "loss": 0.1602048397064209, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.712601519517946, | |
| "grad_norm": 0.13941751420497894, | |
| "learning_rate": 8.91795229151297e-05, | |
| "loss": 0.18057082891464232, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.7230809536285041, | |
| "grad_norm": 0.14242954552173615, | |
| "learning_rate": 8.882585714352856e-05, | |
| "loss": 0.14863334894180297, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.7335603877390621, | |
| "grad_norm": 0.15553542971611023, | |
| "learning_rate": 8.846722957393626e-05, | |
| "loss": 0.15701137781143187, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.7440398218496201, | |
| "grad_norm": 0.12901411950588226, | |
| "learning_rate": 8.810368603760249e-05, | |
| "loss": 0.15571318864822387, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.7545192559601781, | |
| "grad_norm": 0.13449430465698242, | |
| "learning_rate": 8.773527299401902e-05, | |
| "loss": 0.16418551206588744, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.7649986900707362, | |
| "grad_norm": 0.10630270838737488, | |
| "learning_rate": 8.736203752498218e-05, | |
| "loss": 0.16800801753997802, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.7754781241812942, | |
| "grad_norm": 0.11299935728311539, | |
| "learning_rate": 8.698402732857611e-05, | |
| "loss": 0.15700833797454833, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.7859575582918522, | |
| "grad_norm": 0.11920930445194244, | |
| "learning_rate": 8.660129071307707e-05, | |
| "loss": 0.15091001987457275, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.7859575582918522, | |
| "eval_loss": 0.1356429010629654, | |
| "eval_runtime": 94.0557, | |
| "eval_samples_per_second": 3.307, | |
| "eval_steps_per_second": 3.307, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.7964369924024103, | |
| "grad_norm": 0.13870343565940857, | |
| "learning_rate": 8.621387659077986e-05, | |
| "loss": 0.1422027826309204, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.8069164265129684, | |
| "grad_norm": 0.12753477692604065, | |
| "learning_rate": 8.582183447174697e-05, | |
| "loss": 0.142450213432312, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.8173958606235263, | |
| "grad_norm": 0.11877496540546417, | |
| "learning_rate": 8.542521445748141e-05, | |
| "loss": 0.15361062288284302, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.8278752947340844, | |
| "grad_norm": 0.1200249195098877, | |
| "learning_rate": 8.502406723452392e-05, | |
| "loss": 0.14647477865219116, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.8383547288446423, | |
| "grad_norm": 0.12913794815540314, | |
| "learning_rate": 8.461844406797543e-05, | |
| "loss": 0.1591552734375, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.8488341629552004, | |
| "grad_norm": 0.17270176112651825, | |
| "learning_rate": 8.420839679494558e-05, | |
| "loss": 0.1495436668395996, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.8593135970657585, | |
| "grad_norm": 0.15545596182346344, | |
| "learning_rate": 8.379397781792808e-05, | |
| "loss": 0.15377395153045653, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.8697930311763165, | |
| "grad_norm": 0.12941111624240875, | |
| "learning_rate": 8.337524009810395e-05, | |
| "loss": 0.14733861684799193, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.8802724652868745, | |
| "grad_norm": 0.13152749836444855, | |
| "learning_rate": 8.295223714857319e-05, | |
| "loss": 0.13980752229690552, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.8907518993974325, | |
| "grad_norm": 0.11208872497081757, | |
| "learning_rate": 8.252502302751612e-05, | |
| "loss": 0.12019969224929809, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.9012313335079906, | |
| "grad_norm": 0.11118603497743607, | |
| "learning_rate": 8.209365233128482e-05, | |
| "loss": 0.13822466135025024, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.9117107676185486, | |
| "grad_norm": 0.11705653369426727, | |
| "learning_rate": 8.165818018742605e-05, | |
| "loss": 0.1439664840698242, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.9221902017291066, | |
| "grad_norm": 0.08817730098962784, | |
| "learning_rate": 8.121866224763606e-05, | |
| "loss": 0.13380355834960939, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.9326696358396647, | |
| "grad_norm": 0.1092257872223854, | |
| "learning_rate": 8.077515468064851e-05, | |
| "loss": 0.12982802391052245, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.9431490699502227, | |
| "grad_norm": 0.12680962681770325, | |
| "learning_rate": 8.032771416505647e-05, | |
| "loss": 0.1489071011543274, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.9536285040607807, | |
| "grad_norm": 0.11953219771385193, | |
| "learning_rate": 7.987639788206888e-05, | |
| "loss": 0.14020267724990845, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.9641079381713388, | |
| "grad_norm": 0.1041467934846878, | |
| "learning_rate": 7.942126350820318e-05, | |
| "loss": 0.1439213275909424, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.9745873722818967, | |
| "grad_norm": 0.1277916431427002, | |
| "learning_rate": 7.896236920791442e-05, | |
| "loss": 0.1468779683113098, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.9850668063924548, | |
| "grad_norm": 0.11245205253362656, | |
| "learning_rate": 7.849977362616201e-05, | |
| "loss": 0.12012372016906739, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.9955462405030129, | |
| "grad_norm": 0.12230483442544937, | |
| "learning_rate": 7.803353588091522e-05, | |
| "loss": 0.1488939881324768, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.005239717055279, | |
| "grad_norm": 0.14185865223407745, | |
| "learning_rate": 7.7563715555598e-05, | |
| "loss": 0.11488113403320313, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.015719151165837, | |
| "grad_norm": 0.10545773804187775, | |
| "learning_rate": 7.709037269147459e-05, | |
| "loss": 0.10712549686431885, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.026198585276395, | |
| "grad_norm": 0.10376274585723877, | |
| "learning_rate": 7.661356777997631e-05, | |
| "loss": 0.11428828239440918, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.0366780193869531, | |
| "grad_norm": 0.09950564056634903, | |
| "learning_rate": 7.613336175497111e-05, | |
| "loss": 0.09823058247566223, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.0471574534975112, | |
| "grad_norm": 0.10412753373384476, | |
| "learning_rate": 7.564981598497643e-05, | |
| "loss": 0.1106558084487915, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.0471574534975112, | |
| "eval_loss": 0.11185819655656815, | |
| "eval_runtime": 93.808, | |
| "eval_samples_per_second": 3.315, | |
| "eval_steps_per_second": 3.315, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.057636887608069, | |
| "grad_norm": 0.10430868715047836, | |
| "learning_rate": 7.516299226531645e-05, | |
| "loss": 0.11168640851974487, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.0681163217186271, | |
| "grad_norm": 0.09646806865930557, | |
| "learning_rate": 7.467295281022501e-05, | |
| "loss": 0.10711305141448975, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.0785957558291852, | |
| "grad_norm": 0.13060614466667175, | |
| "learning_rate": 7.417976024489474e-05, | |
| "loss": 0.10001810789108276, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.0890751899397433, | |
| "grad_norm": 0.10389085114002228, | |
| "learning_rate": 7.368347759747393e-05, | |
| "loss": 0.11893858909606933, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.0995546240503014, | |
| "grad_norm": 0.11291550099849701, | |
| "learning_rate": 7.318416829101164e-05, | |
| "loss": 0.1079628586769104, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.1100340581608594, | |
| "grad_norm": 0.10372598469257355, | |
| "learning_rate": 7.268189613535255e-05, | |
| "loss": 0.10332397222518921, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.1205134922714173, | |
| "grad_norm": 0.12971536815166473, | |
| "learning_rate": 7.217672531898225e-05, | |
| "loss": 0.10804877281188965, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.1309929263819753, | |
| "grad_norm": 0.10902425646781921, | |
| "learning_rate": 7.166872040082431e-05, | |
| "loss": 0.09947454929351807, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.1414723604925334, | |
| "grad_norm": 0.09305932372808456, | |
| "learning_rate": 7.11579463019897e-05, | |
| "loss": 0.09406971335411071, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.1519517946030915, | |
| "grad_norm": 0.11485275626182556, | |
| "learning_rate": 7.064446829748034e-05, | |
| "loss": 0.09943979978561401, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.1624312287136496, | |
| "grad_norm": 0.09556467831134796, | |
| "learning_rate": 7.0128352007847e-05, | |
| "loss": 0.10862170457839966, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.1729106628242074, | |
| "grad_norm": 0.11937833577394485, | |
| "learning_rate": 6.96096633908034e-05, | |
| "loss": 0.10385221242904663, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.1833900969347655, | |
| "grad_norm": 0.11560507863759995, | |
| "learning_rate": 6.908846873279691e-05, | |
| "loss": 0.09252402186393738, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.1938695310453236, | |
| "grad_norm": 0.11119654029607773, | |
| "learning_rate": 6.856483464053758e-05, | |
| "loss": 0.09637172818183899, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.2043489651558816, | |
| "grad_norm": 0.11722644418478012, | |
| "learning_rate": 6.803882803248585e-05, | |
| "loss": 0.09078751802444458, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.2148283992664397, | |
| "grad_norm": 0.10487739741802216, | |
| "learning_rate": 6.751051613030082e-05, | |
| "loss": 0.10334972143173218, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.2253078333769976, | |
| "grad_norm": 0.10202383995056152, | |
| "learning_rate": 6.697996645024937e-05, | |
| "loss": 0.08661433458328247, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.2357872674875556, | |
| "grad_norm": 0.11801143735647202, | |
| "learning_rate": 6.644724679457804e-05, | |
| "loss": 0.0997927188873291, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.2462667015981137, | |
| "grad_norm": 0.10949107259511948, | |
| "learning_rate": 6.591242524284802e-05, | |
| "loss": 0.0977592945098877, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.2567461357086718, | |
| "grad_norm": 0.10221222043037415, | |
| "learning_rate": 6.537557014323487e-05, | |
| "loss": 0.0970361053943634, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.2672255698192298, | |
| "grad_norm": 0.10554748773574829, | |
| "learning_rate": 6.483675010379393e-05, | |
| "loss": 0.09007551074028015, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.2777050039297877, | |
| "grad_norm": 0.11625627428293228, | |
| "learning_rate": 6.429603398369242e-05, | |
| "loss": 0.08734490275382996, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.2881844380403458, | |
| "grad_norm": 0.10624277591705322, | |
| "learning_rate": 6.37534908844095e-05, | |
| "loss": 0.09858485460281372, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.2986638721509038, | |
| "grad_norm": 0.10184557735919952, | |
| "learning_rate": 6.320919014090534e-05, | |
| "loss": 0.09335023164749146, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.309143306261462, | |
| "grad_norm": 0.10787283629179001, | |
| "learning_rate": 6.266320131276051e-05, | |
| "loss": 0.08665563464164734, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.309143306261462, | |
| "eval_loss": 0.08951585739850998, | |
| "eval_runtime": 94.0567, | |
| "eval_samples_per_second": 3.307, | |
| "eval_steps_per_second": 3.307, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.31962274037202, | |
| "grad_norm": 0.10836981981992722, | |
| "learning_rate": 6.211559417528631e-05, | |
| "loss": 0.0933380126953125, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.3301021744825778, | |
| "grad_norm": 0.1397171914577484, | |
| "learning_rate": 6.156643871060795e-05, | |
| "loss": 0.09835371971130372, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.340581608593136, | |
| "grad_norm": 0.11242218315601349, | |
| "learning_rate": 6.101580509872097e-05, | |
| "loss": 0.09398673176765442, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.351061042703694, | |
| "grad_norm": 0.10235017538070679, | |
| "learning_rate": 6.0463763708522536e-05, | |
| "loss": 0.10350929498672486, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.361540476814252, | |
| "grad_norm": 0.09327106177806854, | |
| "learning_rate": 5.99103850888186e-05, | |
| "loss": 0.09580238461494446, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.3720199109248101, | |
| "grad_norm": 0.12995658814907074, | |
| "learning_rate": 5.9355739959307976e-05, | |
| "loss": 0.08437412977218628, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 1.382499345035368, | |
| "grad_norm": 0.11962983757257462, | |
| "learning_rate": 5.879989920154466e-05, | |
| "loss": 0.08409937620162963, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.392978779145926, | |
| "grad_norm": 0.09431737661361694, | |
| "learning_rate": 5.824293384987941e-05, | |
| "loss": 0.09504773020744324, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 1.4034582132564841, | |
| "grad_norm": 0.13824374973773956, | |
| "learning_rate": 5.768491508238188e-05, | |
| "loss": 0.09193333983421326, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 1.4139376473670422, | |
| "grad_norm": 0.10595858097076416, | |
| "learning_rate": 5.712591421174422e-05, | |
| "loss": 0.08976472616195678, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.4244170814776003, | |
| "grad_norm": 0.09911809861660004, | |
| "learning_rate": 5.6566002676167725e-05, | |
| "loss": 0.07597061395645141, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 1.4348965155881581, | |
| "grad_norm": 0.09723466634750366, | |
| "learning_rate": 5.60052520302332e-05, | |
| "loss": 0.10513757467269898, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 1.4453759496987162, | |
| "grad_norm": 0.11331687867641449, | |
| "learning_rate": 5.5443733935756615e-05, | |
| "loss": 0.09019948840141297, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 1.4558553838092743, | |
| "grad_norm": 0.13363589346408844, | |
| "learning_rate": 5.4881520152630886e-05, | |
| "loss": 0.08314153552055359, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 1.4663348179198323, | |
| "grad_norm": 0.14111892879009247, | |
| "learning_rate": 5.4318682529655404e-05, | |
| "loss": 0.07892010807991028, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.4768142520303904, | |
| "grad_norm": 0.13948485255241394, | |
| "learning_rate": 5.3755292995353913e-05, | |
| "loss": 0.0840128481388092, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 1.4872936861409483, | |
| "grad_norm": 0.12535949051380157, | |
| "learning_rate": 5.31914235487823e-05, | |
| "loss": 0.07869629859924317, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 1.4977731202515066, | |
| "grad_norm": 0.10041694343090057, | |
| "learning_rate": 5.2627146250327484e-05, | |
| "loss": 0.08074848055839538, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 1.5082525543620644, | |
| "grad_norm": 0.10112891346216202, | |
| "learning_rate": 5.2062533212498275e-05, | |
| "loss": 0.0860810935497284, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.5187319884726225, | |
| "grad_norm": 0.11297477036714554, | |
| "learning_rate": 5.149765659070973e-05, | |
| "loss": 0.08794642686843872, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.5292114225831805, | |
| "grad_norm": 0.10511091351509094, | |
| "learning_rate": 5.0932588574061945e-05, | |
| "loss": 0.07854819297790527, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 1.5396908566937384, | |
| "grad_norm": 0.09333530068397522, | |
| "learning_rate": 5.036740137611453e-05, | |
| "loss": 0.08821435570716858, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 1.5501702908042967, | |
| "grad_norm": 0.11480343341827393, | |
| "learning_rate": 4.980216722565804e-05, | |
| "loss": 0.08062278628349304, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 1.5606497249148545, | |
| "grad_norm": 0.08406255394220352, | |
| "learning_rate": 4.923695835748338e-05, | |
| "loss": 0.0940588355064392, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 1.5711291590254126, | |
| "grad_norm": 0.12927693128585815, | |
| "learning_rate": 4.8671847003150447e-05, | |
| "loss": 0.0775177538394928, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.5711291590254126, | |
| "eval_loss": 0.07877222448587418, | |
| "eval_runtime": 34.4389, | |
| "eval_samples_per_second": 9.03, | |
| "eval_steps_per_second": 9.03, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.5816085931359707, | |
| "grad_norm": 0.1255076378583908, | |
| "learning_rate": 4.810690538175728e-05, | |
| "loss": 0.09362970590591431, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 1.5920880272465285, | |
| "grad_norm": 0.1326853185892105, | |
| "learning_rate": 4.754220569071068e-05, | |
| "loss": 0.08364834189414978, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 1.6025674613570868, | |
| "grad_norm": 0.10229979455471039, | |
| "learning_rate": 4.697782009649962e-05, | |
| "loss": 0.0725843846797943, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 1.6130468954676447, | |
| "grad_norm": 0.11407258361577988, | |
| "learning_rate": 4.641382072547272e-05, | |
| "loss": 0.07566151022911072, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 1.6235263295782028, | |
| "grad_norm": 0.09398165345191956, | |
| "learning_rate": 4.585027965462075e-05, | |
| "loss": 0.087736576795578, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.6340057636887608, | |
| "grad_norm": 0.11289424449205399, | |
| "learning_rate": 4.528726890236544e-05, | |
| "loss": 0.08366051316261292, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 1.6444851977993187, | |
| "grad_norm": 0.09478718787431717, | |
| "learning_rate": 4.4724860419355746e-05, | |
| "loss": 0.0885531723499298, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 1.654964631909877, | |
| "grad_norm": 0.09163404256105423, | |
| "learning_rate": 4.416312607927295e-05, | |
| "loss": 0.08392030596733094, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 1.6654440660204348, | |
| "grad_norm": 0.11422222852706909, | |
| "learning_rate": 4.360213766964542e-05, | |
| "loss": 0.08059985041618348, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 1.675923500130993, | |
| "grad_norm": 0.08131479471921921, | |
| "learning_rate": 4.304196688267438e-05, | |
| "loss": 0.07613803148269653, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.686402934241551, | |
| "grad_norm": 0.09615079313516617, | |
| "learning_rate": 4.248268530607199e-05, | |
| "loss": 0.07764078378677368, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 1.696882368352109, | |
| "grad_norm": 0.09730526059865952, | |
| "learning_rate": 4.192436441391271e-05, | |
| "loss": 0.07644452452659607, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.707361802462667, | |
| "grad_norm": 0.09649327397346497, | |
| "learning_rate": 4.136707555749907e-05, | |
| "loss": 0.07866159081459045, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 1.717841236573225, | |
| "grad_norm": 0.11804413050413132, | |
| "learning_rate": 4.0810889956243415e-05, | |
| "loss": 0.06996130347251892, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 1.728320670683783, | |
| "grad_norm": 0.09874672442674637, | |
| "learning_rate": 4.025587868856622e-05, | |
| "loss": 0.07877404093742371, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.738800104794341, | |
| "grad_norm": 0.11149467527866364, | |
| "learning_rate": 3.9702112682812544e-05, | |
| "loss": 0.07241421341896057, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 1.7492795389048992, | |
| "grad_norm": 0.08748896420001984, | |
| "learning_rate": 3.914966270818766e-05, | |
| "loss": 0.07336459755897522, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 1.7597589730154573, | |
| "grad_norm": 0.1172696202993393, | |
| "learning_rate": 3.859859936571307e-05, | |
| "loss": 0.07742337584495544, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 1.770238407126015, | |
| "grad_norm": 0.0719197615981102, | |
| "learning_rate": 3.8048993079203925e-05, | |
| "loss": 0.06242966651916504, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 1.7807178412365732, | |
| "grad_norm": 0.12380168586969376, | |
| "learning_rate": 3.750091408626907e-05, | |
| "loss": 0.07270430326461792, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.7911972753471312, | |
| "grad_norm": 0.1587221622467041, | |
| "learning_rate": 3.6954432429335015e-05, | |
| "loss": 0.06409866213798524, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 1.8016767094576893, | |
| "grad_norm": 0.10983912646770477, | |
| "learning_rate": 3.640961794669482e-05, | |
| "loss": 0.06610031127929687, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 1.8121561435682474, | |
| "grad_norm": 0.11023026704788208, | |
| "learning_rate": 3.586654026358287e-05, | |
| "loss": 0.06866579055786133, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 1.8226355776788052, | |
| "grad_norm": 0.11857719719409943, | |
| "learning_rate": 3.532526878327719e-05, | |
| "loss": 0.06734356880187989, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 1.8331150117893635, | |
| "grad_norm": 0.09280339628458023, | |
| "learning_rate": 3.478587267822987e-05, | |
| "loss": 0.06897796392440796, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.8331150117893635, | |
| "eval_loss": 0.06596127897500992, | |
| "eval_runtime": 35.5001, | |
| "eval_samples_per_second": 8.761, | |
| "eval_steps_per_second": 8.761, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.8435944458999214, | |
| "grad_norm": 0.1175367683172226, | |
| "learning_rate": 3.424842088122716e-05, | |
| "loss": 0.08288194537162781, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 1.8540738800104795, | |
| "grad_norm": 0.10271462798118591, | |
| "learning_rate": 3.371298207658003e-05, | |
| "loss": 0.05643013119697571, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 1.8645533141210375, | |
| "grad_norm": 0.11965195834636688, | |
| "learning_rate": 3.3179624691346654e-05, | |
| "loss": 0.07403092980384826, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 1.8750327482315954, | |
| "grad_norm": 0.09981680661439896, | |
| "learning_rate": 3.2648416886587686e-05, | |
| "loss": 0.07118859887123108, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 1.8855121823421537, | |
| "grad_norm": 0.07787375897169113, | |
| "learning_rate": 3.2119426548655435e-05, | |
| "loss": 0.07219682335853576, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.8959916164527115, | |
| "grad_norm": 0.1303507387638092, | |
| "learning_rate": 3.1592721280518404e-05, | |
| "loss": 0.07636030912399291, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 1.9064710505632696, | |
| "grad_norm": 0.09162267297506332, | |
| "learning_rate": 3.106836839312175e-05, | |
| "loss": 0.06230143308639526, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 1.9169504846738277, | |
| "grad_norm": 0.11375878751277924, | |
| "learning_rate": 3.054643489678526e-05, | |
| "loss": 0.060506826639175414, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 1.9274299187843855, | |
| "grad_norm": 0.1377716213464737, | |
| "learning_rate": 3.0026987492639668e-05, | |
| "loss": 0.08148540854454041, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 1.9379093528949438, | |
| "grad_norm": 0.10483554750680923, | |
| "learning_rate": 2.951009256410255e-05, | |
| "loss": 0.07040726542472839, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.9483887870055017, | |
| "grad_norm": 0.08736151456832886, | |
| "learning_rate": 2.8995816168394702e-05, | |
| "loss": 0.04931557774543762, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 1.9588682211160597, | |
| "grad_norm": 0.11461569368839264, | |
| "learning_rate": 2.848422402809828e-05, | |
| "loss": 0.057559752464294435, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 1.9693476552266178, | |
| "grad_norm": 0.09060918539762497, | |
| "learning_rate": 2.7975381522757803e-05, | |
| "loss": 0.06379705667495728, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 1.9798270893371757, | |
| "grad_norm": 0.07104971259832382, | |
| "learning_rate": 2.746935368052477e-05, | |
| "loss": 0.05813115239143372, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 1.990306523447734, | |
| "grad_norm": 0.10802938044071198, | |
| "learning_rate": 2.696620516984733e-05, | |
| "loss": 0.07732833027839661, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.16884952783584595, | |
| "learning_rate": 2.6466000291206004e-05, | |
| "loss": 0.06166202425956726, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 2.010479434110558, | |
| "grad_norm": 0.08582179993391037, | |
| "learning_rate": 2.5968802968896228e-05, | |
| "loss": 0.04766199886798859, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 2.020958868221116, | |
| "grad_norm": 0.1457364708185196, | |
| "learning_rate": 2.5474676742859048e-05, | |
| "loss": 0.03826354146003723, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 2.031438302331674, | |
| "grad_norm": 0.09275342524051666, | |
| "learning_rate": 2.4983684760561023e-05, | |
| "loss": 0.045059433579444884, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 2.0419177364422323, | |
| "grad_norm": 0.09085927903652191, | |
| "learning_rate": 2.44958897689242e-05, | |
| "loss": 0.04904903173446655, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 2.05239717055279, | |
| "grad_norm": 0.11733179539442062, | |
| "learning_rate": 2.401135410630731e-05, | |
| "loss": 0.05008396506309509, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 2.062876604663348, | |
| "grad_norm": 0.0894237607717514, | |
| "learning_rate": 2.3530139694539095e-05, | |
| "loss": 0.04057626128196716, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 2.0733560387739063, | |
| "grad_norm": 0.08560927212238312, | |
| "learning_rate": 2.305230803100496e-05, | |
| "loss": 0.04843136668205261, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 2.083835472884464, | |
| "grad_norm": 0.07991836220026016, | |
| "learning_rate": 2.257792018078793e-05, | |
| "loss": 0.0544127106666565, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 2.0943149069950224, | |
| "grad_norm": 0.08846250921487808, | |
| "learning_rate": 2.210703676886461e-05, | |
| "loss": 0.0459000825881958, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 2.0943149069950224, | |
| "eval_loss": 0.060011014342308044, | |
| "eval_runtime": 36.3755, | |
| "eval_samples_per_second": 8.55, | |
| "eval_steps_per_second": 8.55, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 2.1047943411055803, | |
| "grad_norm": 0.10082945972681046, | |
| "learning_rate": 2.1639717972357678e-05, | |
| "loss": 0.038090622425079344, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 2.115273775216138, | |
| "grad_norm": 0.05712248757481575, | |
| "learning_rate": 2.1176023512845376e-05, | |
| "loss": 0.04598597884178161, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 2.1257532093266964, | |
| "grad_norm": 0.11628362536430359, | |
| "learning_rate": 2.0716012648729353e-05, | |
| "loss": 0.04984880685806274, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 2.1362326434372543, | |
| "grad_norm": 0.10635484755039215, | |
| "learning_rate": 2.025974416766171e-05, | |
| "loss": 0.04293925166130066, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 2.1467120775478126, | |
| "grad_norm": 0.1017381027340889, | |
| "learning_rate": 1.9807276379032113e-05, | |
| "loss": 0.04305694401264191, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 2.1571915116583704, | |
| "grad_norm": 0.13550882041454315, | |
| "learning_rate": 1.9358667106516055e-05, | |
| "loss": 0.04478869140148163, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 2.1676709457689283, | |
| "grad_norm": 0.08526366949081421, | |
| "learning_rate": 1.8913973680685226e-05, | |
| "loss": 0.036646312475204466, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 2.1781503798794866, | |
| "grad_norm": 0.10932011157274246, | |
| "learning_rate": 1.8473252931680928e-05, | |
| "loss": 0.042200219631195066, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 2.1886298139900444, | |
| "grad_norm": 0.08768360316753387, | |
| "learning_rate": 1.803656118195136e-05, | |
| "loss": 0.0437488317489624, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 2.1991092481006027, | |
| "grad_norm": 0.08362651616334915, | |
| "learning_rate": 1.760395423905379e-05, | |
| "loss": 0.04669668078422547, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 2.2095886822111606, | |
| "grad_norm": 0.08554034680128098, | |
| "learning_rate": 1.7175487388522588e-05, | |
| "loss": 0.034989356994628906, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 2.220068116321719, | |
| "grad_norm": 0.08215561509132385, | |
| "learning_rate": 1.6751215386803986e-05, | |
| "loss": 0.040298929810523985, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 2.2305475504322767, | |
| "grad_norm": 0.0840689167380333, | |
| "learning_rate": 1.6331192454258337e-05, | |
| "loss": 0.041704925894737246, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 2.2410269845428346, | |
| "grad_norm": 0.06530614197254181, | |
| "learning_rate": 1.5915472268231018e-05, | |
| "loss": 0.03651900887489319, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 2.251506418653393, | |
| "grad_norm": 0.12431822717189789, | |
| "learning_rate": 1.550410795619261e-05, | |
| "loss": 0.04806804955005646, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 2.2619858527639507, | |
| "grad_norm": 0.09592410176992416, | |
| "learning_rate": 1.509715208894949e-05, | |
| "loss": 0.0454313725233078, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 2.2724652868745085, | |
| "grad_norm": 0.07589780539274216, | |
| "learning_rate": 1.469465667392536e-05, | |
| "loss": 0.03574602603912354, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 2.282944720985067, | |
| "grad_norm": 0.09734483063220978, | |
| "learning_rate": 1.4296673148515038e-05, | |
| "loss": 0.04358702301979065, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 2.2934241550956247, | |
| "grad_norm": 0.0974339172244072, | |
| "learning_rate": 1.3903252373510838e-05, | |
| "loss": 0.04603351950645447, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 2.303903589206183, | |
| "grad_norm": 0.09025271981954575, | |
| "learning_rate": 1.3514444626602773e-05, | |
| "loss": 0.040065237879753114, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 2.314383023316741, | |
| "grad_norm": 0.07625086605548859, | |
| "learning_rate": 1.3130299595953338e-05, | |
| "loss": 0.044061675667762756, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 2.324862457427299, | |
| "grad_norm": 0.07306221127510071, | |
| "learning_rate": 1.2750866373847465e-05, | |
| "loss": 0.03366467654705048, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 2.335341891537857, | |
| "grad_norm": 0.08357638120651245, | |
| "learning_rate": 1.2376193450418715e-05, | |
| "loss": 0.041424044966697694, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 2.345821325648415, | |
| "grad_norm": 0.09153921157121658, | |
| "learning_rate": 1.2006328707452459e-05, | |
| "loss": 0.03938372135162353, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 2.356300759758973, | |
| "grad_norm": 0.09109660983085632, | |
| "learning_rate": 1.1641319412266765e-05, | |
| "loss": 0.04015985131263733, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 2.356300759758973, | |
| "eval_loss": 0.05486458167433739, | |
| "eval_runtime": 36.8119, | |
| "eval_samples_per_second": 8.448, | |
| "eval_steps_per_second": 8.448, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 2.366780193869531, | |
| "grad_norm": 0.052502721548080444, | |
| "learning_rate": 1.1281212211671822e-05, | |
| "loss": 0.0270554780960083, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 2.377259627980089, | |
| "grad_norm": 0.07931812107563019, | |
| "learning_rate": 1.0926053126008584e-05, | |
| "loss": 0.0417300134897232, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 2.387739062090647, | |
| "grad_norm": 0.08996254205703735, | |
| "learning_rate": 1.0575887543267609e-05, | |
| "loss": 0.037659955024719236, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 2.398218496201205, | |
| "grad_norm": 0.08800788223743439, | |
| "learning_rate": 1.023076021328867e-05, | |
| "loss": 0.048437944054603575, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 2.4086979303117633, | |
| "grad_norm": 0.10572271049022675, | |
| "learning_rate": 9.890715242041787e-06, | |
| "loss": 0.04166909456253052, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 2.419177364422321, | |
| "grad_norm": 0.10573071986436844, | |
| "learning_rate": 9.555796085990781e-06, | |
| "loss": 0.03919607996940613, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 2.4296567985328794, | |
| "grad_norm": 0.09714583307504654, | |
| "learning_rate": 9.226045546539608e-06, | |
| "loss": 0.03530588150024414, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 2.4401362326434373, | |
| "grad_norm": 0.09436199069023132, | |
| "learning_rate": 8.901505764562518e-06, | |
| "loss": 0.05111382007598877, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 2.450615666753995, | |
| "grad_norm": 0.06353961676359177, | |
| "learning_rate": 8.582218215018656e-06, | |
| "loss": 0.03805697858333588, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 2.4610951008645534, | |
| "grad_norm": 0.08853815495967865, | |
| "learning_rate": 8.268223701651684e-06, | |
| "loss": 0.04815975427627563, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 2.4715745349751113, | |
| "grad_norm": 0.07472016662359238, | |
| "learning_rate": 7.959562351775196e-06, | |
| "loss": 0.042247459292411804, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 2.4820539690856696, | |
| "grad_norm": 0.12121549248695374, | |
| "learning_rate": 7.656273611144632e-06, | |
| "loss": 0.040102115273475646, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 2.4925334031962274, | |
| "grad_norm": 0.08667747676372528, | |
| "learning_rate": 7.358396238916254e-06, | |
| "loss": 0.03656341433525086, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 2.5030128373067857, | |
| "grad_norm": 0.1162872165441513, | |
| "learning_rate": 7.065968302693882e-06, | |
| "loss": 0.04052766263484955, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 2.5134922714173435, | |
| "grad_norm": 0.07924140989780426, | |
| "learning_rate": 6.7790271736639595e-06, | |
| "loss": 0.03394221067428589, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 2.5239717055279014, | |
| "grad_norm": 0.09523408859968185, | |
| "learning_rate": 6.497609521819681e-06, | |
| "loss": 0.04119439423084259, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 2.5344511396384597, | |
| "grad_norm": 0.12182598561048508, | |
| "learning_rate": 6.221751311274731e-06, | |
| "loss": 0.05154783725738525, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 2.5449305737490175, | |
| "grad_norm": 0.09359873831272125, | |
| "learning_rate": 5.951487795667149e-06, | |
| "loss": 0.035483264923095705, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 2.5554100078595754, | |
| "grad_norm": 0.08514095097780228, | |
| "learning_rate": 5.686853513654117e-06, | |
| "loss": 0.03830339312553406, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 2.5658894419701337, | |
| "grad_norm": 0.10625084489583969, | |
| "learning_rate": 5.4278822844979705e-06, | |
| "loss": 0.034111028909683226, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 2.5763688760806915, | |
| "grad_norm": 0.1004003956913948, | |
| "learning_rate": 5.174607203744286e-06, | |
| "loss": 0.04465605318546295, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 2.58684831019125, | |
| "grad_norm": 0.0962519720196724, | |
| "learning_rate": 4.927060638992382e-06, | |
| "loss": 0.041056016087532045, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 2.5973277443018077, | |
| "grad_norm": 0.06380607187747955, | |
| "learning_rate": 4.685274225758846e-06, | |
| "loss": 0.03880062401294708, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 2.607807178412366, | |
| "grad_norm": 0.07326535880565643, | |
| "learning_rate": 4.449278863434647e-06, | |
| "loss": 0.03194461762905121, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 2.618286612522924, | |
| "grad_norm": 0.12218596786260605, | |
| "learning_rate": 4.2191047113362854e-06, | |
| "loss": 0.04258840978145599, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 2.618286612522924, | |
| "eval_loss": 0.05223666876554489, | |
| "eval_runtime": 37.7234, | |
| "eval_samples_per_second": 8.244, | |
| "eval_steps_per_second": 8.244, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 2.6287660466334817, | |
| "grad_norm": 0.08594664931297302, | |
| "learning_rate": 3.994781184851598e-06, | |
| "loss": 0.04302787780761719, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 2.63924548074404, | |
| "grad_norm": 0.08187596499919891, | |
| "learning_rate": 3.776336951680548e-06, | |
| "loss": 0.0341387003660202, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 2.649724914854598, | |
| "grad_norm": 0.10216796398162842, | |
| "learning_rate": 3.563799928171596e-06, | |
| "loss": 0.04289879500865936, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 2.6602043489651557, | |
| "grad_norm": 0.11215174198150635, | |
| "learning_rate": 3.3571972757540814e-06, | |
| "loss": 0.04055049121379852, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 2.670683783075714, | |
| "grad_norm": 0.07941269129514694, | |
| "learning_rate": 3.156555397467176e-06, | |
| "loss": 0.04118689000606537, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 2.681163217186272, | |
| "grad_norm": 0.09404437988996506, | |
| "learning_rate": 2.9618999345855547e-06, | |
| "loss": 0.03079705536365509, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 2.69164265129683, | |
| "grad_norm": 0.1109817698597908, | |
| "learning_rate": 2.773255763342647e-06, | |
| "loss": 0.038885954022407535, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 2.702122085407388, | |
| "grad_norm": 0.09431962668895721, | |
| "learning_rate": 2.590646991751472e-06, | |
| "loss": 0.043543145060539246, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 2.7126015195179463, | |
| "grad_norm": 0.08184763044118881, | |
| "learning_rate": 2.414096956523776e-06, | |
| "loss": 0.03256987631320953, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 2.723080953628504, | |
| "grad_norm": 0.08390141278505325, | |
| "learning_rate": 2.2436282200876458e-06, | |
| "loss": 0.03908055424690247, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 2.733560387739062, | |
| "grad_norm": 0.0762532502412796, | |
| "learning_rate": 2.07926256770416e-06, | |
| "loss": 0.04899201393127441, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 2.7440398218496203, | |
| "grad_norm": 0.08239631354808807, | |
| "learning_rate": 1.9210210046832768e-06, | |
| "loss": 0.048707082867622375, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 2.754519255960178, | |
| "grad_norm": 0.09619107842445374, | |
| "learning_rate": 1.7689237536994364e-06, | |
| "loss": 0.0372231125831604, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 2.764998690070736, | |
| "grad_norm": 0.07099667191505432, | |
| "learning_rate": 1.6229902522072293e-06, | |
| "loss": 0.03421170711517334, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 2.7754781241812942, | |
| "grad_norm": 0.10154753923416138, | |
| "learning_rate": 1.4832391499572996e-06, | |
| "loss": 0.03656705319881439, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 2.785957558291852, | |
| "grad_norm": 0.09349387139081955, | |
| "learning_rate": 1.3496883066130173e-06, | |
| "loss": 0.03710306882858276, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 2.7964369924024104, | |
| "grad_norm": 0.061091430485248566, | |
| "learning_rate": 1.2223547894680443e-06, | |
| "loss": 0.0308389812707901, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 2.8069164265129682, | |
| "grad_norm": 0.09838075935840607, | |
| "learning_rate": 1.101254871265256e-06, | |
| "loss": 0.03703555166721344, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 2.8173958606235265, | |
| "grad_norm": 0.10046928375959396, | |
| "learning_rate": 9.864040281170938e-07, | |
| "loss": 0.04500553905963898, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 2.8278752947340844, | |
| "grad_norm": 0.06770773977041245, | |
| "learning_rate": 8.778169375277978e-07, | |
| "loss": 0.03823737502098083, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 2.8383547288446422, | |
| "grad_norm": 0.08373535424470901, | |
| "learning_rate": 7.755074765176618e-07, | |
| "loss": 0.03961678743362427, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 2.8488341629552005, | |
| "grad_norm": 0.07590050995349884, | |
| "learning_rate": 6.794887198496413e-07, | |
| "loss": 0.03221273124217987, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 2.8593135970657584, | |
| "grad_norm": 0.08507678657770157, | |
| "learning_rate": 5.897729383583906e-07, | |
| "loss": 0.04571912884712219, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 2.8697930311763162, | |
| "grad_norm": 0.06584763526916504, | |
| "learning_rate": 5.063715973821659e-07, | |
| "loss": 0.03794914484024048, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 2.8802724652868745, | |
| "grad_norm": 0.07312892377376556, | |
| "learning_rate": 4.292953552975154e-07, | |
| "loss": 0.036365586519241336, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 2.8802724652868745, | |
| "eval_loss": 0.05090421438217163, | |
| "eval_runtime": 85.293, | |
| "eval_samples_per_second": 3.646, | |
| "eval_steps_per_second": 3.646, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 2.8907518993974324, | |
| "grad_norm": 0.08459606021642685, | |
| "learning_rate": 3.5855406215725697e-07, | |
| "loss": 0.03068857192993164, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 2.9012313335079907, | |
| "grad_norm": 0.06866376101970673, | |
| "learning_rate": 2.9415675843163515e-07, | |
| "loss": 0.03265829384326935, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 2.9117107676185485, | |
| "grad_norm": 0.09082643687725067, | |
| "learning_rate": 2.361116738529956e-07, | |
| "loss": 0.03418546915054321, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 2.922190201729107, | |
| "grad_norm": 0.10772739350795746, | |
| "learning_rate": 1.8442622636404284e-07, | |
| "loss": 0.03810786008834839, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 2.9326696358396647, | |
| "grad_norm": 0.08321297913789749, | |
| "learning_rate": 1.391070211698764e-07, | |
| "loss": 0.04068491756916046, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 2.9431490699502225, | |
| "grad_norm": 0.11239277571439743, | |
| "learning_rate": 1.0015984989385496e-07, | |
| "loss": 0.041029155254364014, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 2.953628504060781, | |
| "grad_norm": 0.07199843227863312, | |
| "learning_rate": 6.758968983747171e-08, | |
| "loss": 0.037902483344078065, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 2.9641079381713387, | |
| "grad_norm": 0.08249279856681824, | |
| "learning_rate": 4.140070334422985e-08, | |
| "loss": 0.03996126651763916, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 2.9745873722818965, | |
| "grad_norm": 0.0852220207452774, | |
| "learning_rate": 2.1596237267751396e-08, | |
| "loss": 0.04228667616844177, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 2.985066806392455, | |
| "grad_norm": 0.0858582928776741, | |
| "learning_rate": 8.178822544052666e-09, | |
| "loss": 0.03813594281673431, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 2.995546240503013, | |
| "grad_norm": 0.06642451137304306, | |
| "learning_rate": 1.1501738680919084e-09, | |
| "loss": 0.033472076058387756, | |
| "step": 2860 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 2865, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 250, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 9.031737271887514e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |