Text Generation
PEFT
Safetensors
Transformers
qwen2
axolotl
lora
conversational
text-generation-inference
Instructions to use felixwangg/Qwen2.5-Coder-7B-evol-66k-stage1 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- PEFT
How to use felixwangg/Qwen2.5-Coder-7B-evol-66k-stage1 with PEFT:
from peft import PeftModel from transformers import AutoModelForCausalLM base_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-Coder-7B-Instruct") model = PeftModel.from_pretrained(base_model, "felixwangg/Qwen2.5-Coder-7B-evol-66k-stage1") - Transformers
How to use felixwangg/Qwen2.5-Coder-7B-evol-66k-stage1 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="felixwangg/Qwen2.5-Coder-7B-evol-66k-stage1") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForMultimodalLM tokenizer = AutoTokenizer.from_pretrained("felixwangg/Qwen2.5-Coder-7B-evol-66k-stage1") model = AutoModelForMultimodalLM.from_pretrained("felixwangg/Qwen2.5-Coder-7B-evol-66k-stage1") messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use felixwangg/Qwen2.5-Coder-7B-evol-66k-stage1 with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "felixwangg/Qwen2.5-Coder-7B-evol-66k-stage1" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "felixwangg/Qwen2.5-Coder-7B-evol-66k-stage1", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/felixwangg/Qwen2.5-Coder-7B-evol-66k-stage1
- SGLang
How to use felixwangg/Qwen2.5-Coder-7B-evol-66k-stage1 with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "felixwangg/Qwen2.5-Coder-7B-evol-66k-stage1" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "felixwangg/Qwen2.5-Coder-7B-evol-66k-stage1", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "felixwangg/Qwen2.5-Coder-7B-evol-66k-stage1" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "felixwangg/Qwen2.5-Coder-7B-evol-66k-stage1", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use felixwangg/Qwen2.5-Coder-7B-evol-66k-stage1 with Docker Model Runner:
docker model run hf.co/felixwangg/Qwen2.5-Coder-7B-evol-66k-stage1
| { | |
| "best_global_step": 750, | |
| "best_metric": 0.7697240710258484, | |
| "best_model_checkpoint": "/home/tkwang/scratch/SecSteer-v2/axolotl-outputs/lora/Qwen2.5-Coder-7B-evol-stage1/checkpoint-750", | |
| "epoch": 0.7976601967561818, | |
| "eval_steps": 150, | |
| "global_step": 750, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0, | |
| "eval_loss": 0.804951012134552, | |
| "eval_ppl": 2.23659, | |
| "eval_runtime": 237.4236, | |
| "eval_samples_per_second": 28.161, | |
| "eval_steps_per_second": 1.761, | |
| "memory/device_reserved (GiB)": 41.82, | |
| "memory/max_active (GiB)": 37.85, | |
| "memory/max_allocated (GiB)": 37.85, | |
| "step": 0 | |
| }, | |
| { | |
| "epoch": 0.0010635469290082426, | |
| "grad_norm": 0.0461450070142746, | |
| "learning_rate": 0.0, | |
| "loss": 0.7683508396148682, | |
| "memory/device_reserved (GiB)": 51.3, | |
| "memory/max_active (GiB)": 45.83, | |
| "memory/max_allocated (GiB)": 45.83, | |
| "ppl": 2.15621, | |
| "step": 1, | |
| "tokens/total": 262144, | |
| "tokens/train_per_sec_per_gpu": 143.71, | |
| "tokens/trainable": 21968 | |
| }, | |
| { | |
| "epoch": 0.002127093858016485, | |
| "grad_norm": 0.053678449243307114, | |
| "learning_rate": 4.2553191489361704e-07, | |
| "loss": 0.8233645558357239, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.27815, | |
| "step": 2, | |
| "tokens/total": 524288, | |
| "tokens/train_per_sec_per_gpu": 206.75, | |
| "tokens/trainable": 44841 | |
| }, | |
| { | |
| "epoch": 0.0031906407870247273, | |
| "grad_norm": 0.05005470663309097, | |
| "learning_rate": 8.510638297872341e-07, | |
| "loss": 0.8364737033843994, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.30821, | |
| "step": 3, | |
| "tokens/total": 786432, | |
| "tokens/train_per_sec_per_gpu": 190.31, | |
| "tokens/trainable": 67815 | |
| }, | |
| { | |
| "epoch": 0.00425418771603297, | |
| "grad_norm": 0.04871873930096626, | |
| "learning_rate": 1.276595744680851e-06, | |
| "loss": 0.7923524975776672, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.20859, | |
| "step": 4, | |
| "tokens/total": 1048576, | |
| "tokens/train_per_sec_per_gpu": 172.64, | |
| "tokens/trainable": 89239 | |
| }, | |
| { | |
| "epoch": 0.0053177346450412125, | |
| "grad_norm": 0.052344731986522675, | |
| "learning_rate": 1.7021276595744682e-06, | |
| "loss": 0.7642413973808289, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.14736, | |
| "step": 5, | |
| "tokens/total": 1310720, | |
| "tokens/train_per_sec_per_gpu": 185.1, | |
| "tokens/trainable": 109336 | |
| }, | |
| { | |
| "epoch": 0.006381281574049455, | |
| "grad_norm": 0.04826882481575012, | |
| "learning_rate": 2.1276595744680853e-06, | |
| "loss": 0.8659416437149048, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.37724, | |
| "step": 6, | |
| "tokens/total": 1572864, | |
| "tokens/train_per_sec_per_gpu": 183.2, | |
| "tokens/trainable": 130625 | |
| }, | |
| { | |
| "epoch": 0.007444828503057698, | |
| "grad_norm": 0.05040327087044716, | |
| "learning_rate": 2.553191489361702e-06, | |
| "loss": 0.8249338865280151, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.28173, | |
| "step": 7, | |
| "tokens/total": 1835008, | |
| "tokens/train_per_sec_per_gpu": 209.68, | |
| "tokens/trainable": 153061 | |
| }, | |
| { | |
| "epoch": 0.00850837543206594, | |
| "grad_norm": 0.05111980810761452, | |
| "learning_rate": 2.978723404255319e-06, | |
| "loss": 0.7967497706413269, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.21832, | |
| "step": 8, | |
| "tokens/total": 2097152, | |
| "tokens/train_per_sec_per_gpu": 185.83, | |
| "tokens/trainable": 173852 | |
| }, | |
| { | |
| "epoch": 0.009571922361074183, | |
| "grad_norm": 0.04268274083733559, | |
| "learning_rate": 3.4042553191489363e-06, | |
| "loss": 0.7484039068222046, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.11362, | |
| "step": 9, | |
| "tokens/total": 2359296, | |
| "tokens/train_per_sec_per_gpu": 214.69, | |
| "tokens/trainable": 196255 | |
| }, | |
| { | |
| "epoch": 0.010635469290082425, | |
| "grad_norm": 0.04418900981545448, | |
| "learning_rate": 3.8297872340425535e-06, | |
| "loss": 0.8072069883346558, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.24164, | |
| "step": 10, | |
| "tokens/total": 2621440, | |
| "tokens/train_per_sec_per_gpu": 168.89, | |
| "tokens/trainable": 217227 | |
| }, | |
| { | |
| "epoch": 0.011699016219090667, | |
| "grad_norm": 0.04787033051252365, | |
| "learning_rate": 4.255319148936171e-06, | |
| "loss": 0.7684656381607056, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.15645, | |
| "step": 11, | |
| "tokens/total": 2883584, | |
| "tokens/train_per_sec_per_gpu": 187.36, | |
| "tokens/trainable": 240990 | |
| }, | |
| { | |
| "epoch": 0.01276256314809891, | |
| "grad_norm": 0.04942560940980911, | |
| "learning_rate": 4.680851063829788e-06, | |
| "loss": 0.8147498369216919, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.25861, | |
| "step": 12, | |
| "tokens/total": 3145728, | |
| "tokens/train_per_sec_per_gpu": 146.78, | |
| "tokens/trainable": 263455 | |
| }, | |
| { | |
| "epoch": 0.013826110077107153, | |
| "grad_norm": 0.05288751795887947, | |
| "learning_rate": 5.106382978723404e-06, | |
| "loss": 0.8472910523414612, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.33332, | |
| "step": 13, | |
| "tokens/total": 3407872, | |
| "tokens/train_per_sec_per_gpu": 179.19, | |
| "tokens/trainable": 286052 | |
| }, | |
| { | |
| "epoch": 0.014889657006115395, | |
| "grad_norm": 0.05669346824288368, | |
| "learning_rate": 5.531914893617022e-06, | |
| "loss": 0.7845062017440796, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.19132, | |
| "step": 14, | |
| "tokens/total": 3670016, | |
| "tokens/train_per_sec_per_gpu": 147.62, | |
| "tokens/trainable": 305257 | |
| }, | |
| { | |
| "epoch": 0.015953203935123637, | |
| "grad_norm": 0.04507856070995331, | |
| "learning_rate": 5.957446808510638e-06, | |
| "loss": 0.7491350769996643, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.11517, | |
| "step": 15, | |
| "tokens/total": 3932160, | |
| "tokens/train_per_sec_per_gpu": 254.85, | |
| "tokens/trainable": 328628 | |
| }, | |
| { | |
| "epoch": 0.01701675086413188, | |
| "grad_norm": 0.04711790010333061, | |
| "learning_rate": 6.382978723404256e-06, | |
| "loss": 0.7844012975692749, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.19109, | |
| "step": 16, | |
| "tokens/total": 4194304, | |
| "tokens/train_per_sec_per_gpu": 210.21, | |
| "tokens/trainable": 351162 | |
| }, | |
| { | |
| "epoch": 0.01808029779314012, | |
| "grad_norm": 0.04914192110300064, | |
| "learning_rate": 6.808510638297873e-06, | |
| "loss": 0.8149253129959106, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.25901, | |
| "step": 17, | |
| "tokens/total": 4456448, | |
| "tokens/train_per_sec_per_gpu": 172.85, | |
| "tokens/trainable": 373818 | |
| }, | |
| { | |
| "epoch": 0.019143844722148366, | |
| "grad_norm": 0.057912107557058334, | |
| "learning_rate": 7.234042553191491e-06, | |
| "loss": 0.7840430736541748, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.19031, | |
| "step": 18, | |
| "tokens/total": 4718592, | |
| "tokens/train_per_sec_per_gpu": 154.54, | |
| "tokens/trainable": 393528 | |
| }, | |
| { | |
| "epoch": 0.020207391651156606, | |
| "grad_norm": 0.05160650238394737, | |
| "learning_rate": 7.659574468085107e-06, | |
| "loss": 0.8152034282684326, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.25964, | |
| "step": 19, | |
| "tokens/total": 4980736, | |
| "tokens/train_per_sec_per_gpu": 188.44, | |
| "tokens/trainable": 415305 | |
| }, | |
| { | |
| "epoch": 0.02127093858016485, | |
| "grad_norm": 0.052523426711559296, | |
| "learning_rate": 8.085106382978723e-06, | |
| "loss": 0.816782534122467, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.26321, | |
| "step": 20, | |
| "tokens/total": 5242880, | |
| "tokens/train_per_sec_per_gpu": 223.4, | |
| "tokens/trainable": 438205 | |
| }, | |
| { | |
| "epoch": 0.022334485509173094, | |
| "grad_norm": 0.06776182353496552, | |
| "learning_rate": 8.510638297872341e-06, | |
| "loss": 0.8410882949829102, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.31889, | |
| "step": 21, | |
| "tokens/total": 5505024, | |
| "tokens/train_per_sec_per_gpu": 177.12, | |
| "tokens/trainable": 457521 | |
| }, | |
| { | |
| "epoch": 0.023398032438181334, | |
| "grad_norm": 0.05495529994368553, | |
| "learning_rate": 8.936170212765958e-06, | |
| "loss": 0.7947180271148682, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.21382, | |
| "step": 22, | |
| "tokens/total": 5767168, | |
| "tokens/train_per_sec_per_gpu": 175.57, | |
| "tokens/trainable": 478475 | |
| }, | |
| { | |
| "epoch": 0.024461579367189578, | |
| "grad_norm": 0.061899591237306595, | |
| "learning_rate": 9.361702127659576e-06, | |
| "loss": 0.8852798342704773, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.42366, | |
| "step": 23, | |
| "tokens/total": 6029312, | |
| "tokens/train_per_sec_per_gpu": 142.65, | |
| "tokens/trainable": 498659 | |
| }, | |
| { | |
| "epoch": 0.02552512629619782, | |
| "grad_norm": 0.0498763844370842, | |
| "learning_rate": 9.787234042553192e-06, | |
| "loss": 0.672831654548645, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 1.95978, | |
| "step": 24, | |
| "tokens/total": 6291456, | |
| "tokens/train_per_sec_per_gpu": 214.6, | |
| "tokens/trainable": 521341 | |
| }, | |
| { | |
| "epoch": 0.026588673225206062, | |
| "grad_norm": 0.056778695434331894, | |
| "learning_rate": 1.0212765957446808e-05, | |
| "loss": 0.7445163726806641, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.10542, | |
| "step": 25, | |
| "tokens/total": 6553600, | |
| "tokens/train_per_sec_per_gpu": 191.7, | |
| "tokens/trainable": 542902 | |
| }, | |
| { | |
| "epoch": 0.027652220154214306, | |
| "grad_norm": 0.05652826279401779, | |
| "learning_rate": 1.0638297872340426e-05, | |
| "loss": 0.7965201139450073, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.21781, | |
| "step": 26, | |
| "tokens/total": 6815744, | |
| "tokens/train_per_sec_per_gpu": 173.66, | |
| "tokens/trainable": 565280 | |
| }, | |
| { | |
| "epoch": 0.028715767083222547, | |
| "grad_norm": 0.05963267385959625, | |
| "learning_rate": 1.1063829787234044e-05, | |
| "loss": 0.8641867637634277, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.37308, | |
| "step": 27, | |
| "tokens/total": 7077888, | |
| "tokens/train_per_sec_per_gpu": 197.23, | |
| "tokens/trainable": 587539 | |
| }, | |
| { | |
| "epoch": 0.02977931401223079, | |
| "grad_norm": 0.06136506423354149, | |
| "learning_rate": 1.1489361702127662e-05, | |
| "loss": 0.8187180161476135, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.26759, | |
| "step": 28, | |
| "tokens/total": 7340032, | |
| "tokens/train_per_sec_per_gpu": 222.81, | |
| "tokens/trainable": 609839 | |
| }, | |
| { | |
| "epoch": 0.03084286094123903, | |
| "grad_norm": 0.0565866194665432, | |
| "learning_rate": 1.1914893617021277e-05, | |
| "loss": 0.8024689555168152, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.23104, | |
| "step": 29, | |
| "tokens/total": 7602176, | |
| "tokens/train_per_sec_per_gpu": 204.38, | |
| "tokens/trainable": 631280 | |
| }, | |
| { | |
| "epoch": 0.031906407870247275, | |
| "grad_norm": 0.06343540549278259, | |
| "learning_rate": 1.2340425531914895e-05, | |
| "loss": 0.7662019729614258, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.15158, | |
| "step": 30, | |
| "tokens/total": 7864320, | |
| "tokens/train_per_sec_per_gpu": 228.13, | |
| "tokens/trainable": 652766 | |
| }, | |
| { | |
| "epoch": 0.03296995479925552, | |
| "grad_norm": 0.056531310081481934, | |
| "learning_rate": 1.2765957446808513e-05, | |
| "loss": 0.8442375659942627, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.3262, | |
| "step": 31, | |
| "tokens/total": 8126464, | |
| "tokens/train_per_sec_per_gpu": 190.64, | |
| "tokens/trainable": 674612 | |
| }, | |
| { | |
| "epoch": 0.03403350172826376, | |
| "grad_norm": 0.06304491311311722, | |
| "learning_rate": 1.3191489361702127e-05, | |
| "loss": 0.8129785060882568, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.25461, | |
| "step": 32, | |
| "tokens/total": 8388608, | |
| "tokens/train_per_sec_per_gpu": 214.77, | |
| "tokens/trainable": 695779 | |
| }, | |
| { | |
| "epoch": 0.035097048657272, | |
| "grad_norm": 0.05908438190817833, | |
| "learning_rate": 1.3617021276595745e-05, | |
| "loss": 0.7475928068161011, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.11191, | |
| "step": 33, | |
| "tokens/total": 8650752, | |
| "tokens/train_per_sec_per_gpu": 177.7, | |
| "tokens/trainable": 716063 | |
| }, | |
| { | |
| "epoch": 0.03616059558628024, | |
| "grad_norm": 0.049326092004776, | |
| "learning_rate": 1.4042553191489363e-05, | |
| "loss": 0.7267792224884033, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.06841, | |
| "step": 34, | |
| "tokens/total": 8912896, | |
| "tokens/train_per_sec_per_gpu": 201.94, | |
| "tokens/trainable": 737753 | |
| }, | |
| { | |
| "epoch": 0.03722414251528849, | |
| "grad_norm": 0.04339035972952843, | |
| "learning_rate": 1.4468085106382981e-05, | |
| "loss": 0.8321285247802734, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.29821, | |
| "step": 35, | |
| "tokens/total": 9175040, | |
| "tokens/train_per_sec_per_gpu": 238.51, | |
| "tokens/trainable": 761270 | |
| }, | |
| { | |
| "epoch": 0.03828768944429673, | |
| "grad_norm": 0.040435630828142166, | |
| "learning_rate": 1.4893617021276596e-05, | |
| "loss": 0.7482062578201294, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.11321, | |
| "step": 36, | |
| "tokens/total": 9437184, | |
| "tokens/train_per_sec_per_gpu": 235.13, | |
| "tokens/trainable": 783478 | |
| }, | |
| { | |
| "epoch": 0.039351236373304975, | |
| "grad_norm": 0.04054463654756546, | |
| "learning_rate": 1.5319148936170214e-05, | |
| "loss": 0.7387034296989441, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.09322, | |
| "step": 37, | |
| "tokens/total": 9699328, | |
| "tokens/train_per_sec_per_gpu": 173.41, | |
| "tokens/trainable": 805224 | |
| }, | |
| { | |
| "epoch": 0.04041478330231321, | |
| "grad_norm": 0.03981109336018562, | |
| "learning_rate": 1.5744680851063832e-05, | |
| "loss": 0.7435116767883301, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.10331, | |
| "step": 38, | |
| "tokens/total": 9961472, | |
| "tokens/train_per_sec_per_gpu": 185.32, | |
| "tokens/trainable": 828977 | |
| }, | |
| { | |
| "epoch": 0.041478330231321456, | |
| "grad_norm": 0.04639929160475731, | |
| "learning_rate": 1.6170212765957446e-05, | |
| "loss": 0.8203743696212769, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.27135, | |
| "step": 39, | |
| "tokens/total": 10223616, | |
| "tokens/train_per_sec_per_gpu": 183.52, | |
| "tokens/trainable": 850008 | |
| }, | |
| { | |
| "epoch": 0.0425418771603297, | |
| "grad_norm": 0.04261818155646324, | |
| "learning_rate": 1.6595744680851064e-05, | |
| "loss": 0.8682478070259094, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.38273, | |
| "step": 40, | |
| "tokens/total": 10485760, | |
| "tokens/train_per_sec_per_gpu": 219.05, | |
| "tokens/trainable": 872900 | |
| }, | |
| { | |
| "epoch": 0.043605424089337944, | |
| "grad_norm": 0.04111519455909729, | |
| "learning_rate": 1.7021276595744682e-05, | |
| "loss": 0.7811744213104248, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.18404, | |
| "step": 41, | |
| "tokens/total": 10747904, | |
| "tokens/train_per_sec_per_gpu": 204.48, | |
| "tokens/trainable": 895505 | |
| }, | |
| { | |
| "epoch": 0.04466897101834619, | |
| "grad_norm": 0.03414495289325714, | |
| "learning_rate": 1.74468085106383e-05, | |
| "loss": 0.8004995584487915, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.22665, | |
| "step": 42, | |
| "tokens/total": 11010048, | |
| "tokens/train_per_sec_per_gpu": 198.04, | |
| "tokens/trainable": 917814 | |
| }, | |
| { | |
| "epoch": 0.045732517947354424, | |
| "grad_norm": 0.030925795435905457, | |
| "learning_rate": 1.7872340425531915e-05, | |
| "loss": 0.7756137251853943, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.17192, | |
| "step": 43, | |
| "tokens/total": 11272192, | |
| "tokens/train_per_sec_per_gpu": 195.24, | |
| "tokens/trainable": 939811 | |
| }, | |
| { | |
| "epoch": 0.04679606487636267, | |
| "grad_norm": 0.026804521679878235, | |
| "learning_rate": 1.8297872340425533e-05, | |
| "loss": 0.6872485876083374, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 1.98824, | |
| "step": 44, | |
| "tokens/total": 11534336, | |
| "tokens/train_per_sec_per_gpu": 228.04, | |
| "tokens/trainable": 963796 | |
| }, | |
| { | |
| "epoch": 0.04785961180537091, | |
| "grad_norm": 0.02924325503408909, | |
| "learning_rate": 1.872340425531915e-05, | |
| "loss": 0.7919371128082275, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.20767, | |
| "step": 45, | |
| "tokens/total": 11796480, | |
| "tokens/train_per_sec_per_gpu": 193.49, | |
| "tokens/trainable": 984430 | |
| }, | |
| { | |
| "epoch": 0.048923158734379156, | |
| "grad_norm": 0.030018026009202003, | |
| "learning_rate": 1.914893617021277e-05, | |
| "loss": 0.7972186803817749, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.21936, | |
| "step": 46, | |
| "tokens/total": 12058624, | |
| "tokens/train_per_sec_per_gpu": 180.38, | |
| "tokens/trainable": 1004277 | |
| }, | |
| { | |
| "epoch": 0.0499867056633874, | |
| "grad_norm": 0.030266476795077324, | |
| "learning_rate": 1.9574468085106384e-05, | |
| "loss": 0.7901904582977295, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.20382, | |
| "step": 47, | |
| "tokens/total": 12320768, | |
| "tokens/train_per_sec_per_gpu": 185.52, | |
| "tokens/trainable": 1026347 | |
| }, | |
| { | |
| "epoch": 0.05105025259239564, | |
| "grad_norm": 0.028692839667201042, | |
| "learning_rate": 2e-05, | |
| "loss": 0.7853357791900635, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.19314, | |
| "step": 48, | |
| "tokens/total": 12582912, | |
| "tokens/train_per_sec_per_gpu": 167.93, | |
| "tokens/trainable": 1048721 | |
| }, | |
| { | |
| "epoch": 0.05211379952140388, | |
| "grad_norm": 0.026348290964961052, | |
| "learning_rate": 2.0425531914893616e-05, | |
| "loss": 0.8077329397201538, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.24282, | |
| "step": 49, | |
| "tokens/total": 12845056, | |
| "tokens/train_per_sec_per_gpu": 185.28, | |
| "tokens/trainable": 1071780 | |
| }, | |
| { | |
| "epoch": 0.053177346450412125, | |
| "grad_norm": 0.029625559225678444, | |
| "learning_rate": 2.0851063829787238e-05, | |
| "loss": 0.7733415365219116, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.167, | |
| "step": 50, | |
| "tokens/total": 13107200, | |
| "tokens/train_per_sec_per_gpu": 189.79, | |
| "tokens/trainable": 1091760 | |
| }, | |
| { | |
| "epoch": 0.05424089337942037, | |
| "grad_norm": 0.027405593544244766, | |
| "learning_rate": 2.1276595744680852e-05, | |
| "loss": 0.7173340916633606, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.04896, | |
| "step": 51, | |
| "tokens/total": 13369344, | |
| "tokens/train_per_sec_per_gpu": 188.87, | |
| "tokens/trainable": 1113687 | |
| }, | |
| { | |
| "epoch": 0.05530444030842861, | |
| "grad_norm": 0.02946804091334343, | |
| "learning_rate": 2.1702127659574467e-05, | |
| "loss": 0.7727050185203552, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.16562, | |
| "step": 52, | |
| "tokens/total": 13631488, | |
| "tokens/train_per_sec_per_gpu": 209.22, | |
| "tokens/trainable": 1135391 | |
| }, | |
| { | |
| "epoch": 0.05636798723743685, | |
| "grad_norm": 0.02892529033124447, | |
| "learning_rate": 2.2127659574468088e-05, | |
| "loss": 0.8007056713104248, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.22711, | |
| "step": 53, | |
| "tokens/total": 13893632, | |
| "tokens/train_per_sec_per_gpu": 202.13, | |
| "tokens/trainable": 1159518 | |
| }, | |
| { | |
| "epoch": 0.05743153416644509, | |
| "grad_norm": 0.031362369656562805, | |
| "learning_rate": 2.2553191489361703e-05, | |
| "loss": 0.7692879438400269, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.15823, | |
| "step": 54, | |
| "tokens/total": 14155776, | |
| "tokens/train_per_sec_per_gpu": 151.27, | |
| "tokens/trainable": 1178964 | |
| }, | |
| { | |
| "epoch": 0.05849508109545334, | |
| "grad_norm": 0.027873003855347633, | |
| "learning_rate": 2.2978723404255324e-05, | |
| "loss": 0.7864505052566528, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.19559, | |
| "step": 55, | |
| "tokens/total": 14417920, | |
| "tokens/train_per_sec_per_gpu": 213.4, | |
| "tokens/trainable": 1201830 | |
| }, | |
| { | |
| "epoch": 0.05955862802446158, | |
| "grad_norm": 0.030442189425230026, | |
| "learning_rate": 2.340425531914894e-05, | |
| "loss": 0.7171883583068848, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.04866, | |
| "step": 56, | |
| "tokens/total": 14680064, | |
| "tokens/train_per_sec_per_gpu": 163.06, | |
| "tokens/trainable": 1221040 | |
| }, | |
| { | |
| "epoch": 0.060622174953469825, | |
| "grad_norm": 0.030432693660259247, | |
| "learning_rate": 2.3829787234042553e-05, | |
| "loss": 0.8123354911804199, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.25316, | |
| "step": 57, | |
| "tokens/total": 14942208, | |
| "tokens/train_per_sec_per_gpu": 191.04, | |
| "tokens/trainable": 1245037 | |
| }, | |
| { | |
| "epoch": 0.06168572188247806, | |
| "grad_norm": 0.030456526204943657, | |
| "learning_rate": 2.4255319148936175e-05, | |
| "loss": 0.7400133609771729, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.09596, | |
| "step": 58, | |
| "tokens/total": 15204352, | |
| "tokens/train_per_sec_per_gpu": 173.09, | |
| "tokens/trainable": 1265410 | |
| }, | |
| { | |
| "epoch": 0.0627492688114863, | |
| "grad_norm": 0.028698932379484177, | |
| "learning_rate": 2.468085106382979e-05, | |
| "loss": 0.7431353330612183, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.10252, | |
| "step": 59, | |
| "tokens/total": 15466496, | |
| "tokens/train_per_sec_per_gpu": 204.25, | |
| "tokens/trainable": 1287471 | |
| }, | |
| { | |
| "epoch": 0.06381281574049455, | |
| "grad_norm": 0.029805311933159828, | |
| "learning_rate": 2.5106382978723404e-05, | |
| "loss": 0.785997748374939, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.1946, | |
| "step": 60, | |
| "tokens/total": 15728640, | |
| "tokens/train_per_sec_per_gpu": 224.54, | |
| "tokens/trainable": 1309972 | |
| }, | |
| { | |
| "epoch": 0.0648763626695028, | |
| "grad_norm": 0.03344248980283737, | |
| "learning_rate": 2.5531914893617025e-05, | |
| "loss": 0.7236359119415283, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.06192, | |
| "step": 61, | |
| "tokens/total": 15990784, | |
| "tokens/train_per_sec_per_gpu": 162.36, | |
| "tokens/trainable": 1329223 | |
| }, | |
| { | |
| "epoch": 0.06593990959851104, | |
| "grad_norm": 0.03220194950699806, | |
| "learning_rate": 2.595744680851064e-05, | |
| "loss": 0.7307531833648682, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.07664, | |
| "step": 62, | |
| "tokens/total": 16252928, | |
| "tokens/train_per_sec_per_gpu": 161.66, | |
| "tokens/trainable": 1350124 | |
| }, | |
| { | |
| "epoch": 0.06700345652751928, | |
| "grad_norm": 0.032156504690647125, | |
| "learning_rate": 2.6382978723404255e-05, | |
| "loss": 0.8302059173583984, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.29379, | |
| "step": 63, | |
| "tokens/total": 16515072, | |
| "tokens/train_per_sec_per_gpu": 174.14, | |
| "tokens/trainable": 1372459 | |
| }, | |
| { | |
| "epoch": 0.06806700345652753, | |
| "grad_norm": 0.031544484198093414, | |
| "learning_rate": 2.6808510638297876e-05, | |
| "loss": 0.8317389488220215, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.29731, | |
| "step": 64, | |
| "tokens/total": 16777216, | |
| "tokens/train_per_sec_per_gpu": 203.61, | |
| "tokens/trainable": 1394135 | |
| }, | |
| { | |
| "epoch": 0.06913055038553576, | |
| "grad_norm": 0.028723040595650673, | |
| "learning_rate": 2.723404255319149e-05, | |
| "loss": 0.7596557140350342, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.13754, | |
| "step": 65, | |
| "tokens/total": 17039360, | |
| "tokens/train_per_sec_per_gpu": 175.65, | |
| "tokens/trainable": 1416653 | |
| }, | |
| { | |
| "epoch": 0.070194097314544, | |
| "grad_norm": 0.03393164649605751, | |
| "learning_rate": 2.7659574468085112e-05, | |
| "loss": 0.8652482032775879, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.3756, | |
| "step": 66, | |
| "tokens/total": 17301504, | |
| "tokens/train_per_sec_per_gpu": 195.94, | |
| "tokens/trainable": 1437390 | |
| }, | |
| { | |
| "epoch": 0.07125764424355224, | |
| "grad_norm": 0.030688602477312088, | |
| "learning_rate": 2.8085106382978727e-05, | |
| "loss": 0.7935420274734497, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.21121, | |
| "step": 67, | |
| "tokens/total": 17563648, | |
| "tokens/train_per_sec_per_gpu": 220.73, | |
| "tokens/trainable": 1461784 | |
| }, | |
| { | |
| "epoch": 0.07232119117256049, | |
| "grad_norm": 0.03269756957888603, | |
| "learning_rate": 2.851063829787234e-05, | |
| "loss": 0.7466378211975098, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.10989, | |
| "step": 68, | |
| "tokens/total": 17825792, | |
| "tokens/train_per_sec_per_gpu": 172.64, | |
| "tokens/trainable": 1482296 | |
| }, | |
| { | |
| "epoch": 0.07338473810156873, | |
| "grad_norm": 0.03261660039424896, | |
| "learning_rate": 2.8936170212765963e-05, | |
| "loss": 0.7684181928634644, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.15635, | |
| "step": 69, | |
| "tokens/total": 18087936, | |
| "tokens/train_per_sec_per_gpu": 153.82, | |
| "tokens/trainable": 1503447 | |
| }, | |
| { | |
| "epoch": 0.07444828503057697, | |
| "grad_norm": 0.029810158535838127, | |
| "learning_rate": 2.9361702127659577e-05, | |
| "loss": 0.745851993560791, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.10824, | |
| "step": 70, | |
| "tokens/total": 18350080, | |
| "tokens/train_per_sec_per_gpu": 176.59, | |
| "tokens/trainable": 1526617 | |
| }, | |
| { | |
| "epoch": 0.07551183195958522, | |
| "grad_norm": 0.03327067568898201, | |
| "learning_rate": 2.9787234042553192e-05, | |
| "loss": 0.8320407867431641, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.298, | |
| "step": 71, | |
| "tokens/total": 18612224, | |
| "tokens/train_per_sec_per_gpu": 172.54, | |
| "tokens/trainable": 1549503 | |
| }, | |
| { | |
| "epoch": 0.07657537888859346, | |
| "grad_norm": 0.030294055119156837, | |
| "learning_rate": 3.0212765957446813e-05, | |
| "loss": 0.7923359870910645, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.20855, | |
| "step": 72, | |
| "tokens/total": 18874368, | |
| "tokens/train_per_sec_per_gpu": 186.06, | |
| "tokens/trainable": 1572991 | |
| }, | |
| { | |
| "epoch": 0.0776389258176017, | |
| "grad_norm": 0.03210108354687691, | |
| "learning_rate": 3.063829787234043e-05, | |
| "loss": 0.746250569820404, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.10908, | |
| "step": 73, | |
| "tokens/total": 19136512, | |
| "tokens/train_per_sec_per_gpu": 140.36, | |
| "tokens/trainable": 1594140 | |
| }, | |
| { | |
| "epoch": 0.07870247274660995, | |
| "grad_norm": 0.030283037573099136, | |
| "learning_rate": 3.1063829787234046e-05, | |
| "loss": 0.7411618232727051, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.09837, | |
| "step": 74, | |
| "tokens/total": 19398656, | |
| "tokens/train_per_sec_per_gpu": 208.45, | |
| "tokens/trainable": 1616672 | |
| }, | |
| { | |
| "epoch": 0.07976601967561818, | |
| "grad_norm": 0.03764800727367401, | |
| "learning_rate": 3.1489361702127664e-05, | |
| "loss": 0.7435256242752075, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.10334, | |
| "step": 75, | |
| "tokens/total": 19660800, | |
| "tokens/train_per_sec_per_gpu": 167.25, | |
| "tokens/trainable": 1635193 | |
| }, | |
| { | |
| "epoch": 0.08082956660462642, | |
| "grad_norm": 0.03491177409887314, | |
| "learning_rate": 3.191489361702128e-05, | |
| "loss": 0.7323366403579712, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.07993, | |
| "step": 76, | |
| "tokens/total": 19922944, | |
| "tokens/train_per_sec_per_gpu": 214.58, | |
| "tokens/trainable": 1656375 | |
| }, | |
| { | |
| "epoch": 0.08189311353363467, | |
| "grad_norm": 0.03278028592467308, | |
| "learning_rate": 3.234042553191489e-05, | |
| "loss": 0.7500340938568115, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.11707, | |
| "step": 77, | |
| "tokens/total": 20185088, | |
| "tokens/train_per_sec_per_gpu": 187.04, | |
| "tokens/trainable": 1678155 | |
| }, | |
| { | |
| "epoch": 0.08295666046264291, | |
| "grad_norm": 0.032096248120069504, | |
| "learning_rate": 3.276595744680851e-05, | |
| "loss": 0.7423413991928101, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.10085, | |
| "step": 78, | |
| "tokens/total": 20447232, | |
| "tokens/train_per_sec_per_gpu": 209.49, | |
| "tokens/trainable": 1701373 | |
| }, | |
| { | |
| "epoch": 0.08402020739165116, | |
| "grad_norm": 0.03354285657405853, | |
| "learning_rate": 3.319148936170213e-05, | |
| "loss": 0.8002707958221436, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.22614, | |
| "step": 79, | |
| "tokens/total": 20709376, | |
| "tokens/train_per_sec_per_gpu": 178.24, | |
| "tokens/trainable": 1721916 | |
| }, | |
| { | |
| "epoch": 0.0850837543206594, | |
| "grad_norm": 0.03523889556527138, | |
| "learning_rate": 3.361702127659575e-05, | |
| "loss": 0.7944124937057495, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.21314, | |
| "step": 80, | |
| "tokens/total": 20971520, | |
| "tokens/train_per_sec_per_gpu": 221.61, | |
| "tokens/trainable": 1744950 | |
| }, | |
| { | |
| "epoch": 0.08614730124966764, | |
| "grad_norm": 0.03261874243617058, | |
| "learning_rate": 3.4042553191489365e-05, | |
| "loss": 0.7720386981964111, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.16417, | |
| "step": 81, | |
| "tokens/total": 21233664, | |
| "tokens/train_per_sec_per_gpu": 167.21, | |
| "tokens/trainable": 1765422 | |
| }, | |
| { | |
| "epoch": 0.08721084817867589, | |
| "grad_norm": 0.033395156264305115, | |
| "learning_rate": 3.446808510638298e-05, | |
| "loss": 0.779296875, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.17994, | |
| "step": 82, | |
| "tokens/total": 21495808, | |
| "tokens/train_per_sec_per_gpu": 217.82, | |
| "tokens/trainable": 1787402 | |
| }, | |
| { | |
| "epoch": 0.08827439510768413, | |
| "grad_norm": 0.032813675701618195, | |
| "learning_rate": 3.48936170212766e-05, | |
| "loss": 0.7013646364212036, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.0165, | |
| "step": 83, | |
| "tokens/total": 21757952, | |
| "tokens/train_per_sec_per_gpu": 165.9, | |
| "tokens/trainable": 1807444 | |
| }, | |
| { | |
| "epoch": 0.08933794203669237, | |
| "grad_norm": 0.035501375794410706, | |
| "learning_rate": 3.531914893617022e-05, | |
| "loss": 0.7295072674751282, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.07406, | |
| "step": 84, | |
| "tokens/total": 22020096, | |
| "tokens/train_per_sec_per_gpu": 182.67, | |
| "tokens/trainable": 1827505 | |
| }, | |
| { | |
| "epoch": 0.0904014889657006, | |
| "grad_norm": 0.0353703536093235, | |
| "learning_rate": 3.574468085106383e-05, | |
| "loss": 0.7775543928146362, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.17614, | |
| "step": 85, | |
| "tokens/total": 22282240, | |
| "tokens/train_per_sec_per_gpu": 188.9, | |
| "tokens/trainable": 1847467 | |
| }, | |
| { | |
| "epoch": 0.09146503589470885, | |
| "grad_norm": 0.03491484373807907, | |
| "learning_rate": 3.617021276595745e-05, | |
| "loss": 0.8319449424743652, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.29778, | |
| "step": 86, | |
| "tokens/total": 22544384, | |
| "tokens/train_per_sec_per_gpu": 214.28, | |
| "tokens/trainable": 1868200 | |
| }, | |
| { | |
| "epoch": 0.09252858282371709, | |
| "grad_norm": 0.032434333115816116, | |
| "learning_rate": 3.6595744680851066e-05, | |
| "loss": 0.8519909977912903, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.34431, | |
| "step": 87, | |
| "tokens/total": 22806528, | |
| "tokens/train_per_sec_per_gpu": 259.84, | |
| "tokens/trainable": 1893531 | |
| }, | |
| { | |
| "epoch": 0.09359212975272534, | |
| "grad_norm": 0.04186626523733139, | |
| "learning_rate": 3.7021276595744684e-05, | |
| "loss": 0.8195874094963074, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.26956, | |
| "step": 88, | |
| "tokens/total": 23068672, | |
| "tokens/train_per_sec_per_gpu": 178.33, | |
| "tokens/trainable": 1914870 | |
| }, | |
| { | |
| "epoch": 0.09465567668173358, | |
| "grad_norm": 0.03298460692167282, | |
| "learning_rate": 3.74468085106383e-05, | |
| "loss": 0.7469631433486938, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.11058, | |
| "step": 89, | |
| "tokens/total": 23330816, | |
| "tokens/train_per_sec_per_gpu": 200.02, | |
| "tokens/trainable": 1937963 | |
| }, | |
| { | |
| "epoch": 0.09571922361074182, | |
| "grad_norm": 0.03386974707245827, | |
| "learning_rate": 3.787234042553192e-05, | |
| "loss": 0.7484230399131775, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.11366, | |
| "step": 90, | |
| "tokens/total": 23592960, | |
| "tokens/train_per_sec_per_gpu": 193.44, | |
| "tokens/trainable": 1961198 | |
| }, | |
| { | |
| "epoch": 0.09678277053975007, | |
| "grad_norm": 0.039303258061409, | |
| "learning_rate": 3.829787234042554e-05, | |
| "loss": 0.75224769115448, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.12176, | |
| "step": 91, | |
| "tokens/total": 23855104, | |
| "tokens/train_per_sec_per_gpu": 176.98, | |
| "tokens/trainable": 1982479 | |
| }, | |
| { | |
| "epoch": 0.09784631746875831, | |
| "grad_norm": 0.03529525175690651, | |
| "learning_rate": 3.872340425531915e-05, | |
| "loss": 0.7571150064468384, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.13212, | |
| "step": 92, | |
| "tokens/total": 24117248, | |
| "tokens/train_per_sec_per_gpu": 227.43, | |
| "tokens/trainable": 2004994 | |
| }, | |
| { | |
| "epoch": 0.09890986439776656, | |
| "grad_norm": 0.034970078617334366, | |
| "learning_rate": 3.914893617021277e-05, | |
| "loss": 0.7881733179092407, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.19938, | |
| "step": 93, | |
| "tokens/total": 24379392, | |
| "tokens/train_per_sec_per_gpu": 180.97, | |
| "tokens/trainable": 2028105 | |
| }, | |
| { | |
| "epoch": 0.0999734113267748, | |
| "grad_norm": 0.036846473813056946, | |
| "learning_rate": 3.9574468085106385e-05, | |
| "loss": 0.7719080448150635, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.16389, | |
| "step": 94, | |
| "tokens/total": 24641536, | |
| "tokens/train_per_sec_per_gpu": 186.35, | |
| "tokens/trainable": 2051020 | |
| }, | |
| { | |
| "epoch": 0.10103695825578303, | |
| "grad_norm": 0.03919777274131775, | |
| "learning_rate": 4e-05, | |
| "loss": 0.7920703887939453, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.20796, | |
| "step": 95, | |
| "tokens/total": 24903680, | |
| "tokens/train_per_sec_per_gpu": 147.2, | |
| "tokens/trainable": 2070277 | |
| }, | |
| { | |
| "epoch": 0.10210050518479127, | |
| "grad_norm": 0.03782414644956589, | |
| "learning_rate": 3.9999862427247416e-05, | |
| "loss": 0.8352775573730469, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.30545, | |
| "step": 96, | |
| "tokens/total": 25165824, | |
| "tokens/train_per_sec_per_gpu": 181.92, | |
| "tokens/trainable": 2090839 | |
| }, | |
| { | |
| "epoch": 0.10316405211379952, | |
| "grad_norm": 0.03704574331641197, | |
| "learning_rate": 3.999944971088228e-05, | |
| "loss": 0.761550784111023, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.14159, | |
| "step": 97, | |
| "tokens/total": 25427968, | |
| "tokens/train_per_sec_per_gpu": 206.15, | |
| "tokens/trainable": 2112256 | |
| }, | |
| { | |
| "epoch": 0.10422759904280776, | |
| "grad_norm": 0.03385859355330467, | |
| "learning_rate": 3.999876185658244e-05, | |
| "loss": 0.7179139852523804, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.05015, | |
| "step": 98, | |
| "tokens/total": 25690112, | |
| "tokens/train_per_sec_per_gpu": 187.02, | |
| "tokens/trainable": 2136667 | |
| }, | |
| { | |
| "epoch": 0.105291145971816, | |
| "grad_norm": 0.03678734600543976, | |
| "learning_rate": 3.99977988738109e-05, | |
| "loss": 0.7438210248947144, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.10396, | |
| "step": 99, | |
| "tokens/total": 25952256, | |
| "tokens/train_per_sec_per_gpu": 178.09, | |
| "tokens/trainable": 2157962 | |
| }, | |
| { | |
| "epoch": 0.10635469290082425, | |
| "grad_norm": 0.03856838122010231, | |
| "learning_rate": 3.999656077581569e-05, | |
| "loss": 0.7466105222702026, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.10984, | |
| "step": 100, | |
| "tokens/total": 26214400, | |
| "tokens/train_per_sec_per_gpu": 184.35, | |
| "tokens/trainable": 2178396 | |
| }, | |
| { | |
| "epoch": 0.1074182398298325, | |
| "grad_norm": 0.03785452991724014, | |
| "learning_rate": 3.9995047579629654e-05, | |
| "loss": 0.7475836873054504, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.11189, | |
| "step": 101, | |
| "tokens/total": 26476544, | |
| "tokens/train_per_sec_per_gpu": 201.31, | |
| "tokens/trainable": 2201039 | |
| }, | |
| { | |
| "epoch": 0.10848178675884074, | |
| "grad_norm": 0.03467912971973419, | |
| "learning_rate": 3.9993259306070256e-05, | |
| "loss": 0.7737405300140381, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.16786, | |
| "step": 102, | |
| "tokens/total": 26738688, | |
| "tokens/train_per_sec_per_gpu": 185.99, | |
| "tokens/trainable": 2223019 | |
| }, | |
| { | |
| "epoch": 0.10954533368784898, | |
| "grad_norm": 0.04308745265007019, | |
| "learning_rate": 3.999119597973925e-05, | |
| "loss": 0.8207772970199585, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.27227, | |
| "step": 103, | |
| "tokens/total": 27000832, | |
| "tokens/train_per_sec_per_gpu": 147.51, | |
| "tokens/trainable": 2242785 | |
| }, | |
| { | |
| "epoch": 0.11060888061685722, | |
| "grad_norm": 0.036378778517246246, | |
| "learning_rate": 3.998885762902241e-05, | |
| "loss": 0.7338327169418335, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.08305, | |
| "step": 104, | |
| "tokens/total": 27262976, | |
| "tokens/train_per_sec_per_gpu": 179.24, | |
| "tokens/trainable": 2264578 | |
| }, | |
| { | |
| "epoch": 0.11167242754586545, | |
| "grad_norm": 0.04125402122735977, | |
| "learning_rate": 3.998624428608906e-05, | |
| "loss": 0.8683584332466125, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.383, | |
| "step": 105, | |
| "tokens/total": 27525120, | |
| "tokens/train_per_sec_per_gpu": 195.93, | |
| "tokens/trainable": 2285624 | |
| }, | |
| { | |
| "epoch": 0.1127359744748737, | |
| "grad_norm": 0.03740216791629791, | |
| "learning_rate": 3.9983355986891664e-05, | |
| "loss": 0.7756333947181702, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.17197, | |
| "step": 106, | |
| "tokens/total": 27787264, | |
| "tokens/train_per_sec_per_gpu": 246.08, | |
| "tokens/trainable": 2308345 | |
| }, | |
| { | |
| "epoch": 0.11379952140388194, | |
| "grad_norm": 0.04036470502614975, | |
| "learning_rate": 3.9980192771165364e-05, | |
| "loss": 0.7976692914962769, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.22036, | |
| "step": 107, | |
| "tokens/total": 28049408, | |
| "tokens/train_per_sec_per_gpu": 215.98, | |
| "tokens/trainable": 2328867 | |
| }, | |
| { | |
| "epoch": 0.11486306833289019, | |
| "grad_norm": 0.03836773335933685, | |
| "learning_rate": 3.997675468242738e-05, | |
| "loss": 0.7081190943717957, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.03017, | |
| "step": 108, | |
| "tokens/total": 28311552, | |
| "tokens/train_per_sec_per_gpu": 191.07, | |
| "tokens/trainable": 2349916 | |
| }, | |
| { | |
| "epoch": 0.11592661526189843, | |
| "grad_norm": 0.035974569618701935, | |
| "learning_rate": 3.9973041767976466e-05, | |
| "loss": 0.7658606767654419, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.15084, | |
| "step": 109, | |
| "tokens/total": 28573696, | |
| "tokens/train_per_sec_per_gpu": 200.69, | |
| "tokens/trainable": 2374197 | |
| }, | |
| { | |
| "epoch": 0.11699016219090667, | |
| "grad_norm": 0.0417025052011013, | |
| "learning_rate": 3.9969054078892185e-05, | |
| "loss": 0.8230124711990356, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.27735, | |
| "step": 110, | |
| "tokens/total": 28835840, | |
| "tokens/train_per_sec_per_gpu": 195.73, | |
| "tokens/trainable": 2395345 | |
| }, | |
| { | |
| "epoch": 0.11805370911991492, | |
| "grad_norm": 0.03637392073869705, | |
| "learning_rate": 3.996479167003428e-05, | |
| "loss": 0.7655156254768372, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.1501, | |
| "step": 111, | |
| "tokens/total": 29097984, | |
| "tokens/train_per_sec_per_gpu": 214.49, | |
| "tokens/trainable": 2418145 | |
| }, | |
| { | |
| "epoch": 0.11911725604892316, | |
| "grad_norm": 0.04198000580072403, | |
| "learning_rate": 3.996025460004189e-05, | |
| "loss": 0.8185654878616333, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.26725, | |
| "step": 112, | |
| "tokens/total": 29360128, | |
| "tokens/train_per_sec_per_gpu": 236.86, | |
| "tokens/trainable": 2439031 | |
| }, | |
| { | |
| "epoch": 0.1201808029779314, | |
| "grad_norm": 0.041592370718717575, | |
| "learning_rate": 3.995544293133273e-05, | |
| "loss": 0.8015573024749756, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.22901, | |
| "step": 113, | |
| "tokens/total": 29622272, | |
| "tokens/train_per_sec_per_gpu": 186.73, | |
| "tokens/trainable": 2458819 | |
| }, | |
| { | |
| "epoch": 0.12124434990693965, | |
| "grad_norm": 0.04079896956682205, | |
| "learning_rate": 3.995035673010225e-05, | |
| "loss": 0.7219120264053345, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.05837, | |
| "step": 114, | |
| "tokens/total": 29884416, | |
| "tokens/train_per_sec_per_gpu": 184.15, | |
| "tokens/trainable": 2480736 | |
| }, | |
| { | |
| "epoch": 0.12230789683594788, | |
| "grad_norm": 0.041768353432416916, | |
| "learning_rate": 3.994499606632272e-05, | |
| "loss": 0.8270866870880127, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.28665, | |
| "step": 115, | |
| "tokens/total": 30146560, | |
| "tokens/train_per_sec_per_gpu": 193.64, | |
| "tokens/trainable": 2504003 | |
| }, | |
| { | |
| "epoch": 0.12337144376495612, | |
| "grad_norm": 0.04556523263454437, | |
| "learning_rate": 3.9939361013742275e-05, | |
| "loss": 0.7384425401687622, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.09267, | |
| "step": 116, | |
| "tokens/total": 30408704, | |
| "tokens/train_per_sec_per_gpu": 213.94, | |
| "tokens/trainable": 2524722 | |
| }, | |
| { | |
| "epoch": 0.12443499069396437, | |
| "grad_norm": 0.03868886083364487, | |
| "learning_rate": 3.9933451649883866e-05, | |
| "loss": 0.709857165813446, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.0337, | |
| "step": 117, | |
| "tokens/total": 30670848, | |
| "tokens/train_per_sec_per_gpu": 231.95, | |
| "tokens/trainable": 2546913 | |
| }, | |
| { | |
| "epoch": 0.1254985376229726, | |
| "grad_norm": 0.04056168347597122, | |
| "learning_rate": 3.9927268056044266e-05, | |
| "loss": 0.7398765087127686, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.09568, | |
| "step": 118, | |
| "tokens/total": 30932992, | |
| "tokens/train_per_sec_per_gpu": 183.97, | |
| "tokens/trainable": 2568114 | |
| }, | |
| { | |
| "epoch": 0.12656208455198087, | |
| "grad_norm": 0.04197125881910324, | |
| "learning_rate": 3.992081031729285e-05, | |
| "loss": 0.7923115491867065, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.2085, | |
| "step": 119, | |
| "tokens/total": 31195136, | |
| "tokens/train_per_sec_per_gpu": 199.13, | |
| "tokens/trainable": 2588738 | |
| }, | |
| { | |
| "epoch": 0.1276256314809891, | |
| "grad_norm": 0.037061259150505066, | |
| "learning_rate": 3.9914078522470526e-05, | |
| "loss": 0.8101846575737, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.24832, | |
| "step": 120, | |
| "tokens/total": 31457280, | |
| "tokens/train_per_sec_per_gpu": 193.31, | |
| "tokens/trainable": 2613736 | |
| }, | |
| { | |
| "epoch": 0.12868917840999733, | |
| "grad_norm": 0.04570484906435013, | |
| "learning_rate": 3.9907072764188435e-05, | |
| "loss": 0.7499140501022339, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.11682, | |
| "step": 121, | |
| "tokens/total": 31719424, | |
| "tokens/train_per_sec_per_gpu": 180.47, | |
| "tokens/trainable": 2634223 | |
| }, | |
| { | |
| "epoch": 0.1297527253390056, | |
| "grad_norm": 0.04544052109122276, | |
| "learning_rate": 3.9899793138826736e-05, | |
| "loss": 0.8020647764205933, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.23014, | |
| "step": 122, | |
| "tokens/total": 31981568, | |
| "tokens/train_per_sec_per_gpu": 198.92, | |
| "tokens/trainable": 2655869 | |
| }, | |
| { | |
| "epoch": 0.13081627226801382, | |
| "grad_norm": 0.04469464346766472, | |
| "learning_rate": 3.989223974653323e-05, | |
| "loss": 0.7518518567085266, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.12092, | |
| "step": 123, | |
| "tokens/total": 32243712, | |
| "tokens/train_per_sec_per_gpu": 208.31, | |
| "tokens/trainable": 2679088 | |
| }, | |
| { | |
| "epoch": 0.13187981919702207, | |
| "grad_norm": 0.0395895391702652, | |
| "learning_rate": 3.9884412691222016e-05, | |
| "loss": 0.7855230569839478, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.19355, | |
| "step": 124, | |
| "tokens/total": 32505856, | |
| "tokens/train_per_sec_per_gpu": 195.72, | |
| "tokens/trainable": 2702201 | |
| }, | |
| { | |
| "epoch": 0.1329433661260303, | |
| "grad_norm": 0.04236849397420883, | |
| "learning_rate": 3.987631208057205e-05, | |
| "loss": 0.775454044342041, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.17158, | |
| "step": 125, | |
| "tokens/total": 32768000, | |
| "tokens/train_per_sec_per_gpu": 169.99, | |
| "tokens/trainable": 2724593 | |
| }, | |
| { | |
| "epoch": 0.13400691305503856, | |
| "grad_norm": 0.04403228312730789, | |
| "learning_rate": 3.986793802602566e-05, | |
| "loss": 0.7912722229957581, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.2062, | |
| "step": 126, | |
| "tokens/total": 33030144, | |
| "tokens/train_per_sec_per_gpu": 178.51, | |
| "tokens/trainable": 2747546 | |
| }, | |
| { | |
| "epoch": 0.1350704599840468, | |
| "grad_norm": 0.0415693037211895, | |
| "learning_rate": 3.985929064278701e-05, | |
| "loss": 0.803294837474823, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.23289, | |
| "step": 127, | |
| "tokens/total": 33292288, | |
| "tokens/train_per_sec_per_gpu": 199.75, | |
| "tokens/trainable": 2769876 | |
| }, | |
| { | |
| "epoch": 0.13613400691305505, | |
| "grad_norm": 0.043098073452711105, | |
| "learning_rate": 3.985037004982056e-05, | |
| "loss": 0.8380795121192932, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.31192, | |
| "step": 128, | |
| "tokens/total": 33554432, | |
| "tokens/train_per_sec_per_gpu": 197.32, | |
| "tokens/trainable": 2791990 | |
| }, | |
| { | |
| "epoch": 0.13719755384206328, | |
| "grad_norm": 0.04367615282535553, | |
| "learning_rate": 3.984117636984933e-05, | |
| "loss": 0.7381528615951538, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.09207, | |
| "step": 129, | |
| "tokens/total": 33816576, | |
| "tokens/train_per_sec_per_gpu": 157.27, | |
| "tokens/trainable": 2812157 | |
| }, | |
| { | |
| "epoch": 0.1382611007710715, | |
| "grad_norm": 0.03902239724993706, | |
| "learning_rate": 3.983170972935333e-05, | |
| "loss": 0.6622740030288696, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 1.9392, | |
| "step": 130, | |
| "tokens/total": 34078720, | |
| "tokens/train_per_sec_per_gpu": 199.7, | |
| "tokens/trainable": 2834450 | |
| }, | |
| { | |
| "epoch": 0.13932464770007977, | |
| "grad_norm": 0.04401889070868492, | |
| "learning_rate": 3.982197025856772e-05, | |
| "loss": 0.8131764531135559, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.25506, | |
| "step": 131, | |
| "tokens/total": 34340864, | |
| "tokens/train_per_sec_per_gpu": 210.08, | |
| "tokens/trainable": 2857868 | |
| }, | |
| { | |
| "epoch": 0.140388194629088, | |
| "grad_norm": 0.055351078510284424, | |
| "learning_rate": 3.98119580914811e-05, | |
| "loss": 0.8648597002029419, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.37467, | |
| "step": 132, | |
| "tokens/total": 34603008, | |
| "tokens/train_per_sec_per_gpu": 164.9, | |
| "tokens/trainable": 2877451 | |
| }, | |
| { | |
| "epoch": 0.14145174155809626, | |
| "grad_norm": 0.04292495548725128, | |
| "learning_rate": 3.980167336583359e-05, | |
| "loss": 0.8202415108680725, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.27105, | |
| "step": 133, | |
| "tokens/total": 34865152, | |
| "tokens/train_per_sec_per_gpu": 179.62, | |
| "tokens/trainable": 2898969 | |
| }, | |
| { | |
| "epoch": 0.14251528848710449, | |
| "grad_norm": 0.039902154356241226, | |
| "learning_rate": 3.979111622311501e-05, | |
| "loss": 0.7659401893615723, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.15102, | |
| "step": 134, | |
| "tokens/total": 35127296, | |
| "tokens/train_per_sec_per_gpu": 204.35, | |
| "tokens/trainable": 2922244 | |
| }, | |
| { | |
| "epoch": 0.14357883541611274, | |
| "grad_norm": 0.04879293963313103, | |
| "learning_rate": 3.978028680856286e-05, | |
| "loss": 0.7666274309158325, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.15249, | |
| "step": 135, | |
| "tokens/total": 35389440, | |
| "tokens/train_per_sec_per_gpu": 203.21, | |
| "tokens/trainable": 2941098 | |
| }, | |
| { | |
| "epoch": 0.14464238234512097, | |
| "grad_norm": 0.04211945831775665, | |
| "learning_rate": 3.97691852711604e-05, | |
| "loss": 0.7596578001976013, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.13754, | |
| "step": 136, | |
| "tokens/total": 35651584, | |
| "tokens/train_per_sec_per_gpu": 182.99, | |
| "tokens/trainable": 2962728 | |
| }, | |
| { | |
| "epoch": 0.14570592927412923, | |
| "grad_norm": 0.05592913180589676, | |
| "learning_rate": 3.975781176363451e-05, | |
| "loss": 0.8827542066574097, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.41755, | |
| "step": 137, | |
| "tokens/total": 35913728, | |
| "tokens/train_per_sec_per_gpu": 167.07, | |
| "tokens/trainable": 2983012 | |
| }, | |
| { | |
| "epoch": 0.14676947620313746, | |
| "grad_norm": 0.042312368750572205, | |
| "learning_rate": 3.9746166442453667e-05, | |
| "loss": 0.7679699659347534, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.15539, | |
| "step": 138, | |
| "tokens/total": 36175872, | |
| "tokens/train_per_sec_per_gpu": 173.16, | |
| "tokens/trainable": 3005801 | |
| }, | |
| { | |
| "epoch": 0.14783302313214572, | |
| "grad_norm": 0.043086566030979156, | |
| "learning_rate": 3.973424946782578e-05, | |
| "loss": 0.7419267892837524, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.09998, | |
| "step": 139, | |
| "tokens/total": 36438016, | |
| "tokens/train_per_sec_per_gpu": 179.27, | |
| "tokens/trainable": 3027970 | |
| }, | |
| { | |
| "epoch": 0.14889657006115395, | |
| "grad_norm": 0.044038690626621246, | |
| "learning_rate": 3.972206100369594e-05, | |
| "loss": 0.8171659111976624, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.26407, | |
| "step": 140, | |
| "tokens/total": 36700160, | |
| "tokens/train_per_sec_per_gpu": 156.51, | |
| "tokens/trainable": 3051271 | |
| }, | |
| { | |
| "epoch": 0.14996011699016218, | |
| "grad_norm": 0.04058285430073738, | |
| "learning_rate": 3.970960121774419e-05, | |
| "loss": 0.8079518675804138, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.24331, | |
| "step": 141, | |
| "tokens/total": 36962304, | |
| "tokens/train_per_sec_per_gpu": 167.22, | |
| "tokens/trainable": 3074209 | |
| }, | |
| { | |
| "epoch": 0.15102366391917044, | |
| "grad_norm": 0.04522034898400307, | |
| "learning_rate": 3.9696870281383255e-05, | |
| "loss": 0.8723236322402954, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.39246, | |
| "step": 142, | |
| "tokens/total": 37224448, | |
| "tokens/train_per_sec_per_gpu": 158.71, | |
| "tokens/trainable": 3095651 | |
| }, | |
| { | |
| "epoch": 0.15208721084817867, | |
| "grad_norm": 0.04753715172410011, | |
| "learning_rate": 3.968386836975611e-05, | |
| "loss": 0.7392692565917969, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.0944, | |
| "step": 143, | |
| "tokens/total": 37486592, | |
| "tokens/train_per_sec_per_gpu": 189.77, | |
| "tokens/trainable": 3118220 | |
| }, | |
| { | |
| "epoch": 0.15315075777718692, | |
| "grad_norm": 0.045635782182216644, | |
| "learning_rate": 3.9670595661733654e-05, | |
| "loss": 0.8149927854537964, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.25916, | |
| "step": 144, | |
| "tokens/total": 37748736, | |
| "tokens/train_per_sec_per_gpu": 209.61, | |
| "tokens/trainable": 3143149 | |
| }, | |
| { | |
| "epoch": 0.15421430470619515, | |
| "grad_norm": 0.049446720629930496, | |
| "learning_rate": 3.9657052339912166e-05, | |
| "loss": 0.8048349022865295, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.23633, | |
| "step": 145, | |
| "tokens/total": 38010880, | |
| "tokens/train_per_sec_per_gpu": 200.16, | |
| "tokens/trainable": 3164817 | |
| }, | |
| { | |
| "epoch": 0.1552778516352034, | |
| "grad_norm": 0.04811964929103851, | |
| "learning_rate": 3.9643238590610864e-05, | |
| "loss": 0.7713128328323364, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.1626, | |
| "step": 146, | |
| "tokens/total": 38273024, | |
| "tokens/train_per_sec_per_gpu": 210.98, | |
| "tokens/trainable": 3185683 | |
| }, | |
| { | |
| "epoch": 0.15634139856421164, | |
| "grad_norm": 0.04874229058623314, | |
| "learning_rate": 3.9629154603869294e-05, | |
| "loss": 0.7867254614830017, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.19619, | |
| "step": 147, | |
| "tokens/total": 38535168, | |
| "tokens/train_per_sec_per_gpu": 161.76, | |
| "tokens/trainable": 3207493 | |
| }, | |
| { | |
| "epoch": 0.1574049454932199, | |
| "grad_norm": 0.04509029909968376, | |
| "learning_rate": 3.961480057344474e-05, | |
| "loss": 0.8230168223381042, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.27736, | |
| "step": 148, | |
| "tokens/total": 38797312, | |
| "tokens/train_per_sec_per_gpu": 165.98, | |
| "tokens/trainable": 3227269 | |
| }, | |
| { | |
| "epoch": 0.15846849242222813, | |
| "grad_norm": 0.048180241137742996, | |
| "learning_rate": 3.9600176696809555e-05, | |
| "loss": 0.7925543785095215, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.20903, | |
| "step": 149, | |
| "tokens/total": 39059456, | |
| "tokens/train_per_sec_per_gpu": 178.87, | |
| "tokens/trainable": 3247693 | |
| }, | |
| { | |
| "epoch": 0.15953203935123636, | |
| "grad_norm": 0.044965874403715134, | |
| "learning_rate": 3.9585283175148425e-05, | |
| "loss": 0.7305552959442139, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.07623, | |
| "step": 150, | |
| "tokens/total": 39321600, | |
| "tokens/train_per_sec_per_gpu": 185.73, | |
| "tokens/trainable": 3271323 | |
| }, | |
| { | |
| "epoch": 0.15953203935123636, | |
| "eval_loss": 0.7756121754646301, | |
| "eval_ppl": 2.17192, | |
| "eval_runtime": 237.3162, | |
| "eval_samples_per_second": 28.173, | |
| "eval_steps_per_second": 1.761, | |
| "memory/device_reserved (GiB)": 51.31, | |
| "memory/max_active (GiB)": 38.19, | |
| "memory/max_allocated (GiB)": 38.19, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.16059558628024462, | |
| "grad_norm": 0.05009883642196655, | |
| "learning_rate": 3.9570120213355636e-05, | |
| "loss": 0.7295466661453247, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.07414, | |
| "step": 151, | |
| "tokens/total": 39583744, | |
| "tokens/train_per_sec_per_gpu": 212.53, | |
| "tokens/trainable": 3293376 | |
| }, | |
| { | |
| "epoch": 0.16165913320925285, | |
| "grad_norm": 0.05454389378428459, | |
| "learning_rate": 3.955468802003222e-05, | |
| "loss": 0.8171148300170898, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.26396, | |
| "step": 152, | |
| "tokens/total": 39845888, | |
| "tokens/train_per_sec_per_gpu": 160.49, | |
| "tokens/trainable": 3314003 | |
| }, | |
| { | |
| "epoch": 0.1627226801382611, | |
| "grad_norm": 0.05227701738476753, | |
| "learning_rate": 3.953898680748311e-05, | |
| "loss": 0.8315908908843994, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.29697, | |
| "step": 153, | |
| "tokens/total": 40108032, | |
| "tokens/train_per_sec_per_gpu": 178.92, | |
| "tokens/trainable": 3335783 | |
| }, | |
| { | |
| "epoch": 0.16378622706726934, | |
| "grad_norm": 0.04624287411570549, | |
| "learning_rate": 3.952301679171421e-05, | |
| "loss": 0.7561501860618591, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.13006, | |
| "step": 154, | |
| "tokens/total": 40370176, | |
| "tokens/train_per_sec_per_gpu": 164.87, | |
| "tokens/trainable": 3355886 | |
| }, | |
| { | |
| "epoch": 0.1648497739962776, | |
| "grad_norm": 0.04997319355607033, | |
| "learning_rate": 3.950677819242943e-05, | |
| "loss": 0.788512110710144, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.20012, | |
| "step": 155, | |
| "tokens/total": 40632320, | |
| "tokens/train_per_sec_per_gpu": 188.8, | |
| "tokens/trainable": 3378024 | |
| }, | |
| { | |
| "epoch": 0.16591332092528582, | |
| "grad_norm": 0.05008501932024956, | |
| "learning_rate": 3.949027123302764e-05, | |
| "loss": 0.8327994346618652, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.29975, | |
| "step": 156, | |
| "tokens/total": 40894464, | |
| "tokens/train_per_sec_per_gpu": 163.0, | |
| "tokens/trainable": 3399925 | |
| }, | |
| { | |
| "epoch": 0.16697686785429408, | |
| "grad_norm": 0.05233265459537506, | |
| "learning_rate": 3.9473496140599626e-05, | |
| "loss": 0.8238826394081116, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.27933, | |
| "step": 157, | |
| "tokens/total": 41156608, | |
| "tokens/train_per_sec_per_gpu": 182.71, | |
| "tokens/trainable": 3420640 | |
| }, | |
| { | |
| "epoch": 0.1680404147833023, | |
| "grad_norm": 0.05217234417796135, | |
| "learning_rate": 3.945645314592495e-05, | |
| "loss": 0.7473776340484619, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.11146, | |
| "step": 158, | |
| "tokens/total": 41418752, | |
| "tokens/train_per_sec_per_gpu": 223.76, | |
| "tokens/trainable": 3442988 | |
| }, | |
| { | |
| "epoch": 0.16910396171231057, | |
| "grad_norm": 0.04567525163292885, | |
| "learning_rate": 3.943914248346879e-05, | |
| "loss": 0.7227488160133362, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.06009, | |
| "step": 159, | |
| "tokens/total": 41680896, | |
| "tokens/train_per_sec_per_gpu": 177.03, | |
| "tokens/trainable": 3464101 | |
| }, | |
| { | |
| "epoch": 0.1701675086413188, | |
| "grad_norm": 0.05548242852091789, | |
| "learning_rate": 3.9421564391378685e-05, | |
| "loss": 0.7955631017684937, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.21569, | |
| "step": 160, | |
| "tokens/total": 41943040, | |
| "tokens/train_per_sec_per_gpu": 152.75, | |
| "tokens/trainable": 3483419 | |
| }, | |
| { | |
| "epoch": 0.17123105557032703, | |
| "grad_norm": 0.04398762434720993, | |
| "learning_rate": 3.9403719111481295e-05, | |
| "loss": 0.7600826025009155, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.13845, | |
| "step": 161, | |
| "tokens/total": 42205184, | |
| "tokens/train_per_sec_per_gpu": 205.76, | |
| "tokens/trainable": 3504602 | |
| }, | |
| { | |
| "epoch": 0.1722946024993353, | |
| "grad_norm": 0.04506729915738106, | |
| "learning_rate": 3.9385606889279035e-05, | |
| "loss": 0.7480685114860535, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.11292, | |
| "step": 162, | |
| "tokens/total": 42467328, | |
| "tokens/train_per_sec_per_gpu": 227.39, | |
| "tokens/trainable": 3526406 | |
| }, | |
| { | |
| "epoch": 0.17335814942834352, | |
| "grad_norm": 0.044185835868120193, | |
| "learning_rate": 3.9367227973946745e-05, | |
| "loss": 0.7433359026908875, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.10294, | |
| "step": 163, | |
| "tokens/total": 42729472, | |
| "tokens/train_per_sec_per_gpu": 182.07, | |
| "tokens/trainable": 3548313 | |
| }, | |
| { | |
| "epoch": 0.17442169635735177, | |
| "grad_norm": 0.05059230327606201, | |
| "learning_rate": 3.934858261832822e-05, | |
| "loss": 0.6835325956344604, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 1.98086, | |
| "step": 164, | |
| "tokens/total": 42991616, | |
| "tokens/train_per_sec_per_gpu": 169.05, | |
| "tokens/trainable": 3568190 | |
| }, | |
| { | |
| "epoch": 0.17548524328636, | |
| "grad_norm": 0.050207290798425674, | |
| "learning_rate": 3.932967107893274e-05, | |
| "loss": 0.7899980545043945, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.20339, | |
| "step": 165, | |
| "tokens/total": 43253760, | |
| "tokens/train_per_sec_per_gpu": 205.5, | |
| "tokens/trainable": 3588948 | |
| }, | |
| { | |
| "epoch": 0.17654879021536826, | |
| "grad_norm": 0.04827320948243141, | |
| "learning_rate": 3.931049361593157e-05, | |
| "loss": 0.7980469465255737, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.2212, | |
| "step": 166, | |
| "tokens/total": 43515904, | |
| "tokens/train_per_sec_per_gpu": 173.52, | |
| "tokens/trainable": 3609829 | |
| }, | |
| { | |
| "epoch": 0.1776123371443765, | |
| "grad_norm": 0.048085663467645645, | |
| "learning_rate": 3.9291050493154336e-05, | |
| "loss": 0.7630643844604492, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.14484, | |
| "step": 167, | |
| "tokens/total": 43778048, | |
| "tokens/train_per_sec_per_gpu": 200.45, | |
| "tokens/trainable": 3631205 | |
| }, | |
| { | |
| "epoch": 0.17867588407338475, | |
| "grad_norm": 0.05210770294070244, | |
| "learning_rate": 3.927134197808544e-05, | |
| "loss": 0.7417425513267517, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.09959, | |
| "step": 168, | |
| "tokens/total": 44040192, | |
| "tokens/train_per_sec_per_gpu": 138.92, | |
| "tokens/trainable": 3651025 | |
| }, | |
| { | |
| "epoch": 0.17973943100239298, | |
| "grad_norm": 0.04291163384914398, | |
| "learning_rate": 3.9251368341860343e-05, | |
| "loss": 0.7509276866912842, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.11896, | |
| "step": 169, | |
| "tokens/total": 44302336, | |
| "tokens/train_per_sec_per_gpu": 189.85, | |
| "tokens/trainable": 3674343 | |
| }, | |
| { | |
| "epoch": 0.1808029779314012, | |
| "grad_norm": 0.05076931044459343, | |
| "learning_rate": 3.923112985926185e-05, | |
| "loss": 0.7864252328872681, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.19553, | |
| "step": 170, | |
| "tokens/total": 44564480, | |
| "tokens/train_per_sec_per_gpu": 173.58, | |
| "tokens/trainable": 3694079 | |
| }, | |
| { | |
| "epoch": 0.18186652486040947, | |
| "grad_norm": 0.046801142394542694, | |
| "learning_rate": 3.921062680871635e-05, | |
| "loss": 0.6877319812774658, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 1.9892, | |
| "step": 171, | |
| "tokens/total": 44826624, | |
| "tokens/train_per_sec_per_gpu": 176.69, | |
| "tokens/trainable": 3715203 | |
| }, | |
| { | |
| "epoch": 0.1829300717894177, | |
| "grad_norm": 0.05101482570171356, | |
| "learning_rate": 3.9189859472289956e-05, | |
| "loss": 0.800000786781311, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.22554, | |
| "step": 172, | |
| "tokens/total": 45088768, | |
| "tokens/train_per_sec_per_gpu": 150.34, | |
| "tokens/trainable": 3737632 | |
| }, | |
| { | |
| "epoch": 0.18399361871842596, | |
| "grad_norm": 0.04480605199933052, | |
| "learning_rate": 3.916882813568461e-05, | |
| "loss": 0.7518149614334106, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.12085, | |
| "step": 173, | |
| "tokens/total": 45350912, | |
| "tokens/train_per_sec_per_gpu": 216.08, | |
| "tokens/trainable": 3761754 | |
| }, | |
| { | |
| "epoch": 0.18505716564743419, | |
| "grad_norm": 0.05060945823788643, | |
| "learning_rate": 3.914753308823422e-05, | |
| "loss": 0.8080068230628967, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.24343, | |
| "step": 174, | |
| "tokens/total": 45613056, | |
| "tokens/train_per_sec_per_gpu": 147.42, | |
| "tokens/trainable": 3784061 | |
| }, | |
| { | |
| "epoch": 0.18612071257644244, | |
| "grad_norm": 0.04640955105423927, | |
| "learning_rate": 3.9125974622900596e-05, | |
| "loss": 0.8282898664474487, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.2894, | |
| "step": 175, | |
| "tokens/total": 45875200, | |
| "tokens/train_per_sec_per_gpu": 208.37, | |
| "tokens/trainable": 3808216 | |
| }, | |
| { | |
| "epoch": 0.18718425950545067, | |
| "grad_norm": 0.04194442555308342, | |
| "learning_rate": 3.91041530362695e-05, | |
| "loss": 0.7835493087768555, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.18923, | |
| "step": 176, | |
| "tokens/total": 46137344, | |
| "tokens/train_per_sec_per_gpu": 198.59, | |
| "tokens/trainable": 3832481 | |
| }, | |
| { | |
| "epoch": 0.18824780643445893, | |
| "grad_norm": 0.049947503954172134, | |
| "learning_rate": 3.90820686285465e-05, | |
| "loss": 0.8008949756622314, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.22753, | |
| "step": 177, | |
| "tokens/total": 46399488, | |
| "tokens/train_per_sec_per_gpu": 151.75, | |
| "tokens/trainable": 3852693 | |
| }, | |
| { | |
| "epoch": 0.18931135336346716, | |
| "grad_norm": 0.04894804581999779, | |
| "learning_rate": 3.905972170355286e-05, | |
| "loss": 0.711793065071106, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.03764, | |
| "step": 178, | |
| "tokens/total": 46661632, | |
| "tokens/train_per_sec_per_gpu": 178.97, | |
| "tokens/trainable": 3873846 | |
| }, | |
| { | |
| "epoch": 0.19037490029247542, | |
| "grad_norm": 0.04731186851859093, | |
| "learning_rate": 3.903711256872139e-05, | |
| "loss": 0.7765140533447266, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.17388, | |
| "step": 179, | |
| "tokens/total": 46923776, | |
| "tokens/train_per_sec_per_gpu": 206.26, | |
| "tokens/trainable": 3898641 | |
| }, | |
| { | |
| "epoch": 0.19143844722148365, | |
| "grad_norm": 0.055336493998765945, | |
| "learning_rate": 3.901424153509218e-05, | |
| "loss": 0.8252753019332886, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.28251, | |
| "step": 180, | |
| "tokens/total": 47185920, | |
| "tokens/train_per_sec_per_gpu": 170.2, | |
| "tokens/trainable": 3919863 | |
| }, | |
| { | |
| "epoch": 0.19250199415049188, | |
| "grad_norm": 0.04609975218772888, | |
| "learning_rate": 3.899110891730834e-05, | |
| "loss": 0.7351381778717041, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.08577, | |
| "step": 181, | |
| "tokens/total": 47448064, | |
| "tokens/train_per_sec_per_gpu": 177.94, | |
| "tokens/trainable": 3942514 | |
| }, | |
| { | |
| "epoch": 0.19356554107950014, | |
| "grad_norm": 0.05186214670538902, | |
| "learning_rate": 3.896771503361165e-05, | |
| "loss": 0.7917764186859131, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.20731, | |
| "step": 182, | |
| "tokens/total": 47710208, | |
| "tokens/train_per_sec_per_gpu": 200.82, | |
| "tokens/trainable": 3964229 | |
| }, | |
| { | |
| "epoch": 0.19462908800850837, | |
| "grad_norm": 0.049349937587976456, | |
| "learning_rate": 3.8944060205838204e-05, | |
| "loss": 0.7680513858795166, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.15556, | |
| "step": 183, | |
| "tokens/total": 47972352, | |
| "tokens/train_per_sec_per_gpu": 190.57, | |
| "tokens/trainable": 3983779 | |
| }, | |
| { | |
| "epoch": 0.19569263493751662, | |
| "grad_norm": 0.053863126784563065, | |
| "learning_rate": 3.892014475941399e-05, | |
| "loss": 0.7624801397323608, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.14359, | |
| "step": 184, | |
| "tokens/total": 48234496, | |
| "tokens/train_per_sec_per_gpu": 165.8, | |
| "tokens/trainable": 4004641 | |
| }, | |
| { | |
| "epoch": 0.19675618186652485, | |
| "grad_norm": 0.04809142276644707, | |
| "learning_rate": 3.8895969023350384e-05, | |
| "loss": 0.8235425353050232, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.27856, | |
| "step": 185, | |
| "tokens/total": 48496640, | |
| "tokens/train_per_sec_per_gpu": 198.93, | |
| "tokens/trainable": 4025071 | |
| }, | |
| { | |
| "epoch": 0.1978197287955331, | |
| "grad_norm": 0.04834391921758652, | |
| "learning_rate": 3.8871533330239646e-05, | |
| "loss": 0.7338411211967468, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.08307, | |
| "step": 186, | |
| "tokens/total": 48758784, | |
| "tokens/train_per_sec_per_gpu": 185.51, | |
| "tokens/trainable": 4048197 | |
| }, | |
| { | |
| "epoch": 0.19888327572454134, | |
| "grad_norm": 0.052105050534009933, | |
| "learning_rate": 3.884683801625032e-05, | |
| "loss": 0.7286123633384705, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.0722, | |
| "step": 187, | |
| "tokens/total": 49020928, | |
| "tokens/train_per_sec_per_gpu": 175.75, | |
| "tokens/trainable": 4069621 | |
| }, | |
| { | |
| "epoch": 0.1999468226535496, | |
| "grad_norm": 0.04482823610305786, | |
| "learning_rate": 3.8821883421122645e-05, | |
| "loss": 0.7568373680114746, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.13152, | |
| "step": 188, | |
| "tokens/total": 49283072, | |
| "tokens/train_per_sec_per_gpu": 203.94, | |
| "tokens/trainable": 4093663 | |
| }, | |
| { | |
| "epoch": 0.20101036958255783, | |
| "grad_norm": 0.0523945651948452, | |
| "learning_rate": 3.879666988816386e-05, | |
| "loss": 0.755517840385437, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.12871, | |
| "step": 189, | |
| "tokens/total": 49545216, | |
| "tokens/train_per_sec_per_gpu": 160.35, | |
| "tokens/trainable": 4113650 | |
| }, | |
| { | |
| "epoch": 0.20207391651156606, | |
| "grad_norm": 0.04911473020911217, | |
| "learning_rate": 3.877119776424347e-05, | |
| "loss": 0.7191125154495239, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.05261, | |
| "step": 190, | |
| "tokens/total": 49807360, | |
| "tokens/train_per_sec_per_gpu": 221.01, | |
| "tokens/trainable": 4136190 | |
| }, | |
| { | |
| "epoch": 0.20313746344057432, | |
| "grad_norm": 0.053953029215335846, | |
| "learning_rate": 3.8745467399788506e-05, | |
| "loss": 0.7884220480918884, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.19992, | |
| "step": 191, | |
| "tokens/total": 50069504, | |
| "tokens/train_per_sec_per_gpu": 154.41, | |
| "tokens/trainable": 4158226 | |
| }, | |
| { | |
| "epoch": 0.20420101036958255, | |
| "grad_norm": 0.05675153061747551, | |
| "learning_rate": 3.871947914877866e-05, | |
| "loss": 0.7461360692977905, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.10884, | |
| "step": 192, | |
| "tokens/total": 50331648, | |
| "tokens/train_per_sec_per_gpu": 179.47, | |
| "tokens/trainable": 4179916 | |
| }, | |
| { | |
| "epoch": 0.2052645572985908, | |
| "grad_norm": 0.05521610751748085, | |
| "learning_rate": 3.869323336874146e-05, | |
| "loss": 0.7196043729782104, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.05362, | |
| "step": 193, | |
| "tokens/total": 50593792, | |
| "tokens/train_per_sec_per_gpu": 201.18, | |
| "tokens/trainable": 4200779 | |
| }, | |
| { | |
| "epoch": 0.20632810422759904, | |
| "grad_norm": 0.056510064750909805, | |
| "learning_rate": 3.8666730420747336e-05, | |
| "loss": 0.8237625360488892, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.27906, | |
| "step": 194, | |
| "tokens/total": 50855936, | |
| "tokens/train_per_sec_per_gpu": 171.77, | |
| "tokens/trainable": 4222217 | |
| }, | |
| { | |
| "epoch": 0.2073916511566073, | |
| "grad_norm": 0.0467703752219677, | |
| "learning_rate": 3.863997066940463e-05, | |
| "loss": 0.7923108339309692, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.20849, | |
| "step": 195, | |
| "tokens/total": 51118080, | |
| "tokens/train_per_sec_per_gpu": 193.25, | |
| "tokens/trainable": 4244857 | |
| }, | |
| { | |
| "epoch": 0.20845519808561552, | |
| "grad_norm": 0.04992164671421051, | |
| "learning_rate": 3.8612954482854606e-05, | |
| "loss": 0.7622380256652832, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.14307, | |
| "step": 196, | |
| "tokens/total": 51380224, | |
| "tokens/train_per_sec_per_gpu": 216.77, | |
| "tokens/trainable": 4267572 | |
| }, | |
| { | |
| "epoch": 0.20951874501462378, | |
| "grad_norm": 0.049939945340156555, | |
| "learning_rate": 3.8585682232766385e-05, | |
| "loss": 0.7296082973480225, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.07427, | |
| "step": 197, | |
| "tokens/total": 51642368, | |
| "tokens/train_per_sec_per_gpu": 214.99, | |
| "tokens/trainable": 4290348 | |
| }, | |
| { | |
| "epoch": 0.210582291943632, | |
| "grad_norm": 0.05264829471707344, | |
| "learning_rate": 3.8558154294331807e-05, | |
| "loss": 0.7297487258911133, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.07456, | |
| "step": 198, | |
| "tokens/total": 51904512, | |
| "tokens/train_per_sec_per_gpu": 178.25, | |
| "tokens/trainable": 4310984 | |
| }, | |
| { | |
| "epoch": 0.21164583887264027, | |
| "grad_norm": 0.05678649619221687, | |
| "learning_rate": 3.853037104626031e-05, | |
| "loss": 0.8022236227989197, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.2305, | |
| "step": 199, | |
| "tokens/total": 52166656, | |
| "tokens/train_per_sec_per_gpu": 212.29, | |
| "tokens/trainable": 4332640 | |
| }, | |
| { | |
| "epoch": 0.2127093858016485, | |
| "grad_norm": 0.04627032205462456, | |
| "learning_rate": 3.8502332870773675e-05, | |
| "loss": 0.8206828832626343, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.27205, | |
| "step": 200, | |
| "tokens/total": 52428800, | |
| "tokens/train_per_sec_per_gpu": 173.49, | |
| "tokens/trainable": 4356101 | |
| }, | |
| { | |
| "epoch": 0.21377293273065673, | |
| "grad_norm": 0.04949687048792839, | |
| "learning_rate": 3.847404015360081e-05, | |
| "loss": 0.7801845073699951, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.18187, | |
| "step": 201, | |
| "tokens/total": 52690944, | |
| "tokens/train_per_sec_per_gpu": 202.37, | |
| "tokens/trainable": 4378733 | |
| }, | |
| { | |
| "epoch": 0.214836479659665, | |
| "grad_norm": 0.05199093371629715, | |
| "learning_rate": 3.8445493283972414e-05, | |
| "loss": 0.7552693486213684, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.12818, | |
| "step": 202, | |
| "tokens/total": 52953088, | |
| "tokens/train_per_sec_per_gpu": 165.43, | |
| "tokens/trainable": 4401167 | |
| }, | |
| { | |
| "epoch": 0.21590002658867322, | |
| "grad_norm": 0.060018111020326614, | |
| "learning_rate": 3.841669265461562e-05, | |
| "loss": 0.8178205490112305, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.26556, | |
| "step": 203, | |
| "tokens/total": 53215232, | |
| "tokens/train_per_sec_per_gpu": 192.79, | |
| "tokens/trainable": 4421155 | |
| }, | |
| { | |
| "epoch": 0.21696357351768147, | |
| "grad_norm": 0.0454607792198658, | |
| "learning_rate": 3.838763866174862e-05, | |
| "loss": 0.798674464225769, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.22259, | |
| "step": 204, | |
| "tokens/total": 53477376, | |
| "tokens/train_per_sec_per_gpu": 181.78, | |
| "tokens/trainable": 4443402 | |
| }, | |
| { | |
| "epoch": 0.2180271204466897, | |
| "grad_norm": 0.05475523695349693, | |
| "learning_rate": 3.835833170507519e-05, | |
| "loss": 0.7501173615455627, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.11725, | |
| "step": 205, | |
| "tokens/total": 53739520, | |
| "tokens/train_per_sec_per_gpu": 174.06, | |
| "tokens/trainable": 4465823 | |
| }, | |
| { | |
| "epoch": 0.21909066737569796, | |
| "grad_norm": 0.04717683792114258, | |
| "learning_rate": 3.8328772187779196e-05, | |
| "loss": 0.7843440771102905, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.19097, | |
| "step": 206, | |
| "tokens/total": 54001664, | |
| "tokens/train_per_sec_per_gpu": 172.19, | |
| "tokens/trainable": 4490092 | |
| }, | |
| { | |
| "epoch": 0.2201542143047062, | |
| "grad_norm": 0.04792032018303871, | |
| "learning_rate": 3.829896051651907e-05, | |
| "loss": 0.7995268702507019, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.22449, | |
| "step": 207, | |
| "tokens/total": 54263808, | |
| "tokens/train_per_sec_per_gpu": 198.34, | |
| "tokens/trainable": 4513112 | |
| }, | |
| { | |
| "epoch": 0.22121776123371445, | |
| "grad_norm": 0.05625506862998009, | |
| "learning_rate": 3.8268897101422154e-05, | |
| "loss": 0.7814656496047974, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.18467, | |
| "step": 208, | |
| "tokens/total": 54525952, | |
| "tokens/train_per_sec_per_gpu": 178.5, | |
| "tokens/trainable": 4533887 | |
| }, | |
| { | |
| "epoch": 0.22228130816272268, | |
| "grad_norm": 0.05093343183398247, | |
| "learning_rate": 3.823858235607915e-05, | |
| "loss": 0.8371798396110535, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.30984, | |
| "step": 209, | |
| "tokens/total": 54788096, | |
| "tokens/train_per_sec_per_gpu": 179.89, | |
| "tokens/trainable": 4557864 | |
| }, | |
| { | |
| "epoch": 0.2233448550917309, | |
| "grad_norm": 0.051436666399240494, | |
| "learning_rate": 3.820801669753833e-05, | |
| "loss": 0.8444880247116089, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.32679, | |
| "step": 210, | |
| "tokens/total": 55050240, | |
| "tokens/train_per_sec_per_gpu": 208.06, | |
| "tokens/trainable": 4579529 | |
| }, | |
| { | |
| "epoch": 0.22440840202073917, | |
| "grad_norm": 0.046146344393491745, | |
| "learning_rate": 3.8177200546299894e-05, | |
| "loss": 0.7421606779098511, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.10047, | |
| "step": 211, | |
| "tokens/total": 55312384, | |
| "tokens/train_per_sec_per_gpu": 198.81, | |
| "tokens/trainable": 4602512 | |
| }, | |
| { | |
| "epoch": 0.2254719489497474, | |
| "grad_norm": 0.06141940504312515, | |
| "learning_rate": 3.81461343263101e-05, | |
| "loss": 0.7741906642913818, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.16884, | |
| "step": 212, | |
| "tokens/total": 55574528, | |
| "tokens/train_per_sec_per_gpu": 196.43, | |
| "tokens/trainable": 4623604 | |
| }, | |
| { | |
| "epoch": 0.22653549587875565, | |
| "grad_norm": 0.061457011848688126, | |
| "learning_rate": 3.81148184649555e-05, | |
| "loss": 0.7711528539657593, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.16226, | |
| "step": 213, | |
| "tokens/total": 55836672, | |
| "tokens/train_per_sec_per_gpu": 148.28, | |
| "tokens/trainable": 4645354 | |
| }, | |
| { | |
| "epoch": 0.22759904280776388, | |
| "grad_norm": 0.05289280042052269, | |
| "learning_rate": 3.8083253393057006e-05, | |
| "loss": 0.7671029567718506, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.15352, | |
| "step": 214, | |
| "tokens/total": 56098816, | |
| "tokens/train_per_sec_per_gpu": 198.49, | |
| "tokens/trainable": 4666565 | |
| }, | |
| { | |
| "epoch": 0.22866258973677214, | |
| "grad_norm": 0.05258488655090332, | |
| "learning_rate": 3.805143954486401e-05, | |
| "loss": 0.7617560029029846, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.14203, | |
| "step": 215, | |
| "tokens/total": 56360960, | |
| "tokens/train_per_sec_per_gpu": 177.48, | |
| "tokens/trainable": 4688241 | |
| }, | |
| { | |
| "epoch": 0.22972613666578037, | |
| "grad_norm": 0.05477464199066162, | |
| "learning_rate": 3.801937735804838e-05, | |
| "loss": 0.736034095287323, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.08764, | |
| "step": 216, | |
| "tokens/total": 56623104, | |
| "tokens/train_per_sec_per_gpu": 164.79, | |
| "tokens/trainable": 4710328 | |
| }, | |
| { | |
| "epoch": 0.23078968359478863, | |
| "grad_norm": 0.058859411627054214, | |
| "learning_rate": 3.798706727369845e-05, | |
| "loss": 0.7572994232177734, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.13251, | |
| "step": 217, | |
| "tokens/total": 56885248, | |
| "tokens/train_per_sec_per_gpu": 155.41, | |
| "tokens/trainable": 4730784 | |
| }, | |
| { | |
| "epoch": 0.23185323052379686, | |
| "grad_norm": 0.055905554443597794, | |
| "learning_rate": 3.795450973631293e-05, | |
| "loss": 0.7484654188156128, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.11375, | |
| "step": 218, | |
| "tokens/total": 57147392, | |
| "tokens/train_per_sec_per_gpu": 195.23, | |
| "tokens/trainable": 4752708 | |
| }, | |
| { | |
| "epoch": 0.23291677745280512, | |
| "grad_norm": 0.05177122727036476, | |
| "learning_rate": 3.792170519379482e-05, | |
| "loss": 0.6985906958580017, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.01092, | |
| "step": 219, | |
| "tokens/total": 57409536, | |
| "tokens/train_per_sec_per_gpu": 151.36, | |
| "tokens/trainable": 4776585 | |
| }, | |
| { | |
| "epoch": 0.23398032438181335, | |
| "grad_norm": 0.05102040618658066, | |
| "learning_rate": 3.788865409744527e-05, | |
| "loss": 0.7429978847503662, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.10223, | |
| "step": 220, | |
| "tokens/total": 57671680, | |
| "tokens/train_per_sec_per_gpu": 161.92, | |
| "tokens/trainable": 4798099 | |
| }, | |
| { | |
| "epoch": 0.23504387131082158, | |
| "grad_norm": 0.04876242205500603, | |
| "learning_rate": 3.785535690195728e-05, | |
| "loss": 0.8290475010871887, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.29114, | |
| "step": 221, | |
| "tokens/total": 57933824, | |
| "tokens/train_per_sec_per_gpu": 215.34, | |
| "tokens/trainable": 4821746 | |
| }, | |
| { | |
| "epoch": 0.23610741823982984, | |
| "grad_norm": 0.04731612280011177, | |
| "learning_rate": 3.782181406540954e-05, | |
| "loss": 0.7661755084991455, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.15152, | |
| "step": 222, | |
| "tokens/total": 58195968, | |
| "tokens/train_per_sec_per_gpu": 213.65, | |
| "tokens/trainable": 4845142 | |
| }, | |
| { | |
| "epoch": 0.23717096516883807, | |
| "grad_norm": 0.058613162487745285, | |
| "learning_rate": 3.77880260492601e-05, | |
| "loss": 0.7483052015304565, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.11342, | |
| "step": 223, | |
| "tokens/total": 58458112, | |
| "tokens/train_per_sec_per_gpu": 219.8, | |
| "tokens/trainable": 4866360 | |
| }, | |
| { | |
| "epoch": 0.23823451209784632, | |
| "grad_norm": 0.05162626504898071, | |
| "learning_rate": 3.775399331833998e-05, | |
| "loss": 0.798062264919281, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.22123, | |
| "step": 224, | |
| "tokens/total": 58720256, | |
| "tokens/train_per_sec_per_gpu": 144.23, | |
| "tokens/trainable": 4886502 | |
| }, | |
| { | |
| "epoch": 0.23929805902685455, | |
| "grad_norm": 0.04998771846294403, | |
| "learning_rate": 3.7719716340846845e-05, | |
| "loss": 0.8206250667572021, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.27192, | |
| "step": 225, | |
| "tokens/total": 58982400, | |
| "tokens/train_per_sec_per_gpu": 195.94, | |
| "tokens/trainable": 4910020 | |
| }, | |
| { | |
| "epoch": 0.2403616059558628, | |
| "grad_norm": 0.056194525212049484, | |
| "learning_rate": 3.768519558833849e-05, | |
| "loss": 0.878259539604187, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.40671, | |
| "step": 226, | |
| "tokens/total": 59244544, | |
| "tokens/train_per_sec_per_gpu": 157.32, | |
| "tokens/trainable": 4931232 | |
| }, | |
| { | |
| "epoch": 0.24142515288487104, | |
| "grad_norm": 0.0521056093275547, | |
| "learning_rate": 3.765043153572643e-05, | |
| "loss": 0.759453535079956, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.13711, | |
| "step": 227, | |
| "tokens/total": 59506688, | |
| "tokens/train_per_sec_per_gpu": 193.4, | |
| "tokens/trainable": 4954744 | |
| }, | |
| { | |
| "epoch": 0.2424886998138793, | |
| "grad_norm": 0.04911046847701073, | |
| "learning_rate": 3.761542466126929e-05, | |
| "loss": 0.7336410284042358, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.08265, | |
| "step": 228, | |
| "tokens/total": 59768832, | |
| "tokens/train_per_sec_per_gpu": 207.37, | |
| "tokens/trainable": 4975992 | |
| }, | |
| { | |
| "epoch": 0.24355224674288753, | |
| "grad_norm": 0.06128966435790062, | |
| "learning_rate": 3.758017544656628e-05, | |
| "loss": 0.7942535877227783, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.21279, | |
| "step": 229, | |
| "tokens/total": 60030976, | |
| "tokens/train_per_sec_per_gpu": 185.07, | |
| "tokens/trainable": 4996744 | |
| }, | |
| { | |
| "epoch": 0.24461579367189576, | |
| "grad_norm": 0.057662662118673325, | |
| "learning_rate": 3.754468437655056e-05, | |
| "loss": 0.7774747014045715, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.17597, | |
| "step": 230, | |
| "tokens/total": 60293120, | |
| "tokens/train_per_sec_per_gpu": 144.76, | |
| "tokens/trainable": 5015784 | |
| }, | |
| { | |
| "epoch": 0.24567934060090402, | |
| "grad_norm": 0.0500815324485302, | |
| "learning_rate": 3.7508951939482543e-05, | |
| "loss": 0.701805591583252, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.01739, | |
| "step": 231, | |
| "tokens/total": 60555264, | |
| "tokens/train_per_sec_per_gpu": 198.76, | |
| "tokens/trainable": 5037125 | |
| }, | |
| { | |
| "epoch": 0.24674288752991225, | |
| "grad_norm": 0.05346173420548439, | |
| "learning_rate": 3.74729786269432e-05, | |
| "loss": 0.7507196068763733, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.11852, | |
| "step": 232, | |
| "tokens/total": 60817408, | |
| "tokens/train_per_sec_per_gpu": 197.05, | |
| "tokens/trainable": 5059497 | |
| }, | |
| { | |
| "epoch": 0.2478064344589205, | |
| "grad_norm": 0.05386090278625488, | |
| "learning_rate": 3.7436764933827284e-05, | |
| "loss": 0.7338147163391113, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.08301, | |
| "step": 233, | |
| "tokens/total": 61079552, | |
| "tokens/train_per_sec_per_gpu": 157.02, | |
| "tokens/trainable": 5080212 | |
| }, | |
| { | |
| "epoch": 0.24886998138792873, | |
| "grad_norm": 0.05847088247537613, | |
| "learning_rate": 3.7400311358336555e-05, | |
| "loss": 0.7185083627700806, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.05137, | |
| "step": 234, | |
| "tokens/total": 61341696, | |
| "tokens/train_per_sec_per_gpu": 178.9, | |
| "tokens/trainable": 5101917 | |
| }, | |
| { | |
| "epoch": 0.249933528316937, | |
| "grad_norm": 0.05917196720838547, | |
| "learning_rate": 3.736361840197288e-05, | |
| "loss": 0.7877013087272644, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.19834, | |
| "step": 235, | |
| "tokens/total": 61603840, | |
| "tokens/train_per_sec_per_gpu": 172.85, | |
| "tokens/trainable": 5122988 | |
| }, | |
| { | |
| "epoch": 0.2509970752459452, | |
| "grad_norm": 0.05370993912220001, | |
| "learning_rate": 3.732668656953136e-05, | |
| "loss": 0.7264862656593323, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.0678, | |
| "step": 236, | |
| "tokens/total": 61865984, | |
| "tokens/train_per_sec_per_gpu": 215.97, | |
| "tokens/trainable": 5145194 | |
| }, | |
| { | |
| "epoch": 0.25206062217495345, | |
| "grad_norm": 0.05220884829759598, | |
| "learning_rate": 3.728951636909338e-05, | |
| "loss": 0.7533116340637207, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.12402, | |
| "step": 237, | |
| "tokens/total": 62128128, | |
| "tokens/train_per_sec_per_gpu": 211.82, | |
| "tokens/trainable": 5167987 | |
| }, | |
| { | |
| "epoch": 0.25312416910396174, | |
| "grad_norm": 0.054722413420677185, | |
| "learning_rate": 3.725210831201961e-05, | |
| "loss": 0.7575439810752869, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.13303, | |
| "step": 238, | |
| "tokens/total": 62390272, | |
| "tokens/train_per_sec_per_gpu": 154.07, | |
| "tokens/trainable": 5189725 | |
| }, | |
| { | |
| "epoch": 0.25418771603296997, | |
| "grad_norm": 0.05407283455133438, | |
| "learning_rate": 3.721446291294301e-05, | |
| "loss": 0.7419638633728027, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.10006, | |
| "step": 239, | |
| "tokens/total": 62652416, | |
| "tokens/train_per_sec_per_gpu": 241.74, | |
| "tokens/trainable": 5213041 | |
| }, | |
| { | |
| "epoch": 0.2552512629619782, | |
| "grad_norm": 0.05353840813040733, | |
| "learning_rate": 3.717658068976168e-05, | |
| "loss": 0.7568483352661133, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.13155, | |
| "step": 240, | |
| "tokens/total": 62914560, | |
| "tokens/train_per_sec_per_gpu": 170.27, | |
| "tokens/trainable": 5232929 | |
| }, | |
| { | |
| "epoch": 0.25631480989098643, | |
| "grad_norm": 0.05174530670046806, | |
| "learning_rate": 3.713846216363179e-05, | |
| "loss": 0.6827611923217773, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 1.97934, | |
| "step": 241, | |
| "tokens/total": 63176704, | |
| "tokens/train_per_sec_per_gpu": 212.44, | |
| "tokens/trainable": 5253894 | |
| }, | |
| { | |
| "epoch": 0.25737835681999466, | |
| "grad_norm": 0.050874270498752594, | |
| "learning_rate": 3.7100107858960404e-05, | |
| "loss": 0.7131105661392212, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.04033, | |
| "step": 242, | |
| "tokens/total": 63438848, | |
| "tokens/train_per_sec_per_gpu": 167.64, | |
| "tokens/trainable": 5275682 | |
| }, | |
| { | |
| "epoch": 0.25844190374900294, | |
| "grad_norm": 0.06289295852184296, | |
| "learning_rate": 3.7061518303398244e-05, | |
| "loss": 0.814382791519165, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.25778, | |
| "step": 243, | |
| "tokens/total": 63700992, | |
| "tokens/train_per_sec_per_gpu": 181.58, | |
| "tokens/trainable": 5296823 | |
| }, | |
| { | |
| "epoch": 0.2595054506780112, | |
| "grad_norm": 0.05226941406726837, | |
| "learning_rate": 3.7022694027832456e-05, | |
| "loss": 0.716779887676239, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.04783, | |
| "step": 244, | |
| "tokens/total": 63963136, | |
| "tokens/train_per_sec_per_gpu": 202.14, | |
| "tokens/trainable": 5318716 | |
| }, | |
| { | |
| "epoch": 0.2605689976070194, | |
| "grad_norm": 0.04937303066253662, | |
| "learning_rate": 3.698363556637927e-05, | |
| "loss": 0.7856250405311584, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.19378, | |
| "step": 245, | |
| "tokens/total": 64225280, | |
| "tokens/train_per_sec_per_gpu": 229.48, | |
| "tokens/trainable": 5344987 | |
| }, | |
| { | |
| "epoch": 0.26163254453602763, | |
| "grad_norm": 0.049020808190107346, | |
| "learning_rate": 3.694434345637671e-05, | |
| "loss": 0.7429791688919067, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.10219, | |
| "step": 246, | |
| "tokens/total": 64487424, | |
| "tokens/train_per_sec_per_gpu": 226.82, | |
| "tokens/trainable": 5369834 | |
| }, | |
| { | |
| "epoch": 0.2626960914650359, | |
| "grad_norm": 0.052014704793691635, | |
| "learning_rate": 3.690481823837714e-05, | |
| "loss": 0.7686535120010376, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.15686, | |
| "step": 247, | |
| "tokens/total": 64749568, | |
| "tokens/train_per_sec_per_gpu": 199.34, | |
| "tokens/trainable": 5392410 | |
| }, | |
| { | |
| "epoch": 0.26375963839404415, | |
| "grad_norm": 0.060707978904247284, | |
| "learning_rate": 3.686506045613986e-05, | |
| "loss": 0.7760209441184998, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.17281, | |
| "step": 248, | |
| "tokens/total": 65011712, | |
| "tokens/train_per_sec_per_gpu": 157.58, | |
| "tokens/trainable": 5414349 | |
| }, | |
| { | |
| "epoch": 0.2648231853230524, | |
| "grad_norm": 0.06462915241718292, | |
| "learning_rate": 3.6825070656623626e-05, | |
| "loss": 0.7831145524978638, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.18828, | |
| "step": 249, | |
| "tokens/total": 65273856, | |
| "tokens/train_per_sec_per_gpu": 159.08, | |
| "tokens/trainable": 5433875 | |
| }, | |
| { | |
| "epoch": 0.2658867322520606, | |
| "grad_norm": 0.057429276406764984, | |
| "learning_rate": 3.678484938997912e-05, | |
| "loss": 0.6994718909263611, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.01269, | |
| "step": 250, | |
| "tokens/total": 65536000, | |
| "tokens/train_per_sec_per_gpu": 166.54, | |
| "tokens/trainable": 5454358 | |
| }, | |
| { | |
| "epoch": 0.26695027918106884, | |
| "grad_norm": 0.05674952268600464, | |
| "learning_rate": 3.674439720954138e-05, | |
| "loss": 0.7693842649459839, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.15844, | |
| "step": 251, | |
| "tokens/total": 65798144, | |
| "tokens/train_per_sec_per_gpu": 199.24, | |
| "tokens/trainable": 5476733 | |
| }, | |
| { | |
| "epoch": 0.2680138261100771, | |
| "grad_norm": 0.06037837266921997, | |
| "learning_rate": 3.670371467182219e-05, | |
| "loss": 0.7689650058746338, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.15753, | |
| "step": 252, | |
| "tokens/total": 66060288, | |
| "tokens/train_per_sec_per_gpu": 184.65, | |
| "tokens/trainable": 5498250 | |
| }, | |
| { | |
| "epoch": 0.26907737303908535, | |
| "grad_norm": 0.055501531809568405, | |
| "learning_rate": 3.66628023365024e-05, | |
| "loss": 0.7238588333129883, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.06238, | |
| "step": 253, | |
| "tokens/total": 66322432, | |
| "tokens/train_per_sec_per_gpu": 227.0, | |
| "tokens/trainable": 5520172 | |
| }, | |
| { | |
| "epoch": 0.2701409199680936, | |
| "grad_norm": 0.05900080129504204, | |
| "learning_rate": 3.66216607664243e-05, | |
| "loss": 0.7000092267990112, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.01377, | |
| "step": 254, | |
| "tokens/total": 66584576, | |
| "tokens/train_per_sec_per_gpu": 153.45, | |
| "tokens/trainable": 5539604 | |
| }, | |
| { | |
| "epoch": 0.2712044668971018, | |
| "grad_norm": 0.054649997502565384, | |
| "learning_rate": 3.658029052758377e-05, | |
| "loss": 0.7665979266166687, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.15243, | |
| "step": 255, | |
| "tokens/total": 66846720, | |
| "tokens/train_per_sec_per_gpu": 173.48, | |
| "tokens/trainable": 5561266 | |
| }, | |
| { | |
| "epoch": 0.2722680138261101, | |
| "grad_norm": 0.05519971251487732, | |
| "learning_rate": 3.653869218912258e-05, | |
| "loss": 0.7569284439086914, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.13172, | |
| "step": 256, | |
| "tokens/total": 67108864, | |
| "tokens/train_per_sec_per_gpu": 171.2, | |
| "tokens/trainable": 5583034 | |
| }, | |
| { | |
| "epoch": 0.27333156075511833, | |
| "grad_norm": 0.055965524166822433, | |
| "learning_rate": 3.649686632332052e-05, | |
| "loss": 0.7072951793670654, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.0285, | |
| "step": 257, | |
| "tokens/total": 67371008, | |
| "tokens/train_per_sec_per_gpu": 175.76, | |
| "tokens/trainable": 5604839 | |
| }, | |
| { | |
| "epoch": 0.27439510768412656, | |
| "grad_norm": 0.05398353934288025, | |
| "learning_rate": 3.645481350558754e-05, | |
| "loss": 0.7658364176750183, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.15079, | |
| "step": 258, | |
| "tokens/total": 67633152, | |
| "tokens/train_per_sec_per_gpu": 153.96, | |
| "tokens/trainable": 5626766 | |
| }, | |
| { | |
| "epoch": 0.2754586546131348, | |
| "grad_norm": 0.04775004833936691, | |
| "learning_rate": 3.6412534314455836e-05, | |
| "loss": 0.6940434575080872, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.00179, | |
| "step": 259, | |
| "tokens/total": 67895296, | |
| "tokens/train_per_sec_per_gpu": 142.3, | |
| "tokens/trainable": 5647569 | |
| }, | |
| { | |
| "epoch": 0.276522201542143, | |
| "grad_norm": 0.05857420340180397, | |
| "learning_rate": 3.637002933157187e-05, | |
| "loss": 0.8444321155548096, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.32666, | |
| "step": 260, | |
| "tokens/total": 68157440, | |
| "tokens/train_per_sec_per_gpu": 169.5, | |
| "tokens/trainable": 5667784 | |
| }, | |
| { | |
| "epoch": 0.2775857484711513, | |
| "grad_norm": 0.05263550207018852, | |
| "learning_rate": 3.6327299141688396e-05, | |
| "loss": 0.8108729720115662, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.24987, | |
| "step": 261, | |
| "tokens/total": 68419584, | |
| "tokens/train_per_sec_per_gpu": 143.48, | |
| "tokens/trainable": 5689804 | |
| }, | |
| { | |
| "epoch": 0.27864929540015954, | |
| "grad_norm": 0.05478999391198158, | |
| "learning_rate": 3.6284344332656396e-05, | |
| "loss": 0.8421679735183716, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.32139, | |
| "step": 262, | |
| "tokens/total": 68681728, | |
| "tokens/train_per_sec_per_gpu": 246.12, | |
| "tokens/trainable": 5714024 | |
| }, | |
| { | |
| "epoch": 0.27971284232916777, | |
| "grad_norm": 0.06489334255456924, | |
| "learning_rate": 3.6241165495417006e-05, | |
| "loss": 0.8724418878555298, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.39275, | |
| "step": 263, | |
| "tokens/total": 68943872, | |
| "tokens/train_per_sec_per_gpu": 166.84, | |
| "tokens/trainable": 5734370 | |
| }, | |
| { | |
| "epoch": 0.280776389258176, | |
| "grad_norm": 0.06463494151830673, | |
| "learning_rate": 3.619776322399336e-05, | |
| "loss": 0.7543226480484009, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.12617, | |
| "step": 264, | |
| "tokens/total": 69206016, | |
| "tokens/train_per_sec_per_gpu": 214.95, | |
| "tokens/trainable": 5757030 | |
| }, | |
| { | |
| "epoch": 0.2818399361871843, | |
| "grad_norm": 0.05572579428553581, | |
| "learning_rate": 3.615413811548247e-05, | |
| "loss": 0.7700116634368896, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.15979, | |
| "step": 265, | |
| "tokens/total": 69468160, | |
| "tokens/train_per_sec_per_gpu": 187.77, | |
| "tokens/trainable": 5777899 | |
| }, | |
| { | |
| "epoch": 0.2829034831161925, | |
| "grad_norm": 0.05551740154623985, | |
| "learning_rate": 3.6110290770046954e-05, | |
| "loss": 0.7438211441040039, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.10396, | |
| "step": 266, | |
| "tokens/total": 69730304, | |
| "tokens/train_per_sec_per_gpu": 154.69, | |
| "tokens/trainable": 5798783 | |
| }, | |
| { | |
| "epoch": 0.28396703004520074, | |
| "grad_norm": 0.06058730185031891, | |
| "learning_rate": 3.606622179090682e-05, | |
| "loss": 0.7585455179214478, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.13517, | |
| "step": 267, | |
| "tokens/total": 69992448, | |
| "tokens/train_per_sec_per_gpu": 180.57, | |
| "tokens/trainable": 5819730 | |
| }, | |
| { | |
| "epoch": 0.28503057697420897, | |
| "grad_norm": 0.056012120097875595, | |
| "learning_rate": 3.6021931784331136e-05, | |
| "loss": 0.7486584186553955, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.11416, | |
| "step": 268, | |
| "tokens/total": 70254592, | |
| "tokens/train_per_sec_per_gpu": 156.75, | |
| "tokens/trainable": 5842124 | |
| }, | |
| { | |
| "epoch": 0.28609412390321726, | |
| "grad_norm": 0.06160522252321243, | |
| "learning_rate": 3.5977421359629715e-05, | |
| "loss": 0.6910536289215088, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 1.99582, | |
| "step": 269, | |
| "tokens/total": 70516736, | |
| "tokens/train_per_sec_per_gpu": 171.0, | |
| "tokens/trainable": 5863426 | |
| }, | |
| { | |
| "epoch": 0.2871576708322255, | |
| "grad_norm": 0.061156366020441055, | |
| "learning_rate": 3.593269112914472e-05, | |
| "loss": 0.7586344480514526, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.13536, | |
| "step": 270, | |
| "tokens/total": 70778880, | |
| "tokens/train_per_sec_per_gpu": 165.2, | |
| "tokens/trainable": 5884974 | |
| }, | |
| { | |
| "epoch": 0.2882212177612337, | |
| "grad_norm": 0.061479438096284866, | |
| "learning_rate": 3.588774170824225e-05, | |
| "loss": 0.8126254677772522, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.25382, | |
| "step": 271, | |
| "tokens/total": 71041024, | |
| "tokens/train_per_sec_per_gpu": 213.13, | |
| "tokens/trainable": 5906918 | |
| }, | |
| { | |
| "epoch": 0.28928476469024195, | |
| "grad_norm": 0.05486295372247696, | |
| "learning_rate": 3.584257371530386e-05, | |
| "loss": 0.7505637407302856, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.11819, | |
| "step": 272, | |
| "tokens/total": 71303168, | |
| "tokens/train_per_sec_per_gpu": 202.73, | |
| "tokens/trainable": 5929901 | |
| }, | |
| { | |
| "epoch": 0.2903483116192502, | |
| "grad_norm": 0.05428226664662361, | |
| "learning_rate": 3.579718777171806e-05, | |
| "loss": 0.7298543453216553, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.07478, | |
| "step": 273, | |
| "tokens/total": 71565312, | |
| "tokens/train_per_sec_per_gpu": 187.93, | |
| "tokens/trainable": 5949798 | |
| }, | |
| { | |
| "epoch": 0.29141185854825846, | |
| "grad_norm": 0.06736662238836288, | |
| "learning_rate": 3.5751584501871766e-05, | |
| "loss": 0.7698936462402344, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.15954, | |
| "step": 274, | |
| "tokens/total": 71827456, | |
| "tokens/train_per_sec_per_gpu": 179.77, | |
| "tokens/trainable": 5971772 | |
| }, | |
| { | |
| "epoch": 0.2924754054772667, | |
| "grad_norm": 0.05799878388643265, | |
| "learning_rate": 3.570576453314172e-05, | |
| "loss": 0.7739330530166626, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.16828, | |
| "step": 275, | |
| "tokens/total": 72089600, | |
| "tokens/train_per_sec_per_gpu": 179.84, | |
| "tokens/trainable": 5994035 | |
| }, | |
| { | |
| "epoch": 0.2935389524062749, | |
| "grad_norm": 0.051827434450387955, | |
| "learning_rate": 3.565972849588584e-05, | |
| "loss": 0.7918999791145325, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.20759, | |
| "step": 276, | |
| "tokens/total": 72351744, | |
| "tokens/train_per_sec_per_gpu": 219.16, | |
| "tokens/trainable": 6017969 | |
| }, | |
| { | |
| "epoch": 0.29460249933528315, | |
| "grad_norm": 0.05260028690099716, | |
| "learning_rate": 3.561347702343456e-05, | |
| "loss": 0.7098696827888489, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.03373, | |
| "step": 277, | |
| "tokens/total": 72613888, | |
| "tokens/train_per_sec_per_gpu": 208.13, | |
| "tokens/trainable": 6042126 | |
| }, | |
| { | |
| "epoch": 0.29566604626429144, | |
| "grad_norm": 0.05379115045070648, | |
| "learning_rate": 3.556701075208213e-05, | |
| "loss": 0.7800576090812683, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.1816, | |
| "step": 278, | |
| "tokens/total": 72876032, | |
| "tokens/train_per_sec_per_gpu": 207.75, | |
| "tokens/trainable": 6065523 | |
| }, | |
| { | |
| "epoch": 0.29672959319329967, | |
| "grad_norm": 0.05740763247013092, | |
| "learning_rate": 3.5520330321077815e-05, | |
| "loss": 0.8003265857696533, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.22627, | |
| "step": 279, | |
| "tokens/total": 73138176, | |
| "tokens/train_per_sec_per_gpu": 204.22, | |
| "tokens/trainable": 6088540 | |
| }, | |
| { | |
| "epoch": 0.2977931401223079, | |
| "grad_norm": 0.05483856424689293, | |
| "learning_rate": 3.547343637261717e-05, | |
| "loss": 0.7692792415618896, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.15821, | |
| "step": 280, | |
| "tokens/total": 73400320, | |
| "tokens/train_per_sec_per_gpu": 227.55, | |
| "tokens/trainable": 6111049 | |
| }, | |
| { | |
| "epoch": 0.29885668705131613, | |
| "grad_norm": 0.06599462777376175, | |
| "learning_rate": 3.5426329551833145e-05, | |
| "loss": 0.7712551355361938, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.16248, | |
| "step": 281, | |
| "tokens/total": 73662464, | |
| "tokens/train_per_sec_per_gpu": 194.72, | |
| "tokens/trainable": 6131721 | |
| }, | |
| { | |
| "epoch": 0.29992023398032436, | |
| "grad_norm": 0.055346276611089706, | |
| "learning_rate": 3.537901050678724e-05, | |
| "loss": 0.7576462030410767, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.13325, | |
| "step": 282, | |
| "tokens/total": 73924608, | |
| "tokens/train_per_sec_per_gpu": 200.29, | |
| "tokens/trainable": 6154147 | |
| }, | |
| { | |
| "epoch": 0.30098378090933264, | |
| "grad_norm": 0.06298188120126724, | |
| "learning_rate": 3.533147988846059e-05, | |
| "loss": 0.7945112586021423, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.21336, | |
| "step": 283, | |
| "tokens/total": 74186752, | |
| "tokens/train_per_sec_per_gpu": 228.94, | |
| "tokens/trainable": 6178704 | |
| }, | |
| { | |
| "epoch": 0.3020473278383409, | |
| "grad_norm": 0.05894119665026665, | |
| "learning_rate": 3.5283738350744986e-05, | |
| "loss": 0.7516214847564697, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.12044, | |
| "step": 284, | |
| "tokens/total": 74448896, | |
| "tokens/train_per_sec_per_gpu": 212.98, | |
| "tokens/trainable": 6200161 | |
| }, | |
| { | |
| "epoch": 0.3031108747673491, | |
| "grad_norm": 0.05261611193418503, | |
| "learning_rate": 3.5235786550433906e-05, | |
| "loss": 0.7629417777061462, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.14458, | |
| "step": 285, | |
| "tokens/total": 74711040, | |
| "tokens/train_per_sec_per_gpu": 219.51, | |
| "tokens/trainable": 6222600 | |
| }, | |
| { | |
| "epoch": 0.30417442169635733, | |
| "grad_norm": 0.059760384261608124, | |
| "learning_rate": 3.518762514721346e-05, | |
| "loss": 0.7162789106369019, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.0468, | |
| "step": 286, | |
| "tokens/total": 74973184, | |
| "tokens/train_per_sec_per_gpu": 235.97, | |
| "tokens/trainable": 6246346 | |
| }, | |
| { | |
| "epoch": 0.3052379686253656, | |
| "grad_norm": 0.06057070195674896, | |
| "learning_rate": 3.5139254803653346e-05, | |
| "loss": 0.7634356021881104, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.14564, | |
| "step": 287, | |
| "tokens/total": 75235328, | |
| "tokens/train_per_sec_per_gpu": 152.93, | |
| "tokens/trainable": 6267246 | |
| }, | |
| { | |
| "epoch": 0.30630151555437385, | |
| "grad_norm": 0.05959314480423927, | |
| "learning_rate": 3.509067618519768e-05, | |
| "loss": 0.8070247173309326, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.24123, | |
| "step": 288, | |
| "tokens/total": 75497472, | |
| "tokens/train_per_sec_per_gpu": 186.48, | |
| "tokens/trainable": 6289062 | |
| }, | |
| { | |
| "epoch": 0.3073650624833821, | |
| "grad_norm": 0.06659425795078278, | |
| "learning_rate": 3.5041889960155895e-05, | |
| "loss": 0.6915267705917358, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 1.99676, | |
| "step": 289, | |
| "tokens/total": 75759616, | |
| "tokens/train_per_sec_per_gpu": 194.38, | |
| "tokens/trainable": 6310916 | |
| }, | |
| { | |
| "epoch": 0.3084286094123903, | |
| "grad_norm": 0.05240656062960625, | |
| "learning_rate": 3.499289679969351e-05, | |
| "loss": 0.8021942377090454, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.23043, | |
| "step": 290, | |
| "tokens/total": 76021760, | |
| "tokens/train_per_sec_per_gpu": 207.96, | |
| "tokens/trainable": 6334881 | |
| }, | |
| { | |
| "epoch": 0.30949215634139854, | |
| "grad_norm": 0.06096180900931358, | |
| "learning_rate": 3.494369737782293e-05, | |
| "loss": 0.7638937830924988, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.14662, | |
| "step": 291, | |
| "tokens/total": 76283904, | |
| "tokens/train_per_sec_per_gpu": 205.29, | |
| "tokens/trainable": 6357654 | |
| }, | |
| { | |
| "epoch": 0.3105557032704068, | |
| "grad_norm": 0.062094077467918396, | |
| "learning_rate": 3.489429237139414e-05, | |
| "loss": 0.7336180210113525, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.0826, | |
| "step": 292, | |
| "tokens/total": 76546048, | |
| "tokens/train_per_sec_per_gpu": 178.46, | |
| "tokens/trainable": 6377841 | |
| }, | |
| { | |
| "epoch": 0.31161925019941505, | |
| "grad_norm": 0.055953506380319595, | |
| "learning_rate": 3.48446824600854e-05, | |
| "loss": 0.7609624862670898, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.14034, | |
| "step": 293, | |
| "tokens/total": 76808192, | |
| "tokens/train_per_sec_per_gpu": 168.65, | |
| "tokens/trainable": 6400076 | |
| }, | |
| { | |
| "epoch": 0.3126827971284233, | |
| "grad_norm": 0.05196288600564003, | |
| "learning_rate": 3.4794868326393935e-05, | |
| "loss": 0.7413825988769531, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.09884, | |
| "step": 294, | |
| "tokens/total": 77070336, | |
| "tokens/train_per_sec_per_gpu": 172.78, | |
| "tokens/trainable": 6422449 | |
| }, | |
| { | |
| "epoch": 0.3137463440574315, | |
| "grad_norm": 0.05353325977921486, | |
| "learning_rate": 3.474485065562648e-05, | |
| "loss": 0.7769887447357178, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.17491, | |
| "step": 295, | |
| "tokens/total": 77332480, | |
| "tokens/train_per_sec_per_gpu": 221.08, | |
| "tokens/trainable": 6445481 | |
| }, | |
| { | |
| "epoch": 0.3148098909864398, | |
| "grad_norm": 0.054653119295835495, | |
| "learning_rate": 3.469463013588991e-05, | |
| "loss": 0.7785749435424805, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.17837, | |
| "step": 296, | |
| "tokens/total": 77594624, | |
| "tokens/train_per_sec_per_gpu": 210.56, | |
| "tokens/trainable": 6467586 | |
| }, | |
| { | |
| "epoch": 0.31587343791544803, | |
| "grad_norm": 0.06027977168560028, | |
| "learning_rate": 3.4644207458081735e-05, | |
| "loss": 0.7743946313858032, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.16928, | |
| "step": 297, | |
| "tokens/total": 77856768, | |
| "tokens/train_per_sec_per_gpu": 204.77, | |
| "tokens/trainable": 6489041 | |
| }, | |
| { | |
| "epoch": 0.31693698484445626, | |
| "grad_norm": 0.05732357129454613, | |
| "learning_rate": 3.45935833158806e-05, | |
| "loss": 0.8058252334594727, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.23854, | |
| "step": 298, | |
| "tokens/total": 78118912, | |
| "tokens/train_per_sec_per_gpu": 171.13, | |
| "tokens/trainable": 6510717 | |
| }, | |
| { | |
| "epoch": 0.3180005317734645, | |
| "grad_norm": 0.05821244791150093, | |
| "learning_rate": 3.454275840573679e-05, | |
| "loss": 0.7749941945075989, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.17058, | |
| "step": 299, | |
| "tokens/total": 78381056, | |
| "tokens/train_per_sec_per_gpu": 223.81, | |
| "tokens/trainable": 6533545 | |
| }, | |
| { | |
| "epoch": 0.3190640787024727, | |
| "grad_norm": 0.05597531050443649, | |
| "learning_rate": 3.4491733426862556e-05, | |
| "loss": 0.7812941670417786, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.1843, | |
| "step": 300, | |
| "tokens/total": 78643200, | |
| "tokens/train_per_sec_per_gpu": 185.6, | |
| "tokens/trainable": 6556460 | |
| }, | |
| { | |
| "epoch": 0.3190640787024727, | |
| "eval_loss": 0.7725370526313782, | |
| "eval_ppl": 2.16525, | |
| "eval_runtime": 237.2784, | |
| "eval_samples_per_second": 28.178, | |
| "eval_steps_per_second": 1.762, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 38.19, | |
| "memory/max_allocated (GiB)": 38.19, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.320127625631481, | |
| "grad_norm": 0.054633188992738724, | |
| "learning_rate": 3.44405090812226e-05, | |
| "loss": 0.7383944988250732, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.09257, | |
| "step": 301, | |
| "tokens/total": 78905344, | |
| "tokens/train_per_sec_per_gpu": 192.32, | |
| "tokens/trainable": 6579346 | |
| }, | |
| { | |
| "epoch": 0.32119117256048924, | |
| "grad_norm": 0.05523587390780449, | |
| "learning_rate": 3.438908607352433e-05, | |
| "loss": 0.6943072080612183, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.00232, | |
| "step": 302, | |
| "tokens/total": 79167488, | |
| "tokens/train_per_sec_per_gpu": 187.81, | |
| "tokens/trainable": 6599223 | |
| }, | |
| { | |
| "epoch": 0.32225471948949747, | |
| "grad_norm": 0.058179691433906555, | |
| "learning_rate": 3.433746511120823e-05, | |
| "loss": 0.7615541815757751, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.1416, | |
| "step": 303, | |
| "tokens/total": 79429632, | |
| "tokens/train_per_sec_per_gpu": 207.76, | |
| "tokens/trainable": 6622095 | |
| }, | |
| { | |
| "epoch": 0.3233182664185057, | |
| "grad_norm": 0.06251095235347748, | |
| "learning_rate": 3.428564690443807e-05, | |
| "loss": 0.7761749029159546, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.17314, | |
| "step": 304, | |
| "tokens/total": 79691776, | |
| "tokens/train_per_sec_per_gpu": 212.28, | |
| "tokens/trainable": 6643820 | |
| }, | |
| { | |
| "epoch": 0.324381813347514, | |
| "grad_norm": 0.06721773743629456, | |
| "learning_rate": 3.4233632166091205e-05, | |
| "loss": 0.7868590354919434, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.19649, | |
| "step": 305, | |
| "tokens/total": 79953920, | |
| "tokens/train_per_sec_per_gpu": 171.72, | |
| "tokens/trainable": 6664523 | |
| }, | |
| { | |
| "epoch": 0.3254453602765222, | |
| "grad_norm": 0.05866523087024689, | |
| "learning_rate": 3.41814216117487e-05, | |
| "loss": 0.7207814455032349, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.05604, | |
| "step": 306, | |
| "tokens/total": 80216064, | |
| "tokens/train_per_sec_per_gpu": 128.88, | |
| "tokens/trainable": 6685386 | |
| }, | |
| { | |
| "epoch": 0.32650890720553044, | |
| "grad_norm": 0.05545097589492798, | |
| "learning_rate": 3.412901595968551e-05, | |
| "loss": 0.7210918068885803, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.05668, | |
| "step": 307, | |
| "tokens/total": 80478208, | |
| "tokens/train_per_sec_per_gpu": 193.0, | |
| "tokens/trainable": 6706345 | |
| }, | |
| { | |
| "epoch": 0.32757245413453867, | |
| "grad_norm": 0.063844695687294, | |
| "learning_rate": 3.407641593086063e-05, | |
| "loss": 0.7825930118560791, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.18714, | |
| "step": 308, | |
| "tokens/total": 80740352, | |
| "tokens/train_per_sec_per_gpu": 201.52, | |
| "tokens/trainable": 6728441 | |
| }, | |
| { | |
| "epoch": 0.32863600106354696, | |
| "grad_norm": 0.05788377299904823, | |
| "learning_rate": 3.4023622248907134e-05, | |
| "loss": 0.7766852974891663, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.17425, | |
| "step": 309, | |
| "tokens/total": 81002496, | |
| "tokens/train_per_sec_per_gpu": 240.97, | |
| "tokens/trainable": 6754793 | |
| }, | |
| { | |
| "epoch": 0.3296995479925552, | |
| "grad_norm": 0.060298677533864975, | |
| "learning_rate": 3.397063564012223e-05, | |
| "loss": 0.7567377090454102, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.13131, | |
| "step": 310, | |
| "tokens/total": 81264640, | |
| "tokens/train_per_sec_per_gpu": 192.69, | |
| "tokens/trainable": 6776213 | |
| }, | |
| { | |
| "epoch": 0.3307630949215634, | |
| "grad_norm": 0.05781788378953934, | |
| "learning_rate": 3.391745683345729e-05, | |
| "loss": 0.8195350766181946, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.26944, | |
| "step": 311, | |
| "tokens/total": 81526784, | |
| "tokens/train_per_sec_per_gpu": 182.74, | |
| "tokens/trainable": 6798309 | |
| }, | |
| { | |
| "epoch": 0.33182664185057165, | |
| "grad_norm": 0.05640830472111702, | |
| "learning_rate": 3.3864086560507785e-05, | |
| "loss": 0.8057565689086914, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.23839, | |
| "step": 312, | |
| "tokens/total": 81788928, | |
| "tokens/train_per_sec_per_gpu": 297.01, | |
| "tokens/trainable": 6823595 | |
| }, | |
| { | |
| "epoch": 0.3328901887795799, | |
| "grad_norm": 0.0735592171549797, | |
| "learning_rate": 3.3810525555503254e-05, | |
| "loss": 0.8060101270675659, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.23896, | |
| "step": 313, | |
| "tokens/total": 82051072, | |
| "tokens/train_per_sec_per_gpu": 196.76, | |
| "tokens/trainable": 6845473 | |
| }, | |
| { | |
| "epoch": 0.33395373570858816, | |
| "grad_norm": 0.06115385517477989, | |
| "learning_rate": 3.3756774555297186e-05, | |
| "loss": 0.782099723815918, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.18606, | |
| "step": 314, | |
| "tokens/total": 82313216, | |
| "tokens/train_per_sec_per_gpu": 159.99, | |
| "tokens/trainable": 6866224 | |
| }, | |
| { | |
| "epoch": 0.3350172826375964, | |
| "grad_norm": 0.0663752555847168, | |
| "learning_rate": 3.3702834299356885e-05, | |
| "loss": 0.7522889375686646, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.12185, | |
| "step": 315, | |
| "tokens/total": 82575360, | |
| "tokens/train_per_sec_per_gpu": 157.51, | |
| "tokens/trainable": 6885515 | |
| }, | |
| { | |
| "epoch": 0.3360808295666046, | |
| "grad_norm": 0.05871765688061714, | |
| "learning_rate": 3.3648705529753306e-05, | |
| "loss": 0.7267637252807617, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.06838, | |
| "step": 316, | |
| "tokens/total": 82837504, | |
| "tokens/train_per_sec_per_gpu": 176.94, | |
| "tokens/trainable": 6906219 | |
| }, | |
| { | |
| "epoch": 0.33714437649561285, | |
| "grad_norm": 0.05956491827964783, | |
| "learning_rate": 3.3594388991150825e-05, | |
| "loss": 0.821346640586853, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.27356, | |
| "step": 317, | |
| "tokens/total": 83099648, | |
| "tokens/train_per_sec_per_gpu": 222.29, | |
| "tokens/trainable": 6929025 | |
| }, | |
| { | |
| "epoch": 0.33820792342462114, | |
| "grad_norm": 0.059474050998687744, | |
| "learning_rate": 3.353988543079702e-05, | |
| "loss": 0.8273679614067078, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.28729, | |
| "step": 318, | |
| "tokens/total": 83361792, | |
| "tokens/train_per_sec_per_gpu": 228.13, | |
| "tokens/trainable": 6951873 | |
| }, | |
| { | |
| "epoch": 0.33927147035362937, | |
| "grad_norm": 0.06504053622484207, | |
| "learning_rate": 3.3485195598512365e-05, | |
| "loss": 0.7731481790542603, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.16658, | |
| "step": 319, | |
| "tokens/total": 83623936, | |
| "tokens/train_per_sec_per_gpu": 183.37, | |
| "tokens/trainable": 6973435 | |
| }, | |
| { | |
| "epoch": 0.3403350172826376, | |
| "grad_norm": 0.0615144707262516, | |
| "learning_rate": 3.343032024667994e-05, | |
| "loss": 0.7920888662338257, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.208, | |
| "step": 320, | |
| "tokens/total": 83886080, | |
| "tokens/train_per_sec_per_gpu": 168.97, | |
| "tokens/trainable": 6993689 | |
| }, | |
| { | |
| "epoch": 0.3413985642116458, | |
| "grad_norm": 0.058340758085250854, | |
| "learning_rate": 3.337526013023507e-05, | |
| "loss": 0.7120217084884644, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.03811, | |
| "step": 321, | |
| "tokens/total": 84148224, | |
| "tokens/train_per_sec_per_gpu": 174.54, | |
| "tokens/trainable": 7015311 | |
| }, | |
| { | |
| "epoch": 0.34246211114065406, | |
| "grad_norm": 0.06086525321006775, | |
| "learning_rate": 3.332001600665494e-05, | |
| "loss": 0.8457379341125488, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.3297, | |
| "step": 322, | |
| "tokens/total": 84410368, | |
| "tokens/train_per_sec_per_gpu": 205.39, | |
| "tokens/trainable": 7038147 | |
| }, | |
| { | |
| "epoch": 0.34352565806966234, | |
| "grad_norm": 0.06164320930838585, | |
| "learning_rate": 3.326458863594814e-05, | |
| "loss": 0.8334550261497498, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.30126, | |
| "step": 323, | |
| "tokens/total": 84672512, | |
| "tokens/train_per_sec_per_gpu": 198.45, | |
| "tokens/trainable": 7059704 | |
| }, | |
| { | |
| "epoch": 0.3445892049986706, | |
| "grad_norm": 0.057081304490566254, | |
| "learning_rate": 3.320897878064428e-05, | |
| "loss": 0.838066041469574, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.31189, | |
| "step": 324, | |
| "tokens/total": 84934656, | |
| "tokens/train_per_sec_per_gpu": 209.4, | |
| "tokens/trainable": 7083870 | |
| }, | |
| { | |
| "epoch": 0.3456527519276788, | |
| "grad_norm": 0.05457906052470207, | |
| "learning_rate": 3.3153187205783454e-05, | |
| "loss": 0.7599472403526306, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.13816, | |
| "step": 325, | |
| "tokens/total": 85196800, | |
| "tokens/train_per_sec_per_gpu": 186.44, | |
| "tokens/trainable": 7108954 | |
| }, | |
| { | |
| "epoch": 0.34671629885668703, | |
| "grad_norm": 0.061806946992874146, | |
| "learning_rate": 3.309721467890571e-05, | |
| "loss": 0.7315384149551392, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.07828, | |
| "step": 326, | |
| "tokens/total": 85458944, | |
| "tokens/train_per_sec_per_gpu": 169.83, | |
| "tokens/trainable": 7130972 | |
| }, | |
| { | |
| "epoch": 0.3477798457856953, | |
| "grad_norm": 0.05989091843366623, | |
| "learning_rate": 3.3041061970040486e-05, | |
| "loss": 0.7060387134552002, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.02595, | |
| "step": 327, | |
| "tokens/total": 85721088, | |
| "tokens/train_per_sec_per_gpu": 191.48, | |
| "tokens/trainable": 7152519 | |
| }, | |
| { | |
| "epoch": 0.34884339271470355, | |
| "grad_norm": 0.06088387221097946, | |
| "learning_rate": 3.298472985169609e-05, | |
| "loss": 0.7296034097671509, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.07426, | |
| "step": 328, | |
| "tokens/total": 85983232, | |
| "tokens/train_per_sec_per_gpu": 237.52, | |
| "tokens/trainable": 7173660 | |
| }, | |
| { | |
| "epoch": 0.3499069396437118, | |
| "grad_norm": 0.059276383370161057, | |
| "learning_rate": 3.2928219098848955e-05, | |
| "loss": 0.7553350329399109, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.12832, | |
| "step": 329, | |
| "tokens/total": 86245376, | |
| "tokens/train_per_sec_per_gpu": 148.74, | |
| "tokens/trainable": 7196866 | |
| }, | |
| { | |
| "epoch": 0.35097048657272, | |
| "grad_norm": 0.06169452145695686, | |
| "learning_rate": 3.287153048893307e-05, | |
| "loss": 0.8173863887786865, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.26457, | |
| "step": 330, | |
| "tokens/total": 86507520, | |
| "tokens/train_per_sec_per_gpu": 198.89, | |
| "tokens/trainable": 7218044 | |
| }, | |
| { | |
| "epoch": 0.35203403350172824, | |
| "grad_norm": 0.06478651612997055, | |
| "learning_rate": 3.281466480182925e-05, | |
| "loss": 0.7393308877944946, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.09453, | |
| "step": 331, | |
| "tokens/total": 86769664, | |
| "tokens/train_per_sec_per_gpu": 189.28, | |
| "tokens/trainable": 7240085 | |
| }, | |
| { | |
| "epoch": 0.3530975804307365, | |
| "grad_norm": 0.0714036375284195, | |
| "learning_rate": 3.27576228198544e-05, | |
| "loss": 0.7848330140113831, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.19204, | |
| "step": 332, | |
| "tokens/total": 87031808, | |
| "tokens/train_per_sec_per_gpu": 180.05, | |
| "tokens/trainable": 7263289 | |
| }, | |
| { | |
| "epoch": 0.35416112735974475, | |
| "grad_norm": 0.059255450963974, | |
| "learning_rate": 3.270040532775077e-05, | |
| "loss": 0.7574397921562195, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.13281, | |
| "step": 333, | |
| "tokens/total": 87293952, | |
| "tokens/train_per_sec_per_gpu": 169.3, | |
| "tokens/trainable": 7283911 | |
| }, | |
| { | |
| "epoch": 0.355224674288753, | |
| "grad_norm": 0.0516078807413578, | |
| "learning_rate": 3.264301311267515e-05, | |
| "loss": 0.7163474559783936, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.04694, | |
| "step": 334, | |
| "tokens/total": 87556096, | |
| "tokens/train_per_sec_per_gpu": 223.05, | |
| "tokens/trainable": 7307672 | |
| }, | |
| { | |
| "epoch": 0.3562882212177612, | |
| "grad_norm": 0.06229817494750023, | |
| "learning_rate": 3.2585446964188026e-05, | |
| "loss": 0.715316116809845, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.04483, | |
| "step": 335, | |
| "tokens/total": 87818240, | |
| "tokens/train_per_sec_per_gpu": 195.29, | |
| "tokens/trainable": 7330420 | |
| }, | |
| { | |
| "epoch": 0.3573517681467695, | |
| "grad_norm": 0.06110849231481552, | |
| "learning_rate": 3.252770767424277e-05, | |
| "loss": 0.6982554197311401, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.01024, | |
| "step": 336, | |
| "tokens/total": 88080384, | |
| "tokens/train_per_sec_per_gpu": 207.65, | |
| "tokens/trainable": 7351877 | |
| }, | |
| { | |
| "epoch": 0.35841531507577773, | |
| "grad_norm": 0.05932268872857094, | |
| "learning_rate": 3.246979603717467e-05, | |
| "loss": 0.7535157203674316, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.12446, | |
| "step": 337, | |
| "tokens/total": 88342528, | |
| "tokens/train_per_sec_per_gpu": 239.51, | |
| "tokens/trainable": 7377554 | |
| }, | |
| { | |
| "epoch": 0.35947886200478596, | |
| "grad_norm": 0.06964396685361862, | |
| "learning_rate": 3.2411712849690076e-05, | |
| "loss": 0.802024781703949, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.23005, | |
| "step": 338, | |
| "tokens/total": 88604672, | |
| "tokens/train_per_sec_per_gpu": 223.69, | |
| "tokens/trainable": 7399496 | |
| }, | |
| { | |
| "epoch": 0.3605424089337942, | |
| "grad_norm": 0.06441716849803925, | |
| "learning_rate": 3.235345891085536e-05, | |
| "loss": 0.7387241125106812, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.09326, | |
| "step": 339, | |
| "tokens/total": 88866816, | |
| "tokens/train_per_sec_per_gpu": 195.97, | |
| "tokens/trainable": 7421514 | |
| }, | |
| { | |
| "epoch": 0.3616059558628024, | |
| "grad_norm": 0.05845661088824272, | |
| "learning_rate": 3.229503502208602e-05, | |
| "loss": 0.7207450866699219, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.05596, | |
| "step": 340, | |
| "tokens/total": 89128960, | |
| "tokens/train_per_sec_per_gpu": 171.19, | |
| "tokens/trainable": 7442572 | |
| }, | |
| { | |
| "epoch": 0.3626695027918107, | |
| "grad_norm": 0.06204557791352272, | |
| "learning_rate": 3.2236441987135565e-05, | |
| "loss": 0.757001519203186, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.13187, | |
| "step": 341, | |
| "tokens/total": 89391104, | |
| "tokens/train_per_sec_per_gpu": 177.89, | |
| "tokens/trainable": 7463164 | |
| }, | |
| { | |
| "epoch": 0.36373304972081894, | |
| "grad_norm": 0.05783051997423172, | |
| "learning_rate": 3.2177680612084494e-05, | |
| "loss": 0.7507032752037048, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.11849, | |
| "step": 342, | |
| "tokens/total": 89653248, | |
| "tokens/train_per_sec_per_gpu": 187.61, | |
| "tokens/trainable": 7485037 | |
| }, | |
| { | |
| "epoch": 0.36479659664982717, | |
| "grad_norm": 0.06585251539945602, | |
| "learning_rate": 3.211875170532924e-05, | |
| "loss": 0.7124658823013306, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.03901, | |
| "step": 343, | |
| "tokens/total": 89915392, | |
| "tokens/train_per_sec_per_gpu": 169.85, | |
| "tokens/trainable": 7505178 | |
| }, | |
| { | |
| "epoch": 0.3658601435788354, | |
| "grad_norm": 0.06433013081550598, | |
| "learning_rate": 3.205965607757097e-05, | |
| "loss": 0.755608081817627, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.12891, | |
| "step": 344, | |
| "tokens/total": 90177536, | |
| "tokens/train_per_sec_per_gpu": 186.29, | |
| "tokens/trainable": 7526734 | |
| }, | |
| { | |
| "epoch": 0.3669236905078437, | |
| "grad_norm": 0.06355367600917816, | |
| "learning_rate": 3.200039454180452e-05, | |
| "loss": 0.8150879740715027, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.25937, | |
| "step": 345, | |
| "tokens/total": 90439680, | |
| "tokens/train_per_sec_per_gpu": 179.78, | |
| "tokens/trainable": 7548681 | |
| }, | |
| { | |
| "epoch": 0.3679872374368519, | |
| "grad_norm": 0.058385591953992844, | |
| "learning_rate": 3.1940967913307144e-05, | |
| "loss": 0.6909693479537964, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 1.99565, | |
| "step": 346, | |
| "tokens/total": 90701824, | |
| "tokens/train_per_sec_per_gpu": 181.15, | |
| "tokens/trainable": 7569898 | |
| }, | |
| { | |
| "epoch": 0.36905078436586014, | |
| "grad_norm": 0.055889613926410675, | |
| "learning_rate": 3.188137700962733e-05, | |
| "loss": 0.7137737274169922, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.04168, | |
| "step": 347, | |
| "tokens/total": 90963968, | |
| "tokens/train_per_sec_per_gpu": 225.36, | |
| "tokens/trainable": 7592560 | |
| }, | |
| { | |
| "epoch": 0.37011433129486837, | |
| "grad_norm": 0.07224101573228836, | |
| "learning_rate": 3.1821622650573536e-05, | |
| "loss": 0.7986935973167419, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.22264, | |
| "step": 348, | |
| "tokens/total": 91226112, | |
| "tokens/train_per_sec_per_gpu": 204.66, | |
| "tokens/trainable": 7614191 | |
| }, | |
| { | |
| "epoch": 0.37117787822387666, | |
| "grad_norm": 0.06277446448802948, | |
| "learning_rate": 3.176170565820293e-05, | |
| "loss": 0.7567167282104492, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.13127, | |
| "step": 349, | |
| "tokens/total": 91488256, | |
| "tokens/train_per_sec_per_gpu": 165.46, | |
| "tokens/trainable": 7634778 | |
| }, | |
| { | |
| "epoch": 0.3722414251528849, | |
| "grad_norm": 0.06595855206251144, | |
| "learning_rate": 3.170162685681007e-05, | |
| "loss": 0.8263660073280334, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.285, | |
| "step": 350, | |
| "tokens/total": 91750400, | |
| "tokens/train_per_sec_per_gpu": 180.47, | |
| "tokens/trainable": 7655328 | |
| }, | |
| { | |
| "epoch": 0.3733049720818931, | |
| "grad_norm": 0.053407274186611176, | |
| "learning_rate": 3.1641387072915574e-05, | |
| "loss": 0.7117317318916321, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.03752, | |
| "step": 351, | |
| "tokens/total": 92012544, | |
| "tokens/train_per_sec_per_gpu": 209.35, | |
| "tokens/trainable": 7678377 | |
| }, | |
| { | |
| "epoch": 0.37436851901090135, | |
| "grad_norm": 0.0668676570057869, | |
| "learning_rate": 3.1580987135254715e-05, | |
| "loss": 0.7980103492736816, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.22112, | |
| "step": 352, | |
| "tokens/total": 92274688, | |
| "tokens/train_per_sec_per_gpu": 227.01, | |
| "tokens/trainable": 7700711 | |
| }, | |
| { | |
| "epoch": 0.3754320659399096, | |
| "grad_norm": 0.06859572231769562, | |
| "learning_rate": 3.1520427874766064e-05, | |
| "loss": 0.7606133818626404, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.13959, | |
| "step": 353, | |
| "tokens/total": 92536832, | |
| "tokens/train_per_sec_per_gpu": 178.09, | |
| "tokens/trainable": 7720570 | |
| }, | |
| { | |
| "epoch": 0.37649561286891786, | |
| "grad_norm": 0.06196228042244911, | |
| "learning_rate": 3.145971012458005e-05, | |
| "loss": 0.7603709101676941, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.13907, | |
| "step": 354, | |
| "tokens/total": 92798976, | |
| "tokens/train_per_sec_per_gpu": 174.2, | |
| "tokens/trainable": 7743422 | |
| }, | |
| { | |
| "epoch": 0.3775591597979261, | |
| "grad_norm": 0.05902267247438431, | |
| "learning_rate": 3.139883472000745e-05, | |
| "loss": 0.7486381530761719, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.11412, | |
| "step": 355, | |
| "tokens/total": 93061120, | |
| "tokens/train_per_sec_per_gpu": 257.11, | |
| "tokens/trainable": 7768328 | |
| }, | |
| { | |
| "epoch": 0.3786227067269343, | |
| "grad_norm": 0.05966390669345856, | |
| "learning_rate": 3.133780249852799e-05, | |
| "loss": 0.7499316930770874, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.11686, | |
| "step": 356, | |
| "tokens/total": 93323264, | |
| "tokens/train_per_sec_per_gpu": 218.24, | |
| "tokens/trainable": 7791928 | |
| }, | |
| { | |
| "epoch": 0.37968625365594255, | |
| "grad_norm": 0.06466083228588104, | |
| "learning_rate": 3.127661429977872e-05, | |
| "loss": 0.754686713218689, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.12695, | |
| "step": 357, | |
| "tokens/total": 93585408, | |
| "tokens/train_per_sec_per_gpu": 196.44, | |
| "tokens/trainable": 7812051 | |
| }, | |
| { | |
| "epoch": 0.38074980058495084, | |
| "grad_norm": 0.058210499584674835, | |
| "learning_rate": 3.1215270965542544e-05, | |
| "loss": 0.7765249013900757, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.1739, | |
| "step": 358, | |
| "tokens/total": 93847552, | |
| "tokens/train_per_sec_per_gpu": 207.04, | |
| "tokens/trainable": 7833084 | |
| }, | |
| { | |
| "epoch": 0.38181334751395907, | |
| "grad_norm": 0.057726673781871796, | |
| "learning_rate": 3.115377333973659e-05, | |
| "loss": 0.7482200264930725, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.11324, | |
| "step": 359, | |
| "tokens/total": 94109696, | |
| "tokens/train_per_sec_per_gpu": 228.28, | |
| "tokens/trainable": 7857441 | |
| }, | |
| { | |
| "epoch": 0.3828768944429673, | |
| "grad_norm": 0.06571833789348602, | |
| "learning_rate": 3.109212226840063e-05, | |
| "loss": 0.8272304534912109, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.28698, | |
| "step": 360, | |
| "tokens/total": 94371840, | |
| "tokens/train_per_sec_per_gpu": 168.79, | |
| "tokens/trainable": 7879005 | |
| }, | |
| { | |
| "epoch": 0.3839404413719755, | |
| "grad_norm": 0.06610347330570221, | |
| "learning_rate": 3.103031859968542e-05, | |
| "loss": 0.7621959447860718, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.14298, | |
| "step": 361, | |
| "tokens/total": 94633984, | |
| "tokens/train_per_sec_per_gpu": 174.29, | |
| "tokens/trainable": 7899540 | |
| }, | |
| { | |
| "epoch": 0.38500398830098376, | |
| "grad_norm": 0.05712476745247841, | |
| "learning_rate": 3.096836318384103e-05, | |
| "loss": 0.7930388450622559, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.2101, | |
| "step": 362, | |
| "tokens/total": 94896128, | |
| "tokens/train_per_sec_per_gpu": 220.27, | |
| "tokens/trainable": 7922542 | |
| }, | |
| { | |
| "epoch": 0.38606753522999204, | |
| "grad_norm": 0.06603259593248367, | |
| "learning_rate": 3.0906256873205193e-05, | |
| "loss": 0.7841721773147583, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.19059, | |
| "step": 363, | |
| "tokens/total": 95158272, | |
| "tokens/train_per_sec_per_gpu": 176.27, | |
| "tokens/trainable": 7945013 | |
| }, | |
| { | |
| "epoch": 0.3871310821590003, | |
| "grad_norm": 0.056947011500597, | |
| "learning_rate": 3.08440005221915e-05, | |
| "loss": 0.7464731335639954, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.10955, | |
| "step": 364, | |
| "tokens/total": 95420416, | |
| "tokens/train_per_sec_per_gpu": 169.25, | |
| "tokens/trainable": 7965994 | |
| }, | |
| { | |
| "epoch": 0.3881946290880085, | |
| "grad_norm": 0.06810449063777924, | |
| "learning_rate": 3.0781594987277724e-05, | |
| "loss": 0.8506579399108887, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.34119, | |
| "step": 365, | |
| "tokens/total": 95682560, | |
| "tokens/train_per_sec_per_gpu": 215.35, | |
| "tokens/trainable": 7987278 | |
| }, | |
| { | |
| "epoch": 0.38925817601701673, | |
| "grad_norm": 0.05948546528816223, | |
| "learning_rate": 3.071904112699397e-05, | |
| "loss": 0.8312458395957947, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.29618, | |
| "step": 366, | |
| "tokens/total": 95944704, | |
| "tokens/train_per_sec_per_gpu": 215.65, | |
| "tokens/trainable": 8011500 | |
| }, | |
| { | |
| "epoch": 0.390321722946025, | |
| "grad_norm": 0.06788976490497589, | |
| "learning_rate": 3.0656339801910926e-05, | |
| "loss": 0.7707929611206055, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.16148, | |
| "step": 367, | |
| "tokens/total": 96206848, | |
| "tokens/train_per_sec_per_gpu": 205.07, | |
| "tokens/trainable": 8032365 | |
| }, | |
| { | |
| "epoch": 0.39138526987503325, | |
| "grad_norm": 0.058084722608327866, | |
| "learning_rate": 3.059349187462798e-05, | |
| "loss": 0.6599326729774475, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 1.93466, | |
| "step": 368, | |
| "tokens/total": 96468992, | |
| "tokens/train_per_sec_per_gpu": 228.45, | |
| "tokens/trainable": 8055039 | |
| }, | |
| { | |
| "epoch": 0.3924488168040415, | |
| "grad_norm": 0.06115228310227394, | |
| "learning_rate": 3.053049820976135e-05, | |
| "loss": 0.760746955871582, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.13987, | |
| "step": 369, | |
| "tokens/total": 96731136, | |
| "tokens/train_per_sec_per_gpu": 189.31, | |
| "tokens/trainable": 8078259 | |
| }, | |
| { | |
| "epoch": 0.3935123637330497, | |
| "grad_norm": 0.07438641041517258, | |
| "learning_rate": 3.0467359673932244e-05, | |
| "loss": 0.7815507650375366, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.18486, | |
| "step": 370, | |
| "tokens/total": 96993280, | |
| "tokens/train_per_sec_per_gpu": 151.93, | |
| "tokens/trainable": 8099819 | |
| }, | |
| { | |
| "epoch": 0.39457591066205794, | |
| "grad_norm": 0.06393261253833771, | |
| "learning_rate": 3.040407713575487e-05, | |
| "loss": 0.7451884746551514, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.10684, | |
| "step": 371, | |
| "tokens/total": 97255424, | |
| "tokens/train_per_sec_per_gpu": 177.57, | |
| "tokens/trainable": 8121758 | |
| }, | |
| { | |
| "epoch": 0.3956394575910662, | |
| "grad_norm": 0.058520760387182236, | |
| "learning_rate": 3.034065146582452e-05, | |
| "loss": 0.8240416049957275, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.27969, | |
| "step": 372, | |
| "tokens/total": 97517568, | |
| "tokens/train_per_sec_per_gpu": 214.86, | |
| "tokens/trainable": 8144183 | |
| }, | |
| { | |
| "epoch": 0.39670300452007445, | |
| "grad_norm": 0.06523202359676361, | |
| "learning_rate": 3.0277083536705604e-05, | |
| "loss": 0.7511664032936096, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.11947, | |
| "step": 373, | |
| "tokens/total": 97779712, | |
| "tokens/train_per_sec_per_gpu": 172.88, | |
| "tokens/trainable": 8164676 | |
| }, | |
| { | |
| "epoch": 0.3977665514490827, | |
| "grad_norm": 0.06993680447340012, | |
| "learning_rate": 3.0213374222919617e-05, | |
| "loss": 0.7678710222244263, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.15517, | |
| "step": 374, | |
| "tokens/total": 98041856, | |
| "tokens/train_per_sec_per_gpu": 161.85, | |
| "tokens/trainable": 8184550 | |
| }, | |
| { | |
| "epoch": 0.3988300983780909, | |
| "grad_norm": 0.05952773615717888, | |
| "learning_rate": 3.0149524400933114e-05, | |
| "loss": 0.7702289819717407, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.16026, | |
| "step": 375, | |
| "tokens/total": 98304000, | |
| "tokens/train_per_sec_per_gpu": 174.86, | |
| "tokens/trainable": 8206571 | |
| }, | |
| { | |
| "epoch": 0.3998936453070992, | |
| "grad_norm": 0.06135401502251625, | |
| "learning_rate": 3.008553494914569e-05, | |
| "loss": 0.6841228008270264, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 1.98203, | |
| "step": 376, | |
| "tokens/total": 98566144, | |
| "tokens/train_per_sec_per_gpu": 194.82, | |
| "tokens/trainable": 8229068 | |
| }, | |
| { | |
| "epoch": 0.40095719223610743, | |
| "grad_norm": 0.0706343874335289, | |
| "learning_rate": 3.002140674787783e-05, | |
| "loss": 0.7617697715759277, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.14206, | |
| "step": 377, | |
| "tokens/total": 98828288, | |
| "tokens/train_per_sec_per_gpu": 145.29, | |
| "tokens/trainable": 8251266 | |
| }, | |
| { | |
| "epoch": 0.40202073916511566, | |
| "grad_norm": 0.0608975812792778, | |
| "learning_rate": 2.995714067935887e-05, | |
| "loss": 0.7577897310256958, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.13356, | |
| "step": 378, | |
| "tokens/total": 99090432, | |
| "tokens/train_per_sec_per_gpu": 148.13, | |
| "tokens/trainable": 8273480 | |
| }, | |
| { | |
| "epoch": 0.4030842860941239, | |
| "grad_norm": 0.06372503191232681, | |
| "learning_rate": 2.9892737627714786e-05, | |
| "loss": 0.7569035291671753, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.13167, | |
| "step": 379, | |
| "tokens/total": 99352576, | |
| "tokens/train_per_sec_per_gpu": 164.36, | |
| "tokens/trainable": 8293250 | |
| }, | |
| { | |
| "epoch": 0.4041478330231321, | |
| "grad_norm": 0.07930820435285568, | |
| "learning_rate": 2.9828198478956093e-05, | |
| "loss": 0.7412185668945312, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.09849, | |
| "step": 380, | |
| "tokens/total": 99614720, | |
| "tokens/train_per_sec_per_gpu": 165.37, | |
| "tokens/trainable": 8313207 | |
| }, | |
| { | |
| "epoch": 0.4052113799521404, | |
| "grad_norm": 0.0618307963013649, | |
| "learning_rate": 2.976352412096563e-05, | |
| "loss": 0.718746542930603, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.05186, | |
| "step": 381, | |
| "tokens/total": 99876864, | |
| "tokens/train_per_sec_per_gpu": 195.0, | |
| "tokens/trainable": 8334762 | |
| }, | |
| { | |
| "epoch": 0.40627492688114863, | |
| "grad_norm": 0.06570852547883987, | |
| "learning_rate": 2.9698715443486338e-05, | |
| "loss": 0.7648171186447144, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.1486, | |
| "step": 382, | |
| "tokens/total": 100139008, | |
| "tokens/train_per_sec_per_gpu": 174.53, | |
| "tokens/trainable": 8354787 | |
| }, | |
| { | |
| "epoch": 0.40733847381015686, | |
| "grad_norm": 0.06311152130365372, | |
| "learning_rate": 2.9633773338109027e-05, | |
| "loss": 0.7888460159301758, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.20086, | |
| "step": 383, | |
| "tokens/total": 100401152, | |
| "tokens/train_per_sec_per_gpu": 164.71, | |
| "tokens/trainable": 8376673 | |
| }, | |
| { | |
| "epoch": 0.4084020207391651, | |
| "grad_norm": 0.06162691116333008, | |
| "learning_rate": 2.9568698698260126e-05, | |
| "loss": 0.7124409675598145, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.03896, | |
| "step": 384, | |
| "tokens/total": 100663296, | |
| "tokens/train_per_sec_per_gpu": 224.89, | |
| "tokens/trainable": 8399066 | |
| }, | |
| { | |
| "epoch": 0.4094655676681734, | |
| "grad_norm": 0.059788983315229416, | |
| "learning_rate": 2.9503492419189366e-05, | |
| "loss": 0.7600404024124146, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.13836, | |
| "step": 385, | |
| "tokens/total": 100925440, | |
| "tokens/train_per_sec_per_gpu": 200.39, | |
| "tokens/trainable": 8422560 | |
| }, | |
| { | |
| "epoch": 0.4105291145971816, | |
| "grad_norm": 0.06216095760464668, | |
| "learning_rate": 2.9438155397957474e-05, | |
| "loss": 0.8070495128631592, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.24129, | |
| "step": 386, | |
| "tokens/total": 101187584, | |
| "tokens/train_per_sec_per_gpu": 149.86, | |
| "tokens/trainable": 8443830 | |
| }, | |
| { | |
| "epoch": 0.41159266152618984, | |
| "grad_norm": 0.05838804319500923, | |
| "learning_rate": 2.937268853342383e-05, | |
| "loss": 0.759508490562439, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.13723, | |
| "step": 387, | |
| "tokens/total": 101449728, | |
| "tokens/train_per_sec_per_gpu": 136.92, | |
| "tokens/trainable": 8465737 | |
| }, | |
| { | |
| "epoch": 0.41265620845519807, | |
| "grad_norm": 0.05906308814883232, | |
| "learning_rate": 2.9307092726234127e-05, | |
| "loss": 0.7870290279388428, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.19686, | |
| "step": 388, | |
| "tokens/total": 101711872, | |
| "tokens/train_per_sec_per_gpu": 176.32, | |
| "tokens/trainable": 8488568 | |
| }, | |
| { | |
| "epoch": 0.41371975538420636, | |
| "grad_norm": 0.06456853449344635, | |
| "learning_rate": 2.9241368878807925e-05, | |
| "loss": 0.7161662578582764, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.04657, | |
| "step": 389, | |
| "tokens/total": 101974016, | |
| "tokens/train_per_sec_per_gpu": 181.97, | |
| "tokens/trainable": 8510960 | |
| }, | |
| { | |
| "epoch": 0.4147833023132146, | |
| "grad_norm": 0.0667351484298706, | |
| "learning_rate": 2.9175517895326292e-05, | |
| "loss": 0.8066511154174805, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.24039, | |
| "step": 390, | |
| "tokens/total": 102236160, | |
| "tokens/train_per_sec_per_gpu": 208.78, | |
| "tokens/trainable": 8532514 | |
| }, | |
| { | |
| "epoch": 0.4158468492422228, | |
| "grad_norm": 0.06246356666088104, | |
| "learning_rate": 2.9109540681719322e-05, | |
| "loss": 0.7846518158912659, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.19164, | |
| "step": 391, | |
| "tokens/total": 102498304, | |
| "tokens/train_per_sec_per_gpu": 224.1, | |
| "tokens/trainable": 8555545 | |
| }, | |
| { | |
| "epoch": 0.41691039617123105, | |
| "grad_norm": 0.06540708988904953, | |
| "learning_rate": 2.9043438145653717e-05, | |
| "loss": 0.7380800843238831, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.09192, | |
| "step": 392, | |
| "tokens/total": 102760448, | |
| "tokens/train_per_sec_per_gpu": 190.76, | |
| "tokens/trainable": 8577885 | |
| }, | |
| { | |
| "epoch": 0.4179739431002393, | |
| "grad_norm": 0.062018416821956635, | |
| "learning_rate": 2.8977211196520257e-05, | |
| "loss": 0.774357795715332, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.1692, | |
| "step": 393, | |
| "tokens/total": 103022592, | |
| "tokens/train_per_sec_per_gpu": 212.39, | |
| "tokens/trainable": 8600480 | |
| }, | |
| { | |
| "epoch": 0.41903749002924756, | |
| "grad_norm": 0.062351830303668976, | |
| "learning_rate": 2.8910860745421305e-05, | |
| "loss": 0.7095509767532349, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.03308, | |
| "step": 394, | |
| "tokens/total": 103284736, | |
| "tokens/train_per_sec_per_gpu": 166.14, | |
| "tokens/trainable": 8620831 | |
| }, | |
| { | |
| "epoch": 0.4201010369582558, | |
| "grad_norm": 0.06218433752655983, | |
| "learning_rate": 2.884438770515829e-05, | |
| "loss": 0.7856686115264893, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.19387, | |
| "step": 395, | |
| "tokens/total": 103546880, | |
| "tokens/train_per_sec_per_gpu": 235.06, | |
| "tokens/trainable": 8646017 | |
| }, | |
| { | |
| "epoch": 0.421164583887264, | |
| "grad_norm": 0.07032019644975662, | |
| "learning_rate": 2.877779299021912e-05, | |
| "loss": 0.7358700037002563, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.0873, | |
| "step": 396, | |
| "tokens/total": 103809024, | |
| "tokens/train_per_sec_per_gpu": 132.47, | |
| "tokens/trainable": 8665893 | |
| }, | |
| { | |
| "epoch": 0.42222813081627225, | |
| "grad_norm": 0.057751379907131195, | |
| "learning_rate": 2.871107751676561e-05, | |
| "loss": 0.7476328015327454, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.11199, | |
| "step": 397, | |
| "tokens/total": 104071168, | |
| "tokens/train_per_sec_per_gpu": 216.14, | |
| "tokens/trainable": 8689230 | |
| }, | |
| { | |
| "epoch": 0.42329167774528054, | |
| "grad_norm": 0.060179274529218674, | |
| "learning_rate": 2.8644242202620907e-05, | |
| "loss": 0.7824307084083557, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.18678, | |
| "step": 398, | |
| "tokens/total": 104333312, | |
| "tokens/train_per_sec_per_gpu": 207.33, | |
| "tokens/trainable": 8710606 | |
| }, | |
| { | |
| "epoch": 0.42435522467428877, | |
| "grad_norm": 0.07202541828155518, | |
| "learning_rate": 2.857728796725682e-05, | |
| "loss": 0.7747771739959717, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.17011, | |
| "step": 399, | |
| "tokens/total": 104595456, | |
| "tokens/train_per_sec_per_gpu": 152.9, | |
| "tokens/trainable": 8732079 | |
| }, | |
| { | |
| "epoch": 0.425418771603297, | |
| "grad_norm": 0.06443008780479431, | |
| "learning_rate": 2.8510215731781194e-05, | |
| "loss": 0.7466020584106445, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.10982, | |
| "step": 400, | |
| "tokens/total": 104857600, | |
| "tokens/train_per_sec_per_gpu": 166.03, | |
| "tokens/trainable": 8754126 | |
| }, | |
| { | |
| "epoch": 0.4264823185323052, | |
| "grad_norm": 0.05945134907960892, | |
| "learning_rate": 2.844302641892523e-05, | |
| "loss": 0.7561734914779663, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.13011, | |
| "step": 401, | |
| "tokens/total": 105119744, | |
| "tokens/train_per_sec_per_gpu": 184.08, | |
| "tokens/trainable": 8775310 | |
| }, | |
| { | |
| "epoch": 0.42754586546131346, | |
| "grad_norm": 0.06570764631032944, | |
| "learning_rate": 2.83757209530308e-05, | |
| "loss": 0.728512167930603, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.072, | |
| "step": 402, | |
| "tokens/total": 105381888, | |
| "tokens/train_per_sec_per_gpu": 160.39, | |
| "tokens/trainable": 8795593 | |
| }, | |
| { | |
| "epoch": 0.42860941239032174, | |
| "grad_norm": 0.06125401705503464, | |
| "learning_rate": 2.8308300260037734e-05, | |
| "loss": 0.754820704460144, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.12723, | |
| "step": 403, | |
| "tokens/total": 105644032, | |
| "tokens/train_per_sec_per_gpu": 172.16, | |
| "tokens/trainable": 8817946 | |
| }, | |
| { | |
| "epoch": 0.42967295931933, | |
| "grad_norm": 0.06924828141927719, | |
| "learning_rate": 2.8240765267471056e-05, | |
| "loss": 0.7697207927703857, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.15916, | |
| "step": 404, | |
| "tokens/total": 105906176, | |
| "tokens/train_per_sec_per_gpu": 208.57, | |
| "tokens/trainable": 8840440 | |
| }, | |
| { | |
| "epoch": 0.4307365062483382, | |
| "grad_norm": 0.06233648210763931, | |
| "learning_rate": 2.8173116904428242e-05, | |
| "loss": 0.7522628307342529, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.1218, | |
| "step": 405, | |
| "tokens/total": 106168320, | |
| "tokens/train_per_sec_per_gpu": 192.61, | |
| "tokens/trainable": 8862223 | |
| }, | |
| { | |
| "epoch": 0.43180005317734643, | |
| "grad_norm": 0.05627689138054848, | |
| "learning_rate": 2.810535610156646e-05, | |
| "loss": 0.8060315847396851, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.23901, | |
| "step": 406, | |
| "tokens/total": 106430464, | |
| "tokens/train_per_sec_per_gpu": 240.07, | |
| "tokens/trainable": 8888361 | |
| }, | |
| { | |
| "epoch": 0.4328636001063547, | |
| "grad_norm": 0.060442451387643814, | |
| "learning_rate": 2.803748379108972e-05, | |
| "loss": 0.7366658449172974, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.08896, | |
| "step": 407, | |
| "tokens/total": 106692608, | |
| "tokens/train_per_sec_per_gpu": 194.32, | |
| "tokens/trainable": 8911616 | |
| }, | |
| { | |
| "epoch": 0.43392714703536295, | |
| "grad_norm": 0.06363833695650101, | |
| "learning_rate": 2.7969500906736065e-05, | |
| "loss": 0.777472734451294, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.17597, | |
| "step": 408, | |
| "tokens/total": 106954752, | |
| "tokens/train_per_sec_per_gpu": 205.98, | |
| "tokens/trainable": 8935498 | |
| }, | |
| { | |
| "epoch": 0.4349906939643712, | |
| "grad_norm": 0.060264162719249725, | |
| "learning_rate": 2.7901408383764776e-05, | |
| "loss": 0.8076545000076294, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.24264, | |
| "step": 409, | |
| "tokens/total": 107216896, | |
| "tokens/train_per_sec_per_gpu": 191.96, | |
| "tokens/trainable": 8959641 | |
| }, | |
| { | |
| "epoch": 0.4360542408933794, | |
| "grad_norm": 0.0615147240459919, | |
| "learning_rate": 2.783320715894341e-05, | |
| "loss": 0.7697659730911255, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.15926, | |
| "step": 410, | |
| "tokens/total": 107479040, | |
| "tokens/train_per_sec_per_gpu": 229.4, | |
| "tokens/trainable": 8981683 | |
| }, | |
| { | |
| "epoch": 0.43711778782238764, | |
| "grad_norm": 0.05955628678202629, | |
| "learning_rate": 2.7764898170534993e-05, | |
| "loss": 0.7136242389678955, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.04138, | |
| "step": 411, | |
| "tokens/total": 107741184, | |
| "tokens/train_per_sec_per_gpu": 197.85, | |
| "tokens/trainable": 9002018 | |
| }, | |
| { | |
| "epoch": 0.4381813347513959, | |
| "grad_norm": 0.05883246660232544, | |
| "learning_rate": 2.76964823582851e-05, | |
| "loss": 0.7270078659057617, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.06888, | |
| "step": 412, | |
| "tokens/total": 108003328, | |
| "tokens/train_per_sec_per_gpu": 209.83, | |
| "tokens/trainable": 9024288 | |
| }, | |
| { | |
| "epoch": 0.43924488168040415, | |
| "grad_norm": 0.06609495729207993, | |
| "learning_rate": 2.7627960663408874e-05, | |
| "loss": 0.7433674335479736, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.10301, | |
| "step": 413, | |
| "tokens/total": 108265472, | |
| "tokens/train_per_sec_per_gpu": 205.82, | |
| "tokens/trainable": 9046881 | |
| }, | |
| { | |
| "epoch": 0.4403084286094124, | |
| "grad_norm": 0.07620103657245636, | |
| "learning_rate": 2.7559334028578135e-05, | |
| "loss": 0.7862449884414673, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.19514, | |
| "step": 414, | |
| "tokens/total": 108527616, | |
| "tokens/train_per_sec_per_gpu": 181.75, | |
| "tokens/trainable": 9067519 | |
| }, | |
| { | |
| "epoch": 0.4413719755384206, | |
| "grad_norm": 0.06601685285568237, | |
| "learning_rate": 2.7490603397908393e-05, | |
| "loss": 0.7648828029632568, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.14874, | |
| "step": 415, | |
| "tokens/total": 108789760, | |
| "tokens/train_per_sec_per_gpu": 174.3, | |
| "tokens/trainable": 9089133 | |
| }, | |
| { | |
| "epoch": 0.4424355224674289, | |
| "grad_norm": 0.06513883173465729, | |
| "learning_rate": 2.742176971694585e-05, | |
| "loss": 0.7682895660400391, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.15608, | |
| "step": 416, | |
| "tokens/total": 109051904, | |
| "tokens/train_per_sec_per_gpu": 177.11, | |
| "tokens/trainable": 9110997 | |
| }, | |
| { | |
| "epoch": 0.44349906939643713, | |
| "grad_norm": 0.06401567906141281, | |
| "learning_rate": 2.7352833932654402e-05, | |
| "loss": 0.7881615161895752, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.19935, | |
| "step": 417, | |
| "tokens/total": 109314048, | |
| "tokens/train_per_sec_per_gpu": 176.32, | |
| "tokens/trainable": 9134139 | |
| }, | |
| { | |
| "epoch": 0.44456261632544536, | |
| "grad_norm": 0.07034140825271606, | |
| "learning_rate": 2.7283796993402613e-05, | |
| "loss": 0.8482910990715027, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.33565, | |
| "step": 418, | |
| "tokens/total": 109576192, | |
| "tokens/train_per_sec_per_gpu": 202.73, | |
| "tokens/trainable": 9155746 | |
| }, | |
| { | |
| "epoch": 0.4456261632544536, | |
| "grad_norm": 0.06932256370782852, | |
| "learning_rate": 2.721465984895066e-05, | |
| "loss": 0.7511000633239746, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.11933, | |
| "step": 419, | |
| "tokens/total": 109838336, | |
| "tokens/train_per_sec_per_gpu": 151.07, | |
| "tokens/trainable": 9175997 | |
| }, | |
| { | |
| "epoch": 0.4466897101834618, | |
| "grad_norm": 0.05801774561405182, | |
| "learning_rate": 2.714542345043726e-05, | |
| "loss": 0.7160236835479736, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.04628, | |
| "step": 420, | |
| "tokens/total": 110100480, | |
| "tokens/train_per_sec_per_gpu": 206.36, | |
| "tokens/trainable": 9201154 | |
| }, | |
| { | |
| "epoch": 0.4477532571124701, | |
| "grad_norm": 0.0719035267829895, | |
| "learning_rate": 2.7076088750366617e-05, | |
| "loss": 0.7472874522209167, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.11127, | |
| "step": 421, | |
| "tokens/total": 110362624, | |
| "tokens/train_per_sec_per_gpu": 221.89, | |
| "tokens/trainable": 9223146 | |
| }, | |
| { | |
| "epoch": 0.44881680404147833, | |
| "grad_norm": 0.06794784218072891, | |
| "learning_rate": 2.700665670259527e-05, | |
| "loss": 0.789624035358429, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.20257, | |
| "step": 422, | |
| "tokens/total": 110624768, | |
| "tokens/train_per_sec_per_gpu": 182.17, | |
| "tokens/trainable": 9245502 | |
| }, | |
| { | |
| "epoch": 0.44988035097048656, | |
| "grad_norm": 0.06335590779781342, | |
| "learning_rate": 2.693712826231903e-05, | |
| "loss": 0.720179557800293, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.0548, | |
| "step": 423, | |
| "tokens/total": 110886912, | |
| "tokens/train_per_sec_per_gpu": 184.76, | |
| "tokens/trainable": 9267573 | |
| }, | |
| { | |
| "epoch": 0.4509438978994948, | |
| "grad_norm": 0.06363626569509506, | |
| "learning_rate": 2.6867504386059776e-05, | |
| "loss": 0.771357536315918, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.1627, | |
| "step": 424, | |
| "tokens/total": 111149056, | |
| "tokens/train_per_sec_per_gpu": 181.22, | |
| "tokens/trainable": 9290024 | |
| }, | |
| { | |
| "epoch": 0.4520074448285031, | |
| "grad_norm": 0.06436197459697723, | |
| "learning_rate": 2.679778603165233e-05, | |
| "loss": 0.8299375772476196, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.29318, | |
| "step": 425, | |
| "tokens/total": 111411200, | |
| "tokens/train_per_sec_per_gpu": 181.49, | |
| "tokens/trainable": 9312089 | |
| }, | |
| { | |
| "epoch": 0.4530709917575113, | |
| "grad_norm": 0.07020284235477448, | |
| "learning_rate": 2.6727974158231312e-05, | |
| "loss": 0.7461998462677002, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.10897, | |
| "step": 426, | |
| "tokens/total": 111673344, | |
| "tokens/train_per_sec_per_gpu": 179.65, | |
| "tokens/trainable": 9332072 | |
| }, | |
| { | |
| "epoch": 0.45413453868651954, | |
| "grad_norm": 0.07214612513780594, | |
| "learning_rate": 2.6658069726217863e-05, | |
| "loss": 0.7696465253829956, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.159, | |
| "step": 427, | |
| "tokens/total": 111935488, | |
| "tokens/train_per_sec_per_gpu": 248.27, | |
| "tokens/trainable": 9354855 | |
| }, | |
| { | |
| "epoch": 0.45519808561552777, | |
| "grad_norm": 0.07788221538066864, | |
| "learning_rate": 2.6588073697306494e-05, | |
| "loss": 0.7938302755355835, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.21185, | |
| "step": 428, | |
| "tokens/total": 112197632, | |
| "tokens/train_per_sec_per_gpu": 193.12, | |
| "tokens/trainable": 9376477 | |
| }, | |
| { | |
| "epoch": 0.456261632544536, | |
| "grad_norm": 0.06307144463062286, | |
| "learning_rate": 2.6517987034451846e-05, | |
| "loss": 0.735474705696106, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.08647, | |
| "step": 429, | |
| "tokens/total": 112459776, | |
| "tokens/train_per_sec_per_gpu": 150.92, | |
| "tokens/trainable": 9396690 | |
| }, | |
| { | |
| "epoch": 0.4573251794735443, | |
| "grad_norm": 0.06643529236316681, | |
| "learning_rate": 2.6447810701855436e-05, | |
| "loss": 0.7078378200531006, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.0296, | |
| "step": 430, | |
| "tokens/total": 112721920, | |
| "tokens/train_per_sec_per_gpu": 209.14, | |
| "tokens/trainable": 9418386 | |
| }, | |
| { | |
| "epoch": 0.4583887264025525, | |
| "grad_norm": 0.06323806941509247, | |
| "learning_rate": 2.637754566495238e-05, | |
| "loss": 0.7661035060882568, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.15137, | |
| "step": 431, | |
| "tokens/total": 112984064, | |
| "tokens/train_per_sec_per_gpu": 174.47, | |
| "tokens/trainable": 9440604 | |
| }, | |
| { | |
| "epoch": 0.45945227333156075, | |
| "grad_norm": 0.06076245754957199, | |
| "learning_rate": 2.6307192890398126e-05, | |
| "loss": 0.7844895720481873, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.19129, | |
| "step": 432, | |
| "tokens/total": 113246208, | |
| "tokens/train_per_sec_per_gpu": 186.77, | |
| "tokens/trainable": 9464422 | |
| }, | |
| { | |
| "epoch": 0.460515820260569, | |
| "grad_norm": 0.0693245679140091, | |
| "learning_rate": 2.6236753346055176e-05, | |
| "loss": 0.6995319724082947, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.01281, | |
| "step": 433, | |
| "tokens/total": 113508352, | |
| "tokens/train_per_sec_per_gpu": 182.76, | |
| "tokens/trainable": 9485760 | |
| }, | |
| { | |
| "epoch": 0.46157936718957726, | |
| "grad_norm": 0.060889989137649536, | |
| "learning_rate": 2.6166228000979726e-05, | |
| "loss": 0.6955425143241882, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.0048, | |
| "step": 434, | |
| "tokens/total": 113770496, | |
| "tokens/train_per_sec_per_gpu": 201.76, | |
| "tokens/trainable": 9508485 | |
| }, | |
| { | |
| "epoch": 0.4626429141185855, | |
| "grad_norm": 0.06848379224538803, | |
| "learning_rate": 2.6095617825408357e-05, | |
| "loss": 0.7852069735527039, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.19286, | |
| "step": 435, | |
| "tokens/total": 114032640, | |
| "tokens/train_per_sec_per_gpu": 189.68, | |
| "tokens/trainable": 9530965 | |
| }, | |
| { | |
| "epoch": 0.4637064610475937, | |
| "grad_norm": 0.07383300364017487, | |
| "learning_rate": 2.6024923790744686e-05, | |
| "loss": 0.7191091775894165, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.0526, | |
| "step": 436, | |
| "tokens/total": 114294784, | |
| "tokens/train_per_sec_per_gpu": 196.81, | |
| "tokens/trainable": 9553360 | |
| }, | |
| { | |
| "epoch": 0.46477000797660195, | |
| "grad_norm": 0.06411275267601013, | |
| "learning_rate": 2.5954146869546018e-05, | |
| "loss": 0.7149425148963928, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.04407, | |
| "step": 437, | |
| "tokens/total": 114556928, | |
| "tokens/train_per_sec_per_gpu": 222.64, | |
| "tokens/trainable": 9574783 | |
| }, | |
| { | |
| "epoch": 0.46583355490561024, | |
| "grad_norm": 0.06931298971176147, | |
| "learning_rate": 2.588328803550993e-05, | |
| "loss": 0.8728055953979492, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.39362, | |
| "step": 438, | |
| "tokens/total": 114819072, | |
| "tokens/train_per_sec_per_gpu": 154.42, | |
| "tokens/trainable": 9595109 | |
| }, | |
| { | |
| "epoch": 0.46689710183461847, | |
| "grad_norm": 0.07122842967510223, | |
| "learning_rate": 2.5812348263460916e-05, | |
| "loss": 0.8005967140197754, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.22687, | |
| "step": 439, | |
| "tokens/total": 115081216, | |
| "tokens/train_per_sec_per_gpu": 195.46, | |
| "tokens/trainable": 9615835 | |
| }, | |
| { | |
| "epoch": 0.4679606487636267, | |
| "grad_norm": 0.07328256964683533, | |
| "learning_rate": 2.5741328529336934e-05, | |
| "loss": 0.8034292459487915, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.23319, | |
| "step": 440, | |
| "tokens/total": 115343360, | |
| "tokens/train_per_sec_per_gpu": 232.42, | |
| "tokens/trainable": 9638791 | |
| }, | |
| { | |
| "epoch": 0.4690241956926349, | |
| "grad_norm": 0.07195594161748886, | |
| "learning_rate": 2.5670229810176026e-05, | |
| "loss": 0.8185476064682007, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.2672, | |
| "step": 441, | |
| "tokens/total": 115605504, | |
| "tokens/train_per_sec_per_gpu": 154.16, | |
| "tokens/trainable": 9659990 | |
| }, | |
| { | |
| "epoch": 0.47008774262164316, | |
| "grad_norm": 0.07275015860795975, | |
| "learning_rate": 2.5599053084102838e-05, | |
| "loss": 0.8078755736351013, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.24314, | |
| "step": 442, | |
| "tokens/total": 115867648, | |
| "tokens/train_per_sec_per_gpu": 167.17, | |
| "tokens/trainable": 9680516 | |
| }, | |
| { | |
| "epoch": 0.47115128955065144, | |
| "grad_norm": 0.07171031087636948, | |
| "learning_rate": 2.5527799330315182e-05, | |
| "loss": 0.7240858674049377, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.06284, | |
| "step": 443, | |
| "tokens/total": 116129792, | |
| "tokens/train_per_sec_per_gpu": 189.32, | |
| "tokens/trainable": 9700527 | |
| }, | |
| { | |
| "epoch": 0.47221483647965967, | |
| "grad_norm": 0.05753646418452263, | |
| "learning_rate": 2.5456469529070566e-05, | |
| "loss": 0.7394333481788635, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.09475, | |
| "step": 444, | |
| "tokens/total": 116391936, | |
| "tokens/train_per_sec_per_gpu": 215.9, | |
| "tokens/trainable": 9725734 | |
| }, | |
| { | |
| "epoch": 0.4732783834086679, | |
| "grad_norm": 0.0627320408821106, | |
| "learning_rate": 2.5385064661672692e-05, | |
| "loss": 0.7002488970756531, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.01425, | |
| "step": 445, | |
| "tokens/total": 116654080, | |
| "tokens/train_per_sec_per_gpu": 166.84, | |
| "tokens/trainable": 9748076 | |
| }, | |
| { | |
| "epoch": 0.47434193033767613, | |
| "grad_norm": 0.06529032438993454, | |
| "learning_rate": 2.5313585710457985e-05, | |
| "loss": 0.7247512936592102, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.06422, | |
| "step": 446, | |
| "tokens/total": 116916224, | |
| "tokens/train_per_sec_per_gpu": 182.03, | |
| "tokens/trainable": 9770158 | |
| }, | |
| { | |
| "epoch": 0.4754054772666844, | |
| "grad_norm": 0.07498825341463089, | |
| "learning_rate": 2.5242033658782043e-05, | |
| "loss": 0.7564839124679565, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.13077, | |
| "step": 447, | |
| "tokens/total": 117178368, | |
| "tokens/train_per_sec_per_gpu": 177.15, | |
| "tokens/trainable": 9791564 | |
| }, | |
| { | |
| "epoch": 0.47646902419569265, | |
| "grad_norm": 0.07310041040182114, | |
| "learning_rate": 2.5170409491006145e-05, | |
| "loss": 0.8869813680648804, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.42779, | |
| "step": 448, | |
| "tokens/total": 117440512, | |
| "tokens/train_per_sec_per_gpu": 153.04, | |
| "tokens/trainable": 9812412 | |
| }, | |
| { | |
| "epoch": 0.4775325711247009, | |
| "grad_norm": 0.06502556055784225, | |
| "learning_rate": 2.5098714192483683e-05, | |
| "loss": 0.733482837677002, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.08232, | |
| "step": 449, | |
| "tokens/total": 117702656, | |
| "tokens/train_per_sec_per_gpu": 234.84, | |
| "tokens/trainable": 9836100 | |
| }, | |
| { | |
| "epoch": 0.4785961180537091, | |
| "grad_norm": 0.0711643397808075, | |
| "learning_rate": 2.50269487495466e-05, | |
| "loss": 0.7789556384086609, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.1792, | |
| "step": 450, | |
| "tokens/total": 117964800, | |
| "tokens/train_per_sec_per_gpu": 198.82, | |
| "tokens/trainable": 9857145 | |
| }, | |
| { | |
| "epoch": 0.4785961180537091, | |
| "eval_loss": 0.7710337042808533, | |
| "eval_ppl": 2.162, | |
| "eval_runtime": 237.2453, | |
| "eval_samples_per_second": 28.182, | |
| "eval_steps_per_second": 1.762, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 38.19, | |
| "memory/max_allocated (GiB)": 38.19, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.47965966498271734, | |
| "grad_norm": 0.06686612218618393, | |
| "learning_rate": 2.4955114149491865e-05, | |
| "loss": 0.7786468267440796, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.17852, | |
| "step": 451, | |
| "tokens/total": 118226944, | |
| "tokens/train_per_sec_per_gpu": 164.67, | |
| "tokens/trainable": 9878218 | |
| }, | |
| { | |
| "epoch": 0.4807232119117256, | |
| "grad_norm": 0.0646248385310173, | |
| "learning_rate": 2.488321138056783e-05, | |
| "loss": 0.7580331563949585, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.13407, | |
| "step": 452, | |
| "tokens/total": 118489088, | |
| "tokens/train_per_sec_per_gpu": 244.84, | |
| "tokens/trainable": 9900786 | |
| }, | |
| { | |
| "epoch": 0.48178675884073385, | |
| "grad_norm": 0.06668704003095627, | |
| "learning_rate": 2.481124143196069e-05, | |
| "loss": 0.7126317620277405, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.03935, | |
| "step": 453, | |
| "tokens/total": 118751232, | |
| "tokens/train_per_sec_per_gpu": 175.52, | |
| "tokens/trainable": 9921265 | |
| }, | |
| { | |
| "epoch": 0.4828503057697421, | |
| "grad_norm": 0.06527574360370636, | |
| "learning_rate": 2.473920529378083e-05, | |
| "loss": 0.8261401653289795, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.28448, | |
| "step": 454, | |
| "tokens/total": 119013376, | |
| "tokens/train_per_sec_per_gpu": 182.36, | |
| "tokens/trainable": 9942755 | |
| }, | |
| { | |
| "epoch": 0.4839138526987503, | |
| "grad_norm": 0.07106975466012955, | |
| "learning_rate": 2.4667103957049237e-05, | |
| "loss": 0.7335352897644043, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.08243, | |
| "step": 455, | |
| "tokens/total": 119275520, | |
| "tokens/train_per_sec_per_gpu": 161.14, | |
| "tokens/trainable": 9963450 | |
| }, | |
| { | |
| "epoch": 0.4849773996277586, | |
| "grad_norm": 0.06658606976270676, | |
| "learning_rate": 2.4594938413683842e-05, | |
| "loss": 0.8705431222915649, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.38821, | |
| "step": 456, | |
| "tokens/total": 119537664, | |
| "tokens/train_per_sec_per_gpu": 221.13, | |
| "tokens/trainable": 9986403 | |
| }, | |
| { | |
| "epoch": 0.48604094655676683, | |
| "grad_norm": 0.06788789480924606, | |
| "learning_rate": 2.4522709656485896e-05, | |
| "loss": 0.8447569608688354, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.32741, | |
| "step": 457, | |
| "tokens/total": 119799808, | |
| "tokens/train_per_sec_per_gpu": 181.47, | |
| "tokens/trainable": 10007854 | |
| }, | |
| { | |
| "epoch": 0.48710449348577506, | |
| "grad_norm": 0.06112990900874138, | |
| "learning_rate": 2.445041867912629e-05, | |
| "loss": 0.7580868601799011, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.13419, | |
| "step": 458, | |
| "tokens/total": 120061952, | |
| "tokens/train_per_sec_per_gpu": 243.31, | |
| "tokens/trainable": 10033216 | |
| }, | |
| { | |
| "epoch": 0.4881680404147833, | |
| "grad_norm": 0.06946436315774918, | |
| "learning_rate": 2.43780664761319e-05, | |
| "loss": 0.7739719152450562, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.16836, | |
| "step": 459, | |
| "tokens/total": 120324096, | |
| "tokens/train_per_sec_per_gpu": 186.88, | |
| "tokens/trainable": 10054546 | |
| }, | |
| { | |
| "epoch": 0.4892315873437915, | |
| "grad_norm": 0.06711182743310928, | |
| "learning_rate": 2.4305654042871893e-05, | |
| "loss": 0.7676090002059937, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.15461, | |
| "step": 460, | |
| "tokens/total": 120586240, | |
| "tokens/train_per_sec_per_gpu": 203.9, | |
| "tokens/trainable": 10077655 | |
| }, | |
| { | |
| "epoch": 0.4902951342727998, | |
| "grad_norm": 0.07618647068738937, | |
| "learning_rate": 2.4233182375544052e-05, | |
| "loss": 0.8108090758323669, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.24973, | |
| "step": 461, | |
| "tokens/total": 120848384, | |
| "tokens/train_per_sec_per_gpu": 203.76, | |
| "tokens/trainable": 10099531 | |
| }, | |
| { | |
| "epoch": 0.49135868120180803, | |
| "grad_norm": 0.062073446810245514, | |
| "learning_rate": 2.4160652471161043e-05, | |
| "loss": 0.7443853616714478, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.10515, | |
| "step": 462, | |
| "tokens/total": 121110528, | |
| "tokens/train_per_sec_per_gpu": 172.63, | |
| "tokens/trainable": 10121476 | |
| }, | |
| { | |
| "epoch": 0.49242222813081626, | |
| "grad_norm": 0.06608369201421738, | |
| "learning_rate": 2.408806532753674e-05, | |
| "loss": 0.7803705930709839, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.18228, | |
| "step": 463, | |
| "tokens/total": 121372672, | |
| "tokens/train_per_sec_per_gpu": 193.2, | |
| "tokens/trainable": 10142900 | |
| }, | |
| { | |
| "epoch": 0.4934857750598245, | |
| "grad_norm": 0.06678762286901474, | |
| "learning_rate": 2.4015421943272442e-05, | |
| "loss": 0.7602465152740479, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.1388, | |
| "step": 464, | |
| "tokens/total": 121634816, | |
| "tokens/train_per_sec_per_gpu": 144.23, | |
| "tokens/trainable": 10162519 | |
| }, | |
| { | |
| "epoch": 0.4945493219888328, | |
| "grad_norm": 0.07111706584692001, | |
| "learning_rate": 2.3942723317743194e-05, | |
| "loss": 0.8450040817260742, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.32799, | |
| "step": 465, | |
| "tokens/total": 121896960, | |
| "tokens/train_per_sec_per_gpu": 172.92, | |
| "tokens/trainable": 10184843 | |
| }, | |
| { | |
| "epoch": 0.495612868917841, | |
| "grad_norm": 0.06318546831607819, | |
| "learning_rate": 2.3869970451083996e-05, | |
| "loss": 0.7744694948196411, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.16944, | |
| "step": 466, | |
| "tokens/total": 122159104, | |
| "tokens/train_per_sec_per_gpu": 211.14, | |
| "tokens/trainable": 10208041 | |
| }, | |
| { | |
| "epoch": 0.49667641584684924, | |
| "grad_norm": 0.06685180962085724, | |
| "learning_rate": 2.3797164344176054e-05, | |
| "loss": 0.7870070934295654, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.19681, | |
| "step": 467, | |
| "tokens/total": 122421248, | |
| "tokens/train_per_sec_per_gpu": 188.13, | |
| "tokens/trainable": 10229287 | |
| }, | |
| { | |
| "epoch": 0.49773996277585747, | |
| "grad_norm": 0.058472346514463425, | |
| "learning_rate": 2.3724305998633033e-05, | |
| "loss": 0.7486791610717773, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.11421, | |
| "step": 468, | |
| "tokens/total": 122683392, | |
| "tokens/train_per_sec_per_gpu": 193.34, | |
| "tokens/trainable": 10253293 | |
| }, | |
| { | |
| "epoch": 0.4988035097048657, | |
| "grad_norm": 0.06563540548086166, | |
| "learning_rate": 2.365139641678724e-05, | |
| "loss": 0.793043851852417, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.21011, | |
| "step": 469, | |
| "tokens/total": 122945536, | |
| "tokens/train_per_sec_per_gpu": 208.04, | |
| "tokens/trainable": 10275964 | |
| }, | |
| { | |
| "epoch": 0.499867056633874, | |
| "grad_norm": 0.06955686956644058, | |
| "learning_rate": 2.3578436601675857e-05, | |
| "loss": 0.767907977104187, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.15525, | |
| "step": 470, | |
| "tokens/total": 123207680, | |
| "tokens/train_per_sec_per_gpu": 162.65, | |
| "tokens/trainable": 10294893 | |
| }, | |
| { | |
| "epoch": 0.5009306035628822, | |
| "grad_norm": 0.06527870893478394, | |
| "learning_rate": 2.3505427557027153e-05, | |
| "loss": 0.7281315922737122, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.07121, | |
| "step": 471, | |
| "tokens/total": 123469824, | |
| "tokens/train_per_sec_per_gpu": 200.9, | |
| "tokens/trainable": 10318126 | |
| }, | |
| { | |
| "epoch": 0.5019941504918904, | |
| "grad_norm": 0.05844523012638092, | |
| "learning_rate": 2.3432370287246644e-05, | |
| "loss": 0.8092571496963501, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.24624, | |
| "step": 472, | |
| "tokens/total": 123731968, | |
| "tokens/train_per_sec_per_gpu": 191.12, | |
| "tokens/trainable": 10340892 | |
| }, | |
| { | |
| "epoch": 0.5030576974208987, | |
| "grad_norm": 0.0627407431602478, | |
| "learning_rate": 2.3359265797403297e-05, | |
| "loss": 0.8085892200469971, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.24474, | |
| "step": 473, | |
| "tokens/total": 123994112, | |
| "tokens/train_per_sec_per_gpu": 171.13, | |
| "tokens/trainable": 10364090 | |
| }, | |
| { | |
| "epoch": 0.5041212443499069, | |
| "grad_norm": 0.06694270670413971, | |
| "learning_rate": 2.3286115093215717e-05, | |
| "loss": 0.7958250045776367, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.21627, | |
| "step": 474, | |
| "tokens/total": 124256256, | |
| "tokens/train_per_sec_per_gpu": 199.51, | |
| "tokens/trainable": 10385759 | |
| }, | |
| { | |
| "epoch": 0.5051847912789151, | |
| "grad_norm": 0.06240719184279442, | |
| "learning_rate": 2.3212919181038264e-05, | |
| "loss": 0.7695267200469971, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.15874, | |
| "step": 475, | |
| "tokens/total": 124518400, | |
| "tokens/train_per_sec_per_gpu": 229.77, | |
| "tokens/trainable": 10411362 | |
| }, | |
| { | |
| "epoch": 0.5062483382079235, | |
| "grad_norm": 0.06532912701368332, | |
| "learning_rate": 2.313967906784725e-05, | |
| "loss": 0.7777595520019531, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.17659, | |
| "step": 476, | |
| "tokens/total": 124780544, | |
| "tokens/train_per_sec_per_gpu": 235.82, | |
| "tokens/trainable": 10433694 | |
| }, | |
| { | |
| "epoch": 0.5073118851369317, | |
| "grad_norm": 0.07455461472272873, | |
| "learning_rate": 2.306639576122708e-05, | |
| "loss": 0.8379300832748413, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.31158, | |
| "step": 477, | |
| "tokens/total": 125042688, | |
| "tokens/train_per_sec_per_gpu": 163.21, | |
| "tokens/trainable": 10452642 | |
| }, | |
| { | |
| "epoch": 0.5083754320659399, | |
| "grad_norm": 0.06591842323541641, | |
| "learning_rate": 2.2993070269356372e-05, | |
| "loss": 0.8277432322502136, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.28815, | |
| "step": 478, | |
| "tokens/total": 125304832, | |
| "tokens/train_per_sec_per_gpu": 150.23, | |
| "tokens/trainable": 10473380 | |
| }, | |
| { | |
| "epoch": 0.5094389789949482, | |
| "grad_norm": 0.06278102844953537, | |
| "learning_rate": 2.2919703600994096e-05, | |
| "loss": 0.7098827362060547, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.03375, | |
| "step": 479, | |
| "tokens/total": 125566976, | |
| "tokens/train_per_sec_per_gpu": 225.4, | |
| "tokens/trainable": 10496551 | |
| }, | |
| { | |
| "epoch": 0.5105025259239564, | |
| "grad_norm": 0.0822196677327156, | |
| "learning_rate": 2.2846296765465708e-05, | |
| "loss": 0.7754343152046204, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.17154, | |
| "step": 480, | |
| "tokens/total": 125829120, | |
| "tokens/train_per_sec_per_gpu": 189.13, | |
| "tokens/trainable": 10518279 | |
| }, | |
| { | |
| "epoch": 0.5115660728529646, | |
| "grad_norm": 0.06427222490310669, | |
| "learning_rate": 2.2772850772649245e-05, | |
| "loss": 0.7595022916793823, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.13721, | |
| "step": 481, | |
| "tokens/total": 126091264, | |
| "tokens/train_per_sec_per_gpu": 199.34, | |
| "tokens/trainable": 10541893 | |
| }, | |
| { | |
| "epoch": 0.5126296197819729, | |
| "grad_norm": 0.07555945217609406, | |
| "learning_rate": 2.269936663296146e-05, | |
| "loss": 0.7030783295631409, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.01996, | |
| "step": 482, | |
| "tokens/total": 126353408, | |
| "tokens/train_per_sec_per_gpu": 131.21, | |
| "tokens/trainable": 10560989 | |
| }, | |
| { | |
| "epoch": 0.5136931667109811, | |
| "grad_norm": 0.06928715854883194, | |
| "learning_rate": 2.262584535734387e-05, | |
| "loss": 0.7435761094093323, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.10344, | |
| "step": 483, | |
| "tokens/total": 126615552, | |
| "tokens/train_per_sec_per_gpu": 198.19, | |
| "tokens/trainable": 10582215 | |
| }, | |
| { | |
| "epoch": 0.5147567136399893, | |
| "grad_norm": 0.07140571624040604, | |
| "learning_rate": 2.2552287957248914e-05, | |
| "loss": 0.7427330017089844, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.10167, | |
| "step": 484, | |
| "tokens/total": 126877696, | |
| "tokens/train_per_sec_per_gpu": 177.97, | |
| "tokens/trainable": 10603392 | |
| }, | |
| { | |
| "epoch": 0.5158202605689977, | |
| "grad_norm": 0.05892965570092201, | |
| "learning_rate": 2.2478695444625993e-05, | |
| "loss": 0.7203789949417114, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.05521, | |
| "step": 485, | |
| "tokens/total": 127139840, | |
| "tokens/train_per_sec_per_gpu": 178.16, | |
| "tokens/trainable": 10625660 | |
| }, | |
| { | |
| "epoch": 0.5168838074980059, | |
| "grad_norm": 0.06429523229598999, | |
| "learning_rate": 2.240506883190756e-05, | |
| "loss": 0.7731969952583313, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.16668, | |
| "step": 486, | |
| "tokens/total": 127401984, | |
| "tokens/train_per_sec_per_gpu": 219.13, | |
| "tokens/trainable": 10649385 | |
| }, | |
| { | |
| "epoch": 0.5179473544270141, | |
| "grad_norm": 0.07350295782089233, | |
| "learning_rate": 2.2331409131995186e-05, | |
| "loss": 0.7495805621147156, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.11611, | |
| "step": 487, | |
| "tokens/total": 127664128, | |
| "tokens/train_per_sec_per_gpu": 234.6, | |
| "tokens/trainable": 10670554 | |
| }, | |
| { | |
| "epoch": 0.5190109013560223, | |
| "grad_norm": 0.06763040274381638, | |
| "learning_rate": 2.2257717358245645e-05, | |
| "loss": 0.7657451629638672, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.1506, | |
| "step": 488, | |
| "tokens/total": 127926272, | |
| "tokens/train_per_sec_per_gpu": 211.17, | |
| "tokens/trainable": 10694102 | |
| }, | |
| { | |
| "epoch": 0.5200744482850306, | |
| "grad_norm": 0.059085726737976074, | |
| "learning_rate": 2.2183994524456946e-05, | |
| "loss": 0.7091976404190063, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.03236, | |
| "step": 489, | |
| "tokens/total": 128188416, | |
| "tokens/train_per_sec_per_gpu": 185.08, | |
| "tokens/trainable": 10716407 | |
| }, | |
| { | |
| "epoch": 0.5211379952140388, | |
| "grad_norm": 0.06827449053525925, | |
| "learning_rate": 2.2110241644854415e-05, | |
| "loss": 0.7978835105895996, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.22084, | |
| "step": 490, | |
| "tokens/total": 128450560, | |
| "tokens/train_per_sec_per_gpu": 241.5, | |
| "tokens/trainable": 10739309 | |
| }, | |
| { | |
| "epoch": 0.522201542143047, | |
| "grad_norm": 0.06967286020517349, | |
| "learning_rate": 2.2036459734076715e-05, | |
| "loss": 0.7917447090148926, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.20724, | |
| "step": 491, | |
| "tokens/total": 128712704, | |
| "tokens/train_per_sec_per_gpu": 183.24, | |
| "tokens/trainable": 10759856 | |
| }, | |
| { | |
| "epoch": 0.5232650890720553, | |
| "grad_norm": 0.06789640337228775, | |
| "learning_rate": 2.196264980716189e-05, | |
| "loss": 0.785992443561554, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.19458, | |
| "step": 492, | |
| "tokens/total": 128974848, | |
| "tokens/train_per_sec_per_gpu": 208.01, | |
| "tokens/trainable": 10783453 | |
| }, | |
| { | |
| "epoch": 0.5243286360010635, | |
| "grad_norm": 0.07713824510574341, | |
| "learning_rate": 2.1888812879533438e-05, | |
| "loss": 0.740135908126831, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.09622, | |
| "step": 493, | |
| "tokens/total": 129236992, | |
| "tokens/train_per_sec_per_gpu": 193.38, | |
| "tokens/trainable": 10803022 | |
| }, | |
| { | |
| "epoch": 0.5253921829300718, | |
| "grad_norm": 0.06190735474228859, | |
| "learning_rate": 2.1814949966986288e-05, | |
| "loss": 0.7506577968597412, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.11839, | |
| "step": 494, | |
| "tokens/total": 129499136, | |
| "tokens/train_per_sec_per_gpu": 186.68, | |
| "tokens/trainable": 10825782 | |
| }, | |
| { | |
| "epoch": 0.5264557298590801, | |
| "grad_norm": 0.07524209469556808, | |
| "learning_rate": 2.174106208567286e-05, | |
| "loss": 0.8435920476913452, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.3247, | |
| "step": 495, | |
| "tokens/total": 129761280, | |
| "tokens/train_per_sec_per_gpu": 209.07, | |
| "tokens/trainable": 10848136 | |
| }, | |
| { | |
| "epoch": 0.5275192767880883, | |
| "grad_norm": 0.0792510136961937, | |
| "learning_rate": 2.166715025208908e-05, | |
| "loss": 0.795114278793335, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.21469, | |
| "step": 496, | |
| "tokens/total": 130023424, | |
| "tokens/train_per_sec_per_gpu": 176.49, | |
| "tokens/trainable": 10869582 | |
| }, | |
| { | |
| "epoch": 0.5285828237170965, | |
| "grad_norm": 0.06474092602729797, | |
| "learning_rate": 2.1593215483060382e-05, | |
| "loss": 0.7897614240646362, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.20287, | |
| "step": 497, | |
| "tokens/total": 130285568, | |
| "tokens/train_per_sec_per_gpu": 209.6, | |
| "tokens/trainable": 10893683 | |
| }, | |
| { | |
| "epoch": 0.5296463706461048, | |
| "grad_norm": 0.06536766141653061, | |
| "learning_rate": 2.151925879572774e-05, | |
| "loss": 0.7543013095855713, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.12613, | |
| "step": 498, | |
| "tokens/total": 130547712, | |
| "tokens/train_per_sec_per_gpu": 224.22, | |
| "tokens/trainable": 10916399 | |
| }, | |
| { | |
| "epoch": 0.530709917575113, | |
| "grad_norm": 0.06765280663967133, | |
| "learning_rate": 2.144528120753365e-05, | |
| "loss": 0.7390546202659607, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.09395, | |
| "step": 499, | |
| "tokens/total": 130809856, | |
| "tokens/train_per_sec_per_gpu": 169.0, | |
| "tokens/trainable": 10938004 | |
| }, | |
| { | |
| "epoch": 0.5317734645041212, | |
| "grad_norm": 0.07433243840932846, | |
| "learning_rate": 2.137128373620817e-05, | |
| "loss": 0.7480766773223877, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.11293, | |
| "step": 500, | |
| "tokens/total": 131072000, | |
| "tokens/train_per_sec_per_gpu": 188.24, | |
| "tokens/trainable": 10960082 | |
| }, | |
| { | |
| "epoch": 0.5328370114331294, | |
| "grad_norm": 0.058342620730400085, | |
| "learning_rate": 2.129726739975486e-05, | |
| "loss": 0.73946613073349, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.09482, | |
| "step": 501, | |
| "tokens/total": 131334144, | |
| "tokens/train_per_sec_per_gpu": 230.56, | |
| "tokens/trainable": 10983591 | |
| }, | |
| { | |
| "epoch": 0.5339005583621377, | |
| "grad_norm": 0.06705200672149658, | |
| "learning_rate": 2.1223233216436858e-05, | |
| "loss": 0.7899049520492554, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.20319, | |
| "step": 502, | |
| "tokens/total": 131596288, | |
| "tokens/train_per_sec_per_gpu": 177.49, | |
| "tokens/trainable": 11007036 | |
| }, | |
| { | |
| "epoch": 0.534964105291146, | |
| "grad_norm": 0.07227369397878647, | |
| "learning_rate": 2.114918220476279e-05, | |
| "loss": 0.7793487310409546, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.18005, | |
| "step": 503, | |
| "tokens/total": 131858432, | |
| "tokens/train_per_sec_per_gpu": 173.77, | |
| "tokens/trainable": 11030251 | |
| }, | |
| { | |
| "epoch": 0.5360276522201542, | |
| "grad_norm": 0.0691395178437233, | |
| "learning_rate": 2.1075115383472803e-05, | |
| "loss": 0.7794291973114014, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.18023, | |
| "step": 504, | |
| "tokens/total": 132120576, | |
| "tokens/train_per_sec_per_gpu": 161.28, | |
| "tokens/trainable": 11049765 | |
| }, | |
| { | |
| "epoch": 0.5370911991491625, | |
| "grad_norm": 0.06914931535720825, | |
| "learning_rate": 2.1001033771524556e-05, | |
| "loss": 0.7482678890228271, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.11334, | |
| "step": 505, | |
| "tokens/total": 132382720, | |
| "tokens/train_per_sec_per_gpu": 204.45, | |
| "tokens/trainable": 11072121 | |
| }, | |
| { | |
| "epoch": 0.5381547460781707, | |
| "grad_norm": 0.06037479639053345, | |
| "learning_rate": 2.0926938388079168e-05, | |
| "loss": 0.7241630554199219, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.063, | |
| "step": 506, | |
| "tokens/total": 132644864, | |
| "tokens/train_per_sec_per_gpu": 211.36, | |
| "tokens/trainable": 11094232 | |
| }, | |
| { | |
| "epoch": 0.5392182930071789, | |
| "grad_norm": 0.06841259449720383, | |
| "learning_rate": 2.085283025248723e-05, | |
| "loss": 0.7322399616241455, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.07973, | |
| "step": 507, | |
| "tokens/total": 132907008, | |
| "tokens/train_per_sec_per_gpu": 200.68, | |
| "tokens/trainable": 11118027 | |
| }, | |
| { | |
| "epoch": 0.5402818399361872, | |
| "grad_norm": 0.0649460032582283, | |
| "learning_rate": 2.0778710384274757e-05, | |
| "loss": 0.7410999536514282, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.09824, | |
| "step": 508, | |
| "tokens/total": 133169152, | |
| "tokens/train_per_sec_per_gpu": 187.65, | |
| "tokens/trainable": 11142472 | |
| }, | |
| { | |
| "epoch": 0.5413453868651954, | |
| "grad_norm": 0.06589141488075256, | |
| "learning_rate": 2.0704579803129184e-05, | |
| "loss": 0.7477791905403137, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.1123, | |
| "step": 509, | |
| "tokens/total": 133431296, | |
| "tokens/train_per_sec_per_gpu": 186.9, | |
| "tokens/trainable": 11164967 | |
| }, | |
| { | |
| "epoch": 0.5424089337942036, | |
| "grad_norm": 0.07234744727611542, | |
| "learning_rate": 2.0630439528885314e-05, | |
| "loss": 0.82126784324646, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.27338, | |
| "step": 510, | |
| "tokens/total": 133693440, | |
| "tokens/train_per_sec_per_gpu": 183.66, | |
| "tokens/trainable": 11187755 | |
| }, | |
| { | |
| "epoch": 0.5434724807232119, | |
| "grad_norm": 0.06735506653785706, | |
| "learning_rate": 2.0556290581511314e-05, | |
| "loss": 0.7757540941238403, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.17223, | |
| "step": 511, | |
| "tokens/total": 133955584, | |
| "tokens/train_per_sec_per_gpu": 200.97, | |
| "tokens/trainable": 11210450 | |
| }, | |
| { | |
| "epoch": 0.5445360276522202, | |
| "grad_norm": 0.06329286843538284, | |
| "learning_rate": 2.0482133981094656e-05, | |
| "loss": 0.7571574449539185, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.13221, | |
| "step": 512, | |
| "tokens/total": 134217728, | |
| "tokens/train_per_sec_per_gpu": 193.23, | |
| "tokens/trainable": 11232157 | |
| }, | |
| { | |
| "epoch": 0.5455995745812284, | |
| "grad_norm": 0.06758707016706467, | |
| "learning_rate": 2.0407970747828113e-05, | |
| "loss": 0.7560121417045593, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.12977, | |
| "step": 513, | |
| "tokens/total": 134479872, | |
| "tokens/train_per_sec_per_gpu": 171.6, | |
| "tokens/trainable": 11253596 | |
| }, | |
| { | |
| "epoch": 0.5466631215102367, | |
| "grad_norm": 0.067698173224926, | |
| "learning_rate": 2.033380190199569e-05, | |
| "loss": 0.7989984750747681, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.22331, | |
| "step": 514, | |
| "tokens/total": 134742016, | |
| "tokens/train_per_sec_per_gpu": 206.04, | |
| "tokens/trainable": 11276800 | |
| }, | |
| { | |
| "epoch": 0.5477266684392449, | |
| "grad_norm": 0.06352519989013672, | |
| "learning_rate": 2.025962846395862e-05, | |
| "loss": 0.7677520513534546, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.15492, | |
| "step": 515, | |
| "tokens/total": 135004160, | |
| "tokens/train_per_sec_per_gpu": 189.51, | |
| "tokens/trainable": 11300134 | |
| }, | |
| { | |
| "epoch": 0.5487902153682531, | |
| "grad_norm": 0.0675693228840828, | |
| "learning_rate": 2.0185451454141307e-05, | |
| "loss": 0.8176583647727966, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.26519, | |
| "step": 516, | |
| "tokens/total": 135266304, | |
| "tokens/train_per_sec_per_gpu": 170.04, | |
| "tokens/trainable": 11322260 | |
| }, | |
| { | |
| "epoch": 0.5498537622972614, | |
| "grad_norm": 0.07476314902305603, | |
| "learning_rate": 2.0111271893017298e-05, | |
| "loss": 0.7549651861190796, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.12754, | |
| "step": 517, | |
| "tokens/total": 135528448, | |
| "tokens/train_per_sec_per_gpu": 178.37, | |
| "tokens/trainable": 11342564 | |
| }, | |
| { | |
| "epoch": 0.5509173092262696, | |
| "grad_norm": 0.08098773658275604, | |
| "learning_rate": 2.0037090801095217e-05, | |
| "loss": 0.8205512762069702, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.27175, | |
| "step": 518, | |
| "tokens/total": 135790592, | |
| "tokens/train_per_sec_per_gpu": 195.1, | |
| "tokens/trainable": 11364054 | |
| }, | |
| { | |
| "epoch": 0.5519808561552778, | |
| "grad_norm": 0.07975499331951141, | |
| "learning_rate": 1.9962909198904782e-05, | |
| "loss": 0.822569727897644, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.27634, | |
| "step": 519, | |
| "tokens/total": 136052736, | |
| "tokens/train_per_sec_per_gpu": 185.35, | |
| "tokens/trainable": 11385094 | |
| }, | |
| { | |
| "epoch": 0.553044403084286, | |
| "grad_norm": 0.0654132217168808, | |
| "learning_rate": 1.9888728106982712e-05, | |
| "loss": 0.7287492156028748, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.07249, | |
| "step": 520, | |
| "tokens/total": 136314880, | |
| "tokens/train_per_sec_per_gpu": 188.86, | |
| "tokens/trainable": 11409016 | |
| }, | |
| { | |
| "epoch": 0.5541079500132944, | |
| "grad_norm": 0.06833093613386154, | |
| "learning_rate": 1.98145485458587e-05, | |
| "loss": 0.7531672716140747, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.12372, | |
| "step": 521, | |
| "tokens/total": 136577024, | |
| "tokens/train_per_sec_per_gpu": 246.44, | |
| "tokens/trainable": 11431115 | |
| }, | |
| { | |
| "epoch": 0.5551714969423026, | |
| "grad_norm": 0.0704493448138237, | |
| "learning_rate": 1.9740371536041388e-05, | |
| "loss": 0.8314918875694275, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.29674, | |
| "step": 522, | |
| "tokens/total": 136839168, | |
| "tokens/train_per_sec_per_gpu": 211.44, | |
| "tokens/trainable": 11454368 | |
| }, | |
| { | |
| "epoch": 0.5562350438713108, | |
| "grad_norm": 0.07962756603956223, | |
| "learning_rate": 1.966619809800432e-05, | |
| "loss": 0.7510333061218262, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.11919, | |
| "step": 523, | |
| "tokens/total": 137101312, | |
| "tokens/train_per_sec_per_gpu": 174.57, | |
| "tokens/trainable": 11474895 | |
| }, | |
| { | |
| "epoch": 0.5572985908003191, | |
| "grad_norm": 0.07758195698261261, | |
| "learning_rate": 1.95920292521719e-05, | |
| "loss": 0.7872558236122131, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.19736, | |
| "step": 524, | |
| "tokens/total": 137363456, | |
| "tokens/train_per_sec_per_gpu": 204.94, | |
| "tokens/trainable": 11494814 | |
| }, | |
| { | |
| "epoch": 0.5583621377293273, | |
| "grad_norm": 0.06898010522127151, | |
| "learning_rate": 1.9517866018905347e-05, | |
| "loss": 0.7131535410881042, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.04042, | |
| "step": 525, | |
| "tokens/total": 137625600, | |
| "tokens/train_per_sec_per_gpu": 186.39, | |
| "tokens/trainable": 11516315 | |
| }, | |
| { | |
| "epoch": 0.5594256846583355, | |
| "grad_norm": 0.0707436203956604, | |
| "learning_rate": 1.9443709418488692e-05, | |
| "loss": 0.7908194065093994, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.2052, | |
| "step": 526, | |
| "tokens/total": 137887744, | |
| "tokens/train_per_sec_per_gpu": 178.33, | |
| "tokens/trainable": 11536975 | |
| }, | |
| { | |
| "epoch": 0.5604892315873438, | |
| "grad_norm": 0.06255731731653214, | |
| "learning_rate": 1.9369560471114693e-05, | |
| "loss": 0.7359522581100464, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.08747, | |
| "step": 527, | |
| "tokens/total": 138149888, | |
| "tokens/train_per_sec_per_gpu": 191.7, | |
| "tokens/trainable": 11558976 | |
| }, | |
| { | |
| "epoch": 0.561552778516352, | |
| "grad_norm": 0.06818301230669022, | |
| "learning_rate": 1.9295420196870826e-05, | |
| "loss": 0.759716808795929, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.13767, | |
| "step": 528, | |
| "tokens/total": 138412032, | |
| "tokens/train_per_sec_per_gpu": 228.46, | |
| "tokens/trainable": 11580954 | |
| }, | |
| { | |
| "epoch": 0.5626163254453602, | |
| "grad_norm": 0.07047592103481293, | |
| "learning_rate": 1.922128961572525e-05, | |
| "loss": 0.8164031505584717, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.26235, | |
| "step": 529, | |
| "tokens/total": 138674176, | |
| "tokens/train_per_sec_per_gpu": 185.46, | |
| "tokens/trainable": 11603196 | |
| }, | |
| { | |
| "epoch": 0.5636798723743686, | |
| "grad_norm": 0.06535010784864426, | |
| "learning_rate": 1.9147169747512773e-05, | |
| "loss": 0.7326769828796387, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.08064, | |
| "step": 530, | |
| "tokens/total": 138936320, | |
| "tokens/train_per_sec_per_gpu": 192.3, | |
| "tokens/trainable": 11625136 | |
| }, | |
| { | |
| "epoch": 0.5647434193033768, | |
| "grad_norm": 0.20296123623847961, | |
| "learning_rate": 1.9073061611920835e-05, | |
| "loss": 0.8644706606864929, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.37375, | |
| "step": 531, | |
| "tokens/total": 139198464, | |
| "tokens/train_per_sec_per_gpu": 168.12, | |
| "tokens/trainable": 11649882 | |
| }, | |
| { | |
| "epoch": 0.565806966232385, | |
| "grad_norm": 0.07702672481536865, | |
| "learning_rate": 1.899896622847545e-05, | |
| "loss": 0.6990363001823425, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.01181, | |
| "step": 532, | |
| "tokens/total": 139460608, | |
| "tokens/train_per_sec_per_gpu": 188.85, | |
| "tokens/trainable": 11671814 | |
| }, | |
| { | |
| "epoch": 0.5668705131613933, | |
| "grad_norm": 0.06929226219654083, | |
| "learning_rate": 1.89248846165272e-05, | |
| "loss": 0.7941587567329407, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.21258, | |
| "step": 533, | |
| "tokens/total": 139722752, | |
| "tokens/train_per_sec_per_gpu": 194.34, | |
| "tokens/trainable": 11695013 | |
| }, | |
| { | |
| "epoch": 0.5679340600904015, | |
| "grad_norm": 0.07153689116239548, | |
| "learning_rate": 1.885081779523722e-05, | |
| "loss": 0.7691222429275513, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.15787, | |
| "step": 534, | |
| "tokens/total": 139984896, | |
| "tokens/train_per_sec_per_gpu": 240.08, | |
| "tokens/trainable": 11717160 | |
| }, | |
| { | |
| "epoch": 0.5689976070194097, | |
| "grad_norm": 0.06492677330970764, | |
| "learning_rate": 1.8776766783563152e-05, | |
| "loss": 0.7174063920974731, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.04911, | |
| "step": 535, | |
| "tokens/total": 140247040, | |
| "tokens/train_per_sec_per_gpu": 162.24, | |
| "tokens/trainable": 11738101 | |
| }, | |
| { | |
| "epoch": 0.5700611539484179, | |
| "grad_norm": 0.06563756614923477, | |
| "learning_rate": 1.8702732600245138e-05, | |
| "loss": 0.7257460355758667, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.06627, | |
| "step": 536, | |
| "tokens/total": 140509184, | |
| "tokens/train_per_sec_per_gpu": 173.62, | |
| "tokens/trainable": 11759125 | |
| }, | |
| { | |
| "epoch": 0.5711247008774262, | |
| "grad_norm": 0.06024221330881119, | |
| "learning_rate": 1.8628716263791837e-05, | |
| "loss": 0.747328519821167, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.11135, | |
| "step": 537, | |
| "tokens/total": 140771328, | |
| "tokens/train_per_sec_per_gpu": 207.16, | |
| "tokens/trainable": 11782122 | |
| }, | |
| { | |
| "epoch": 0.5721882478064345, | |
| "grad_norm": 0.06131380796432495, | |
| "learning_rate": 1.8554718792466353e-05, | |
| "loss": 0.7804300785064697, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.18241, | |
| "step": 538, | |
| "tokens/total": 141033472, | |
| "tokens/train_per_sec_per_gpu": 198.93, | |
| "tokens/trainable": 11805081 | |
| }, | |
| { | |
| "epoch": 0.5732517947354427, | |
| "grad_norm": 0.0673721581697464, | |
| "learning_rate": 1.8480741204272268e-05, | |
| "loss": 0.8499374389648438, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.3395, | |
| "step": 539, | |
| "tokens/total": 141295616, | |
| "tokens/train_per_sec_per_gpu": 205.27, | |
| "tokens/trainable": 11827941 | |
| }, | |
| { | |
| "epoch": 0.574315341664451, | |
| "grad_norm": 0.07307814806699753, | |
| "learning_rate": 1.8406784516939628e-05, | |
| "loss": 0.740190863609314, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.09634, | |
| "step": 540, | |
| "tokens/total": 141557760, | |
| "tokens/train_per_sec_per_gpu": 200.34, | |
| "tokens/trainable": 11850776 | |
| }, | |
| { | |
| "epoch": 0.5753788885934592, | |
| "grad_norm": 0.06450652331113815, | |
| "learning_rate": 1.8332849747910925e-05, | |
| "loss": 0.7271907329559326, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.06926, | |
| "step": 541, | |
| "tokens/total": 141819904, | |
| "tokens/train_per_sec_per_gpu": 199.39, | |
| "tokens/trainable": 11872489 | |
| }, | |
| { | |
| "epoch": 0.5764424355224674, | |
| "grad_norm": 0.06675565242767334, | |
| "learning_rate": 1.825893791432714e-05, | |
| "loss": 0.7834812998771667, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.18908, | |
| "step": 542, | |
| "tokens/total": 142082048, | |
| "tokens/train_per_sec_per_gpu": 220.61, | |
| "tokens/trainable": 11894441 | |
| }, | |
| { | |
| "epoch": 0.5775059824514757, | |
| "grad_norm": 0.07322388142347336, | |
| "learning_rate": 1.8185050033013715e-05, | |
| "loss": 0.8086245656013489, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.24482, | |
| "step": 543, | |
| "tokens/total": 142344192, | |
| "tokens/train_per_sec_per_gpu": 179.2, | |
| "tokens/trainable": 11915451 | |
| }, | |
| { | |
| "epoch": 0.5785695293804839, | |
| "grad_norm": 0.07201948016881943, | |
| "learning_rate": 1.811118712046657e-05, | |
| "loss": 0.8121210336685181, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.25268, | |
| "step": 544, | |
| "tokens/total": 142606336, | |
| "tokens/train_per_sec_per_gpu": 175.39, | |
| "tokens/trainable": 11937488 | |
| }, | |
| { | |
| "epoch": 0.5796330763094921, | |
| "grad_norm": 0.06886903196573257, | |
| "learning_rate": 1.8037350192838117e-05, | |
| "loss": 0.7545644044876099, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.12668, | |
| "step": 545, | |
| "tokens/total": 142868480, | |
| "tokens/train_per_sec_per_gpu": 194.01, | |
| "tokens/trainable": 11959978 | |
| }, | |
| { | |
| "epoch": 0.5806966232385004, | |
| "grad_norm": 0.07075604796409607, | |
| "learning_rate": 1.7963540265923298e-05, | |
| "loss": 0.7854512333869934, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.1934, | |
| "step": 546, | |
| "tokens/total": 143130624, | |
| "tokens/train_per_sec_per_gpu": 187.1, | |
| "tokens/trainable": 11980805 | |
| }, | |
| { | |
| "epoch": 0.5817601701675087, | |
| "grad_norm": 0.06993231177330017, | |
| "learning_rate": 1.788975835514559e-05, | |
| "loss": 0.7716733813285828, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.16338, | |
| "step": 547, | |
| "tokens/total": 143392768, | |
| "tokens/train_per_sec_per_gpu": 176.49, | |
| "tokens/trainable": 12002477 | |
| }, | |
| { | |
| "epoch": 0.5828237170965169, | |
| "grad_norm": 0.06735540181398392, | |
| "learning_rate": 1.7816005475543057e-05, | |
| "loss": 0.7268585562705994, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.06857, | |
| "step": 548, | |
| "tokens/total": 143654912, | |
| "tokens/train_per_sec_per_gpu": 168.14, | |
| "tokens/trainable": 12024394 | |
| }, | |
| { | |
| "epoch": 0.5838872640255252, | |
| "grad_norm": 0.06800015270709991, | |
| "learning_rate": 1.7742282641754362e-05, | |
| "loss": 0.7047498226165771, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.02334, | |
| "step": 549, | |
| "tokens/total": 143917056, | |
| "tokens/train_per_sec_per_gpu": 173.12, | |
| "tokens/trainable": 12045702 | |
| }, | |
| { | |
| "epoch": 0.5849508109545334, | |
| "grad_norm": 0.07099801301956177, | |
| "learning_rate": 1.766859086800482e-05, | |
| "loss": 0.7587050199508667, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.13551, | |
| "step": 550, | |
| "tokens/total": 144179200, | |
| "tokens/train_per_sec_per_gpu": 142.06, | |
| "tokens/trainable": 12067322 | |
| }, | |
| { | |
| "epoch": 0.5860143578835416, | |
| "grad_norm": 0.06389941275119781, | |
| "learning_rate": 1.759493116809245e-05, | |
| "loss": 0.7534220814704895, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.12426, | |
| "step": 551, | |
| "tokens/total": 144441344, | |
| "tokens/train_per_sec_per_gpu": 208.95, | |
| "tokens/trainable": 12090670 | |
| }, | |
| { | |
| "epoch": 0.5870779048125498, | |
| "grad_norm": 0.08602443337440491, | |
| "learning_rate": 1.7521304555374013e-05, | |
| "loss": 0.7858190536499023, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.1942, | |
| "step": 552, | |
| "tokens/total": 144703488, | |
| "tokens/train_per_sec_per_gpu": 183.64, | |
| "tokens/trainable": 12112909 | |
| }, | |
| { | |
| "epoch": 0.5881414517415581, | |
| "grad_norm": 0.06364187598228455, | |
| "learning_rate": 1.7447712042751086e-05, | |
| "loss": 0.8089983463287354, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.24566, | |
| "step": 553, | |
| "tokens/total": 144965632, | |
| "tokens/train_per_sec_per_gpu": 221.48, | |
| "tokens/trainable": 12136873 | |
| }, | |
| { | |
| "epoch": 0.5892049986705663, | |
| "grad_norm": 0.06705693900585175, | |
| "learning_rate": 1.7374154642656133e-05, | |
| "loss": 0.7520922422409058, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.12143, | |
| "step": 554, | |
| "tokens/total": 145227776, | |
| "tokens/train_per_sec_per_gpu": 211.51, | |
| "tokens/trainable": 12158329 | |
| }, | |
| { | |
| "epoch": 0.5902685455995745, | |
| "grad_norm": 0.06293050199747086, | |
| "learning_rate": 1.730063336703855e-05, | |
| "loss": 0.833366870880127, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.30105, | |
| "step": 555, | |
| "tokens/total": 145489920, | |
| "tokens/train_per_sec_per_gpu": 209.36, | |
| "tokens/trainable": 12180630 | |
| }, | |
| { | |
| "epoch": 0.5913320925285829, | |
| "grad_norm": 0.07079198956489563, | |
| "learning_rate": 1.722714922735076e-05, | |
| "loss": 0.6932408213615417, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.00019, | |
| "step": 556, | |
| "tokens/total": 145752064, | |
| "tokens/train_per_sec_per_gpu": 194.23, | |
| "tokens/trainable": 12202488 | |
| }, | |
| { | |
| "epoch": 0.5923956394575911, | |
| "grad_norm": 0.07012391835451126, | |
| "learning_rate": 1.7153703234534302e-05, | |
| "loss": 0.7621327042579651, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.14284, | |
| "step": 557, | |
| "tokens/total": 146014208, | |
| "tokens/train_per_sec_per_gpu": 216.52, | |
| "tokens/trainable": 12226431 | |
| }, | |
| { | |
| "epoch": 0.5934591863865993, | |
| "grad_norm": 0.061280712485313416, | |
| "learning_rate": 1.708029639900591e-05, | |
| "loss": 0.7005034685134888, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.01477, | |
| "step": 558, | |
| "tokens/total": 146276352, | |
| "tokens/train_per_sec_per_gpu": 192.83, | |
| "tokens/trainable": 12247296 | |
| }, | |
| { | |
| "epoch": 0.5945227333156076, | |
| "grad_norm": 0.06636322289705276, | |
| "learning_rate": 1.7006929730643635e-05, | |
| "loss": 0.6887087821960449, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 1.99114, | |
| "step": 559, | |
| "tokens/total": 146538496, | |
| "tokens/train_per_sec_per_gpu": 207.99, | |
| "tokens/trainable": 12268797 | |
| }, | |
| { | |
| "epoch": 0.5955862802446158, | |
| "grad_norm": 0.07723846286535263, | |
| "learning_rate": 1.6933604238772924e-05, | |
| "loss": 0.6889795064926147, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 1.99168, | |
| "step": 560, | |
| "tokens/total": 146800640, | |
| "tokens/train_per_sec_per_gpu": 200.15, | |
| "tokens/trainable": 12289775 | |
| }, | |
| { | |
| "epoch": 0.596649827173624, | |
| "grad_norm": 0.06970304995775223, | |
| "learning_rate": 1.6860320932152755e-05, | |
| "loss": 0.7143691182136536, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.0429, | |
| "step": 561, | |
| "tokens/total": 147062784, | |
| "tokens/train_per_sec_per_gpu": 167.54, | |
| "tokens/trainable": 12309110 | |
| }, | |
| { | |
| "epoch": 0.5977133741026323, | |
| "grad_norm": 0.07083282619714737, | |
| "learning_rate": 1.6787080818961746e-05, | |
| "loss": 0.7616149187088013, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.14173, | |
| "step": 562, | |
| "tokens/total": 147324928, | |
| "tokens/train_per_sec_per_gpu": 170.61, | |
| "tokens/trainable": 12329610 | |
| }, | |
| { | |
| "epoch": 0.5987769210316405, | |
| "grad_norm": 0.06794283539056778, | |
| "learning_rate": 1.6713884906784293e-05, | |
| "loss": 0.7875048518180847, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.19791, | |
| "step": 563, | |
| "tokens/total": 147587072, | |
| "tokens/train_per_sec_per_gpu": 160.01, | |
| "tokens/trainable": 12350916 | |
| }, | |
| { | |
| "epoch": 0.5998404679606487, | |
| "grad_norm": 0.07991766929626465, | |
| "learning_rate": 1.6640734202596702e-05, | |
| "loss": 0.7792809009552002, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.1799, | |
| "step": 564, | |
| "tokens/total": 147849216, | |
| "tokens/train_per_sec_per_gpu": 155.73, | |
| "tokens/trainable": 12370256 | |
| }, | |
| { | |
| "epoch": 0.6009040148896571, | |
| "grad_norm": 0.07368163019418716, | |
| "learning_rate": 1.6567629712753363e-05, | |
| "loss": 0.7557001113891602, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.1291, | |
| "step": 565, | |
| "tokens/total": 148111360, | |
| "tokens/train_per_sec_per_gpu": 184.87, | |
| "tokens/trainable": 12391117 | |
| }, | |
| { | |
| "epoch": 0.6019675618186653, | |
| "grad_norm": 0.07445425540208817, | |
| "learning_rate": 1.6494572442972857e-05, | |
| "loss": 0.7838120460510254, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.1898, | |
| "step": 566, | |
| "tokens/total": 148373504, | |
| "tokens/train_per_sec_per_gpu": 188.91, | |
| "tokens/trainable": 12412378 | |
| }, | |
| { | |
| "epoch": 0.6030311087476735, | |
| "grad_norm": 0.0670490711927414, | |
| "learning_rate": 1.642156339832415e-05, | |
| "loss": 0.7417568564414978, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.09962, | |
| "step": 567, | |
| "tokens/total": 148635648, | |
| "tokens/train_per_sec_per_gpu": 170.03, | |
| "tokens/trainable": 12433097 | |
| }, | |
| { | |
| "epoch": 0.6040946556766817, | |
| "grad_norm": 0.06880825757980347, | |
| "learning_rate": 1.634860358321277e-05, | |
| "loss": 0.7300165891647339, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.07512, | |
| "step": 568, | |
| "tokens/total": 148897792, | |
| "tokens/train_per_sec_per_gpu": 195.18, | |
| "tokens/trainable": 12455206 | |
| }, | |
| { | |
| "epoch": 0.60515820260569, | |
| "grad_norm": 0.07096575200557709, | |
| "learning_rate": 1.627569400136697e-05, | |
| "loss": 0.8089407682418823, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.24553, | |
| "step": 569, | |
| "tokens/total": 149159936, | |
| "tokens/train_per_sec_per_gpu": 191.16, | |
| "tokens/trainable": 12478007 | |
| }, | |
| { | |
| "epoch": 0.6062217495346982, | |
| "grad_norm": 0.0826595202088356, | |
| "learning_rate": 1.620283565582395e-05, | |
| "loss": 0.7439614534378052, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.10425, | |
| "step": 570, | |
| "tokens/total": 149422080, | |
| "tokens/train_per_sec_per_gpu": 186.55, | |
| "tokens/trainable": 12498799 | |
| }, | |
| { | |
| "epoch": 0.6072852964637064, | |
| "grad_norm": 0.08035894483327866, | |
| "learning_rate": 1.6130029548916007e-05, | |
| "loss": 0.7608100175857544, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.14001, | |
| "step": 571, | |
| "tokens/total": 149684224, | |
| "tokens/train_per_sec_per_gpu": 177.96, | |
| "tokens/trainable": 12518407 | |
| }, | |
| { | |
| "epoch": 0.6083488433927147, | |
| "grad_norm": 0.07164735347032547, | |
| "learning_rate": 1.605727668225681e-05, | |
| "loss": 0.8451493978500366, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.32833, | |
| "step": 572, | |
| "tokens/total": 149946368, | |
| "tokens/train_per_sec_per_gpu": 219.11, | |
| "tokens/trainable": 12541561 | |
| }, | |
| { | |
| "epoch": 0.6094123903217229, | |
| "grad_norm": 0.07182083278894424, | |
| "learning_rate": 1.5984578056727564e-05, | |
| "loss": 0.8233068585395813, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.27802, | |
| "step": 573, | |
| "tokens/total": 150208512, | |
| "tokens/train_per_sec_per_gpu": 208.4, | |
| "tokens/trainable": 12562762 | |
| }, | |
| { | |
| "epoch": 0.6104759372507312, | |
| "grad_norm": 0.06765095144510269, | |
| "learning_rate": 1.591193467246327e-05, | |
| "loss": 0.7087922096252441, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.03154, | |
| "step": 574, | |
| "tokens/total": 150470656, | |
| "tokens/train_per_sec_per_gpu": 149.32, | |
| "tokens/trainable": 12583197 | |
| }, | |
| { | |
| "epoch": 0.6115394841797395, | |
| "grad_norm": 0.07710757106542587, | |
| "learning_rate": 1.5839347528838957e-05, | |
| "loss": 0.7562744617462158, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.13032, | |
| "step": 575, | |
| "tokens/total": 150732800, | |
| "tokens/train_per_sec_per_gpu": 121.82, | |
| "tokens/trainable": 12601739 | |
| }, | |
| { | |
| "epoch": 0.6126030311087477, | |
| "grad_norm": 0.07722500711679459, | |
| "learning_rate": 1.5766817624455954e-05, | |
| "loss": 0.8186255693435669, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.26738, | |
| "step": 576, | |
| "tokens/total": 150994944, | |
| "tokens/train_per_sec_per_gpu": 177.23, | |
| "tokens/trainable": 12623589 | |
| }, | |
| { | |
| "epoch": 0.6136665780377559, | |
| "grad_norm": 0.06252503395080566, | |
| "learning_rate": 1.569434595712811e-05, | |
| "loss": 0.7135177850723267, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.04116, | |
| "step": 577, | |
| "tokens/total": 151257088, | |
| "tokens/train_per_sec_per_gpu": 216.51, | |
| "tokens/trainable": 12646370 | |
| }, | |
| { | |
| "epoch": 0.6147301249667642, | |
| "grad_norm": 0.07124295085668564, | |
| "learning_rate": 1.5621933523868106e-05, | |
| "loss": 0.7426387667655945, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.10147, | |
| "step": 578, | |
| "tokens/total": 151519232, | |
| "tokens/train_per_sec_per_gpu": 164.29, | |
| "tokens/trainable": 12666523 | |
| }, | |
| { | |
| "epoch": 0.6157936718957724, | |
| "grad_norm": 0.06870193779468536, | |
| "learning_rate": 1.5549581320873715e-05, | |
| "loss": 0.7106361389160156, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.03529, | |
| "step": 579, | |
| "tokens/total": 151781376, | |
| "tokens/train_per_sec_per_gpu": 203.55, | |
| "tokens/trainable": 12689727 | |
| }, | |
| { | |
| "epoch": 0.6168572188247806, | |
| "grad_norm": 0.06538794189691544, | |
| "learning_rate": 1.5477290343514108e-05, | |
| "loss": 0.7434192299842834, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.10311, | |
| "step": 580, | |
| "tokens/total": 152043520, | |
| "tokens/train_per_sec_per_gpu": 220.1, | |
| "tokens/trainable": 12713675 | |
| }, | |
| { | |
| "epoch": 0.6179207657537888, | |
| "grad_norm": 0.08144285529851913, | |
| "learning_rate": 1.5405061586316158e-05, | |
| "loss": 0.7463376522064209, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.10926, | |
| "step": 581, | |
| "tokens/total": 152305664, | |
| "tokens/train_per_sec_per_gpu": 199.52, | |
| "tokens/trainable": 12734720 | |
| }, | |
| { | |
| "epoch": 0.6189843126827971, | |
| "grad_norm": 0.0673174113035202, | |
| "learning_rate": 1.533289604295077e-05, | |
| "loss": 0.6784200668334961, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 1.97076, | |
| "step": 582, | |
| "tokens/total": 152567808, | |
| "tokens/train_per_sec_per_gpu": 185.45, | |
| "tokens/trainable": 12757403 | |
| }, | |
| { | |
| "epoch": 0.6200478596118054, | |
| "grad_norm": 0.07269315421581268, | |
| "learning_rate": 1.5260794706219176e-05, | |
| "loss": 0.769637942314148, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.15898, | |
| "step": 583, | |
| "tokens/total": 152829952, | |
| "tokens/train_per_sec_per_gpu": 198.93, | |
| "tokens/trainable": 12778686 | |
| }, | |
| { | |
| "epoch": 0.6211114065408136, | |
| "grad_norm": 0.07265086472034454, | |
| "learning_rate": 1.5188758568039318e-05, | |
| "loss": 0.783679723739624, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.18951, | |
| "step": 584, | |
| "tokens/total": 153092096, | |
| "tokens/train_per_sec_per_gpu": 172.98, | |
| "tokens/trainable": 12799072 | |
| }, | |
| { | |
| "epoch": 0.6221749534698219, | |
| "grad_norm": 0.06607359647750854, | |
| "learning_rate": 1.5116788619432177e-05, | |
| "loss": 0.7359644770622253, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.08749, | |
| "step": 585, | |
| "tokens/total": 153354240, | |
| "tokens/train_per_sec_per_gpu": 217.63, | |
| "tokens/trainable": 12823018 | |
| }, | |
| { | |
| "epoch": 0.6232385003988301, | |
| "grad_norm": 0.07835246622562408, | |
| "learning_rate": 1.5044885850508137e-05, | |
| "loss": 0.7655049562454224, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.15008, | |
| "step": 586, | |
| "tokens/total": 153616384, | |
| "tokens/train_per_sec_per_gpu": 172.13, | |
| "tokens/trainable": 12843433 | |
| }, | |
| { | |
| "epoch": 0.6243020473278383, | |
| "grad_norm": 0.06266583502292633, | |
| "learning_rate": 1.4973051250453399e-05, | |
| "loss": 0.784702718257904, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.19176, | |
| "step": 587, | |
| "tokens/total": 153878528, | |
| "tokens/train_per_sec_per_gpu": 181.28, | |
| "tokens/trainable": 12866501 | |
| }, | |
| { | |
| "epoch": 0.6253655942568466, | |
| "grad_norm": 0.07751967012882233, | |
| "learning_rate": 1.4901285807516326e-05, | |
| "loss": 0.7583497762680054, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.13475, | |
| "step": 588, | |
| "tokens/total": 154140672, | |
| "tokens/train_per_sec_per_gpu": 186.01, | |
| "tokens/trainable": 12887753 | |
| }, | |
| { | |
| "epoch": 0.6264291411858548, | |
| "grad_norm": 0.06350097805261612, | |
| "learning_rate": 1.4829590508993859e-05, | |
| "loss": 0.7606030702590942, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.13957, | |
| "step": 589, | |
| "tokens/total": 154402816, | |
| "tokens/train_per_sec_per_gpu": 173.15, | |
| "tokens/trainable": 12910246 | |
| }, | |
| { | |
| "epoch": 0.627492688114863, | |
| "grad_norm": 0.0697028860449791, | |
| "learning_rate": 1.4757966341217963e-05, | |
| "loss": 0.7563662528991699, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.13052, | |
| "step": 590, | |
| "tokens/total": 154664960, | |
| "tokens/train_per_sec_per_gpu": 205.69, | |
| "tokens/trainable": 12933119 | |
| }, | |
| { | |
| "epoch": 0.6285562350438713, | |
| "grad_norm": 0.07811742275953293, | |
| "learning_rate": 1.4686414289542023e-05, | |
| "loss": 0.8478161096572876, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.33454, | |
| "step": 591, | |
| "tokens/total": 154927104, | |
| "tokens/train_per_sec_per_gpu": 200.8, | |
| "tokens/trainable": 12953870 | |
| }, | |
| { | |
| "epoch": 0.6296197819728796, | |
| "grad_norm": 0.06654322147369385, | |
| "learning_rate": 1.461493533832731e-05, | |
| "loss": 0.7657591104507446, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.15063, | |
| "step": 592, | |
| "tokens/total": 155189248, | |
| "tokens/train_per_sec_per_gpu": 193.4, | |
| "tokens/trainable": 12976578 | |
| }, | |
| { | |
| "epoch": 0.6306833289018878, | |
| "grad_norm": 0.07657228410243988, | |
| "learning_rate": 1.454353047092944e-05, | |
| "loss": 0.839828372001648, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.31597, | |
| "step": 593, | |
| "tokens/total": 155451392, | |
| "tokens/train_per_sec_per_gpu": 228.66, | |
| "tokens/trainable": 13000674 | |
| }, | |
| { | |
| "epoch": 0.6317468758308961, | |
| "grad_norm": 0.07358460128307343, | |
| "learning_rate": 1.4472200669684821e-05, | |
| "loss": 0.6963605284690857, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.00644, | |
| "step": 594, | |
| "tokens/total": 155713536, | |
| "tokens/train_per_sec_per_gpu": 199.12, | |
| "tokens/trainable": 13021140 | |
| }, | |
| { | |
| "epoch": 0.6328104227599043, | |
| "grad_norm": 0.07676289230585098, | |
| "learning_rate": 1.4400946915897168e-05, | |
| "loss": 0.8120332956314087, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.25248, | |
| "step": 595, | |
| "tokens/total": 155975680, | |
| "tokens/train_per_sec_per_gpu": 172.52, | |
| "tokens/trainable": 13041462 | |
| }, | |
| { | |
| "epoch": 0.6338739696889125, | |
| "grad_norm": 0.07129766792058945, | |
| "learning_rate": 1.4329770189823982e-05, | |
| "loss": 0.7258169054985046, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.06642, | |
| "step": 596, | |
| "tokens/total": 156237824, | |
| "tokens/train_per_sec_per_gpu": 199.58, | |
| "tokens/trainable": 13062706 | |
| }, | |
| { | |
| "epoch": 0.6349375166179207, | |
| "grad_norm": 0.0697442814707756, | |
| "learning_rate": 1.4258671470663075e-05, | |
| "loss": 0.7802278995513916, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.18197, | |
| "step": 597, | |
| "tokens/total": 156499968, | |
| "tokens/train_per_sec_per_gpu": 205.89, | |
| "tokens/trainable": 13087819 | |
| }, | |
| { | |
| "epoch": 0.636001063546929, | |
| "grad_norm": 0.07785540819168091, | |
| "learning_rate": 1.4187651736539092e-05, | |
| "loss": 0.756123423576355, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.13, | |
| "step": 598, | |
| "tokens/total": 156762112, | |
| "tokens/train_per_sec_per_gpu": 196.25, | |
| "tokens/trainable": 13109448 | |
| }, | |
| { | |
| "epoch": 0.6370646104759372, | |
| "grad_norm": 0.07135733217000961, | |
| "learning_rate": 1.4116711964490076e-05, | |
| "loss": 0.760696530342102, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.13977, | |
| "step": 599, | |
| "tokens/total": 157024256, | |
| "tokens/train_per_sec_per_gpu": 169.38, | |
| "tokens/trainable": 13130856 | |
| }, | |
| { | |
| "epoch": 0.6381281574049454, | |
| "grad_norm": 0.07068773359060287, | |
| "learning_rate": 1.404585313045399e-05, | |
| "loss": 0.7768102884292603, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.17453, | |
| "step": 600, | |
| "tokens/total": 157286400, | |
| "tokens/train_per_sec_per_gpu": 200.62, | |
| "tokens/trainable": 13154412 | |
| }, | |
| { | |
| "epoch": 0.6381281574049454, | |
| "eval_loss": 0.7701326012611389, | |
| "eval_ppl": 2.16005, | |
| "eval_runtime": 237.2245, | |
| "eval_samples_per_second": 28.184, | |
| "eval_steps_per_second": 1.762, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 38.19, | |
| "memory/max_allocated (GiB)": 38.19, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.6391917043339538, | |
| "grad_norm": 0.06947837024927139, | |
| "learning_rate": 1.3975076209255321e-05, | |
| "loss": 0.7434956431388855, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.10327, | |
| "step": 601, | |
| "tokens/total": 157548544, | |
| "tokens/train_per_sec_per_gpu": 185.08, | |
| "tokens/trainable": 13176877 | |
| }, | |
| { | |
| "epoch": 0.640255251262962, | |
| "grad_norm": 0.08173263818025589, | |
| "learning_rate": 1.3904382174591654e-05, | |
| "loss": 0.7505627870559692, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.11819, | |
| "step": 602, | |
| "tokens/total": 157810688, | |
| "tokens/train_per_sec_per_gpu": 157.36, | |
| "tokens/trainable": 13197398 | |
| }, | |
| { | |
| "epoch": 0.6413187981919702, | |
| "grad_norm": 0.07958182692527771, | |
| "learning_rate": 1.3833771999020274e-05, | |
| "loss": 0.7926914095878601, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.20933, | |
| "step": 603, | |
| "tokens/total": 158072832, | |
| "tokens/train_per_sec_per_gpu": 143.13, | |
| "tokens/trainable": 13219479 | |
| }, | |
| { | |
| "epoch": 0.6423823451209785, | |
| "grad_norm": 0.08310385793447495, | |
| "learning_rate": 1.3763246653944824e-05, | |
| "loss": 0.7650701999664307, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.14915, | |
| "step": 604, | |
| "tokens/total": 158334976, | |
| "tokens/train_per_sec_per_gpu": 156.87, | |
| "tokens/trainable": 13239934 | |
| }, | |
| { | |
| "epoch": 0.6434458920499867, | |
| "grad_norm": 0.07890634983778, | |
| "learning_rate": 1.3692807109601875e-05, | |
| "loss": 0.7053734064102173, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.0246, | |
| "step": 605, | |
| "tokens/total": 158597120, | |
| "tokens/train_per_sec_per_gpu": 183.26, | |
| "tokens/trainable": 13259732 | |
| }, | |
| { | |
| "epoch": 0.6445094389789949, | |
| "grad_norm": 0.06404001265764236, | |
| "learning_rate": 1.3622454335047631e-05, | |
| "loss": 0.737473726272583, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.09065, | |
| "step": 606, | |
| "tokens/total": 158859264, | |
| "tokens/train_per_sec_per_gpu": 235.57, | |
| "tokens/trainable": 13284062 | |
| }, | |
| { | |
| "epoch": 0.6455729859080032, | |
| "grad_norm": 0.07135722041130066, | |
| "learning_rate": 1.3552189298144573e-05, | |
| "loss": 0.7033129334449768, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.02044, | |
| "step": 607, | |
| "tokens/total": 159121408, | |
| "tokens/train_per_sec_per_gpu": 176.01, | |
| "tokens/trainable": 13306868 | |
| }, | |
| { | |
| "epoch": 0.6466365328370114, | |
| "grad_norm": 0.0779723972082138, | |
| "learning_rate": 1.3482012965548161e-05, | |
| "loss": 0.8055770993232727, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.23799, | |
| "step": 608, | |
| "tokens/total": 159383552, | |
| "tokens/train_per_sec_per_gpu": 189.42, | |
| "tokens/trainable": 13327922 | |
| }, | |
| { | |
| "epoch": 0.6477000797660196, | |
| "grad_norm": 0.08110717684030533, | |
| "learning_rate": 1.341192630269351e-05, | |
| "loss": 0.817200779914856, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.26415, | |
| "step": 609, | |
| "tokens/total": 159645696, | |
| "tokens/train_per_sec_per_gpu": 170.71, | |
| "tokens/trainable": 13347786 | |
| }, | |
| { | |
| "epoch": 0.648763626695028, | |
| "grad_norm": 0.06666215509176254, | |
| "learning_rate": 1.3341930273782144e-05, | |
| "loss": 0.704579770565033, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.023, | |
| "step": 610, | |
| "tokens/total": 159907840, | |
| "tokens/train_per_sec_per_gpu": 200.63, | |
| "tokens/trainable": 13371502 | |
| }, | |
| { | |
| "epoch": 0.6498271736240362, | |
| "grad_norm": 0.06857079267501831, | |
| "learning_rate": 1.3272025841768693e-05, | |
| "loss": 0.8532009720802307, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.34715, | |
| "step": 611, | |
| "tokens/total": 160169984, | |
| "tokens/train_per_sec_per_gpu": 191.0, | |
| "tokens/trainable": 13393637 | |
| }, | |
| { | |
| "epoch": 0.6508907205530444, | |
| "grad_norm": 0.0732354000210762, | |
| "learning_rate": 1.320221396834767e-05, | |
| "loss": 0.7995498180389404, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.22454, | |
| "step": 612, | |
| "tokens/total": 160432128, | |
| "tokens/train_per_sec_per_gpu": 189.51, | |
| "tokens/trainable": 13415692 | |
| }, | |
| { | |
| "epoch": 0.6519542674820527, | |
| "grad_norm": 0.06430143862962723, | |
| "learning_rate": 1.3132495613940237e-05, | |
| "loss": 0.76482093334198, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.14861, | |
| "step": 613, | |
| "tokens/total": 160694272, | |
| "tokens/train_per_sec_per_gpu": 207.14, | |
| "tokens/trainable": 13438366 | |
| }, | |
| { | |
| "epoch": 0.6530178144110609, | |
| "grad_norm": 0.06737808883190155, | |
| "learning_rate": 1.3062871737680976e-05, | |
| "loss": 0.7430492043495178, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.10234, | |
| "step": 614, | |
| "tokens/total": 160956416, | |
| "tokens/train_per_sec_per_gpu": 188.74, | |
| "tokens/trainable": 13458353 | |
| }, | |
| { | |
| "epoch": 0.6540813613400691, | |
| "grad_norm": 0.0812523365020752, | |
| "learning_rate": 1.2993343297404732e-05, | |
| "loss": 0.8373923897743225, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.31033, | |
| "step": 615, | |
| "tokens/total": 161218560, | |
| "tokens/train_per_sec_per_gpu": 198.52, | |
| "tokens/trainable": 13479819 | |
| }, | |
| { | |
| "epoch": 0.6551449082690773, | |
| "grad_norm": 0.07273576408624649, | |
| "learning_rate": 1.2923911249633391e-05, | |
| "loss": 0.753447413444519, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.12431, | |
| "step": 616, | |
| "tokens/total": 161480704, | |
| "tokens/train_per_sec_per_gpu": 187.46, | |
| "tokens/trainable": 13500295 | |
| }, | |
| { | |
| "epoch": 0.6562084551980856, | |
| "grad_norm": 0.07487501949071884, | |
| "learning_rate": 1.2854576549562743e-05, | |
| "loss": 0.7411309480667114, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.09831, | |
| "step": 617, | |
| "tokens/total": 161742848, | |
| "tokens/train_per_sec_per_gpu": 158.41, | |
| "tokens/trainable": 13518908 | |
| }, | |
| { | |
| "epoch": 0.6572720021270939, | |
| "grad_norm": 0.060535646975040436, | |
| "learning_rate": 1.2785340151049348e-05, | |
| "loss": 0.6992688179016113, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.01228, | |
| "step": 618, | |
| "tokens/total": 162004992, | |
| "tokens/train_per_sec_per_gpu": 195.11, | |
| "tokens/trainable": 13542123 | |
| }, | |
| { | |
| "epoch": 0.6583355490561021, | |
| "grad_norm": 0.07925450801849365, | |
| "learning_rate": 1.2716203006597389e-05, | |
| "loss": 0.7158269882202148, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.04588, | |
| "step": 619, | |
| "tokens/total": 162267136, | |
| "tokens/train_per_sec_per_gpu": 154.73, | |
| "tokens/trainable": 13560742 | |
| }, | |
| { | |
| "epoch": 0.6593990959851104, | |
| "grad_norm": 0.07715223729610443, | |
| "learning_rate": 1.2647166067345598e-05, | |
| "loss": 0.7587981224060059, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.13571, | |
| "step": 620, | |
| "tokens/total": 162529280, | |
| "tokens/train_per_sec_per_gpu": 190.53, | |
| "tokens/trainable": 13581062 | |
| }, | |
| { | |
| "epoch": 0.6604626429141186, | |
| "grad_norm": 0.0761185884475708, | |
| "learning_rate": 1.2578230283054153e-05, | |
| "loss": 0.7682688236236572, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.15603, | |
| "step": 621, | |
| "tokens/total": 162791424, | |
| "tokens/train_per_sec_per_gpu": 193.3, | |
| "tokens/trainable": 13602325 | |
| }, | |
| { | |
| "epoch": 0.6615261898431268, | |
| "grad_norm": 0.0685892179608345, | |
| "learning_rate": 1.2509396602091612e-05, | |
| "loss": 0.7407702207565308, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.09755, | |
| "step": 622, | |
| "tokens/total": 163053568, | |
| "tokens/train_per_sec_per_gpu": 178.64, | |
| "tokens/trainable": 13624055 | |
| }, | |
| { | |
| "epoch": 0.6625897367721351, | |
| "grad_norm": 0.07496423274278641, | |
| "learning_rate": 1.2440665971421872e-05, | |
| "loss": 0.7656638622283936, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.15042, | |
| "step": 623, | |
| "tokens/total": 163315712, | |
| "tokens/train_per_sec_per_gpu": 182.21, | |
| "tokens/trainable": 13643250 | |
| }, | |
| { | |
| "epoch": 0.6636532837011433, | |
| "grad_norm": 0.07262540608644485, | |
| "learning_rate": 1.2372039336591137e-05, | |
| "loss": 0.801539421081543, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.22897, | |
| "step": 624, | |
| "tokens/total": 163577856, | |
| "tokens/train_per_sec_per_gpu": 196.93, | |
| "tokens/trainable": 13664253 | |
| }, | |
| { | |
| "epoch": 0.6647168306301515, | |
| "grad_norm": 0.07125604152679443, | |
| "learning_rate": 1.230351764171491e-05, | |
| "loss": 0.7554821968078613, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.12864, | |
| "step": 625, | |
| "tokens/total": 163840000, | |
| "tokens/train_per_sec_per_gpu": 174.72, | |
| "tokens/trainable": 13685345 | |
| }, | |
| { | |
| "epoch": 0.6657803775591598, | |
| "grad_norm": 0.071174755692482, | |
| "learning_rate": 1.2235101829465003e-05, | |
| "loss": 0.8233163952827454, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.27804, | |
| "step": 626, | |
| "tokens/total": 164102144, | |
| "tokens/train_per_sec_per_gpu": 235.38, | |
| "tokens/trainable": 13708541 | |
| }, | |
| { | |
| "epoch": 0.6668439244881681, | |
| "grad_norm": 0.07615090906620026, | |
| "learning_rate": 1.2166792841056596e-05, | |
| "loss": 0.7587050199508667, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.13551, | |
| "step": 627, | |
| "tokens/total": 164364288, | |
| "tokens/train_per_sec_per_gpu": 220.58, | |
| "tokens/trainable": 13730305 | |
| }, | |
| { | |
| "epoch": 0.6679074714171763, | |
| "grad_norm": 0.08138352632522583, | |
| "learning_rate": 1.2098591616235231e-05, | |
| "loss": 0.7529855370521545, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.12333, | |
| "step": 628, | |
| "tokens/total": 164626432, | |
| "tokens/train_per_sec_per_gpu": 193.64, | |
| "tokens/trainable": 13752153 | |
| }, | |
| { | |
| "epoch": 0.6689710183461846, | |
| "grad_norm": 0.07478975504636765, | |
| "learning_rate": 1.2030499093263938e-05, | |
| "loss": 0.7286227941513062, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.07222, | |
| "step": 629, | |
| "tokens/total": 164888576, | |
| "tokens/train_per_sec_per_gpu": 193.85, | |
| "tokens/trainable": 13774483 | |
| }, | |
| { | |
| "epoch": 0.6700345652751928, | |
| "grad_norm": 0.07285770773887634, | |
| "learning_rate": 1.1962516208910295e-05, | |
| "loss": 0.733648955821991, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.08267, | |
| "step": 630, | |
| "tokens/total": 165150720, | |
| "tokens/train_per_sec_per_gpu": 184.41, | |
| "tokens/trainable": 13795589 | |
| }, | |
| { | |
| "epoch": 0.671098112204201, | |
| "grad_norm": 0.0695388987660408, | |
| "learning_rate": 1.1894643898433541e-05, | |
| "loss": 0.7216265201568604, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.05778, | |
| "step": 631, | |
| "tokens/total": 165412864, | |
| "tokens/train_per_sec_per_gpu": 204.39, | |
| "tokens/trainable": 13817860 | |
| }, | |
| { | |
| "epoch": 0.6721616591332092, | |
| "grad_norm": 0.0669473186135292, | |
| "learning_rate": 1.1826883095571758e-05, | |
| "loss": 0.8506951332092285, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.34127, | |
| "step": 632, | |
| "tokens/total": 165675008, | |
| "tokens/train_per_sec_per_gpu": 251.79, | |
| "tokens/trainable": 13842305 | |
| }, | |
| { | |
| "epoch": 0.6732252060622175, | |
| "grad_norm": 0.07681821286678314, | |
| "learning_rate": 1.1759234732528952e-05, | |
| "loss": 0.7686042785644531, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.15675, | |
| "step": 633, | |
| "tokens/total": 165937152, | |
| "tokens/train_per_sec_per_gpu": 177.33, | |
| "tokens/trainable": 13861902 | |
| }, | |
| { | |
| "epoch": 0.6742887529912257, | |
| "grad_norm": 0.06518401950597763, | |
| "learning_rate": 1.1691699739962275e-05, | |
| "loss": 0.8041465878486633, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.23479, | |
| "step": 634, | |
| "tokens/total": 166199296, | |
| "tokens/train_per_sec_per_gpu": 207.88, | |
| "tokens/trainable": 13884726 | |
| }, | |
| { | |
| "epoch": 0.6753522999202339, | |
| "grad_norm": 0.06772468984127045, | |
| "learning_rate": 1.1624279046969208e-05, | |
| "loss": 0.7493684887886047, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.11566, | |
| "step": 635, | |
| "tokens/total": 166461440, | |
| "tokens/train_per_sec_per_gpu": 221.3, | |
| "tokens/trainable": 13908358 | |
| }, | |
| { | |
| "epoch": 0.6764158468492423, | |
| "grad_norm": 0.07777251303195953, | |
| "learning_rate": 1.1556973581074784e-05, | |
| "loss": 0.7796363830566406, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.18068, | |
| "step": 636, | |
| "tokens/total": 166723584, | |
| "tokens/train_per_sec_per_gpu": 158.72, | |
| "tokens/trainable": 13930851 | |
| }, | |
| { | |
| "epoch": 0.6774793937782505, | |
| "grad_norm": 0.07164981216192245, | |
| "learning_rate": 1.1489784268218811e-05, | |
| "loss": 0.7849699258804321, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.19234, | |
| "step": 637, | |
| "tokens/total": 166985728, | |
| "tokens/train_per_sec_per_gpu": 226.45, | |
| "tokens/trainable": 13952042 | |
| }, | |
| { | |
| "epoch": 0.6785429407072587, | |
| "grad_norm": 0.07446952164173126, | |
| "learning_rate": 1.1422712032743186e-05, | |
| "loss": 0.7363911867141724, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.08839, | |
| "step": 638, | |
| "tokens/total": 167247872, | |
| "tokens/train_per_sec_per_gpu": 176.2, | |
| "tokens/trainable": 13972004 | |
| }, | |
| { | |
| "epoch": 0.679606487636267, | |
| "grad_norm": 0.07111480087041855, | |
| "learning_rate": 1.1355757797379093e-05, | |
| "loss": 0.7901080846786499, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.20363, | |
| "step": 639, | |
| "tokens/total": 167510016, | |
| "tokens/train_per_sec_per_gpu": 204.49, | |
| "tokens/trainable": 13994787 | |
| }, | |
| { | |
| "epoch": 0.6806700345652752, | |
| "grad_norm": 0.07409324496984482, | |
| "learning_rate": 1.1288922483234395e-05, | |
| "loss": 0.7688755393028259, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.15734, | |
| "step": 640, | |
| "tokens/total": 167772160, | |
| "tokens/train_per_sec_per_gpu": 183.2, | |
| "tokens/trainable": 14016072 | |
| }, | |
| { | |
| "epoch": 0.6817335814942834, | |
| "grad_norm": 0.06080978736281395, | |
| "learning_rate": 1.1222207009780888e-05, | |
| "loss": 0.731904923915863, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.07904, | |
| "step": 641, | |
| "tokens/total": 168034304, | |
| "tokens/train_per_sec_per_gpu": 193.71, | |
| "tokens/trainable": 14039858 | |
| }, | |
| { | |
| "epoch": 0.6827971284232917, | |
| "grad_norm": 0.07461950182914734, | |
| "learning_rate": 1.1155612294841713e-05, | |
| "loss": 0.7744308710098267, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.16936, | |
| "step": 642, | |
| "tokens/total": 168296448, | |
| "tokens/train_per_sec_per_gpu": 190.26, | |
| "tokens/trainable": 14062445 | |
| }, | |
| { | |
| "epoch": 0.6838606753522999, | |
| "grad_norm": 0.06637416779994965, | |
| "learning_rate": 1.10891392545787e-05, | |
| "loss": 0.6965582370758057, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.00683, | |
| "step": 643, | |
| "tokens/total": 168558592, | |
| "tokens/train_per_sec_per_gpu": 202.21, | |
| "tokens/trainable": 14085386 | |
| }, | |
| { | |
| "epoch": 0.6849242222813081, | |
| "grad_norm": 0.08194036036729813, | |
| "learning_rate": 1.1022788803479747e-05, | |
| "loss": 0.7541095018386841, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.12572, | |
| "step": 644, | |
| "tokens/total": 168820736, | |
| "tokens/train_per_sec_per_gpu": 200.39, | |
| "tokens/trainable": 14107399 | |
| }, | |
| { | |
| "epoch": 0.6859877692103165, | |
| "grad_norm": 0.07168550044298172, | |
| "learning_rate": 1.095656185434629e-05, | |
| "loss": 0.7705248594284058, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.1609, | |
| "step": 645, | |
| "tokens/total": 169082880, | |
| "tokens/train_per_sec_per_gpu": 179.15, | |
| "tokens/trainable": 14129284 | |
| }, | |
| { | |
| "epoch": 0.6870513161393247, | |
| "grad_norm": 0.0729442834854126, | |
| "learning_rate": 1.0890459318280681e-05, | |
| "loss": 0.7964296340942383, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.21761, | |
| "step": 646, | |
| "tokens/total": 169345024, | |
| "tokens/train_per_sec_per_gpu": 210.01, | |
| "tokens/trainable": 14151413 | |
| }, | |
| { | |
| "epoch": 0.6881148630683329, | |
| "grad_norm": 0.07156070321798325, | |
| "learning_rate": 1.0824482104673723e-05, | |
| "loss": 0.7291166186332703, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.07325, | |
| "step": 647, | |
| "tokens/total": 169607168, | |
| "tokens/train_per_sec_per_gpu": 187.04, | |
| "tokens/trainable": 14172388 | |
| }, | |
| { | |
| "epoch": 0.6891784099973411, | |
| "grad_norm": 0.07664386928081512, | |
| "learning_rate": 1.0758631121192075e-05, | |
| "loss": 0.847728967666626, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.33434, | |
| "step": 648, | |
| "tokens/total": 169869312, | |
| "tokens/train_per_sec_per_gpu": 183.41, | |
| "tokens/trainable": 14194504 | |
| }, | |
| { | |
| "epoch": 0.6902419569263494, | |
| "grad_norm": 0.06855987012386322, | |
| "learning_rate": 1.0692907273765878e-05, | |
| "loss": 0.809911847114563, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.24771, | |
| "step": 649, | |
| "tokens/total": 170131456, | |
| "tokens/train_per_sec_per_gpu": 175.96, | |
| "tokens/trainable": 14217186 | |
| }, | |
| { | |
| "epoch": 0.6913055038553576, | |
| "grad_norm": 0.069715715944767, | |
| "learning_rate": 1.0627311466576167e-05, | |
| "loss": 0.7290323972702026, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.07307, | |
| "step": 650, | |
| "tokens/total": 170393600, | |
| "tokens/train_per_sec_per_gpu": 226.27, | |
| "tokens/trainable": 14240939 | |
| }, | |
| { | |
| "epoch": 0.6923690507843658, | |
| "grad_norm": 0.06665818393230438, | |
| "learning_rate": 1.0561844602042535e-05, | |
| "loss": 0.696631908416748, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.00698, | |
| "step": 651, | |
| "tokens/total": 170655744, | |
| "tokens/train_per_sec_per_gpu": 165.96, | |
| "tokens/trainable": 14262378 | |
| }, | |
| { | |
| "epoch": 0.6934325977133741, | |
| "grad_norm": 0.06637638807296753, | |
| "learning_rate": 1.0496507580810637e-05, | |
| "loss": 0.7380032539367676, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.09175, | |
| "step": 652, | |
| "tokens/total": 170917888, | |
| "tokens/train_per_sec_per_gpu": 210.8, | |
| "tokens/trainable": 14284893 | |
| }, | |
| { | |
| "epoch": 0.6944961446423823, | |
| "grad_norm": 0.07084432989358902, | |
| "learning_rate": 1.0431301301739882e-05, | |
| "loss": 0.8238184452056885, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.27919, | |
| "step": 653, | |
| "tokens/total": 171180032, | |
| "tokens/train_per_sec_per_gpu": 161.41, | |
| "tokens/trainable": 14305618 | |
| }, | |
| { | |
| "epoch": 0.6955596915713906, | |
| "grad_norm": 0.07500351220369339, | |
| "learning_rate": 1.0366226661890976e-05, | |
| "loss": 0.8070303201675415, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.24124, | |
| "step": 654, | |
| "tokens/total": 171442176, | |
| "tokens/train_per_sec_per_gpu": 198.07, | |
| "tokens/trainable": 14327779 | |
| }, | |
| { | |
| "epoch": 0.6966232385003989, | |
| "grad_norm": 0.06748203933238983, | |
| "learning_rate": 1.0301284556513669e-05, | |
| "loss": 0.7480711340904236, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.11292, | |
| "step": 655, | |
| "tokens/total": 171704320, | |
| "tokens/train_per_sec_per_gpu": 198.9, | |
| "tokens/trainable": 14350224 | |
| }, | |
| { | |
| "epoch": 0.6976867854294071, | |
| "grad_norm": 0.07086233794689178, | |
| "learning_rate": 1.023647587903438e-05, | |
| "loss": 0.7530485391616821, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.12346, | |
| "step": 656, | |
| "tokens/total": 171966464, | |
| "tokens/train_per_sec_per_gpu": 186.03, | |
| "tokens/trainable": 14370865 | |
| }, | |
| { | |
| "epoch": 0.6987503323584153, | |
| "grad_norm": 0.07139495760202408, | |
| "learning_rate": 1.017180152104391e-05, | |
| "loss": 0.7517107725143433, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.12062, | |
| "step": 657, | |
| "tokens/total": 172228608, | |
| "tokens/train_per_sec_per_gpu": 208.93, | |
| "tokens/trainable": 14393067 | |
| }, | |
| { | |
| "epoch": 0.6998138792874236, | |
| "grad_norm": 0.07312899082899094, | |
| "learning_rate": 1.0107262372285224e-05, | |
| "loss": 0.737115740776062, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.0899, | |
| "step": 658, | |
| "tokens/total": 172490752, | |
| "tokens/train_per_sec_per_gpu": 211.06, | |
| "tokens/trainable": 14414022 | |
| }, | |
| { | |
| "epoch": 0.7008774262164318, | |
| "grad_norm": 0.06322194635868073, | |
| "learning_rate": 1.004285932064113e-05, | |
| "loss": 0.7515996098518372, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.12039, | |
| "step": 659, | |
| "tokens/total": 172752896, | |
| "tokens/train_per_sec_per_gpu": 264.01, | |
| "tokens/trainable": 14441310 | |
| }, | |
| { | |
| "epoch": 0.70194097314544, | |
| "grad_norm": 0.06637128442525864, | |
| "learning_rate": 9.978593252122168e-06, | |
| "loss": 0.7236040830612183, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.06185, | |
| "step": 660, | |
| "tokens/total": 173015040, | |
| "tokens/train_per_sec_per_gpu": 205.79, | |
| "tokens/trainable": 14462654 | |
| }, | |
| { | |
| "epoch": 0.7030045200744482, | |
| "grad_norm": 0.07252184301614761, | |
| "learning_rate": 9.914465050854312e-06, | |
| "loss": 0.7753311395645142, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.17131, | |
| "step": 661, | |
| "tokens/total": 173277184, | |
| "tokens/train_per_sec_per_gpu": 186.41, | |
| "tokens/trainable": 14484643 | |
| }, | |
| { | |
| "epoch": 0.7040680670034565, | |
| "grad_norm": 0.06677009165287018, | |
| "learning_rate": 9.85047559906689e-06, | |
| "loss": 0.7773078680038452, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.17561, | |
| "step": 662, | |
| "tokens/total": 173539328, | |
| "tokens/train_per_sec_per_gpu": 200.95, | |
| "tokens/trainable": 14507742 | |
| }, | |
| { | |
| "epoch": 0.7051316139324648, | |
| "grad_norm": 0.06489443778991699, | |
| "learning_rate": 9.78662577708039e-06, | |
| "loss": 0.8098626136779785, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.2476, | |
| "step": 663, | |
| "tokens/total": 173801472, | |
| "tokens/train_per_sec_per_gpu": 209.29, | |
| "tokens/trainable": 14531656 | |
| }, | |
| { | |
| "epoch": 0.706195160861473, | |
| "grad_norm": 0.0692375972867012, | |
| "learning_rate": 9.722916463294405e-06, | |
| "loss": 0.7761327624320984, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.17305, | |
| "step": 664, | |
| "tokens/total": 174063616, | |
| "tokens/train_per_sec_per_gpu": 180.0, | |
| "tokens/trainable": 14553153 | |
| }, | |
| { | |
| "epoch": 0.7072587077904813, | |
| "grad_norm": 0.07724090665578842, | |
| "learning_rate": 9.659348534175484e-06, | |
| "loss": 0.7831587195396423, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.18837, | |
| "step": 665, | |
| "tokens/total": 174325760, | |
| "tokens/train_per_sec_per_gpu": 183.64, | |
| "tokens/trainable": 14575227 | |
| }, | |
| { | |
| "epoch": 0.7083222547194895, | |
| "grad_norm": 0.07470756769180298, | |
| "learning_rate": 9.595922864245135e-06, | |
| "loss": 0.799434244632721, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.22428, | |
| "step": 666, | |
| "tokens/total": 174587904, | |
| "tokens/train_per_sec_per_gpu": 224.55, | |
| "tokens/trainable": 14598217 | |
| }, | |
| { | |
| "epoch": 0.7093858016484977, | |
| "grad_norm": 0.07464490085840225, | |
| "learning_rate": 9.532640326067764e-06, | |
| "loss": 0.7516946792602539, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.12059, | |
| "step": 667, | |
| "tokens/total": 174850048, | |
| "tokens/train_per_sec_per_gpu": 200.11, | |
| "tokens/trainable": 14620170 | |
| }, | |
| { | |
| "epoch": 0.710449348577506, | |
| "grad_norm": 0.06981492787599564, | |
| "learning_rate": 9.469501790238654e-06, | |
| "loss": 0.7612972259521484, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.14105, | |
| "step": 668, | |
| "tokens/total": 175112192, | |
| "tokens/train_per_sec_per_gpu": 176.73, | |
| "tokens/trainable": 14641941 | |
| }, | |
| { | |
| "epoch": 0.7115128955065142, | |
| "grad_norm": 0.06430601328611374, | |
| "learning_rate": 9.406508125372034e-06, | |
| "loss": 0.7262794971466064, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.06737, | |
| "step": 669, | |
| "tokens/total": 175374336, | |
| "tokens/train_per_sec_per_gpu": 186.94, | |
| "tokens/trainable": 14664035 | |
| }, | |
| { | |
| "epoch": 0.7125764424355224, | |
| "grad_norm": 0.06467308104038239, | |
| "learning_rate": 9.343660198089072e-06, | |
| "loss": 0.751503586769104, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.12019, | |
| "step": 670, | |
| "tokens/total": 175636480, | |
| "tokens/train_per_sec_per_gpu": 171.75, | |
| "tokens/trainable": 14684884 | |
| }, | |
| { | |
| "epoch": 0.7136399893645307, | |
| "grad_norm": 0.07673154026269913, | |
| "learning_rate": 9.280958873006032e-06, | |
| "loss": 0.771912693977356, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.1639, | |
| "step": 671, | |
| "tokens/total": 175898624, | |
| "tokens/train_per_sec_per_gpu": 185.49, | |
| "tokens/trainable": 14705843 | |
| }, | |
| { | |
| "epoch": 0.714703536293539, | |
| "grad_norm": 0.06743910163640976, | |
| "learning_rate": 9.21840501272228e-06, | |
| "loss": 0.7066195011138916, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.02713, | |
| "step": 672, | |
| "tokens/total": 176160768, | |
| "tokens/train_per_sec_per_gpu": 202.95, | |
| "tokens/trainable": 14728558 | |
| }, | |
| { | |
| "epoch": 0.7157670832225472, | |
| "grad_norm": 0.07574694603681564, | |
| "learning_rate": 9.155999477808503e-06, | |
| "loss": 0.7232800126075745, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.06118, | |
| "step": 673, | |
| "tokens/total": 176422912, | |
| "tokens/train_per_sec_per_gpu": 203.45, | |
| "tokens/trainable": 14748880 | |
| }, | |
| { | |
| "epoch": 0.7168306301515555, | |
| "grad_norm": 0.06880473345518112, | |
| "learning_rate": 9.093743126794818e-06, | |
| "loss": 0.7625008225440979, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.14363, | |
| "step": 674, | |
| "tokens/total": 176685056, | |
| "tokens/train_per_sec_per_gpu": 202.57, | |
| "tokens/trainable": 14770901 | |
| }, | |
| { | |
| "epoch": 0.7178941770805637, | |
| "grad_norm": 0.07242193818092346, | |
| "learning_rate": 9.031636816158974e-06, | |
| "loss": 0.7549704909324646, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.12755, | |
| "step": 675, | |
| "tokens/total": 176947200, | |
| "tokens/train_per_sec_per_gpu": 170.41, | |
| "tokens/trainable": 14791194 | |
| }, | |
| { | |
| "epoch": 0.7189577240095719, | |
| "grad_norm": 0.07365458458662033, | |
| "learning_rate": 8.969681400314589e-06, | |
| "loss": 0.8114491701126099, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.25117, | |
| "step": 676, | |
| "tokens/total": 177209344, | |
| "tokens/train_per_sec_per_gpu": 170.56, | |
| "tokens/trainable": 14811680 | |
| }, | |
| { | |
| "epoch": 0.7200212709385801, | |
| "grad_norm": 0.08069625496864319, | |
| "learning_rate": 8.907877731599372e-06, | |
| "loss": 0.7868974208831787, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.19657, | |
| "step": 677, | |
| "tokens/total": 177471488, | |
| "tokens/train_per_sec_per_gpu": 214.41, | |
| "tokens/trainable": 14831836 | |
| }, | |
| { | |
| "epoch": 0.7210848178675884, | |
| "grad_norm": 0.07136721163988113, | |
| "learning_rate": 8.846226660263415e-06, | |
| "loss": 0.7520813345909119, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.12141, | |
| "step": 678, | |
| "tokens/total": 177733632, | |
| "tokens/train_per_sec_per_gpu": 165.9, | |
| "tokens/trainable": 14852202 | |
| }, | |
| { | |
| "epoch": 0.7221483647965966, | |
| "grad_norm": 0.06927739828824997, | |
| "learning_rate": 8.78472903445746e-06, | |
| "loss": 0.7342413067817688, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.0839, | |
| "step": 679, | |
| "tokens/total": 177995776, | |
| "tokens/train_per_sec_per_gpu": 199.05, | |
| "tokens/trainable": 14873216 | |
| }, | |
| { | |
| "epoch": 0.7232119117256048, | |
| "grad_norm": 0.06386187672615051, | |
| "learning_rate": 8.723385700221288e-06, | |
| "loss": 0.7768787145614624, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.17467, | |
| "step": 680, | |
| "tokens/total": 178257920, | |
| "tokens/train_per_sec_per_gpu": 263.1, | |
| "tokens/trainable": 14895853 | |
| }, | |
| { | |
| "epoch": 0.7242754586546132, | |
| "grad_norm": 0.08089049160480499, | |
| "learning_rate": 8.662197501472016e-06, | |
| "loss": 0.7622289061546326, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.14305, | |
| "step": 681, | |
| "tokens/total": 178520064, | |
| "tokens/train_per_sec_per_gpu": 179.05, | |
| "tokens/trainable": 14917187 | |
| }, | |
| { | |
| "epoch": 0.7253390055836214, | |
| "grad_norm": 0.07812239229679108, | |
| "learning_rate": 8.601165279992549e-06, | |
| "loss": 0.7714396715164185, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.16288, | |
| "step": 682, | |
| "tokens/total": 178782208, | |
| "tokens/train_per_sec_per_gpu": 157.1, | |
| "tokens/trainable": 14936575 | |
| }, | |
| { | |
| "epoch": 0.7264025525126296, | |
| "grad_norm": 0.07455753535032272, | |
| "learning_rate": 8.540289875419962e-06, | |
| "loss": 0.763314962387085, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.14538, | |
| "step": 683, | |
| "tokens/total": 179044352, | |
| "tokens/train_per_sec_per_gpu": 199.99, | |
| "tokens/trainable": 14959566 | |
| }, | |
| { | |
| "epoch": 0.7274660994416379, | |
| "grad_norm": 0.06646628677845001, | |
| "learning_rate": 8.47957212523394e-06, | |
| "loss": 0.7563636302947998, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.13051, | |
| "step": 684, | |
| "tokens/total": 179306496, | |
| "tokens/train_per_sec_per_gpu": 202.47, | |
| "tokens/trainable": 14981954 | |
| }, | |
| { | |
| "epoch": 0.7285296463706461, | |
| "grad_norm": 0.08660077303647995, | |
| "learning_rate": 8.419012864745297e-06, | |
| "loss": 0.7826834917068481, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.18733, | |
| "step": 685, | |
| "tokens/total": 179568640, | |
| "tokens/train_per_sec_per_gpu": 169.37, | |
| "tokens/trainable": 15001703 | |
| }, | |
| { | |
| "epoch": 0.7295931932996543, | |
| "grad_norm": 0.07473118603229523, | |
| "learning_rate": 8.358612927084435e-06, | |
| "loss": 0.8132219910621643, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.25516, | |
| "step": 686, | |
| "tokens/total": 179830784, | |
| "tokens/train_per_sec_per_gpu": 176.67, | |
| "tokens/trainable": 15024105 | |
| }, | |
| { | |
| "epoch": 0.7306567402286626, | |
| "grad_norm": 0.06741170585155487, | |
| "learning_rate": 8.29837314318993e-06, | |
| "loss": 0.6950328946113586, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.00377, | |
| "step": 687, | |
| "tokens/total": 180092928, | |
| "tokens/train_per_sec_per_gpu": 190.47, | |
| "tokens/trainable": 15045856 | |
| }, | |
| { | |
| "epoch": 0.7317202871576708, | |
| "grad_norm": 0.07606185227632523, | |
| "learning_rate": 8.23829434179707e-06, | |
| "loss": 0.8332937955856323, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.30088, | |
| "step": 688, | |
| "tokens/total": 180355072, | |
| "tokens/train_per_sec_per_gpu": 194.57, | |
| "tokens/trainable": 15067803 | |
| }, | |
| { | |
| "epoch": 0.732783834086679, | |
| "grad_norm": 0.07419081032276154, | |
| "learning_rate": 8.178377349426471e-06, | |
| "loss": 0.81829434633255, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.26663, | |
| "step": 689, | |
| "tokens/total": 180617216, | |
| "tokens/train_per_sec_per_gpu": 179.35, | |
| "tokens/trainable": 15090573 | |
| }, | |
| { | |
| "epoch": 0.7338473810156874, | |
| "grad_norm": 0.0726039931178093, | |
| "learning_rate": 8.118622990372676e-06, | |
| "loss": 0.8616635203361511, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.3671, | |
| "step": 690, | |
| "tokens/total": 180879360, | |
| "tokens/train_per_sec_per_gpu": 200.44, | |
| "tokens/trainable": 15112666 | |
| }, | |
| { | |
| "epoch": 0.7349109279446956, | |
| "grad_norm": 0.07402276247739792, | |
| "learning_rate": 8.059032086692864e-06, | |
| "loss": 0.7540690302848816, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.12563, | |
| "step": 691, | |
| "tokens/total": 181141504, | |
| "tokens/train_per_sec_per_gpu": 168.0, | |
| "tokens/trainable": 15133098 | |
| }, | |
| { | |
| "epoch": 0.7359744748737038, | |
| "grad_norm": 0.07309621572494507, | |
| "learning_rate": 7.999605458195486e-06, | |
| "loss": 0.7695388793945312, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.15877, | |
| "step": 692, | |
| "tokens/total": 181403648, | |
| "tokens/train_per_sec_per_gpu": 181.09, | |
| "tokens/trainable": 15153899 | |
| }, | |
| { | |
| "epoch": 0.737038021802712, | |
| "grad_norm": 0.06912367045879364, | |
| "learning_rate": 7.94034392242903e-06, | |
| "loss": 0.7535648345947266, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.12456, | |
| "step": 693, | |
| "tokens/total": 181665792, | |
| "tokens/train_per_sec_per_gpu": 183.77, | |
| "tokens/trainable": 15176330 | |
| }, | |
| { | |
| "epoch": 0.7381015687317203, | |
| "grad_norm": 0.07674102485179901, | |
| "learning_rate": 7.881248294670771e-06, | |
| "loss": 0.7670722007751465, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.15345, | |
| "step": 694, | |
| "tokens/total": 181927936, | |
| "tokens/train_per_sec_per_gpu": 172.46, | |
| "tokens/trainable": 15198450 | |
| }, | |
| { | |
| "epoch": 0.7391651156607285, | |
| "grad_norm": 0.07187400758266449, | |
| "learning_rate": 7.82231938791551e-06, | |
| "loss": 0.7742888331413269, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.16905, | |
| "step": 695, | |
| "tokens/total": 182190080, | |
| "tokens/train_per_sec_per_gpu": 209.81, | |
| "tokens/trainable": 15222309 | |
| }, | |
| { | |
| "epoch": 0.7402286625897367, | |
| "grad_norm": 0.07647348195314407, | |
| "learning_rate": 7.763558012864446e-06, | |
| "loss": 0.8115613460540771, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.25142, | |
| "step": 696, | |
| "tokens/total": 182452224, | |
| "tokens/train_per_sec_per_gpu": 160.85, | |
| "tokens/trainable": 15245790 | |
| }, | |
| { | |
| "epoch": 0.741292209518745, | |
| "grad_norm": 0.07111165672540665, | |
| "learning_rate": 7.704964977913984e-06, | |
| "loss": 0.8241356611251831, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.27991, | |
| "step": 697, | |
| "tokens/total": 182714368, | |
| "tokens/train_per_sec_per_gpu": 222.33, | |
| "tokens/trainable": 15268790 | |
| }, | |
| { | |
| "epoch": 0.7423557564477533, | |
| "grad_norm": 0.07145415991544724, | |
| "learning_rate": 7.646541089144638e-06, | |
| "loss": 0.7397055625915527, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.09532, | |
| "step": 698, | |
| "tokens/total": 182976512, | |
| "tokens/train_per_sec_per_gpu": 197.27, | |
| "tokens/trainable": 15289822 | |
| }, | |
| { | |
| "epoch": 0.7434193033767615, | |
| "grad_norm": 0.08257947117090225, | |
| "learning_rate": 7.588287150309928e-06, | |
| "loss": 0.7899596691131592, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.20331, | |
| "step": 699, | |
| "tokens/total": 183238656, | |
| "tokens/train_per_sec_per_gpu": 172.65, | |
| "tokens/trainable": 15310952 | |
| }, | |
| { | |
| "epoch": 0.7444828503057698, | |
| "grad_norm": 0.07342710345983505, | |
| "learning_rate": 7.530203962825331e-06, | |
| "loss": 0.7988142371177673, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.2229, | |
| "step": 700, | |
| "tokens/total": 183500800, | |
| "tokens/train_per_sec_per_gpu": 202.61, | |
| "tokens/trainable": 15335255 | |
| }, | |
| { | |
| "epoch": 0.745546397234778, | |
| "grad_norm": 0.07142513245344162, | |
| "learning_rate": 7.4722923257572335e-06, | |
| "loss": 0.8055736422538757, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.23798, | |
| "step": 701, | |
| "tokens/total": 183762944, | |
| "tokens/train_per_sec_per_gpu": 234.19, | |
| "tokens/trainable": 15359283 | |
| }, | |
| { | |
| "epoch": 0.7466099441637862, | |
| "grad_norm": 0.0752025619149208, | |
| "learning_rate": 7.414553035811978e-06, | |
| "loss": 0.7718970775604248, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.16387, | |
| "step": 702, | |
| "tokens/total": 184025088, | |
| "tokens/train_per_sec_per_gpu": 222.69, | |
| "tokens/trainable": 15380449 | |
| }, | |
| { | |
| "epoch": 0.7476734910927945, | |
| "grad_norm": 0.07547293603420258, | |
| "learning_rate": 7.35698688732486e-06, | |
| "loss": 0.7342941761016846, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.08401, | |
| "step": 703, | |
| "tokens/total": 184287232, | |
| "tokens/train_per_sec_per_gpu": 152.07, | |
| "tokens/trainable": 15400504 | |
| }, | |
| { | |
| "epoch": 0.7487370380218027, | |
| "grad_norm": 0.06701923161745071, | |
| "learning_rate": 7.299594672249231e-06, | |
| "loss": 0.8081640005111694, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.24378, | |
| "step": 704, | |
| "tokens/total": 184549376, | |
| "tokens/train_per_sec_per_gpu": 185.98, | |
| "tokens/trainable": 15423145 | |
| }, | |
| { | |
| "epoch": 0.7498005849508109, | |
| "grad_norm": 0.06758899241685867, | |
| "learning_rate": 7.242377180145603e-06, | |
| "loss": 0.7538725733757019, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.12521, | |
| "step": 705, | |
| "tokens/total": 184811520, | |
| "tokens/train_per_sec_per_gpu": 169.11, | |
| "tokens/trainable": 15446585 | |
| }, | |
| { | |
| "epoch": 0.7508641318798192, | |
| "grad_norm": 0.07745972275733948, | |
| "learning_rate": 7.1853351981707504e-06, | |
| "loss": 0.8155503273010254, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.26042, | |
| "step": 706, | |
| "tokens/total": 185073664, | |
| "tokens/train_per_sec_per_gpu": 202.79, | |
| "tokens/trainable": 15467738 | |
| }, | |
| { | |
| "epoch": 0.7519276788088275, | |
| "grad_norm": 0.06426619738340378, | |
| "learning_rate": 7.128469511066933e-06, | |
| "loss": 0.7379392385482788, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.09162, | |
| "step": 707, | |
| "tokens/total": 185335808, | |
| "tokens/train_per_sec_per_gpu": 199.47, | |
| "tokens/trainable": 15490043 | |
| }, | |
| { | |
| "epoch": 0.7529912257378357, | |
| "grad_norm": 0.07671428471803665, | |
| "learning_rate": 7.071780901151049e-06, | |
| "loss": 0.8222740292549133, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.27567, | |
| "step": 708, | |
| "tokens/total": 185597952, | |
| "tokens/train_per_sec_per_gpu": 187.24, | |
| "tokens/trainable": 15512912 | |
| }, | |
| { | |
| "epoch": 0.754054772666844, | |
| "grad_norm": 0.07356390357017517, | |
| "learning_rate": 7.015270148303919e-06, | |
| "loss": 0.7136031985282898, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.04133, | |
| "step": 709, | |
| "tokens/total": 185860096, | |
| "tokens/train_per_sec_per_gpu": 188.47, | |
| "tokens/trainable": 15532378 | |
| }, | |
| { | |
| "epoch": 0.7551183195958522, | |
| "grad_norm": 0.07700134813785553, | |
| "learning_rate": 6.958938029959508e-06, | |
| "loss": 0.7238497734069824, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.06236, | |
| "step": 710, | |
| "tokens/total": 186122240, | |
| "tokens/train_per_sec_per_gpu": 203.44, | |
| "tokens/trainable": 15553126 | |
| }, | |
| { | |
| "epoch": 0.7561818665248604, | |
| "grad_norm": 0.06828487664461136, | |
| "learning_rate": 6.902785321094301e-06, | |
| "loss": 0.7130833864212036, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.04027, | |
| "step": 711, | |
| "tokens/total": 186384384, | |
| "tokens/train_per_sec_per_gpu": 226.96, | |
| "tokens/trainable": 15575744 | |
| }, | |
| { | |
| "epoch": 0.7572454134538686, | |
| "grad_norm": 0.07853987067937851, | |
| "learning_rate": 6.846812794216546e-06, | |
| "loss": 0.7818912267684937, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.1856, | |
| "step": 712, | |
| "tokens/total": 186646528, | |
| "tokens/train_per_sec_per_gpu": 165.77, | |
| "tokens/trainable": 15595629 | |
| }, | |
| { | |
| "epoch": 0.7583089603828769, | |
| "grad_norm": 0.08049149066209793, | |
| "learning_rate": 6.791021219355722e-06, | |
| "loss": 0.860575795173645, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.36452, | |
| "step": 713, | |
| "tokens/total": 186908672, | |
| "tokens/train_per_sec_per_gpu": 197.57, | |
| "tokens/trainable": 15617377 | |
| }, | |
| { | |
| "epoch": 0.7593725073118851, | |
| "grad_norm": 0.07803945243358612, | |
| "learning_rate": 6.735411364051865e-06, | |
| "loss": 0.8134850263595581, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.25576, | |
| "step": 714, | |
| "tokens/total": 187170816, | |
| "tokens/train_per_sec_per_gpu": 170.03, | |
| "tokens/trainable": 15638182 | |
| }, | |
| { | |
| "epoch": 0.7604360542408933, | |
| "grad_norm": 0.07364539057016373, | |
| "learning_rate": 6.679983993345067e-06, | |
| "loss": 0.7291663885116577, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.07335, | |
| "step": 715, | |
| "tokens/total": 187432960, | |
| "tokens/train_per_sec_per_gpu": 165.64, | |
| "tokens/trainable": 15658499 | |
| }, | |
| { | |
| "epoch": 0.7614996011699017, | |
| "grad_norm": 0.06459354609251022, | |
| "learning_rate": 6.624739869764931e-06, | |
| "loss": 0.6955982446670532, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.00491, | |
| "step": 716, | |
| "tokens/total": 187695104, | |
| "tokens/train_per_sec_per_gpu": 218.83, | |
| "tokens/trainable": 15681617 | |
| }, | |
| { | |
| "epoch": 0.7625631480989099, | |
| "grad_norm": 0.07153891026973724, | |
| "learning_rate": 6.5696797533200596e-06, | |
| "loss": 0.7669087648391724, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.1531, | |
| "step": 717, | |
| "tokens/total": 187957248, | |
| "tokens/train_per_sec_per_gpu": 215.72, | |
| "tokens/trainable": 15703269 | |
| }, | |
| { | |
| "epoch": 0.7636266950279181, | |
| "grad_norm": 0.0816182941198349, | |
| "learning_rate": 6.514804401487642e-06, | |
| "loss": 0.7865086197853088, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.19572, | |
| "step": 718, | |
| "tokens/total": 188219392, | |
| "tokens/train_per_sec_per_gpu": 197.11, | |
| "tokens/trainable": 15723336 | |
| }, | |
| { | |
| "epoch": 0.7646902419569264, | |
| "grad_norm": 0.07151731103658676, | |
| "learning_rate": 6.460114569202989e-06, | |
| "loss": 0.7884716987609863, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.20003, | |
| "step": 719, | |
| "tokens/total": 188481536, | |
| "tokens/train_per_sec_per_gpu": 194.18, | |
| "tokens/trainable": 15744945 | |
| }, | |
| { | |
| "epoch": 0.7657537888859346, | |
| "grad_norm": 0.08032439649105072, | |
| "learning_rate": 6.405611008849184e-06, | |
| "loss": 0.7517272233963013, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.12066, | |
| "step": 720, | |
| "tokens/total": 188743680, | |
| "tokens/train_per_sec_per_gpu": 182.56, | |
| "tokens/trainable": 15766209 | |
| }, | |
| { | |
| "epoch": 0.7668173358149428, | |
| "grad_norm": 0.0685308575630188, | |
| "learning_rate": 6.351294470246694e-06, | |
| "loss": 0.710903525352478, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.03583, | |
| "step": 721, | |
| "tokens/total": 189005824, | |
| "tokens/train_per_sec_per_gpu": 177.14, | |
| "tokens/trainable": 15790422 | |
| }, | |
| { | |
| "epoch": 0.767880882743951, | |
| "grad_norm": 0.06389962136745453, | |
| "learning_rate": 6.2971657006431175e-06, | |
| "loss": 0.6497384905815125, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 1.91504, | |
| "step": 722, | |
| "tokens/total": 189267968, | |
| "tokens/train_per_sec_per_gpu": 211.94, | |
| "tokens/trainable": 15812341 | |
| }, | |
| { | |
| "epoch": 0.7689444296729593, | |
| "grad_norm": 0.08121001720428467, | |
| "learning_rate": 6.243225444702823e-06, | |
| "loss": 0.7910786867141724, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.20577, | |
| "step": 723, | |
| "tokens/total": 189530112, | |
| "tokens/train_per_sec_per_gpu": 167.38, | |
| "tokens/trainable": 15832072 | |
| }, | |
| { | |
| "epoch": 0.7700079766019675, | |
| "grad_norm": 0.07116419076919556, | |
| "learning_rate": 6.1894744444967525e-06, | |
| "loss": 0.820841372013092, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.27241, | |
| "step": 724, | |
| "tokens/total": 189792256, | |
| "tokens/train_per_sec_per_gpu": 226.98, | |
| "tokens/trainable": 15855273 | |
| }, | |
| { | |
| "epoch": 0.7710715235309759, | |
| "grad_norm": 0.07092837989330292, | |
| "learning_rate": 6.135913439492227e-06, | |
| "loss": 0.7487311363220215, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.11432, | |
| "step": 725, | |
| "tokens/total": 190054400, | |
| "tokens/train_per_sec_per_gpu": 171.4, | |
| "tokens/trainable": 15877353 | |
| }, | |
| { | |
| "epoch": 0.7721350704599841, | |
| "grad_norm": 0.07904180139303207, | |
| "learning_rate": 6.0825431665427185e-06, | |
| "loss": 0.779388427734375, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.18014, | |
| "step": 726, | |
| "tokens/total": 190316544, | |
| "tokens/train_per_sec_per_gpu": 166.96, | |
| "tokens/trainable": 15899580 | |
| }, | |
| { | |
| "epoch": 0.7731986173889923, | |
| "grad_norm": 0.08124149590730667, | |
| "learning_rate": 6.029364359877772e-06, | |
| "loss": 0.7941169738769531, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.21249, | |
| "step": 727, | |
| "tokens/total": 190578688, | |
| "tokens/train_per_sec_per_gpu": 164.05, | |
| "tokens/trainable": 15919312 | |
| }, | |
| { | |
| "epoch": 0.7742621643180005, | |
| "grad_norm": 0.07375580817461014, | |
| "learning_rate": 5.976377751092867e-06, | |
| "loss": 0.7667893171310425, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.15284, | |
| "step": 728, | |
| "tokens/total": 190840832, | |
| "tokens/train_per_sec_per_gpu": 183.39, | |
| "tokens/trainable": 15940854 | |
| }, | |
| { | |
| "epoch": 0.7753257112470088, | |
| "grad_norm": 0.07292015105485916, | |
| "learning_rate": 5.923584069139372e-06, | |
| "loss": 0.7329660654067993, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.08124, | |
| "step": 729, | |
| "tokens/total": 191102976, | |
| "tokens/train_per_sec_per_gpu": 179.98, | |
| "tokens/trainable": 15960364 | |
| }, | |
| { | |
| "epoch": 0.776389258176017, | |
| "grad_norm": 0.07190407067537308, | |
| "learning_rate": 5.870984040314491e-06, | |
| "loss": 0.7781997919082642, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.17755, | |
| "step": 730, | |
| "tokens/total": 191365120, | |
| "tokens/train_per_sec_per_gpu": 181.25, | |
| "tokens/trainable": 15981713 | |
| }, | |
| { | |
| "epoch": 0.7774528051050252, | |
| "grad_norm": 0.08158424496650696, | |
| "learning_rate": 5.81857838825131e-06, | |
| "loss": 0.7884747385978699, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.20004, | |
| "step": 731, | |
| "tokens/total": 191627264, | |
| "tokens/train_per_sec_per_gpu": 192.3, | |
| "tokens/trainable": 16002933 | |
| }, | |
| { | |
| "epoch": 0.7785163520340335, | |
| "grad_norm": 0.06751953810453415, | |
| "learning_rate": 5.7663678339087995e-06, | |
| "loss": 0.7473230361938477, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.11134, | |
| "step": 732, | |
| "tokens/total": 191889408, | |
| "tokens/train_per_sec_per_gpu": 204.72, | |
| "tokens/trainable": 16023922 | |
| }, | |
| { | |
| "epoch": 0.7795798989630417, | |
| "grad_norm": 0.07681619375944138, | |
| "learning_rate": 5.714353095561929e-06, | |
| "loss": 0.669176459312439, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 1.95263, | |
| "step": 733, | |
| "tokens/total": 192151552, | |
| "tokens/train_per_sec_per_gpu": 214.29, | |
| "tokens/trainable": 16047794 | |
| }, | |
| { | |
| "epoch": 0.78064344589205, | |
| "grad_norm": 0.07466083765029907, | |
| "learning_rate": 5.66253488879178e-06, | |
| "loss": 0.7656868696212769, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.15047, | |
| "step": 734, | |
| "tokens/total": 192413696, | |
| "tokens/train_per_sec_per_gpu": 172.48, | |
| "tokens/trainable": 16067607 | |
| }, | |
| { | |
| "epoch": 0.7817069928210583, | |
| "grad_norm": 0.07689571380615234, | |
| "learning_rate": 5.6109139264756715e-06, | |
| "loss": 0.782907247543335, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.18782, | |
| "step": 735, | |
| "tokens/total": 192675840, | |
| "tokens/train_per_sec_per_gpu": 141.32, | |
| "tokens/trainable": 16086585 | |
| }, | |
| { | |
| "epoch": 0.7827705397500665, | |
| "grad_norm": 0.06553677469491959, | |
| "learning_rate": 5.55949091877741e-06, | |
| "loss": 0.805263876914978, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.23729, | |
| "step": 736, | |
| "tokens/total": 192937984, | |
| "tokens/train_per_sec_per_gpu": 213.73, | |
| "tokens/trainable": 16110209 | |
| }, | |
| { | |
| "epoch": 0.7838340866790747, | |
| "grad_norm": 0.07252180576324463, | |
| "learning_rate": 5.508266573137449e-06, | |
| "loss": 0.7305166125297546, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.07615, | |
| "step": 737, | |
| "tokens/total": 193200128, | |
| "tokens/train_per_sec_per_gpu": 189.19, | |
| "tokens/trainable": 16132001 | |
| }, | |
| { | |
| "epoch": 0.784897633608083, | |
| "grad_norm": 0.08187612891197205, | |
| "learning_rate": 5.457241594263219e-06, | |
| "loss": 0.8013345003128052, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.22851, | |
| "step": 738, | |
| "tokens/total": 193462272, | |
| "tokens/train_per_sec_per_gpu": 196.91, | |
| "tokens/trainable": 16152126 | |
| }, | |
| { | |
| "epoch": 0.7859611805370912, | |
| "grad_norm": 0.06783902645111084, | |
| "learning_rate": 5.4064166841194e-06, | |
| "loss": 0.7070966958999634, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.02809, | |
| "step": 739, | |
| "tokens/total": 193724416, | |
| "tokens/train_per_sec_per_gpu": 187.99, | |
| "tokens/trainable": 16173209 | |
| }, | |
| { | |
| "epoch": 0.7870247274660994, | |
| "grad_norm": 0.07314567267894745, | |
| "learning_rate": 5.3557925419182764e-06, | |
| "loss": 0.7426178455352783, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.10143, | |
| "step": 740, | |
| "tokens/total": 193986560, | |
| "tokens/train_per_sec_per_gpu": 192.09, | |
| "tokens/trainable": 16193859 | |
| }, | |
| { | |
| "epoch": 0.7880882743951076, | |
| "grad_norm": 0.07094215601682663, | |
| "learning_rate": 5.305369864110095e-06, | |
| "loss": 0.7928224802017212, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.20962, | |
| "step": 741, | |
| "tokens/total": 194248704, | |
| "tokens/train_per_sec_per_gpu": 206.94, | |
| "tokens/trainable": 16217465 | |
| }, | |
| { | |
| "epoch": 0.7891518213241159, | |
| "grad_norm": 0.07578529417514801, | |
| "learning_rate": 5.255149344373525e-06, | |
| "loss": 0.7676488161087036, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.15469, | |
| "step": 742, | |
| "tokens/total": 194510848, | |
| "tokens/train_per_sec_per_gpu": 209.86, | |
| "tokens/trainable": 16237798 | |
| }, | |
| { | |
| "epoch": 0.7902153682531242, | |
| "grad_norm": 0.07046926766633987, | |
| "learning_rate": 5.205131673606072e-06, | |
| "loss": 0.7564293742179871, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.13065, | |
| "step": 743, | |
| "tokens/total": 194772992, | |
| "tokens/train_per_sec_per_gpu": 188.6, | |
| "tokens/trainable": 16259923 | |
| }, | |
| { | |
| "epoch": 0.7912789151821324, | |
| "grad_norm": 0.07736487686634064, | |
| "learning_rate": 5.155317539914601e-06, | |
| "loss": 0.8347206115722656, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.30417, | |
| "step": 744, | |
| "tokens/total": 195035136, | |
| "tokens/train_per_sec_per_gpu": 185.07, | |
| "tokens/trainable": 16281319 | |
| }, | |
| { | |
| "epoch": 0.7923424621111407, | |
| "grad_norm": 0.07465264946222305, | |
| "learning_rate": 5.105707628605872e-06, | |
| "loss": 0.7569116950035095, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.13168, | |
| "step": 745, | |
| "tokens/total": 195297280, | |
| "tokens/train_per_sec_per_gpu": 208.28, | |
| "tokens/trainable": 16304177 | |
| }, | |
| { | |
| "epoch": 0.7934060090401489, | |
| "grad_norm": 0.07319594919681549, | |
| "learning_rate": 5.056302622177074e-06, | |
| "loss": 0.8522895574569702, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.34501, | |
| "step": 746, | |
| "tokens/total": 195559424, | |
| "tokens/train_per_sec_per_gpu": 183.83, | |
| "tokens/trainable": 16325635 | |
| }, | |
| { | |
| "epoch": 0.7944695559691571, | |
| "grad_norm": 0.06852512806653976, | |
| "learning_rate": 5.007103200306493e-06, | |
| "loss": 0.6919762492179871, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 1.99766, | |
| "step": 747, | |
| "tokens/total": 195821568, | |
| "tokens/train_per_sec_per_gpu": 169.51, | |
| "tokens/trainable": 16347890 | |
| }, | |
| { | |
| "epoch": 0.7955331028981654, | |
| "grad_norm": 0.06914964318275452, | |
| "learning_rate": 4.958110039844109e-06, | |
| "loss": 0.7257647514343262, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.06631, | |
| "step": 748, | |
| "tokens/total": 196083712, | |
| "tokens/train_per_sec_per_gpu": 180.18, | |
| "tokens/trainable": 16370035 | |
| }, | |
| { | |
| "epoch": 0.7965966498271736, | |
| "grad_norm": 0.08356507867574692, | |
| "learning_rate": 4.9093238148023206e-06, | |
| "loss": 0.8562688827514648, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.35436, | |
| "step": 749, | |
| "tokens/total": 196345856, | |
| "tokens/train_per_sec_per_gpu": 163.34, | |
| "tokens/trainable": 16389246 | |
| }, | |
| { | |
| "epoch": 0.7976601967561818, | |
| "grad_norm": 0.06851000338792801, | |
| "learning_rate": 4.860745196346652e-06, | |
| "loss": 0.732913076877594, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 46.14, | |
| "memory/max_allocated (GiB)": 46.14, | |
| "ppl": 2.08113, | |
| "step": 750, | |
| "tokens/total": 196608000, | |
| "tokens/train_per_sec_per_gpu": 191.77, | |
| "tokens/trainable": 16410595 | |
| }, | |
| { | |
| "epoch": 0.7976601967561818, | |
| "eval_loss": 0.7697240710258484, | |
| "eval_ppl": 2.15917, | |
| "eval_runtime": 237.415, | |
| "eval_samples_per_second": 28.162, | |
| "eval_steps_per_second": 1.761, | |
| "memory/device_reserved (GiB)": 52.46, | |
| "memory/max_active (GiB)": 38.19, | |
| "memory/max_allocated (GiB)": 38.19, | |
| "step": 750 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 941, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 150, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 1000, | |
| "early_stopping_threshold": 0.0 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 0 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 8.388464118595584e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |