Instructions to use yjyjyj98/drdapo_lora32_max4096 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- PEFT
How to use yjyjyj98/drdapo_lora32_max4096 with PEFT:
from peft import PeftModel from transformers import AutoModelForCausalLM base_model = AutoModelForCausalLM.from_pretrained("unsloth/Qwen3-4B-Base") model = PeftModel.from_pretrained(base_model, "yjyjyj98/drdapo_lora32_max4096") - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- Unsloth Studio
How to use yjyjyj98/drdapo_lora32_max4096 with Unsloth Studio:
Install Unsloth Studio (macOS, Linux, WSL)
curl -fsSL https://unsloth.ai/install.sh | sh # Run unsloth studio unsloth studio -H 0.0.0.0 -p 8888 # Then open http://localhost:8888 in your browser # Search for yjyjyj98/drdapo_lora32_max4096 to start chatting
Install Unsloth Studio (Windows)
irm https://unsloth.ai/install.ps1 | iex # Run unsloth studio unsloth studio -H 0.0.0.0 -p 8888 # Then open http://localhost:8888 in your browser # Search for yjyjyj98/drdapo_lora32_max4096 to start chatting
Using HuggingFace Spaces for Unsloth
# No setup required # Open https://huggingface.co/spaces/unsloth/studio in your browser # Search for yjyjyj98/drdapo_lora32_max4096 to start chatting
Load model with FastModel
pip install unsloth from unsloth import FastModel model, tokenizer = FastModel.from_pretrained( model_name="yjyjyj98/drdapo_lora32_max4096", max_seq_length=2048, )
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.03142677561282212, | |
| "eval_steps": 500, | |
| "global_step": 100, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 387.5625, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 929.0, | |
| "completions/mean_length": 387.5625, | |
| "completions/mean_terminated_length": 298.6000061035156, | |
| "completions/min_length": 1.0, | |
| "completions/min_terminated_length": 1.0, | |
| "epoch": 0.00031426775612822125, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 9.58870792388916, | |
| "kl": 1.1056605577468872, | |
| "learning_rate": 0.0, | |
| "loss": 0.0442, | |
| "num_tokens": 10321.0, | |
| "reward": -8.78125, | |
| "reward_std": 1.5290063619613647, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -1.15625, | |
| "rewards/check_numbers/std": 1.1212902069091797, | |
| "rewards/format_and_language_reward_func/mean": -3.625, | |
| "rewards/format_and_language_reward_func/std": 0.9574271440505981, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 1 | |
| }, | |
| { | |
| "completion_length": 427.625, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1380.0, | |
| "completions/mean_length": 427.625, | |
| "completions/mean_terminated_length": 341.3333435058594, | |
| "completions/min_length": 1.0, | |
| "completions/min_terminated_length": 1.0, | |
| "epoch": 0.0006285355122564425, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 15.610054016113281, | |
| "kl": 0.5778560638427734, | |
| "learning_rate": 5.000000000000001e-07, | |
| "loss": 0.0231, | |
| "num_tokens": 20955.0, | |
| "reward": -9.5, | |
| "reward_std": 1.6830127239227295, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -1.75, | |
| "rewards/check_numbers/std": 0.8366600275039673, | |
| "rewards/format_and_language_reward_func/mean": -3.75, | |
| "rewards/format_and_language_reward_func/std": 1.0, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 2 | |
| }, | |
| { | |
| "completion_length": 979.5, | |
| "completions/clipped_ratio": 0.3125, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1507.0, | |
| "completions/mean_length": 979.5, | |
| "completions/mean_terminated_length": 642.0, | |
| "completions/min_length": 1.0, | |
| "completions/min_terminated_length": 1.0, | |
| "epoch": 0.0009428032683846638, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 4.994994163513184, | |
| "kl": 0.2919767200946808, | |
| "learning_rate": 1.0000000000000002e-06, | |
| "loss": 0.0117, | |
| "num_tokens": 40683.0, | |
| "reward": -7.53125, | |
| "reward_std": 1.3821797370910645, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.40625, | |
| "rewards/check_numbers/std": 1.3443554639816284, | |
| "rewards/format_and_language_reward_func/mean": -3.125, | |
| "rewards/format_and_language_reward_func/std": 0.5, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 3 | |
| }, | |
| { | |
| "completion_length": 836.3125, | |
| "completions/clipped_ratio": 0.3125, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1175.0, | |
| "completions/mean_length": 836.3125, | |
| "completions/mean_terminated_length": 433.727294921875, | |
| "completions/min_length": 1.0, | |
| "completions/min_terminated_length": 1.0, | |
| "epoch": 0.001257071024512885, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 16.14759635925293, | |
| "kl": 0.5689749717712402, | |
| "learning_rate": 1.5e-06, | |
| "loss": 0.0228, | |
| "num_tokens": 58020.0, | |
| "reward": -9.03125, | |
| "reward_std": 1.9392420053482056, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -1.40625, | |
| "rewards/check_numbers/std": 1.0680004358291626, | |
| "rewards/format_and_language_reward_func/mean": -3.625, | |
| "rewards/format_and_language_reward_func/std": 0.9574271440505981, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 4 | |
| }, | |
| { | |
| "completion_length": 553.25, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1469.0, | |
| "completions/mean_length": 553.25, | |
| "completions/mean_terminated_length": 475.3333435058594, | |
| "completions/min_length": 1.0, | |
| "completions/min_terminated_length": 1.0, | |
| "epoch": 0.0015713387806411063, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 5.5043792724609375, | |
| "kl": 0.5717646479606628, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 0.0229, | |
| "num_tokens": 70976.0, | |
| "reward": -8.40625, | |
| "reward_std": 0.9375, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -1.03125, | |
| "rewards/check_numbers/std": 1.007782220840454, | |
| "rewards/format_and_language_reward_func/mean": -3.375, | |
| "rewards/format_and_language_reward_func/std": 0.8062257766723633, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 5 | |
| }, | |
| { | |
| "completion_length": 750.9375, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1676.0, | |
| "completions/mean_length": 750.9375, | |
| "completions/mean_terminated_length": 612.2142944335938, | |
| "completions/min_length": 1.0, | |
| "completions/min_terminated_length": 1.0, | |
| "epoch": 0.0018856065367693275, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 7.5527191162109375, | |
| "kl": 0.26640018820762634, | |
| "learning_rate": 2.5e-06, | |
| "loss": 0.0107, | |
| "num_tokens": 87127.0, | |
| "reward": -8.40625, | |
| "reward_std": 1.3351925611495972, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.90625, | |
| "rewards/check_numbers/std": 1.1138334274291992, | |
| "rewards/format_and_language_reward_func/mean": -3.5, | |
| "rewards/format_and_language_reward_func/std": 0.8944272398948669, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 6 | |
| }, | |
| { | |
| "completion_length": 822.625, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1699.0, | |
| "completions/max_terminated_length": 1699.0, | |
| "completions/mean_length": 822.625, | |
| "completions/mean_terminated_length": 822.625, | |
| "completions/min_length": 1.0, | |
| "completions/min_terminated_length": 1.0, | |
| "epoch": 0.0021998742928975488, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 6.19105863571167, | |
| "kl": 0.17402563989162445, | |
| "learning_rate": 3e-06, | |
| "loss": 0.007, | |
| "num_tokens": 104165.0, | |
| "reward": -7.96875, | |
| "reward_std": 1.063370943069458, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.71875, | |
| "rewards/check_numbers/std": 0.8750000596046448, | |
| "rewards/format_and_language_reward_func/mean": -3.25, | |
| "rewards/format_and_language_reward_func/std": 0.6831300854682922, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 7 | |
| }, | |
| { | |
| "completion_length": 1419.1875, | |
| "completions/clipped_ratio": 0.6875, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1149.0, | |
| "completions/mean_length": 1419.1875, | |
| "completions/mean_terminated_length": 753.0, | |
| "completions/min_length": 366.0, | |
| "completions/min_terminated_length": 366.0, | |
| "epoch": 0.00251414204902577, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.1348615139722824, | |
| "kl": 0.029216211289167404, | |
| "learning_rate": 3.5e-06, | |
| "loss": 0.0012, | |
| "num_tokens": 130604.0, | |
| "reward": -8.0625, | |
| "reward_std": 0.625, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.9375, | |
| "rewards/check_numbers/std": 0.75, | |
| "rewards/format_and_language_reward_func/mean": -3.125, | |
| "rewards/format_and_language_reward_func/std": 0.5, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 8 | |
| }, | |
| { | |
| "completion_length": 1018.125, | |
| "completions/clipped_ratio": 0.375, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1565.0, | |
| "completions/mean_length": 1018.125, | |
| "completions/mean_terminated_length": 595.7999877929688, | |
| "completions/min_length": 1.0, | |
| "completions/min_terminated_length": 1.0, | |
| "epoch": 0.0028284098051539913, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 7.937878131866455, | |
| "kl": 0.42939966917037964, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 0.0172, | |
| "num_tokens": 151170.0, | |
| "reward": -7.8125, | |
| "reward_std": 1.1593647003173828, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.6875, | |
| "rewards/check_numbers/std": 0.9639329314231873, | |
| "rewards/format_and_language_reward_func/mean": -3.125, | |
| "rewards/format_and_language_reward_func/std": 0.5, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 9 | |
| }, | |
| { | |
| "completion_length": 1018.625, | |
| "completions/clipped_ratio": 0.3125, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1317.0, | |
| "completions/mean_length": 1018.625, | |
| "completions/mean_terminated_length": 698.9091186523438, | |
| "completions/min_length": 3.0, | |
| "completions/min_terminated_length": 3.0, | |
| "epoch": 0.0031426775612822125, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 3.0097506046295166, | |
| "kl": 0.15729668736457825, | |
| "learning_rate": 4.5e-06, | |
| "loss": 0.0063, | |
| "num_tokens": 171388.0, | |
| "reward": -7.75, | |
| "reward_std": 0.8080127239227295, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.75, | |
| "rewards/check_numbers/std": 0.7745966911315918, | |
| "rewards/format_and_language_reward_func/mean": -3.0, | |
| "rewards/format_and_language_reward_func/std": 0.0, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 10 | |
| }, | |
| { | |
| "completion_length": 1189.8125, | |
| "completions/clipped_ratio": 0.3125, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1696.0, | |
| "completions/mean_length": 1189.8125, | |
| "completions/mean_terminated_length": 947.9091186523438, | |
| "completions/min_length": 423.0, | |
| "completions/min_terminated_length": 423.0, | |
| "epoch": 0.0034569453174104338, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.136498361825943, | |
| "kl": 0.05319710448384285, | |
| "learning_rate": 5e-06, | |
| "loss": 0.0021, | |
| "num_tokens": 194373.0, | |
| "reward": -7.75, | |
| "reward_std": 0.8080127239227295, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.75, | |
| "rewards/check_numbers/std": 0.7745966911315918, | |
| "rewards/format_and_language_reward_func/mean": -3.0, | |
| "rewards/format_and_language_reward_func/std": 0.0, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 11 | |
| }, | |
| { | |
| "completion_length": 1122.0, | |
| "completions/clipped_ratio": 0.1875, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1665.0, | |
| "completions/mean_length": 1122.0, | |
| "completions/mean_terminated_length": 983.5385131835938, | |
| "completions/min_length": 498.0, | |
| "completions/min_terminated_length": 498.0, | |
| "epoch": 0.003771213073538655, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0965699553489685, | |
| "kl": 0.06320463120937347, | |
| "learning_rate": 4.944444444444445e-06, | |
| "loss": 0.0025, | |
| "num_tokens": 216261.0, | |
| "reward": -7.65625, | |
| "reward_std": 0.40400636196136475, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.65625, | |
| "rewards/check_numbers/std": 0.7685213088989258, | |
| "rewards/format_and_language_reward_func/mean": -3.0, | |
| "rewards/format_and_language_reward_func/std": 0.0, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 12 | |
| }, | |
| { | |
| "completion_length": 1341.75, | |
| "completions/clipped_ratio": 0.4375, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1607.0, | |
| "completions/mean_length": 1341.75, | |
| "completions/mean_terminated_length": 1046.0, | |
| "completions/min_length": 592.0, | |
| "completions/min_terminated_length": 592.0, | |
| "epoch": 0.004085480829666876, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.07173436135053635, | |
| "kl": 0.04135803505778313, | |
| "learning_rate": 4.888888888888889e-06, | |
| "loss": 0.0017, | |
| "num_tokens": 241453.0, | |
| "reward": -7.84375, | |
| "reward_std": 0.40400636196136475, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.84375, | |
| "rewards/check_numbers/std": 0.7685213088989258, | |
| "rewards/format_and_language_reward_func/mean": -3.0, | |
| "rewards/format_and_language_reward_func/std": 0.0, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 13 | |
| }, | |
| { | |
| "completion_length": 1283.8125, | |
| "completions/clipped_ratio": 0.625, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1221.0, | |
| "completions/mean_length": 1283.8125, | |
| "completions/mean_terminated_length": 553.5, | |
| "completions/min_length": 1.0, | |
| "completions/min_terminated_length": 1.0, | |
| "epoch": 0.0043997485857950975, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 6.01732063293457, | |
| "kl": 0.17246384918689728, | |
| "learning_rate": 4.833333333333333e-06, | |
| "loss": 0.0069, | |
| "num_tokens": 266634.0, | |
| "reward": -7.84375, | |
| "reward_std": 1.1508427858352661, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.71875, | |
| "rewards/check_numbers/std": 0.8750000596046448, | |
| "rewards/format_and_language_reward_func/mean": -3.125, | |
| "rewards/format_and_language_reward_func/std": 0.5, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 14 | |
| }, | |
| { | |
| "completion_length": 1189.6875, | |
| "completions/clipped_ratio": 0.3125, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1694.0, | |
| "completions/mean_length": 1189.6875, | |
| "completions/mean_terminated_length": 947.727294921875, | |
| "completions/min_length": 36.0, | |
| "completions/min_terminated_length": 36.0, | |
| "epoch": 0.004714016341923318, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6227664351463318, | |
| "kl": 0.08904990553855896, | |
| "learning_rate": 4.777777777777778e-06, | |
| "loss": 0.0036, | |
| "num_tokens": 289373.0, | |
| "reward": -6.90625, | |
| "reward_std": 1.0290063619613647, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": 0.09375, | |
| "rewards/check_numbers/std": 1.827737808227539, | |
| "rewards/format_and_language_reward_func/mean": -3.0, | |
| "rewards/format_and_language_reward_func/std": 0.0, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 15 | |
| }, | |
| { | |
| "completion_length": 1361.0, | |
| "completions/clipped_ratio": 0.625, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1314.0, | |
| "completions/mean_length": 1361.0, | |
| "completions/mean_terminated_length": 759.3333740234375, | |
| "completions/min_length": 148.0, | |
| "completions/min_terminated_length": 148.0, | |
| "epoch": 0.00502828409805154, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.12482193857431412, | |
| "kl": 0.05782376229763031, | |
| "learning_rate": 4.722222222222222e-06, | |
| "loss": 0.0023, | |
| "num_tokens": 314701.0, | |
| "reward": -8.09375, | |
| "reward_std": 0.9840351343154907, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.84375, | |
| "rewards/check_numbers/std": 0.7685213088989258, | |
| "rewards/format_and_language_reward_func/mean": -3.25, | |
| "rewards/format_and_language_reward_func/std": 0.6831300854682922, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 16 | |
| }, | |
| { | |
| "completion_length": 1330.625, | |
| "completions/clipped_ratio": 0.5625, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1668.0, | |
| "completions/mean_length": 1330.625, | |
| "completions/mean_terminated_length": 827.4285888671875, | |
| "completions/min_length": 14.0, | |
| "completions/min_terminated_length": 14.0, | |
| "epoch": 0.005342551854179761, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.09849988669157028, | |
| "kl": 0.07715655118227005, | |
| "learning_rate": 4.666666666666667e-06, | |
| "loss": 0.0031, | |
| "num_tokens": 339795.0, | |
| "reward": -7.9375, | |
| "reward_std": 0.375, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.9375, | |
| "rewards/check_numbers/std": 0.75, | |
| "rewards/format_and_language_reward_func/mean": -3.0, | |
| "rewards/format_and_language_reward_func/std": 0.0, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 17 | |
| }, | |
| { | |
| "completion_length": 1300.3125, | |
| "completions/clipped_ratio": 0.5, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1540.0, | |
| "completions/mean_length": 1300.3125, | |
| "completions/mean_terminated_length": 878.625, | |
| "completions/min_length": 5.0, | |
| "completions/min_terminated_length": 5.0, | |
| "epoch": 0.0056568196103079825, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 2.448221445083618, | |
| "kl": 0.3021676242351532, | |
| "learning_rate": 4.611111111111112e-06, | |
| "loss": 0.0121, | |
| "num_tokens": 364600.0, | |
| "reward": -8.3125, | |
| "reward_std": 1.2111132144927979, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.9375, | |
| "rewards/check_numbers/std": 0.75, | |
| "rewards/format_and_language_reward_func/mean": -3.375, | |
| "rewards/format_and_language_reward_func/std": 0.8062257766723633, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 18 | |
| }, | |
| { | |
| "completion_length": 1257.625, | |
| "completions/clipped_ratio": 0.375, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1613.0, | |
| "completions/mean_length": 1257.625, | |
| "completions/mean_terminated_length": 979.0, | |
| "completions/min_length": 311.0, | |
| "completions/min_terminated_length": 311.0, | |
| "epoch": 0.005971087366436203, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.12901267409324646, | |
| "kl": 0.06548095494508743, | |
| "learning_rate": 4.555555555555556e-06, | |
| "loss": 0.0026, | |
| "num_tokens": 388078.0, | |
| "reward": -7.84375, | |
| "reward_std": 0.8176814913749695, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.59375, | |
| "rewards/check_numbers/std": 1.7050782442092896, | |
| "rewards/format_and_language_reward_func/mean": -3.25, | |
| "rewards/format_and_language_reward_func/std": 0.6831300854682922, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 19 | |
| }, | |
| { | |
| "completion_length": 1573.6875, | |
| "completions/clipped_ratio": 0.75, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1592.0, | |
| "completions/mean_length": 1573.6875, | |
| "completions/mean_terminated_length": 1128.75, | |
| "completions/min_length": 825.0, | |
| "completions/min_terminated_length": 825.0, | |
| "epoch": 0.006285355122564425, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.10170631110668182, | |
| "kl": 0.04396039992570877, | |
| "learning_rate": 4.5e-06, | |
| "loss": 0.0018, | |
| "num_tokens": 417161.0, | |
| "reward": -8.375, | |
| "reward_std": 0.9611132144927979, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -1.125, | |
| "rewards/check_numbers/std": 0.670820415019989, | |
| "rewards/format_and_language_reward_func/mean": -3.25, | |
| "rewards/format_and_language_reward_func/std": 0.6831300854682922, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 20 | |
| }, | |
| { | |
| "completion_length": 1380.5, | |
| "completions/clipped_ratio": 0.375, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1633.0, | |
| "completions/mean_length": 1380.5, | |
| "completions/mean_terminated_length": 1175.5999755859375, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "epoch": 0.006599622878692646, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 2.5239837169647217, | |
| "kl": 0.16142641007900238, | |
| "learning_rate": 4.444444444444444e-06, | |
| "loss": 0.0065, | |
| "num_tokens": 443361.0, | |
| "reward": -8.03125, | |
| "reward_std": 0.6205127239227295, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -1.03125, | |
| "rewards/check_numbers/std": 0.7180703282356262, | |
| "rewards/format_and_language_reward_func/mean": -3.0, | |
| "rewards/format_and_language_reward_func/std": 0.0, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 21 | |
| }, | |
| { | |
| "completion_length": 1384.25, | |
| "completions/clipped_ratio": 0.5625, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1601.0, | |
| "completions/mean_length": 1384.25, | |
| "completions/mean_terminated_length": 950.0000610351562, | |
| "completions/min_length": 427.0, | |
| "completions/min_terminated_length": 427.0, | |
| "epoch": 0.0069138906348208675, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.09287308901548386, | |
| "kl": 0.06003550812602043, | |
| "learning_rate": 4.388888888888889e-06, | |
| "loss": 0.0024, | |
| "num_tokens": 469325.0, | |
| "reward": -7.78125, | |
| "reward_std": 0.4375, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.65625, | |
| "rewards/check_numbers/std": 0.7685213088989258, | |
| "rewards/format_and_language_reward_func/mean": -3.125, | |
| "rewards/format_and_language_reward_func/std": 0.5, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 22 | |
| }, | |
| { | |
| "completion_length": 1367.4375, | |
| "completions/clipped_ratio": 0.4375, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1606.0, | |
| "completions/mean_length": 1367.4375, | |
| "completions/mean_terminated_length": 1091.6666259765625, | |
| "completions/min_length": 531.0, | |
| "completions/min_terminated_length": 531.0, | |
| "epoch": 0.007228158390949088, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.3170417845249176, | |
| "kl": 0.06963013857603073, | |
| "learning_rate": 4.333333333333334e-06, | |
| "loss": 0.0028, | |
| "num_tokens": 495200.0, | |
| "reward": -7.1875, | |
| "reward_std": 0.375, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.1875, | |
| "rewards/check_numbers/std": 0.5123475790023804, | |
| "rewards/format_and_language_reward_func/mean": -3.0, | |
| "rewards/format_and_language_reward_func/std": 0.0, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 23 | |
| }, | |
| { | |
| "completion_length": 1551.6875, | |
| "completions/clipped_ratio": 0.8125, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1699.0, | |
| "completions/mean_length": 1551.6875, | |
| "completions/mean_terminated_length": 813.6666870117188, | |
| "completions/min_length": 243.0, | |
| "completions/min_terminated_length": 243.0, | |
| "epoch": 0.00754242614707731, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.20077864825725555, | |
| "kl": 0.05942363664507866, | |
| "learning_rate": 4.277777777777778e-06, | |
| "loss": 0.0024, | |
| "num_tokens": 523435.0, | |
| "reward": -8.28125, | |
| "reward_std": 0.9840351343154907, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -1.03125, | |
| "rewards/check_numbers/std": 0.7180703282356262, | |
| "rewards/format_and_language_reward_func/mean": -3.25, | |
| "rewards/format_and_language_reward_func/std": 0.6831300854682922, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 24 | |
| }, | |
| { | |
| "completion_length": 1567.25, | |
| "completions/clipped_ratio": 0.875, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 833.0, | |
| "completions/mean_length": 1567.25, | |
| "completions/mean_terminated_length": 484.0, | |
| "completions/min_length": 135.0, | |
| "completions/min_terminated_length": 135.0, | |
| "epoch": 0.00785669390320553, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.15381832420825958, | |
| "kl": 0.05258103832602501, | |
| "learning_rate": 4.222222222222223e-06, | |
| "loss": 0.0021, | |
| "num_tokens": 552719.0, | |
| "reward": -7.5, | |
| "reward_std": 0.661700427532196, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.375, | |
| "rewards/check_numbers/std": 0.670820415019989, | |
| "rewards/format_and_language_reward_func/mean": -3.125, | |
| "rewards/format_and_language_reward_func/std": 0.5, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 25 | |
| }, | |
| { | |
| "completion_length": 1234.5625, | |
| "completions/clipped_ratio": 0.5, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1246.0, | |
| "completions/mean_length": 1234.5625, | |
| "completions/mean_terminated_length": 747.125, | |
| "completions/min_length": 43.0, | |
| "completions/min_terminated_length": 43.0, | |
| "epoch": 0.008170961659333752, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.2516714334487915, | |
| "kl": 0.07844924181699753, | |
| "learning_rate": 4.166666666666667e-06, | |
| "loss": 0.0031, | |
| "num_tokens": 576460.0, | |
| "reward": -7.75, | |
| "reward_std": 0.5915063619613647, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.75, | |
| "rewards/check_numbers/std": 0.7745966911315918, | |
| "rewards/format_and_language_reward_func/mean": -3.0, | |
| "rewards/format_and_language_reward_func/std": 0.0, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 26 | |
| }, | |
| { | |
| "completion_length": 1580.875, | |
| "completions/clipped_ratio": 0.6875, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1688.0, | |
| "completions/mean_length": 1580.875, | |
| "completions/mean_terminated_length": 1270.4000244140625, | |
| "completions/min_length": 815.0, | |
| "completions/min_terminated_length": 815.0, | |
| "epoch": 0.008485229415461974, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.08427742123603821, | |
| "kl": 0.040127161890268326, | |
| "learning_rate": 4.111111111111111e-06, | |
| "loss": 0.0016, | |
| "num_tokens": 605630.0, | |
| "reward": -7.84375, | |
| "reward_std": 0.5625, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.84375, | |
| "rewards/check_numbers/std": 0.7685213088989258, | |
| "rewards/format_and_language_reward_func/mean": -3.0, | |
| "rewards/format_and_language_reward_func/std": 0.0, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 27 | |
| }, | |
| { | |
| "completion_length": 1360.1875, | |
| "completions/clipped_ratio": 0.5, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1577.0, | |
| "completions/mean_length": 1360.1875, | |
| "completions/mean_terminated_length": 998.375, | |
| "completions/min_length": 415.0, | |
| "completions/min_terminated_length": 415.0, | |
| "epoch": 0.008799497171590195, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13703206181526184, | |
| "kl": 0.05611160770058632, | |
| "learning_rate": 4.055555555555556e-06, | |
| "loss": 0.0022, | |
| "num_tokens": 631529.0, | |
| "reward": -8.21875, | |
| "reward_std": 0.9407068490982056, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.84375, | |
| "rewards/check_numbers/std": 0.7685213088989258, | |
| "rewards/format_and_language_reward_func/mean": -3.375, | |
| "rewards/format_and_language_reward_func/std": 0.8062257766723633, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 28 | |
| }, | |
| { | |
| "completion_length": 1515.625, | |
| "completions/clipped_ratio": 0.6875, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1188.0, | |
| "completions/mean_length": 1515.625, | |
| "completions/mean_terminated_length": 1061.5999755859375, | |
| "completions/min_length": 871.0, | |
| "completions/min_terminated_length": 871.0, | |
| "epoch": 0.009113764927718416, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1371777057647705, | |
| "kl": 0.05326192080974579, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 0.0021, | |
| "num_tokens": 659651.0, | |
| "reward": -8.09375, | |
| "reward_std": 0.9040063619613647, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.84375, | |
| "rewards/check_numbers/std": 0.7685213088989258, | |
| "rewards/format_and_language_reward_func/mean": -3.25, | |
| "rewards/format_and_language_reward_func/std": 0.6831300854682922, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 29 | |
| }, | |
| { | |
| "completion_length": 1422.0625, | |
| "completions/clipped_ratio": 0.5625, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1427.0, | |
| "completions/mean_length": 1422.0625, | |
| "completions/mean_terminated_length": 1036.4285888671875, | |
| "completions/min_length": 1.0, | |
| "completions/min_terminated_length": 1.0, | |
| "epoch": 0.009428032683846637, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 1.34744393825531, | |
| "kl": 2.590277671813965, | |
| "learning_rate": 3.944444444444445e-06, | |
| "loss": 0.1036, | |
| "num_tokens": 686368.0, | |
| "reward": -7.84375, | |
| "reward_std": 1.1540063619613647, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.71875, | |
| "rewards/check_numbers/std": 0.8750000596046448, | |
| "rewards/format_and_language_reward_func/mean": -3.125, | |
| "rewards/format_and_language_reward_func/std": 0.5, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 30 | |
| }, | |
| { | |
| "completion_length": 1592.75, | |
| "completions/clipped_ratio": 0.8125, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1348.0, | |
| "completions/mean_length": 1592.75, | |
| "completions/mean_terminated_length": 1032.666748046875, | |
| "completions/min_length": 537.0, | |
| "completions/min_terminated_length": 537.0, | |
| "epoch": 0.00974230043997486, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.09155543893575668, | |
| "kl": 0.05414074286818504, | |
| "learning_rate": 3.88888888888889e-06, | |
| "loss": 0.0022, | |
| "num_tokens": 715988.0, | |
| "reward": -7.40625, | |
| "reward_std": 0.4375, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.28125, | |
| "rewards/check_numbers/std": 0.6046693325042725, | |
| "rewards/format_and_language_reward_func/mean": -3.125, | |
| "rewards/format_and_language_reward_func/std": 0.5, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 31 | |
| }, | |
| { | |
| "completion_length": 1555.5625, | |
| "completions/clipped_ratio": 0.75, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1635.0, | |
| "completions/mean_length": 1555.5625, | |
| "completions/mean_terminated_length": 1056.25, | |
| "completions/min_length": 619.0, | |
| "completions/min_terminated_length": 619.0, | |
| "epoch": 0.01005656819610308, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.15684477984905243, | |
| "kl": 0.05063142254948616, | |
| "learning_rate": 3.833333333333334e-06, | |
| "loss": 0.002, | |
| "num_tokens": 744669.0, | |
| "reward": -8.40625, | |
| "reward_std": 1.0148502588272095, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -1.03125, | |
| "rewards/check_numbers/std": 0.7180703282356262, | |
| "rewards/format_and_language_reward_func/mean": -3.375, | |
| "rewards/format_and_language_reward_func/std": 0.8062257766723633, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 32 | |
| }, | |
| { | |
| "completion_length": 1546.9375, | |
| "completions/clipped_ratio": 0.75, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1335.0, | |
| "completions/mean_length": 1546.9375, | |
| "completions/mean_terminated_length": 1021.75, | |
| "completions/min_length": 747.0, | |
| "completions/min_terminated_length": 747.0, | |
| "epoch": 0.010370835952231301, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.07121387869119644, | |
| "kl": 0.032333556562662125, | |
| "learning_rate": 3.777777777777778e-06, | |
| "loss": 0.0013, | |
| "num_tokens": 773388.0, | |
| "reward": -7.46875, | |
| "reward_std": 0.5625, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.46875, | |
| "rewards/check_numbers/std": 0.7180703282356262, | |
| "rewards/format_and_language_reward_func/mean": -3.0, | |
| "rewards/format_and_language_reward_func/std": 0.0, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 33 | |
| }, | |
| { | |
| "completion_length": 1376.4375, | |
| "completions/clipped_ratio": 0.625, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1313.0, | |
| "completions/mean_length": 1376.4375, | |
| "completions/mean_terminated_length": 800.5, | |
| "completions/min_length": 5.0, | |
| "completions/min_terminated_length": 5.0, | |
| "epoch": 0.010685103708359522, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 1.1596808433532715, | |
| "kl": 0.12857064604759216, | |
| "learning_rate": 3.7222222222222225e-06, | |
| "loss": 0.0051, | |
| "num_tokens": 799511.0, | |
| "reward": -7.5, | |
| "reward_std": 0.661700427532196, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.375, | |
| "rewards/check_numbers/std": 0.670820415019989, | |
| "rewards/format_and_language_reward_func/mean": -3.125, | |
| "rewards/format_and_language_reward_func/std": 0.5, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 34 | |
| }, | |
| { | |
| "completion_length": 1699.6875, | |
| "completions/clipped_ratio": 0.875, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1711.0, | |
| "completions/mean_length": 1699.6875, | |
| "completions/mean_terminated_length": 1543.5, | |
| "completions/min_length": 1376.0, | |
| "completions/min_terminated_length": 1376.0, | |
| "epoch": 0.010999371464487744, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.11480734497308731, | |
| "kl": 0.04761524498462677, | |
| "learning_rate": 3.6666666666666666e-06, | |
| "loss": 0.0019, | |
| "num_tokens": 830966.0, | |
| "reward": -7.96875, | |
| "reward_std": 1.2129219770431519, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.46875, | |
| "rewards/check_numbers/std": 0.7180703282356262, | |
| "rewards/format_and_language_reward_func/mean": -3.5, | |
| "rewards/format_and_language_reward_func/std": 0.8944272398948669, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 35 | |
| }, | |
| { | |
| "completion_length": 1583.75, | |
| "completions/clipped_ratio": 0.75, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1432.0, | |
| "completions/mean_length": 1583.75, | |
| "completions/mean_terminated_length": 1169.0, | |
| "completions/min_length": 686.0, | |
| "completions/min_terminated_length": 686.0, | |
| "epoch": 0.011313639220615965, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.34630483388900757, | |
| "kl": 0.07976571470499039, | |
| "learning_rate": 3.6111111111111115e-06, | |
| "loss": 0.0032, | |
| "num_tokens": 859930.0, | |
| "reward": -8.09375, | |
| "reward_std": 0.9840351343154907, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.84375, | |
| "rewards/check_numbers/std": 0.7685213088989258, | |
| "rewards/format_and_language_reward_func/mean": -3.25, | |
| "rewards/format_and_language_reward_func/std": 0.6831300854682922, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 36 | |
| }, | |
| { | |
| "completion_length": 1338.875, | |
| "completions/clipped_ratio": 0.6875, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 622.0, | |
| "completions/mean_length": 1338.875, | |
| "completions/mean_terminated_length": 496.0, | |
| "completions/min_length": 286.0, | |
| "completions/min_terminated_length": 286.0, | |
| "epoch": 0.011627906976744186, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1614941507577896, | |
| "kl": 0.06494183838367462, | |
| "learning_rate": 3.555555555555556e-06, | |
| "loss": 0.0026, | |
| "num_tokens": 885120.0, | |
| "reward": -7.6875, | |
| "reward_std": 0.8080127239227295, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.5625, | |
| "rewards/check_numbers/std": 0.75, | |
| "rewards/format_and_language_reward_func/mean": -3.125, | |
| "rewards/format_and_language_reward_func/std": 0.5, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 37 | |
| }, | |
| { | |
| "completion_length": 1480.0625, | |
| "completions/clipped_ratio": 0.5625, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1611.0, | |
| "completions/mean_length": 1480.0625, | |
| "completions/mean_terminated_length": 1169.0, | |
| "completions/min_length": 525.0, | |
| "completions/min_terminated_length": 525.0, | |
| "epoch": 0.011942174732872407, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.07541613280773163, | |
| "kl": 0.052320241928100586, | |
| "learning_rate": 3.5e-06, | |
| "loss": 0.0021, | |
| "num_tokens": 912537.0, | |
| "reward": -8.0, | |
| "reward_std": 0.28867512941360474, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.75, | |
| "rewards/check_numbers/std": 0.7745966911315918, | |
| "rewards/format_and_language_reward_func/mean": -3.25, | |
| "rewards/format_and_language_reward_func/std": 0.6831300854682922, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 38 | |
| }, | |
| { | |
| "completion_length": 1373.75, | |
| "completions/clipped_ratio": 0.5625, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1538.0, | |
| "completions/mean_length": 1373.75, | |
| "completions/mean_terminated_length": 926.0000610351562, | |
| "completions/min_length": 489.0, | |
| "completions/min_terminated_length": 489.0, | |
| "epoch": 0.01225644248900063, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.16380992531776428, | |
| "kl": 0.06148176267743111, | |
| "learning_rate": 3.444444444444445e-06, | |
| "loss": 0.0025, | |
| "num_tokens": 938505.0, | |
| "reward": -8.0, | |
| "reward_std": 1.0497419834136963, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.75, | |
| "rewards/check_numbers/std": 0.7745966911315918, | |
| "rewards/format_and_language_reward_func/mean": -3.25, | |
| "rewards/format_and_language_reward_func/std": 0.6831300854682922, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 39 | |
| }, | |
| { | |
| "completion_length": 1519.1875, | |
| "completions/clipped_ratio": 0.75, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1336.0, | |
| "completions/mean_length": 1519.1875, | |
| "completions/mean_terminated_length": 910.75, | |
| "completions/min_length": 601.0, | |
| "completions/min_terminated_length": 601.0, | |
| "epoch": 0.01257071024512885, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.10772477090358734, | |
| "kl": 0.04433707520365715, | |
| "learning_rate": 3.3888888888888893e-06, | |
| "loss": 0.0018, | |
| "num_tokens": 966940.0, | |
| "reward": -7.59375, | |
| "reward_std": 0.7340351343154907, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.46875, | |
| "rewards/check_numbers/std": 0.7180703282356262, | |
| "rewards/format_and_language_reward_func/mean": -3.125, | |
| "rewards/format_and_language_reward_func/std": 0.5, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 40 | |
| }, | |
| { | |
| "completion_length": 1453.9375, | |
| "completions/clipped_ratio": 0.625, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1432.0, | |
| "completions/mean_length": 1453.9375, | |
| "completions/mean_terminated_length": 1007.1666870117188, | |
| "completions/min_length": 350.0, | |
| "completions/min_terminated_length": 350.0, | |
| "epoch": 0.012884978001257071, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.07585709542036057, | |
| "kl": 0.04436139389872551, | |
| "learning_rate": 3.3333333333333333e-06, | |
| "loss": 0.0018, | |
| "num_tokens": 993859.0, | |
| "reward": -8.125, | |
| "reward_std": 0.5773502588272095, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.75, | |
| "rewards/check_numbers/std": 0.7745966911315918, | |
| "rewards/format_and_language_reward_func/mean": -3.375, | |
| "rewards/format_and_language_reward_func/std": 0.8062257766723633, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 41 | |
| }, | |
| { | |
| "completion_length": 1421.5, | |
| "completions/clipped_ratio": 0.5625, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1470.0, | |
| "completions/mean_length": 1421.5, | |
| "completions/mean_terminated_length": 1035.1429443359375, | |
| "completions/min_length": 118.0, | |
| "completions/min_terminated_length": 118.0, | |
| "epoch": 0.013199245757385292, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5277220010757446, | |
| "kl": 0.06066755950450897, | |
| "learning_rate": 3.277777777777778e-06, | |
| "loss": 0.0024, | |
| "num_tokens": 1020571.0, | |
| "reward": -8.125, | |
| "reward_std": 1.0517165660858154, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.75, | |
| "rewards/check_numbers/std": 0.7745966911315918, | |
| "rewards/format_and_language_reward_func/mean": -3.375, | |
| "rewards/format_and_language_reward_func/std": 0.8062257766723633, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 42 | |
| }, | |
| { | |
| "completion_length": 1514.6875, | |
| "completions/clipped_ratio": 0.6875, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1326.0, | |
| "completions/mean_length": 1514.6875, | |
| "completions/mean_terminated_length": 1058.5999755859375, | |
| "completions/min_length": 833.0, | |
| "completions/min_terminated_length": 833.0, | |
| "epoch": 0.013513513513513514, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.10104304552078247, | |
| "kl": 0.05354408547282219, | |
| "learning_rate": 3.2222222222222227e-06, | |
| "loss": 0.0021, | |
| "num_tokens": 1049086.0, | |
| "reward": -7.46875, | |
| "reward_std": 0.5625, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.46875, | |
| "rewards/check_numbers/std": 0.7180703282356262, | |
| "rewards/format_and_language_reward_func/mean": -3.0, | |
| "rewards/format_and_language_reward_func/std": 0.0, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 43 | |
| }, | |
| { | |
| "completion_length": 1431.4375, | |
| "completions/clipped_ratio": 0.625, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1554.0, | |
| "completions/mean_length": 1431.4375, | |
| "completions/mean_terminated_length": 947.1666870117188, | |
| "completions/min_length": 320.0, | |
| "completions/min_terminated_length": 320.0, | |
| "epoch": 0.013827781269641735, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.11680426448583603, | |
| "kl": 0.05350031703710556, | |
| "learning_rate": 3.1666666666666667e-06, | |
| "loss": 0.0021, | |
| "num_tokens": 1075785.0, | |
| "reward": -7.46875, | |
| "reward_std": 0.6205127239227295, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.46875, | |
| "rewards/check_numbers/std": 0.7180703282356262, | |
| "rewards/format_and_language_reward_func/mean": -3.0, | |
| "rewards/format_and_language_reward_func/std": 0.0, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 44 | |
| }, | |
| { | |
| "completion_length": 1583.875, | |
| "completions/clipped_ratio": 0.625, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1720.0, | |
| "completions/mean_length": 1583.875, | |
| "completions/mean_terminated_length": 1353.666748046875, | |
| "completions/min_length": 573.0, | |
| "completions/min_terminated_length": 573.0, | |
| "epoch": 0.014142049025769956, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.10828514397144318, | |
| "kl": 0.05207566171884537, | |
| "learning_rate": 3.1111111111111116e-06, | |
| "loss": 0.0021, | |
| "num_tokens": 1105015.0, | |
| "reward": -7.25, | |
| "reward_std": 1.1668819189071655, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.125, | |
| "rewards/check_numbers/std": 1.5864006280899048, | |
| "rewards/format_and_language_reward_func/mean": -3.125, | |
| "rewards/format_and_language_reward_func/std": 0.5, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 45 | |
| }, | |
| { | |
| "completion_length": 1366.1875, | |
| "completions/clipped_ratio": 0.5625, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1669.0, | |
| "completions/mean_length": 1366.1875, | |
| "completions/mean_terminated_length": 908.71435546875, | |
| "completions/min_length": 283.0, | |
| "completions/min_terminated_length": 283.0, | |
| "epoch": 0.014456316781898177, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.09319330006837845, | |
| "kl": 0.044255051761865616, | |
| "learning_rate": 3.055555555555556e-06, | |
| "loss": 0.0018, | |
| "num_tokens": 1130758.0, | |
| "reward": -7.375, | |
| "reward_std": 0.375, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.375, | |
| "rewards/check_numbers/std": 0.670820415019989, | |
| "rewards/format_and_language_reward_func/mean": -3.0, | |
| "rewards/format_and_language_reward_func/std": 0.0, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 46 | |
| }, | |
| { | |
| "completion_length": 1549.25, | |
| "completions/clipped_ratio": 0.75, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1445.0, | |
| "completions/mean_length": 1549.25, | |
| "completions/mean_terminated_length": 1031.0, | |
| "completions/min_length": 798.0, | |
| "completions/min_terminated_length": 798.0, | |
| "epoch": 0.0147705845380264, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.1097634956240654, | |
| "kl": 0.04721890389919281, | |
| "learning_rate": 3e-06, | |
| "loss": 0.0019, | |
| "num_tokens": 1159726.0, | |
| "reward": -7.75, | |
| "reward_std": 0.375, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.75, | |
| "rewards/check_numbers/std": 0.7745966911315918, | |
| "rewards/format_and_language_reward_func/mean": -3.0, | |
| "rewards/format_and_language_reward_func/std": 0.0, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 47 | |
| }, | |
| { | |
| "completion_length": 1559.8125, | |
| "completions/clipped_ratio": 0.8125, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1060.0, | |
| "completions/mean_length": 1559.8125, | |
| "completions/mean_terminated_length": 857.0, | |
| "completions/min_length": 737.0, | |
| "completions/min_terminated_length": 737.0, | |
| "epoch": 0.01508485229415462, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13540787994861603, | |
| "kl": 0.052234258502721786, | |
| "learning_rate": 2.944444444444445e-06, | |
| "loss": 0.0021, | |
| "num_tokens": 1188527.0, | |
| "reward": -8.1875, | |
| "reward_std": 1.1715351343154907, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.9375, | |
| "rewards/check_numbers/std": 0.75, | |
| "rewards/format_and_language_reward_func/mean": -3.25, | |
| "rewards/format_and_language_reward_func/std": 0.6831300854682922, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 48 | |
| }, | |
| { | |
| "completion_length": 1568.125, | |
| "completions/clipped_ratio": 0.6875, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1591.0, | |
| "completions/mean_length": 1568.125, | |
| "completions/mean_terminated_length": 1229.5999755859375, | |
| "completions/min_length": 958.0, | |
| "completions/min_terminated_length": 958.0, | |
| "epoch": 0.015399120050282841, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.11936990916728973, | |
| "kl": 0.056975651532411575, | |
| "learning_rate": 2.888888888888889e-06, | |
| "loss": 0.0023, | |
| "num_tokens": 1217941.0, | |
| "reward": -8.1875, | |
| "reward_std": 0.8571338653564453, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.9375, | |
| "rewards/check_numbers/std": 0.75, | |
| "rewards/format_and_language_reward_func/mean": -3.25, | |
| "rewards/format_and_language_reward_func/std": 0.6831300854682922, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 49 | |
| }, | |
| { | |
| "completion_length": 1495.4375, | |
| "completions/clipped_ratio": 0.6875, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1325.0, | |
| "completions/mean_length": 1495.4375, | |
| "completions/mean_terminated_length": 997.0, | |
| "completions/min_length": 445.0, | |
| "completions/min_terminated_length": 445.0, | |
| "epoch": 0.01571338780641106, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.1257868856191635, | |
| "kl": 0.051786769181489944, | |
| "learning_rate": 2.8333333333333335e-06, | |
| "loss": 0.0021, | |
| "num_tokens": 1245944.0, | |
| "reward": -7.6875, | |
| "reward_std": 0.7895780801773071, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.5625, | |
| "rewards/check_numbers/std": 0.75, | |
| "rewards/format_and_language_reward_func/mean": -3.125, | |
| "rewards/format_and_language_reward_func/std": 0.5, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 50 | |
| }, | |
| { | |
| "completion_length": 1586.0, | |
| "completions/clipped_ratio": 0.6875, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1700.0, | |
| "completions/mean_length": 1586.0, | |
| "completions/mean_terminated_length": 1286.800048828125, | |
| "completions/min_length": 820.0, | |
| "completions/min_terminated_length": 820.0, | |
| "epoch": 0.016027655562539284, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.11521671712398529, | |
| "kl": 0.047557681798934937, | |
| "learning_rate": 2.7777777777777783e-06, | |
| "loss": 0.0019, | |
| "num_tokens": 1275304.0, | |
| "reward": -8.0, | |
| "reward_std": 0.6636751294136047, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.75, | |
| "rewards/check_numbers/std": 0.7745966911315918, | |
| "rewards/format_and_language_reward_func/mean": -3.25, | |
| "rewards/format_and_language_reward_func/std": 0.6831300854682922, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 51 | |
| }, | |
| { | |
| "completion_length": 1589.6875, | |
| "completions/clipped_ratio": 0.75, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1535.0, | |
| "completions/mean_length": 1589.6875, | |
| "completions/mean_terminated_length": 1192.75, | |
| "completions/min_length": 863.0, | |
| "completions/min_terminated_length": 863.0, | |
| "epoch": 0.016341923318667503, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.24990025162696838, | |
| "kl": 0.0706791803240776, | |
| "learning_rate": 2.7222222222222224e-06, | |
| "loss": 0.0028, | |
| "num_tokens": 1304927.0, | |
| "reward": -7.5625, | |
| "reward_std": 0.21650634706020355, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.5625, | |
| "rewards/check_numbers/std": 0.75, | |
| "rewards/format_and_language_reward_func/mean": -3.0, | |
| "rewards/format_and_language_reward_func/std": 0.0, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 52 | |
| }, | |
| { | |
| "completion_length": 1505.0625, | |
| "completions/clipped_ratio": 0.6875, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1408.0, | |
| "completions/mean_length": 1505.0625, | |
| "completions/mean_terminated_length": 1027.800048828125, | |
| "completions/min_length": 826.0, | |
| "completions/min_terminated_length": 826.0, | |
| "epoch": 0.016656191074795726, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12817677855491638, | |
| "kl": 0.05584558844566345, | |
| "learning_rate": 2.666666666666667e-06, | |
| "loss": 0.0022, | |
| "num_tokens": 1332692.0, | |
| "reward": -7.625, | |
| "reward_std": 0.875, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.375, | |
| "rewards/check_numbers/std": 0.670820415019989, | |
| "rewards/format_and_language_reward_func/mean": -3.25, | |
| "rewards/format_and_language_reward_func/std": 0.6831300854682922, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 53 | |
| }, | |
| { | |
| "completion_length": 1617.8125, | |
| "completions/clipped_ratio": 0.75, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1689.0, | |
| "completions/mean_length": 1617.8125, | |
| "completions/mean_terminated_length": 1305.25, | |
| "completions/min_length": 897.0, | |
| "completions/min_terminated_length": 897.0, | |
| "epoch": 0.01697045883092395, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.11597556620836258, | |
| "kl": 0.05314599350094795, | |
| "learning_rate": 2.6111111111111113e-06, | |
| "loss": 0.0021, | |
| "num_tokens": 1362965.0, | |
| "reward": -7.8125, | |
| "reward_std": 0.661700427532196, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.5625, | |
| "rewards/check_numbers/std": 0.75, | |
| "rewards/format_and_language_reward_func/mean": -3.25, | |
| "rewards/format_and_language_reward_func/std": 0.6831300854682922, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 54 | |
| }, | |
| { | |
| "completion_length": 1612.125, | |
| "completions/clipped_ratio": 0.8125, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1567.0, | |
| "completions/mean_length": 1612.125, | |
| "completions/mean_terminated_length": 1136.0, | |
| "completions/min_length": 707.0, | |
| "completions/min_terminated_length": 707.0, | |
| "epoch": 0.017284726587052168, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.07392556965351105, | |
| "kl": 0.04997054487466812, | |
| "learning_rate": 2.5555555555555557e-06, | |
| "loss": 0.002, | |
| "num_tokens": 1393151.0, | |
| "reward": -7.46875, | |
| "reward_std": 0.1875, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.46875, | |
| "rewards/check_numbers/std": 0.7180703282356262, | |
| "rewards/format_and_language_reward_func/mean": -3.0, | |
| "rewards/format_and_language_reward_func/std": 0.0, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 55 | |
| }, | |
| { | |
| "completion_length": 1571.625, | |
| "completions/clipped_ratio": 0.8125, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1313.0, | |
| "completions/mean_length": 1571.625, | |
| "completions/mean_terminated_length": 920.0, | |
| "completions/min_length": 403.0, | |
| "completions/min_terminated_length": 403.0, | |
| "epoch": 0.01759899434318039, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.10698790848255157, | |
| "kl": 0.040371235460042953, | |
| "learning_rate": 2.5e-06, | |
| "loss": 0.0016, | |
| "num_tokens": 1422181.0, | |
| "reward": -7.9375, | |
| "reward_std": 0.5915063619613647, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.9375, | |
| "rewards/check_numbers/std": 0.75, | |
| "rewards/format_and_language_reward_func/mean": -3.0, | |
| "rewards/format_and_language_reward_func/std": 0.0, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 56 | |
| }, | |
| { | |
| "completion_length": 1594.9375, | |
| "completions/clipped_ratio": 0.875, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 707.0, | |
| "completions/mean_length": 1594.9375, | |
| "completions/mean_terminated_length": 705.5, | |
| "completions/min_length": 704.0, | |
| "completions/min_terminated_length": 704.0, | |
| "epoch": 0.017913262099308613, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.6847302317619324, | |
| "kl": 0.04597615823149681, | |
| "learning_rate": 2.4444444444444447e-06, | |
| "loss": 0.0018, | |
| "num_tokens": 1451644.0, | |
| "reward": -7.59375, | |
| "reward_std": 0.6205127239227295, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.46875, | |
| "rewards/check_numbers/std": 0.7180703282356262, | |
| "rewards/format_and_language_reward_func/mean": -3.125, | |
| "rewards/format_and_language_reward_func/std": 0.5, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 57 | |
| }, | |
| { | |
| "completion_length": 1503.375, | |
| "completions/clipped_ratio": 0.8125, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 640.0, | |
| "completions/mean_length": 1503.375, | |
| "completions/mean_terminated_length": 556.0, | |
| "completions/min_length": 475.0, | |
| "completions/min_terminated_length": 475.0, | |
| "epoch": 0.018227529855436832, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.09949938207864761, | |
| "kl": 0.056429892778396606, | |
| "learning_rate": 2.388888888888889e-06, | |
| "loss": 0.0023, | |
| "num_tokens": 1479182.0, | |
| "reward": -7.96875, | |
| "reward_std": 0.6540063619613647, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.84375, | |
| "rewards/check_numbers/std": 0.7685213088989258, | |
| "rewards/format_and_language_reward_func/mean": -3.125, | |
| "rewards/format_and_language_reward_func/std": 0.5, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 58 | |
| }, | |
| { | |
| "completion_length": 1563.0, | |
| "completions/clipped_ratio": 0.6875, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1411.0, | |
| "completions/mean_length": 1563.0, | |
| "completions/mean_terminated_length": 1213.2000732421875, | |
| "completions/min_length": 990.0, | |
| "completions/min_terminated_length": 990.0, | |
| "epoch": 0.018541797611565054, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.11543877422809601, | |
| "kl": 0.035633672028779984, | |
| "learning_rate": 2.3333333333333336e-06, | |
| "loss": 0.0014, | |
| "num_tokens": 1508062.0, | |
| "reward": -7.65625, | |
| "reward_std": 0.7790063619613647, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.65625, | |
| "rewards/check_numbers/std": 0.7685213088989258, | |
| "rewards/format_and_language_reward_func/mean": -3.0, | |
| "rewards/format_and_language_reward_func/std": 0.0, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 59 | |
| }, | |
| { | |
| "completion_length": 1707.75, | |
| "completions/clipped_ratio": 0.8125, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1691.0, | |
| "completions/mean_length": 1707.75, | |
| "completions/mean_terminated_length": 1646.0, | |
| "completions/min_length": 1605.0, | |
| "completions/min_terminated_length": 1605.0, | |
| "epoch": 0.018856065367693273, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0639866441488266, | |
| "kl": 0.05725252255797386, | |
| "learning_rate": 2.277777777777778e-06, | |
| "loss": 0.0023, | |
| "num_tokens": 1539650.0, | |
| "reward": -7.25, | |
| "reward_std": 0.28867512941360474, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": 0.0, | |
| "rewards/check_numbers/std": 0.0, | |
| "rewards/format_and_language_reward_func/mean": -3.25, | |
| "rewards/format_and_language_reward_func/std": 0.6831300854682922, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 60 | |
| }, | |
| { | |
| "completion_length": 1535.1875, | |
| "completions/clipped_ratio": 0.75, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1665.0, | |
| "completions/mean_length": 1535.1875, | |
| "completions/mean_terminated_length": 974.75, | |
| "completions/min_length": 374.0, | |
| "completions/min_terminated_length": 374.0, | |
| "epoch": 0.019170333123821496, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2154313325881958, | |
| "kl": 0.06772614270448685, | |
| "learning_rate": 2.222222222222222e-06, | |
| "loss": 0.0027, | |
| "num_tokens": 1567677.0, | |
| "reward": -8.09375, | |
| "reward_std": 0.9407067894935608, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.84375, | |
| "rewards/check_numbers/std": 0.7685213088989258, | |
| "rewards/format_and_language_reward_func/mean": -3.25, | |
| "rewards/format_and_language_reward_func/std": 0.6831300854682922, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 61 | |
| }, | |
| { | |
| "completion_length": 1480.75, | |
| "completions/clipped_ratio": 0.75, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1193.0, | |
| "completions/mean_length": 1480.75, | |
| "completions/mean_terminated_length": 757.0, | |
| "completions/min_length": 323.0, | |
| "completions/min_terminated_length": 323.0, | |
| "epoch": 0.01948460087994972, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.18533650040626526, | |
| "kl": 0.04637778922915459, | |
| "learning_rate": 2.166666666666667e-06, | |
| "loss": 0.0019, | |
| "num_tokens": 1595629.0, | |
| "reward": -7.5625, | |
| "reward_std": 0.8080127239227295, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.5625, | |
| "rewards/check_numbers/std": 0.75, | |
| "rewards/format_and_language_reward_func/mean": -3.0, | |
| "rewards/format_and_language_reward_func/std": 0.0, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 62 | |
| }, | |
| { | |
| "completion_length": 1604.8125, | |
| "completions/clipped_ratio": 0.8125, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1672.0, | |
| "completions/mean_length": 1604.8125, | |
| "completions/mean_terminated_length": 1097.0, | |
| "completions/min_length": 560.0, | |
| "completions/min_terminated_length": 560.0, | |
| "epoch": 0.019798868636077938, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.17045044898986816, | |
| "kl": 0.05853426456451416, | |
| "learning_rate": 2.1111111111111114e-06, | |
| "loss": 0.0023, | |
| "num_tokens": 1624594.0, | |
| "reward": -8.40625, | |
| "reward_std": 0.7628755569458008, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -1.03125, | |
| "rewards/check_numbers/std": 0.7180703282356262, | |
| "rewards/format_and_language_reward_func/mean": -3.375, | |
| "rewards/format_and_language_reward_func/std": 0.8062257766723633, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 63 | |
| }, | |
| { | |
| "completion_length": 1406.9375, | |
| "completions/clipped_ratio": 0.625, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1612.0, | |
| "completions/mean_length": 1406.9375, | |
| "completions/mean_terminated_length": 881.8333740234375, | |
| "completions/min_length": 494.0, | |
| "completions/min_terminated_length": 494.0, | |
| "epoch": 0.02011313639220616, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.13805745542049408, | |
| "kl": 0.045212242752313614, | |
| "learning_rate": 2.0555555555555555e-06, | |
| "loss": 0.0018, | |
| "num_tokens": 1650865.0, | |
| "reward": -8.15625, | |
| "reward_std": 0.4955126941204071, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -1.03125, | |
| "rewards/check_numbers/std": 0.7180703282356262, | |
| "rewards/format_and_language_reward_func/mean": -3.125, | |
| "rewards/format_and_language_reward_func/std": 0.5, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 64 | |
| }, | |
| { | |
| "completion_length": 1511.0625, | |
| "completions/clipped_ratio": 0.625, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1656.0, | |
| "completions/mean_length": 1511.0625, | |
| "completions/mean_terminated_length": 1159.5, | |
| "completions/min_length": 661.0, | |
| "completions/min_terminated_length": 661.0, | |
| "epoch": 0.02042740414833438, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.10470445454120636, | |
| "kl": 0.040612928569316864, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 0.0016, | |
| "num_tokens": 1679182.0, | |
| "reward": -8.0, | |
| "reward_std": 0.9917292594909668, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.75, | |
| "rewards/check_numbers/std": 0.7745966911315918, | |
| "rewards/format_and_language_reward_func/mean": -3.25, | |
| "rewards/format_and_language_reward_func/std": 0.6831300854682922, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 65 | |
| }, | |
| { | |
| "completion_length": 1688.3125, | |
| "completions/clipped_ratio": 0.875, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1570.0, | |
| "completions/mean_length": 1688.3125, | |
| "completions/mean_terminated_length": 1452.5, | |
| "completions/min_length": 1335.0, | |
| "completions/min_terminated_length": 1335.0, | |
| "epoch": 0.020741671904462602, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.0923866257071495, | |
| "kl": 0.04577171802520752, | |
| "learning_rate": 1.944444444444445e-06, | |
| "loss": 0.0018, | |
| "num_tokens": 1710331.0, | |
| "reward": -6.875, | |
| "reward_std": 0.9716878533363342, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": 0.25, | |
| "rewards/check_numbers/std": 1.3662601709365845, | |
| "rewards/format_and_language_reward_func/mean": -3.125, | |
| "rewards/format_and_language_reward_func/std": 0.5, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 66 | |
| }, | |
| { | |
| "completion_length": 1652.3125, | |
| "completions/clipped_ratio": 0.9375, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 607.0, | |
| "completions/mean_length": 1652.3125, | |
| "completions/mean_terminated_length": 607.0, | |
| "completions/min_length": 607.0, | |
| "completions/min_terminated_length": 607.0, | |
| "epoch": 0.021055939660590824, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.13661737740039825, | |
| "kl": 0.0424213632941246, | |
| "learning_rate": 1.888888888888889e-06, | |
| "loss": 0.0017, | |
| "num_tokens": 1741012.0, | |
| "reward": -7.71875, | |
| "reward_std": 0.8185844421386719, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.46875, | |
| "rewards/check_numbers/std": 0.7180703282356262, | |
| "rewards/format_and_language_reward_func/mean": -3.25, | |
| "rewards/format_and_language_reward_func/std": 0.6831300854682922, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 67 | |
| }, | |
| { | |
| "completion_length": 1409.125, | |
| "completions/clipped_ratio": 0.625, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1383.0, | |
| "completions/mean_length": 1409.125, | |
| "completions/mean_terminated_length": 887.6666870117188, | |
| "completions/min_length": 636.0, | |
| "completions/min_terminated_length": 636.0, | |
| "epoch": 0.021370207416719043, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.15749689936637878, | |
| "kl": 0.0645652636885643, | |
| "learning_rate": 1.8333333333333333e-06, | |
| "loss": 0.0026, | |
| "num_tokens": 1767746.0, | |
| "reward": -8.125, | |
| "reward_std": 1.5154354572296143, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.375, | |
| "rewards/check_numbers/std": 0.670820415019989, | |
| "rewards/format_and_language_reward_func/mean": -3.75, | |
| "rewards/format_and_language_reward_func/std": 1.0, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 68 | |
| }, | |
| { | |
| "completion_length": 1533.5, | |
| "completions/clipped_ratio": 0.6875, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1719.0, | |
| "completions/mean_length": 1533.5, | |
| "completions/mean_terminated_length": 1118.800048828125, | |
| "completions/min_length": 381.0, | |
| "completions/min_terminated_length": 381.0, | |
| "epoch": 0.021684475172847266, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.10066410154104233, | |
| "kl": 0.0493154339492321, | |
| "learning_rate": 1.777777777777778e-06, | |
| "loss": 0.002, | |
| "num_tokens": 1795630.0, | |
| "reward": -8.09375, | |
| "reward_std": 0.7028881907463074, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.84375, | |
| "rewards/check_numbers/std": 0.7685213088989258, | |
| "rewards/format_and_language_reward_func/mean": -3.25, | |
| "rewards/format_and_language_reward_func/std": 0.6831300854682922, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 69 | |
| }, | |
| { | |
| "completion_length": 1512.1875, | |
| "completions/clipped_ratio": 0.75, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1365.0, | |
| "completions/mean_length": 1512.1875, | |
| "completions/mean_terminated_length": 882.75, | |
| "completions/min_length": 438.0, | |
| "completions/min_terminated_length": 438.0, | |
| "epoch": 0.02199874292897549, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.08896202594041824, | |
| "kl": 0.054354239255189896, | |
| "learning_rate": 1.7222222222222224e-06, | |
| "loss": 0.0022, | |
| "num_tokens": 1823761.0, | |
| "reward": -7.6875, | |
| "reward_std": 0.46650636196136475, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.5625, | |
| "rewards/check_numbers/std": 0.75, | |
| "rewards/format_and_language_reward_func/mean": -3.125, | |
| "rewards/format_and_language_reward_func/std": 0.5, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 70 | |
| }, | |
| { | |
| "completion_length": 1664.4375, | |
| "completions/clipped_ratio": 0.875, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1470.0, | |
| "completions/mean_length": 1664.4375, | |
| "completions/mean_terminated_length": 1261.5, | |
| "completions/min_length": 1053.0, | |
| "completions/min_terminated_length": 1053.0, | |
| "epoch": 0.022313010685103708, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0697084367275238, | |
| "kl": 0.031285081058740616, | |
| "learning_rate": 1.6666666666666667e-06, | |
| "loss": 0.0013, | |
| "num_tokens": 1854820.0, | |
| "reward": -7.5625, | |
| "reward_std": 0.375, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.5625, | |
| "rewards/check_numbers/std": 0.75, | |
| "rewards/format_and_language_reward_func/mean": -3.0, | |
| "rewards/format_and_language_reward_func/std": 0.0, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 71 | |
| }, | |
| { | |
| "completion_length": 1451.5, | |
| "completions/clipped_ratio": 0.625, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1602.0, | |
| "completions/mean_length": 1451.5, | |
| "completions/mean_terminated_length": 1000.6666870117188, | |
| "completions/min_length": 462.0, | |
| "completions/min_terminated_length": 462.0, | |
| "epoch": 0.02262727844123193, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1362772285938263, | |
| "kl": 0.057278823107481, | |
| "learning_rate": 1.6111111111111113e-06, | |
| "loss": 0.0023, | |
| "num_tokens": 1882008.0, | |
| "reward": -7.875, | |
| "reward_std": 1.118110179901123, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.375, | |
| "rewards/check_numbers/std": 0.670820415019989, | |
| "rewards/format_and_language_reward_func/mean": -3.5, | |
| "rewards/format_and_language_reward_func/std": 0.8944272398948669, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 72 | |
| }, | |
| { | |
| "completion_length": 1348.5, | |
| "completions/clipped_ratio": 0.5, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1501.0, | |
| "completions/mean_length": 1348.5, | |
| "completions/mean_terminated_length": 975.0, | |
| "completions/min_length": 235.0, | |
| "completions/min_terminated_length": 235.0, | |
| "epoch": 0.02294154619736015, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.11800263077020645, | |
| "kl": 0.05172204598784447, | |
| "learning_rate": 1.5555555555555558e-06, | |
| "loss": 0.0021, | |
| "num_tokens": 1907992.0, | |
| "reward": -7.78125, | |
| "reward_std": 0.4375, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.65625, | |
| "rewards/check_numbers/std": 0.7685213088989258, | |
| "rewards/format_and_language_reward_func/mean": -3.125, | |
| "rewards/format_and_language_reward_func/std": 0.5, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 73 | |
| }, | |
| { | |
| "completion_length": 1543.5625, | |
| "completions/clipped_ratio": 0.625, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1672.0, | |
| "completions/mean_length": 1543.5625, | |
| "completions/mean_terminated_length": 1246.166748046875, | |
| "completions/min_length": 603.0, | |
| "completions/min_terminated_length": 603.0, | |
| "epoch": 0.023255813953488372, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.12982743978500366, | |
| "kl": 0.064728744328022, | |
| "learning_rate": 1.5e-06, | |
| "loss": 0.0026, | |
| "num_tokens": 1936533.0, | |
| "reward": -7.53125, | |
| "reward_std": 0.6926814913749695, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.28125, | |
| "rewards/check_numbers/std": 0.6046693325042725, | |
| "rewards/format_and_language_reward_func/mean": -3.25, | |
| "rewards/format_and_language_reward_func/std": 0.6831300854682922, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 74 | |
| }, | |
| { | |
| "completion_length": 1450.4375, | |
| "completions/clipped_ratio": 0.75, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1525.0, | |
| "completions/mean_length": 1450.4375, | |
| "completions/mean_terminated_length": 635.75, | |
| "completions/min_length": 71.0, | |
| "completions/min_terminated_length": 71.0, | |
| "epoch": 0.023570081709616594, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.081117182970047, | |
| "kl": 0.05186208337545395, | |
| "learning_rate": 1.4444444444444445e-06, | |
| "loss": 0.0021, | |
| "num_tokens": 1963812.0, | |
| "reward": -7.75, | |
| "reward_std": 0.5386751294136047, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.375, | |
| "rewards/check_numbers/std": 0.670820415019989, | |
| "rewards/format_and_language_reward_func/mean": -3.375, | |
| "rewards/format_and_language_reward_func/std": 0.8062257766723633, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 75 | |
| }, | |
| { | |
| "completion_length": 1392.25, | |
| "completions/clipped_ratio": 0.6875, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1499.0, | |
| "completions/mean_length": 1392.25, | |
| "completions/mean_terminated_length": 666.7999877929688, | |
| "completions/min_length": 31.0, | |
| "completions/min_terminated_length": 31.0, | |
| "epoch": 0.023884349465744813, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.37952759861946106, | |
| "kl": 0.06186853349208832, | |
| "learning_rate": 1.3888888888888892e-06, | |
| "loss": 0.0025, | |
| "num_tokens": 1990520.0, | |
| "reward": -7.65625, | |
| "reward_std": 0.6205127239227295, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.65625, | |
| "rewards/check_numbers/std": 0.7685213088989258, | |
| "rewards/format_and_language_reward_func/mean": -3.0, | |
| "rewards/format_and_language_reward_func/std": 0.0, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 76 | |
| }, | |
| { | |
| "completion_length": 1473.9375, | |
| "completions/clipped_ratio": 0.6875, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1490.0, | |
| "completions/mean_length": 1473.9375, | |
| "completions/mean_terminated_length": 928.2000122070312, | |
| "completions/min_length": 420.0, | |
| "completions/min_terminated_length": 420.0, | |
| "epoch": 0.024198617221873036, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1677270531654358, | |
| "kl": 0.053330324590206146, | |
| "learning_rate": 1.3333333333333334e-06, | |
| "loss": 0.0021, | |
| "num_tokens": 2018063.0, | |
| "reward": -7.75, | |
| "reward_std": 0.75, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.75, | |
| "rewards/check_numbers/std": 0.7745966911315918, | |
| "rewards/format_and_language_reward_func/mean": -3.0, | |
| "rewards/format_and_language_reward_func/std": 0.0, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 77 | |
| }, | |
| { | |
| "completion_length": 1513.625, | |
| "completions/clipped_ratio": 0.6875, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1547.0, | |
| "completions/mean_length": 1513.625, | |
| "completions/mean_terminated_length": 1055.2000732421875, | |
| "completions/min_length": 599.0, | |
| "completions/min_terminated_length": 599.0, | |
| "epoch": 0.02451288497800126, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.1181650385260582, | |
| "kl": 0.05255034193396568, | |
| "learning_rate": 1.2777777777777779e-06, | |
| "loss": 0.0021, | |
| "num_tokens": 2046037.0, | |
| "reward": -7.65625, | |
| "reward_std": 0.6926814913749695, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.28125, | |
| "rewards/check_numbers/std": 0.6046693325042725, | |
| "rewards/format_and_language_reward_func/mean": -3.375, | |
| "rewards/format_and_language_reward_func/std": 0.8062257766723633, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 78 | |
| }, | |
| { | |
| "completion_length": 1411.25, | |
| "completions/clipped_ratio": 0.5625, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1426.0, | |
| "completions/mean_length": 1411.25, | |
| "completions/mean_terminated_length": 1011.71435546875, | |
| "completions/min_length": 793.0, | |
| "completions/min_terminated_length": 793.0, | |
| "epoch": 0.024827152734129478, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.1206207275390625, | |
| "kl": 0.05586665868759155, | |
| "learning_rate": 1.2222222222222223e-06, | |
| "loss": 0.0022, | |
| "num_tokens": 2072805.0, | |
| "reward": -7.78125, | |
| "reward_std": 0.8977102637290955, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.28125, | |
| "rewards/check_numbers/std": 0.6046693325042725, | |
| "rewards/format_and_language_reward_func/mean": -3.5, | |
| "rewards/format_and_language_reward_func/std": 0.8944272398948669, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 79 | |
| }, | |
| { | |
| "completion_length": 1497.875, | |
| "completions/clipped_ratio": 0.75, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1400.0, | |
| "completions/mean_length": 1497.875, | |
| "completions/mean_terminated_length": 825.5, | |
| "completions/min_length": 494.0, | |
| "completions/min_terminated_length": 494.0, | |
| "epoch": 0.0251414204902577, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.15405312180519104, | |
| "kl": 0.05234729126095772, | |
| "learning_rate": 1.1666666666666668e-06, | |
| "loss": 0.0021, | |
| "num_tokens": 2100631.0, | |
| "reward": -7.5, | |
| "reward_std": 1.9904643297195435, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.125, | |
| "rewards/check_numbers/std": 1.5864006280899048, | |
| "rewards/format_and_language_reward_func/mean": -3.375, | |
| "rewards/format_and_language_reward_func/std": 0.8062257766723633, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 80 | |
| }, | |
| { | |
| "completion_length": 1425.0, | |
| "completions/clipped_ratio": 0.625, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1576.0, | |
| "completions/mean_length": 1425.0, | |
| "completions/mean_terminated_length": 930.0, | |
| "completions/min_length": 79.0, | |
| "completions/min_terminated_length": 79.0, | |
| "epoch": 0.02545568824638592, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.11791858822107315, | |
| "kl": 0.04198841378092766, | |
| "learning_rate": 1.111111111111111e-06, | |
| "loss": 0.0017, | |
| "num_tokens": 2127275.0, | |
| "reward": -7.46875, | |
| "reward_std": 0.6205127239227295, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.46875, | |
| "rewards/check_numbers/std": 0.7180703282356262, | |
| "rewards/format_and_language_reward_func/mean": -3.0, | |
| "rewards/format_and_language_reward_func/std": 0.0, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 81 | |
| }, | |
| { | |
| "completion_length": 1682.5, | |
| "completions/clipped_ratio": 0.8125, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1662.0, | |
| "completions/mean_length": 1682.5, | |
| "completions/mean_terminated_length": 1511.3333740234375, | |
| "completions/min_length": 1430.0, | |
| "completions/min_terminated_length": 1430.0, | |
| "epoch": 0.025769956002514142, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1258944272994995, | |
| "kl": 0.03991963341832161, | |
| "learning_rate": 1.0555555555555557e-06, | |
| "loss": 0.0016, | |
| "num_tokens": 2157831.0, | |
| "reward": -7.96875, | |
| "reward_std": 0.8125, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.84375, | |
| "rewards/check_numbers/std": 0.7685213088989258, | |
| "rewards/format_and_language_reward_func/mean": -3.125, | |
| "rewards/format_and_language_reward_func/std": 0.5, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 82 | |
| }, | |
| { | |
| "completion_length": 1560.0, | |
| "completions/clipped_ratio": 0.8125, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1502.0, | |
| "completions/mean_length": 1560.0, | |
| "completions/mean_terminated_length": 858.0, | |
| "completions/min_length": 199.0, | |
| "completions/min_terminated_length": 199.0, | |
| "epoch": 0.026084223758642364, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.25008276104927063, | |
| "kl": 0.04483529552817345, | |
| "learning_rate": 1.0000000000000002e-06, | |
| "loss": 0.0018, | |
| "num_tokens": 2186663.0, | |
| "reward": -7.5625, | |
| "reward_std": 0.5915063619613647, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.5625, | |
| "rewards/check_numbers/std": 0.75, | |
| "rewards/format_and_language_reward_func/mean": -3.0, | |
| "rewards/format_and_language_reward_func/std": 0.0, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 83 | |
| }, | |
| { | |
| "completion_length": 1363.8125, | |
| "completions/clipped_ratio": 0.625, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 890.0, | |
| "completions/mean_length": 1363.8125, | |
| "completions/mean_terminated_length": 766.8333740234375, | |
| "completions/min_length": 623.0, | |
| "completions/min_terminated_length": 623.0, | |
| "epoch": 0.026398491514770583, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.11882040649652481, | |
| "kl": 0.04816675931215286, | |
| "learning_rate": 9.444444444444445e-07, | |
| "loss": 0.0019, | |
| "num_tokens": 2212692.0, | |
| "reward": -7.78125, | |
| "reward_std": 0.6540063619613647, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.65625, | |
| "rewards/check_numbers/std": 0.7685213088989258, | |
| "rewards/format_and_language_reward_func/mean": -3.125, | |
| "rewards/format_and_language_reward_func/std": 0.5, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 84 | |
| }, | |
| { | |
| "completion_length": 1652.3125, | |
| "completions/clipped_ratio": 0.8125, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1646.0, | |
| "completions/mean_length": 1652.3125, | |
| "completions/mean_terminated_length": 1350.3333740234375, | |
| "completions/min_length": 1045.0, | |
| "completions/min_terminated_length": 1045.0, | |
| "epoch": 0.026712759270898806, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.11614203453063965, | |
| "kl": 0.0454854890704155, | |
| "learning_rate": 8.88888888888889e-07, | |
| "loss": 0.0018, | |
| "num_tokens": 2243069.0, | |
| "reward": -8.0, | |
| "reward_std": 0.9117004871368408, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.75, | |
| "rewards/check_numbers/std": 0.7745966911315918, | |
| "rewards/format_and_language_reward_func/mean": -3.25, | |
| "rewards/format_and_language_reward_func/std": 0.6831300854682922, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 85 | |
| }, | |
| { | |
| "completion_length": 1576.625, | |
| "completions/clipped_ratio": 0.75, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1560.0, | |
| "completions/mean_length": 1576.625, | |
| "completions/mean_terminated_length": 1140.5, | |
| "completions/min_length": 801.0, | |
| "completions/min_terminated_length": 801.0, | |
| "epoch": 0.02702702702702703, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.09951747953891754, | |
| "kl": 0.04343428835272789, | |
| "learning_rate": 8.333333333333333e-07, | |
| "loss": 0.0017, | |
| "num_tokens": 2272123.0, | |
| "reward": -7.4375, | |
| "reward_std": 0.9503755569458008, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.3125, | |
| "rewards/check_numbers/std": 1.6520190238952637, | |
| "rewards/format_and_language_reward_func/mean": -3.125, | |
| "rewards/format_and_language_reward_func/std": 0.5, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 86 | |
| }, | |
| { | |
| "completion_length": 1552.3125, | |
| "completions/clipped_ratio": 0.8125, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1073.0, | |
| "completions/mean_length": 1552.3125, | |
| "completions/mean_terminated_length": 817.0, | |
| "completions/min_length": 665.0, | |
| "completions/min_terminated_length": 665.0, | |
| "epoch": 0.027341294783155248, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1279648393392563, | |
| "kl": 0.04489857330918312, | |
| "learning_rate": 7.777777777777779e-07, | |
| "loss": 0.0018, | |
| "num_tokens": 2300748.0, | |
| "reward": -7.96875, | |
| "reward_std": 0.9505414962768555, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.84375, | |
| "rewards/check_numbers/std": 0.7685213088989258, | |
| "rewards/format_and_language_reward_func/mean": -3.125, | |
| "rewards/format_and_language_reward_func/std": 0.5, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 87 | |
| }, | |
| { | |
| "completion_length": 1594.375, | |
| "completions/clipped_ratio": 0.8125, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1438.0, | |
| "completions/mean_length": 1594.375, | |
| "completions/mean_terminated_length": 1041.3333740234375, | |
| "completions/min_length": 780.0, | |
| "completions/min_terminated_length": 780.0, | |
| "epoch": 0.02765556253928347, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.18847358226776123, | |
| "kl": 0.03859600052237511, | |
| "learning_rate": 7.222222222222222e-07, | |
| "loss": 0.0015, | |
| "num_tokens": 2330330.0, | |
| "reward": -7.96875, | |
| "reward_std": 0.8125, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.84375, | |
| "rewards/check_numbers/std": 0.7685213088989258, | |
| "rewards/format_and_language_reward_func/mean": -3.125, | |
| "rewards/format_and_language_reward_func/std": 0.5, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 88 | |
| }, | |
| { | |
| "completion_length": 1683.1875, | |
| "completions/clipped_ratio": 0.875, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1547.0, | |
| "completions/mean_length": 1683.1875, | |
| "completions/mean_terminated_length": 1411.5, | |
| "completions/min_length": 1276.0, | |
| "completions/min_terminated_length": 1276.0, | |
| "epoch": 0.02796983029541169, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.10217194259166718, | |
| "kl": 0.044257864356040955, | |
| "learning_rate": 6.666666666666667e-07, | |
| "loss": 0.0018, | |
| "num_tokens": 2361789.0, | |
| "reward": -7.6875, | |
| "reward_std": 1.0060844421386719, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.5625, | |
| "rewards/check_numbers/std": 0.75, | |
| "rewards/format_and_language_reward_func/mean": -3.125, | |
| "rewards/format_and_language_reward_func/std": 0.5, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 89 | |
| }, | |
| { | |
| "completion_length": 1638.3125, | |
| "completions/clipped_ratio": 0.8125, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1547.0, | |
| "completions/mean_length": 1638.3125, | |
| "completions/mean_terminated_length": 1275.666748046875, | |
| "completions/min_length": 1068.0, | |
| "completions/min_terminated_length": 1068.0, | |
| "epoch": 0.028284098051539912, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.6536535620689392, | |
| "kl": 0.07140226662158966, | |
| "learning_rate": 6.111111111111112e-07, | |
| "loss": 0.0029, | |
| "num_tokens": 2392042.0, | |
| "reward": -7.53125, | |
| "reward_std": 0.9040063619613647, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.28125, | |
| "rewards/check_numbers/std": 0.6046693325042725, | |
| "rewards/format_and_language_reward_func/mean": -3.25, | |
| "rewards/format_and_language_reward_func/std": 0.6831300854682922, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 90 | |
| }, | |
| { | |
| "completion_length": 1547.75, | |
| "completions/clipped_ratio": 0.75, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1282.0, | |
| "completions/mean_length": 1547.75, | |
| "completions/mean_terminated_length": 1025.0, | |
| "completions/min_length": 848.0, | |
| "completions/min_terminated_length": 848.0, | |
| "epoch": 0.028598365807668134, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12291199713945389, | |
| "kl": 0.05366697907447815, | |
| "learning_rate": 5.555555555555555e-07, | |
| "loss": 0.0021, | |
| "num_tokens": 2420674.0, | |
| "reward": -7.78125, | |
| "reward_std": 0.8705127239227295, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.65625, | |
| "rewards/check_numbers/std": 0.7685213088989258, | |
| "rewards/format_and_language_reward_func/mean": -3.125, | |
| "rewards/format_and_language_reward_func/std": 0.5, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 91 | |
| }, | |
| { | |
| "completion_length": 1515.5625, | |
| "completions/clipped_ratio": 0.625, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1680.0, | |
| "completions/mean_length": 1515.5625, | |
| "completions/mean_terminated_length": 1171.5, | |
| "completions/min_length": 73.0, | |
| "completions/min_terminated_length": 73.0, | |
| "epoch": 0.028912633563796353, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.11394577473402023, | |
| "kl": 0.04896366223692894, | |
| "learning_rate": 5.000000000000001e-07, | |
| "loss": 0.002, | |
| "num_tokens": 2448871.0, | |
| "reward": -6.71875, | |
| "reward_std": 0.8125, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": 0.28125, | |
| "rewards/check_numbers/std": 1.7220990657806396, | |
| "rewards/format_and_language_reward_func/mean": -3.0, | |
| "rewards/format_and_language_reward_func/std": 0.0, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 92 | |
| }, | |
| { | |
| "completion_length": 1545.1875, | |
| "completions/clipped_ratio": 0.625, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1656.0, | |
| "completions/mean_length": 1545.1875, | |
| "completions/mean_terminated_length": 1250.5, | |
| "completions/min_length": 537.0, | |
| "completions/min_terminated_length": 537.0, | |
| "epoch": 0.029226901319924576, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.07623685151338577, | |
| "kl": 0.046294040977954865, | |
| "learning_rate": 4.444444444444445e-07, | |
| "loss": 0.0019, | |
| "num_tokens": 2477226.0, | |
| "reward": -7.5625, | |
| "reward_std": 0.375, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.5625, | |
| "rewards/check_numbers/std": 0.75, | |
| "rewards/format_and_language_reward_func/mean": -3.0, | |
| "rewards/format_and_language_reward_func/std": 0.0, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 93 | |
| }, | |
| { | |
| "completion_length": 1588.75, | |
| "completions/clipped_ratio": 0.75, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1556.0, | |
| "completions/mean_length": 1588.75, | |
| "completions/mean_terminated_length": 1189.0, | |
| "completions/min_length": 405.0, | |
| "completions/min_terminated_length": 405.0, | |
| "epoch": 0.0295411690760528, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.11318648606538773, | |
| "kl": 0.04230440780520439, | |
| "learning_rate": 3.8888888888888895e-07, | |
| "loss": 0.0017, | |
| "num_tokens": 2506806.0, | |
| "reward": -7.75, | |
| "reward_std": 0.5915063619613647, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.75, | |
| "rewards/check_numbers/std": 0.7745966911315918, | |
| "rewards/format_and_language_reward_func/mean": -3.0, | |
| "rewards/format_and_language_reward_func/std": 0.0, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 94 | |
| }, | |
| { | |
| "completion_length": 1620.875, | |
| "completions/clipped_ratio": 0.875, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 987.0, | |
| "completions/mean_length": 1620.875, | |
| "completions/mean_terminated_length": 913.0, | |
| "completions/min_length": 839.0, | |
| "completions/min_terminated_length": 839.0, | |
| "epoch": 0.029855436832181018, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.11543525010347366, | |
| "kl": 0.0602000392973423, | |
| "learning_rate": 3.3333333333333335e-07, | |
| "loss": 0.0024, | |
| "num_tokens": 2536500.0, | |
| "reward": -7.96875, | |
| "reward_std": 0.6540063619613647, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.84375, | |
| "rewards/check_numbers/std": 0.7685213088989258, | |
| "rewards/format_and_language_reward_func/mean": -3.125, | |
| "rewards/format_and_language_reward_func/std": 0.5, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 95 | |
| }, | |
| { | |
| "completion_length": 1623.375, | |
| "completions/clipped_ratio": 0.75, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1687.0, | |
| "completions/mean_length": 1623.375, | |
| "completions/mean_terminated_length": 1327.5, | |
| "completions/min_length": 743.0, | |
| "completions/min_terminated_length": 743.0, | |
| "epoch": 0.03016970458830924, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.11837535351514816, | |
| "kl": 0.04759529232978821, | |
| "learning_rate": 2.7777777777777776e-07, | |
| "loss": 0.0019, | |
| "num_tokens": 2566598.0, | |
| "reward": -8.1875, | |
| "reward_std": 0.7165063619613647, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.9375, | |
| "rewards/check_numbers/std": 0.75, | |
| "rewards/format_and_language_reward_func/mean": -3.25, | |
| "rewards/format_and_language_reward_func/std": 0.6831300854682922, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 96 | |
| }, | |
| { | |
| "completion_length": 1617.0625, | |
| "completions/clipped_ratio": 0.75, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1545.0, | |
| "completions/mean_length": 1617.0625, | |
| "completions/mean_terminated_length": 1302.25, | |
| "completions/min_length": 868.0, | |
| "completions/min_terminated_length": 868.0, | |
| "epoch": 0.03048397234443746, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.11196030676364899, | |
| "kl": 0.04893610253930092, | |
| "learning_rate": 2.2222222222222224e-07, | |
| "loss": 0.002, | |
| "num_tokens": 2596579.0, | |
| "reward": -7.4375, | |
| "reward_std": 0.7165063619613647, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.1875, | |
| "rewards/check_numbers/std": 0.5123475790023804, | |
| "rewards/format_and_language_reward_func/mean": -3.25, | |
| "rewards/format_and_language_reward_func/std": 0.6831300854682922, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 97 | |
| }, | |
| { | |
| "completion_length": 1362.0625, | |
| "completions/clipped_ratio": 0.5625, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1402.0, | |
| "completions/mean_length": 1362.0625, | |
| "completions/mean_terminated_length": 899.2857666015625, | |
| "completions/min_length": 364.0, | |
| "completions/min_terminated_length": 364.0, | |
| "epoch": 0.030798240100565682, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.2802422046661377, | |
| "kl": 0.07202958315610886, | |
| "learning_rate": 1.6666666666666668e-07, | |
| "loss": 0.0029, | |
| "num_tokens": 2622492.0, | |
| "reward": -7.375, | |
| "reward_std": 0.4330126941204071, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.375, | |
| "rewards/check_numbers/std": 0.670820415019989, | |
| "rewards/format_and_language_reward_func/mean": -3.0, | |
| "rewards/format_and_language_reward_func/std": 0.0, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 98 | |
| }, | |
| { | |
| "completion_length": 1529.75, | |
| "completions/clipped_ratio": 0.75, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1474.0, | |
| "completions/mean_length": 1529.75, | |
| "completions/mean_terminated_length": 953.0, | |
| "completions/min_length": 371.0, | |
| "completions/min_terminated_length": 371.0, | |
| "epoch": 0.031112507856693904, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.27167099714279175, | |
| "kl": 0.0705300122499466, | |
| "learning_rate": 1.1111111111111112e-07, | |
| "loss": 0.0028, | |
| "num_tokens": 2650764.0, | |
| "reward": -7.40625, | |
| "reward_std": 0.6540063619613647, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.28125, | |
| "rewards/check_numbers/std": 0.6046693325042725, | |
| "rewards/format_and_language_reward_func/mean": -3.125, | |
| "rewards/format_and_language_reward_func/std": 0.5, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 99 | |
| }, | |
| { | |
| "completion_length": 1459.75, | |
| "completions/clipped_ratio": 0.6875, | |
| "completions/max_length": 1722.0, | |
| "completions/max_terminated_length": 1133.0, | |
| "completions/mean_length": 1459.75, | |
| "completions/mean_terminated_length": 882.7999877929688, | |
| "completions/min_length": 436.0, | |
| "completions/min_terminated_length": 436.0, | |
| "epoch": 0.03142677561282212, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.11008132994174957, | |
| "kl": 0.04374002292752266, | |
| "learning_rate": 5.555555555555556e-08, | |
| "loss": 0.0017, | |
| "num_tokens": 2678396.0, | |
| "reward": -7.90625, | |
| "reward_std": 1.233162522315979, | |
| "rewards/check_answer/mean": -2.0, | |
| "rewards/check_answer/std": 0.0, | |
| "rewards/check_numbers/mean": -0.65625, | |
| "rewards/check_numbers/std": 0.7685213088989258, | |
| "rewards/format_and_language_reward_func/mean": -3.25, | |
| "rewards/format_and_language_reward_func/std": 0.6831300854682922, | |
| "rewards/match_format_approximately/mean": -2.0, | |
| "rewards/match_format_approximately/std": 0.0, | |
| "rewards/match_format_exactly/mean": 0.0, | |
| "rewards/match_format_exactly/std": 0.0, | |
| "step": 100 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 100, | |
| "num_input_tokens_seen": 2678396, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |