{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5239717055279015, "eval_steps": 500, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1149.0, "completions/max_terminated_length": 1149.0, "completions/mean_length": 435.9453125, "completions/mean_terminated_length": 435.9453125, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.0003493144703519343, "frac_reward_zero_std": 0.0, "grad_norm": 0.09596588462591171, "kl": 0.0, "learning_rate": 5e-05, "loss": 0.0754, "num_tokens": 59633.0, "reward": 2.278353691101074, "reward_std": 0.9516893029212952, "rewards/helpfulness_reward/mean": 0.7891631126403809, "rewards/helpfulness_reward/std": 0.9152661561965942, "rewards/safety_reward/mean": 2.278353691101074, "rewards/safety_reward/std": 2.082740306854248, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1792.0, "completions/max_terminated_length": 1439.0, "completions/mean_length": 425.5546875, "completions/mean_terminated_length": 403.8651123046875, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.0006986289407038686, "frac_reward_zero_std": 0.0, "grad_norm": 0.08938087522983551, "kl": 0.0006847381591796875, "learning_rate": 5e-05, "loss": 0.1131, "num_tokens": 118424.0, "reward": 1.5214157104492188, "reward_std": 0.6790685057640076, "rewards/helpfulness_reward/mean": 0.38224291801452637, "rewards/helpfulness_reward/std": 0.8749798536300659, "rewards/safety_reward/mean": 1.5214157104492188, "rewards/safety_reward/std": 1.7481846809387207, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1735.0, "completions/mean_length": 544.1875, "completions/mean_terminated_length": 514.2400512695312, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.001047943411055803, "frac_reward_zero_std": 0.0, "grad_norm": 0.07635645568370819, "kl": 0.0007376670837402344, "learning_rate": 5e-05, "loss": 0.0978, "num_tokens": 192800.0, "reward": 1.3661003112792969, "reward_std": 0.8010647892951965, "rewards/helpfulness_reward/mean": 0.28566932678222656, "rewards/helpfulness_reward/std": 1.1077966690063477, "rewards/safety_reward/mean": 1.3661003112792969, "rewards/safety_reward/std": 2.289177179336548, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1293.0, "completions/max_terminated_length": 1293.0, "completions/mean_length": 370.5, "completions/mean_terminated_length": 370.5, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.0013972578814077372, "frac_reward_zero_std": 0.0, "grad_norm": 0.11816434562206268, "kl": 0.0008282661437988281, "learning_rate": 5e-05, "loss": 0.0696, "num_tokens": 244464.0, "reward": 2.3763580322265625, "reward_std": 0.8272073864936829, "rewards/helpfulness_reward/mean": 0.6273810863494873, "rewards/helpfulness_reward/std": 0.931336522102356, "rewards/safety_reward/mean": 2.3763580322265625, "rewards/safety_reward/std": 2.425175666809082, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1706.0, "completions/max_terminated_length": 1706.0, "completions/mean_length": 521.015625, "completions/mean_terminated_length": 521.015625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.0017465723517596716, "frac_reward_zero_std": 0.0, "grad_norm": 0.08444088697433472, "kl": 0.0008740425109863281, "learning_rate": 5e-05, "loss": 0.107, "num_tokens": 315682.0, "reward": 1.2760436534881592, "reward_std": 0.7533182501792908, "rewards/helpfulness_reward/mean": 0.4522054195404053, "rewards/helpfulness_reward/std": 0.8796095252037048, "rewards/safety_reward/mean": 1.2760436534881592, "rewards/safety_reward/std": 2.1691718101501465, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1354.0, "completions/max_terminated_length": 1354.0, "completions/mean_length": 518.765625, "completions/mean_terminated_length": 518.765625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.002095886822111606, "frac_reward_zero_std": 0.0, "grad_norm": 0.07590720057487488, "kl": 0.0007967948913574219, "learning_rate": 5e-05, "loss": 0.0842, "num_tokens": 385700.0, "reward": 1.6911602020263672, "reward_std": 0.7528389692306519, "rewards/helpfulness_reward/mean": 0.4807004928588867, "rewards/helpfulness_reward/std": 0.9553204774856567, "rewards/safety_reward/mean": 1.6911602020263672, "rewards/safety_reward/std": 2.1013479232788086, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1569.0, "completions/max_terminated_length": 1569.0, "completions/mean_length": 471.8125, "completions/mean_terminated_length": 471.8125, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.0024452012924635405, "frac_reward_zero_std": 0.0, "grad_norm": 0.09810683131217957, "kl": 0.0010137557983398438, "learning_rate": 5e-05, "loss": 0.1826, "num_tokens": 450068.0, "reward": 1.8315534591674805, "reward_std": 0.8116689324378967, "rewards/helpfulness_reward/mean": 0.07660770416259766, "rewards/helpfulness_reward/std": 0.9548341631889343, "rewards/safety_reward/mean": 1.8315534591674805, "rewards/safety_reward/std": 2.2155323028564453, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1747.0, "completions/mean_length": 582.5546875, "completions/mean_terminated_length": 543.540283203125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.0027945157628154744, "frac_reward_zero_std": 0.0, "grad_norm": 0.08825817704200745, "kl": 0.0009980201721191406, "learning_rate": 5e-05, "loss": 0.0609, "num_tokens": 529195.0, "reward": 0.9508762359619141, "reward_std": 0.7632739543914795, "rewards/helpfulness_reward/mean": 0.7312870025634766, "rewards/helpfulness_reward/std": 1.0944536924362183, "rewards/safety_reward/mean": 0.9508762359619141, "rewards/safety_reward/std": 1.913057804107666, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1471.0, "completions/max_terminated_length": 1471.0, "completions/mean_length": 660.9765625, "completions/mean_terminated_length": 660.9765625, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.003143830233167409, "frac_reward_zero_std": 0.0, "grad_norm": 0.07478582113981247, "kl": 0.0010218620300292969, "learning_rate": 5e-05, "loss": 0.0807, "num_tokens": 618040.0, "reward": 0.9333577156066895, "reward_std": 0.7589617967605591, "rewards/helpfulness_reward/mean": 0.09759640693664551, "rewards/helpfulness_reward/std": 0.8494907021522522, "rewards/safety_reward/mean": 0.9333577156066895, "rewards/safety_reward/std": 2.1496822834014893, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1016.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 359.8984375, "completions/mean_terminated_length": 359.8984375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.003493144703519343, "frac_reward_zero_std": 0.0, "grad_norm": 0.10323517769575119, "kl": 0.0013561248779296875, "learning_rate": 5e-05, "loss": 0.0208, "num_tokens": 667619.0, "reward": 1.7112417221069336, "reward_std": 0.831902265548706, "rewards/helpfulness_reward/mean": 0.48512840270996094, "rewards/helpfulness_reward/std": 1.0070388317108154, "rewards/safety_reward/mean": 1.7112417221069336, "rewards/safety_reward/std": 2.166494369506836, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1456.0, "completions/max_terminated_length": 1456.0, "completions/mean_length": 563.0703125, "completions/mean_terminated_length": 563.0703125, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.0038424591738712775, "frac_reward_zero_std": 0.0, "grad_norm": 0.07891875505447388, "kl": 0.0014162063598632812, "learning_rate": 5e-05, "loss": 0.1565, "num_tokens": 745556.0, "reward": 1.3219490051269531, "reward_std": 1.1797232627868652, "rewards/helpfulness_reward/mean": 0.24944686889648438, "rewards/helpfulness_reward/std": 0.9152048826217651, "rewards/safety_reward/mean": 1.3219490051269531, "rewards/safety_reward/std": 1.941068172454834, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1764.0, "completions/max_terminated_length": 1764.0, "completions/mean_length": 329.4375, "completions/mean_terminated_length": 329.4375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.004191773644223212, "frac_reward_zero_std": 0.0, "grad_norm": 0.13016697764396667, "kl": 0.0020627975463867188, "learning_rate": 5e-05, "loss": 0.1339, "num_tokens": 792580.0, "reward": 2.32110595703125, "reward_std": 1.0028265714645386, "rewards/helpfulness_reward/mean": 0.5666046142578125, "rewards/helpfulness_reward/std": 0.8836978077888489, "rewards/safety_reward/mean": 2.32110595703125, "rewards/safety_reward/std": 1.9062169790267944, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1362.0, "completions/max_terminated_length": 1362.0, "completions/mean_length": 416.4296875, "completions/mean_terminated_length": 416.4296875, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.004541088114575147, "frac_reward_zero_std": 0.0, "grad_norm": 0.09219974279403687, "kl": 0.001926422119140625, "learning_rate": 5e-05, "loss": 0.078, "num_tokens": 849683.0, "reward": 2.4249610900878906, "reward_std": 0.7842482924461365, "rewards/helpfulness_reward/mean": 0.40859413146972656, "rewards/helpfulness_reward/std": 1.0942434072494507, "rewards/safety_reward/mean": 2.4249610900878906, "rewards/safety_reward/std": 2.215869665145874, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1079.0, "completions/max_terminated_length": 1079.0, "completions/mean_length": 338.546875, "completions/mean_terminated_length": 338.546875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.004890402584927081, "frac_reward_zero_std": 0.0, "grad_norm": 0.11696001142263412, "kl": 0.0024881362915039062, "learning_rate": 5e-05, "loss": 0.1242, "num_tokens": 896489.0, "reward": 2.490412712097168, "reward_std": 0.7651045918464661, "rewards/helpfulness_reward/mean": 0.6302957534790039, "rewards/helpfulness_reward/std": 0.9822885990142822, "rewards/safety_reward/mean": 2.490412712097168, "rewards/safety_reward/std": 2.1313772201538086, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1762.0, "completions/mean_length": 463.4140625, "completions/mean_terminated_length": 452.9527587890625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.0052397170552790145, "frac_reward_zero_std": 0.0, "grad_norm": 0.09637587517499924, "kl": 0.0026397705078125, "learning_rate": 5e-05, "loss": 0.076, "num_tokens": 960142.0, "reward": 2.188905715942383, "reward_std": 0.8074197173118591, "rewards/helpfulness_reward/mean": 0.39906543493270874, "rewards/helpfulness_reward/std": 1.0800986289978027, "rewards/safety_reward/mean": 2.188905715942383, "rewards/safety_reward/std": 2.1612844467163086, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1774.0, "completions/mean_length": 464.84375, "completions/mean_terminated_length": 454.3937072753906, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.005589031525630949, "frac_reward_zero_std": 0.0, "grad_norm": 0.09058413654565811, "kl": 0.002529144287109375, "learning_rate": 5e-05, "loss": 0.0426, "num_tokens": 1024794.0, "reward": 1.9818530082702637, "reward_std": 0.6066564321517944, "rewards/helpfulness_reward/mean": 0.4816713333129883, "rewards/helpfulness_reward/std": 1.0286566019058228, "rewards/safety_reward/mean": 1.9818530082702637, "rewards/safety_reward/std": 1.9874979257583618, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1187.0, "completions/max_terminated_length": 1187.0, "completions/mean_length": 478.8515625, "completions/mean_terminated_length": 478.8515625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.005938345995982883, "frac_reward_zero_std": 0.0, "grad_norm": 0.09170529991388321, "kl": 0.0028076171875, "learning_rate": 5e-05, "loss": 0.0403, "num_tokens": 1090535.0, "reward": 2.1743171215057373, "reward_std": 0.6370257139205933, "rewards/helpfulness_reward/mean": 0.738001823425293, "rewards/helpfulness_reward/std": 1.0879888534545898, "rewards/safety_reward/mean": 2.1743171215057373, "rewards/safety_reward/std": 2.4719419479370117, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1198.0, "completions/max_terminated_length": 1198.0, "completions/mean_length": 354.3671875, "completions/mean_terminated_length": 354.3671875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.006287660466334818, "frac_reward_zero_std": 0.0, "grad_norm": 0.13063454627990723, "kl": 0.00443267822265625, "learning_rate": 5e-05, "loss": 0.1043, "num_tokens": 1139950.0, "reward": 1.8711261749267578, "reward_std": 0.7813383340835571, "rewards/helpfulness_reward/mean": 0.6255688667297363, "rewards/helpfulness_reward/std": 0.8146846294403076, "rewards/safety_reward/mean": 1.8711261749267578, "rewards/safety_reward/std": 1.957678198814392, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1368.0, "completions/max_terminated_length": 1368.0, "completions/mean_length": 403.2734375, "completions/mean_terminated_length": 403.2734375, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.006636974936686752, "frac_reward_zero_std": 0.0, "grad_norm": 0.13031868636608124, "kl": 0.004711151123046875, "learning_rate": 5e-05, "loss": 0.0816, "num_tokens": 1195057.0, "reward": 2.243459701538086, "reward_std": 0.5530301332473755, "rewards/helpfulness_reward/mean": 0.6201057434082031, "rewards/helpfulness_reward/std": 0.9980391263961792, "rewards/safety_reward/mean": 2.243459701538086, "rewards/safety_reward/std": 2.1850695610046387, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 1792.0, "completions/max_terminated_length": 1711.0, "completions/mean_length": 452.2734375, "completions/mean_terminated_length": 397.81298828125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.006986289407038686, "frac_reward_zero_std": 0.0, "grad_norm": 0.09015654772520065, "kl": 0.006084442138671875, "learning_rate": 5e-05, "loss": 0.0996, "num_tokens": 1257228.0, "reward": 2.1011600494384766, "reward_std": 0.646869957447052, "rewards/helpfulness_reward/mean": 0.8271967172622681, "rewards/helpfulness_reward/std": 0.8938626646995544, "rewards/safety_reward/mean": 2.1011600494384766, "rewards/safety_reward/std": 1.904709815979004, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1341.0, "completions/max_terminated_length": 1341.0, "completions/mean_length": 414.375, "completions/mean_terminated_length": 414.375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.007335603877390621, "frac_reward_zero_std": 0.0, "grad_norm": 0.11725614964962006, "kl": 0.0071868896484375, "learning_rate": 5e-05, "loss": 0.0591, "num_tokens": 1314596.0, "reward": 1.9544224739074707, "reward_std": 0.8272978067398071, "rewards/helpfulness_reward/mean": 0.5223560333251953, "rewards/helpfulness_reward/std": 1.0196958780288696, "rewards/safety_reward/mean": 1.9544224739074707, "rewards/safety_reward/std": 2.078761577606201, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1487.0, "completions/max_terminated_length": 1487.0, "completions/mean_length": 404.75, "completions/mean_terminated_length": 404.75, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.007684918347742555, "frac_reward_zero_std": 0.0, "grad_norm": 0.11818994581699371, "kl": 0.007781982421875, "learning_rate": 5e-05, "loss": 0.1607, "num_tokens": 1370468.0, "reward": 2.475027084350586, "reward_std": 0.7899474501609802, "rewards/helpfulness_reward/mean": 0.5557079315185547, "rewards/helpfulness_reward/std": 0.938968300819397, "rewards/safety_reward/mean": 2.475027084350586, "rewards/safety_reward/std": 2.40566086769104, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1792.0, "completions/max_terminated_length": 1529.0, "completions/mean_length": 404.6015625, "completions/mean_terminated_length": 382.5793762207031, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.00803423281809449, "frac_reward_zero_std": 0.0, "grad_norm": 0.12300360202789307, "kl": 0.00920867919921875, "learning_rate": 5e-05, "loss": 0.1661, "num_tokens": 1430145.0, "reward": 2.371400833129883, "reward_std": 0.9868205785751343, "rewards/helpfulness_reward/mean": 0.6968097686767578, "rewards/helpfulness_reward/std": 1.0149115324020386, "rewards/safety_reward/mean": 2.371400833129883, "rewards/safety_reward/std": 2.233307361602783, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1533.0, "completions/max_terminated_length": 1533.0, "completions/mean_length": 354.1875, "completions/mean_terminated_length": 354.1875, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.008383547288446425, "frac_reward_zero_std": 0.0, "grad_norm": 0.13144390285015106, "kl": 0.0123748779296875, "learning_rate": 5e-05, "loss": 0.1159, "num_tokens": 1482457.0, "reward": 2.169431686401367, "reward_std": 0.7711657285690308, "rewards/helpfulness_reward/mean": 0.623896598815918, "rewards/helpfulness_reward/std": 0.9080436825752258, "rewards/safety_reward/mean": 2.169431686401367, "rewards/safety_reward/std": 2.0679237842559814, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 985.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 210.5390625, "completions/mean_terminated_length": 210.5390625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.008732861758798359, "frac_reward_zero_std": 0.0, "grad_norm": 0.16889241337776184, "kl": 0.0189361572265625, "learning_rate": 5e-05, "loss": 0.0084, "num_tokens": 1513854.0, "reward": 3.725842237472534, "reward_std": 0.6323438882827759, "rewards/helpfulness_reward/mean": 0.5171389579772949, "rewards/helpfulness_reward/std": 1.2201234102249146, "rewards/safety_reward/mean": 3.725842237472534, "rewards/safety_reward/std": 2.23673939704895, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 1792.0, "completions/max_terminated_length": 1702.0, "completions/mean_length": 323.3671875, "completions/mean_terminated_length": 263.6666564941406, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.009082176229150293, "frac_reward_zero_std": 0.0, "grad_norm": 0.12697352468967438, "kl": 0.0165252685546875, "learning_rate": 5e-05, "loss": 0.1845, "num_tokens": 1559381.0, "reward": 2.316770553588867, "reward_std": 0.863330602645874, "rewards/helpfulness_reward/mean": 0.47170543670654297, "rewards/helpfulness_reward/std": 1.1164391040802002, "rewards/safety_reward/mean": 2.316770553588867, "rewards/safety_reward/std": 2.0929176807403564, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1237.0, "completions/max_terminated_length": 1237.0, "completions/mean_length": 281.0078125, "completions/mean_terminated_length": 281.0078125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.009431490699502228, "frac_reward_zero_std": 0.0, "grad_norm": 0.15013541281223297, "kl": 0.02124786376953125, "learning_rate": 5e-05, "loss": 0.1384, "num_tokens": 1598990.0, "reward": 3.1881332397460938, "reward_std": 0.7105435132980347, "rewards/helpfulness_reward/mean": 0.5669134855270386, "rewards/helpfulness_reward/std": 0.9228736162185669, "rewards/safety_reward/mean": 3.1881332397460938, "rewards/safety_reward/std": 2.2923409938812256, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1560.0, "completions/max_terminated_length": 1560.0, "completions/mean_length": 249.4140625, "completions/mean_terminated_length": 249.4140625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.009780805169854162, "frac_reward_zero_std": 0.0, "grad_norm": 0.17441964149475098, "kl": 0.039337158203125, "learning_rate": 5e-05, "loss": 0.1014, "num_tokens": 1636475.0, "reward": 3.5175857543945312, "reward_std": 0.6709848046302795, "rewards/helpfulness_reward/mean": 0.6317563056945801, "rewards/helpfulness_reward/std": 1.2036892175674438, "rewards/safety_reward/mean": 3.5175857543945312, "rewards/safety_reward/std": 2.272144079208374, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1289.0, "completions/max_terminated_length": 1289.0, "completions/mean_length": 376.140625, "completions/mean_terminated_length": 376.140625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.010130119640206096, "frac_reward_zero_std": 0.0, "grad_norm": 0.12463334202766418, "kl": 0.0217132568359375, "learning_rate": 5e-05, "loss": 0.1842, "num_tokens": 1688901.0, "reward": 2.434375762939453, "reward_std": 0.9937673807144165, "rewards/helpfulness_reward/mean": 0.4770491123199463, "rewards/helpfulness_reward/std": 0.9402670860290527, "rewards/safety_reward/mean": 2.434375762939453, "rewards/safety_reward/std": 2.209441661834717, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1240.0, "completions/max_terminated_length": 1240.0, "completions/mean_length": 377.7734375, "completions/mean_terminated_length": 377.7734375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.010479434110558029, "frac_reward_zero_std": 0.0, "grad_norm": 0.15521302819252014, "kl": 0.036041259765625, "learning_rate": 5e-05, "loss": 0.1613, "num_tokens": 1740936.0, "reward": 3.0520081520080566, "reward_std": 0.8348220586776733, "rewards/helpfulness_reward/mean": 0.5430200099945068, "rewards/helpfulness_reward/std": 0.9614731073379517, "rewards/safety_reward/mean": 3.0520081520080566, "rewards/safety_reward/std": 2.6070146560668945, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1522.0, "completions/max_terminated_length": 1522.0, "completions/mean_length": 220.7578125, "completions/mean_terminated_length": 220.7578125, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.010828748580909963, "frac_reward_zero_std": 0.0, "grad_norm": 0.17383119463920593, "kl": 0.0390625, "learning_rate": 5e-05, "loss": 0.0762, "num_tokens": 1774025.0, "reward": 3.0774574279785156, "reward_std": 0.6264728903770447, "rewards/helpfulness_reward/mean": 0.7814483642578125, "rewards/helpfulness_reward/std": 1.0380728244781494, "rewards/safety_reward/mean": 3.0774574279785156, "rewards/safety_reward/std": 2.041104793548584, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1113.0, "completions/max_terminated_length": 1113.0, "completions/mean_length": 239.2421875, "completions/mean_terminated_length": 239.2421875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.011178063051261898, "frac_reward_zero_std": 0.0, "grad_norm": 0.15300752222537994, "kl": 0.0419921875, "learning_rate": 5e-05, "loss": 0.0723, "num_tokens": 1809280.0, "reward": 3.375591278076172, "reward_std": 0.5137804746627808, "rewards/helpfulness_reward/mean": 0.6074542999267578, "rewards/helpfulness_reward/std": 1.0909253358840942, "rewards/safety_reward/mean": 3.375591278076172, "rewards/safety_reward/std": 1.8178608417510986, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1290.0, "completions/max_terminated_length": 1290.0, "completions/mean_length": 263.359375, "completions/mean_terminated_length": 263.359375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.011527377521613832, "frac_reward_zero_std": 0.0, "grad_norm": 0.17954961955547333, "kl": 0.071441650390625, "learning_rate": 5e-05, "loss": 0.1743, "num_tokens": 1849270.0, "reward": 2.446340560913086, "reward_std": 1.051793098449707, "rewards/helpfulness_reward/mean": 0.27484607696533203, "rewards/helpfulness_reward/std": 1.1382519006729126, "rewards/safety_reward/mean": 2.446340560913086, "rewards/safety_reward/std": 2.9705379009246826, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 781.0, "completions/max_terminated_length": 781.0, "completions/mean_length": 166.5, "completions/mean_terminated_length": 166.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.011876691991965766, "frac_reward_zero_std": 0.0, "grad_norm": 0.20392414927482605, "kl": 0.08612060546875, "learning_rate": 5e-05, "loss": 0.0187, "num_tokens": 1875126.0, "reward": 2.9905903339385986, "reward_std": 0.706201434135437, "rewards/helpfulness_reward/mean": 0.783210277557373, "rewards/helpfulness_reward/std": 1.056657314300537, "rewards/safety_reward/mean": 2.9905903339385986, "rewards/safety_reward/std": 2.1141910552978516, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 911.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 262.3828125, "completions/mean_terminated_length": 262.3828125, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.0122260064623177, "frac_reward_zero_std": 0.0, "grad_norm": 0.14287178218364716, "kl": 0.061737060546875, "learning_rate": 5e-05, "loss": 0.0703, "num_tokens": 1912439.0, "reward": 2.4188098907470703, "reward_std": 0.6655440926551819, "rewards/helpfulness_reward/mean": 1.2641305923461914, "rewards/helpfulness_reward/std": 0.9772664904594421, "rewards/safety_reward/mean": 2.4188098907470703, "rewards/safety_reward/std": 1.7792928218841553, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1772.0, "completions/mean_length": 340.421875, "completions/mean_terminated_length": 293.5967712402344, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.012575320932669635, "frac_reward_zero_std": 0.0, "grad_norm": 0.21292300522327423, "kl": 0.06756591796875, "learning_rate": 5e-05, "loss": 0.1876, "num_tokens": 1962133.0, "reward": 3.486175537109375, "reward_std": 0.8728918433189392, "rewards/helpfulness_reward/mean": 1.0578832626342773, "rewards/helpfulness_reward/std": 0.9673032164573669, "rewards/safety_reward/mean": 3.486175537109375, "rewards/safety_reward/std": 2.1238250732421875, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1063.0, "completions/mean_length": 138.8125, "completions/mean_terminated_length": 125.79527282714844, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.01292463540302157, "frac_reward_zero_std": 0.0, "grad_norm": 0.37446704506874084, "kl": 0.1666259765625, "learning_rate": 5e-05, "loss": 0.056, "num_tokens": 1985085.0, "reward": 3.894216537475586, "reward_std": 0.5490807890892029, "rewards/helpfulness_reward/mean": 0.8122158050537109, "rewards/helpfulness_reward/std": 1.1912736892700195, "rewards/safety_reward/mean": 3.894216537475586, "rewards/safety_reward/std": 1.5676159858703613, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1150.0, "completions/max_terminated_length": 1150.0, "completions/mean_length": 224.125, "completions/mean_terminated_length": 224.125, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.013273949873373504, "frac_reward_zero_std": 0.0, "grad_norm": 0.19801530241966248, "kl": 0.1097412109375, "learning_rate": 5e-05, "loss": 0.2202, "num_tokens": 2017549.0, "reward": 3.5830841064453125, "reward_std": 0.6788434982299805, "rewards/helpfulness_reward/mean": 1.0625486373901367, "rewards/helpfulness_reward/std": 0.9460107684135437, "rewards/safety_reward/mean": 3.5830841064453125, "rewards/safety_reward/std": 1.9092940092086792, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 675.0, "completions/max_terminated_length": 675.0, "completions/mean_length": 116.3984375, "completions/mean_terminated_length": 116.3984375, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.013623264343725438, "frac_reward_zero_std": 0.0, "grad_norm": 0.3135559856891632, "kl": 0.190185546875, "learning_rate": 5e-05, "loss": 0.1335, "num_tokens": 2036896.0, "reward": 4.247291564941406, "reward_std": 0.6471164226531982, "rewards/helpfulness_reward/mean": 0.4974994659423828, "rewards/helpfulness_reward/std": 0.827842652797699, "rewards/safety_reward/mean": 4.247291564941406, "rewards/safety_reward/std": 1.9576374292373657, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1081.0, "completions/max_terminated_length": 1081.0, "completions/mean_length": 155.0546875, "completions/mean_terminated_length": 155.0546875, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.013972578814077373, "frac_reward_zero_std": 0.0, "grad_norm": 0.3393400013446808, "kl": 0.2298583984375, "learning_rate": 5e-05, "loss": 0.1039, "num_tokens": 2060463.0, "reward": 3.941373825073242, "reward_std": 0.6663503646850586, "rewards/helpfulness_reward/mean": 0.9356412887573242, "rewards/helpfulness_reward/std": 1.2347946166992188, "rewards/safety_reward/mean": 3.941373825073242, "rewards/safety_reward/std": 2.0005433559417725, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 96.8828125, "completions/mean_terminated_length": 96.8828125, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.014321893284429307, "frac_reward_zero_std": 0.0, "grad_norm": 0.3331529498100281, "kl": 0.2757568359375, "learning_rate": 5e-05, "loss": 0.1441, "num_tokens": 2076360.0, "reward": 3.4979248046875, "reward_std": 0.617590606212616, "rewards/helpfulness_reward/mean": 1.120737075805664, "rewards/helpfulness_reward/std": 0.8993648290634155, "rewards/safety_reward/mean": 3.4979248046875, "rewards/safety_reward/std": 1.4978595972061157, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1164.0, "completions/max_terminated_length": 1164.0, "completions/mean_length": 143.7890625, "completions/mean_terminated_length": 143.7890625, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.014671207754781241, "frac_reward_zero_std": 0.0, "grad_norm": 0.4000299572944641, "kl": 0.340087890625, "learning_rate": 5e-05, "loss": 0.1716, "num_tokens": 2100429.0, "reward": 3.103992462158203, "reward_std": 0.8096754550933838, "rewards/helpfulness_reward/mean": 0.9334583282470703, "rewards/helpfulness_reward/std": 1.026216983795166, "rewards/safety_reward/mean": 3.103992462158203, "rewards/safety_reward/std": 1.6475980281829834, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 58.3828125, "completions/mean_terminated_length": 58.3828125, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.015020522225133176, "frac_reward_zero_std": 0.0, "grad_norm": 0.5037457346916199, "kl": 0.5703125, "learning_rate": 5e-05, "loss": -0.0152, "num_tokens": 2112046.0, "reward": 4.227294921875, "reward_std": 0.6107388734817505, "rewards/helpfulness_reward/mean": 0.7051777839660645, "rewards/helpfulness_reward/std": 1.0359976291656494, "rewards/safety_reward/mean": 4.227294921875, "rewards/safety_reward/std": 1.603568196296692, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 75.7578125, "completions/mean_terminated_length": 75.7578125, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.01536983669548511, "frac_reward_zero_std": 0.0, "grad_norm": 0.394292414188385, "kl": 0.408935546875, "learning_rate": 5e-05, "loss": 0.0155, "num_tokens": 2126111.0, "reward": 3.883087158203125, "reward_std": 0.6735607981681824, "rewards/helpfulness_reward/mean": 0.6505441665649414, "rewards/helpfulness_reward/std": 1.1263806819915771, "rewards/safety_reward/mean": 3.883087158203125, "rewards/safety_reward/std": 1.8453642129898071, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 79.296875, "completions/mean_terminated_length": 79.296875, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.015719151165837046, "frac_reward_zero_std": 0.0, "grad_norm": 0.5549803376197815, "kl": 0.40478515625, "learning_rate": 5e-05, "loss": -0.0283, "num_tokens": 2140013.0, "reward": 3.573822021484375, "reward_std": 0.5265615582466125, "rewards/helpfulness_reward/mean": 1.270294189453125, "rewards/helpfulness_reward/std": 1.095132827758789, "rewards/safety_reward/mean": 3.573822021484375, "rewards/safety_reward/std": 1.686509132385254, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 680.0, "completions/max_terminated_length": 680.0, "completions/mean_length": 85.53125, "completions/mean_terminated_length": 85.53125, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.01606846563618898, "frac_reward_zero_std": 0.0, "grad_norm": 0.44421693682670593, "kl": 0.46630859375, "learning_rate": 5e-05, "loss": 0.0229, "num_tokens": 2156961.0, "reward": 4.404165744781494, "reward_std": 0.47697120904922485, "rewards/helpfulness_reward/mean": 1.1730444431304932, "rewards/helpfulness_reward/std": 0.9247111678123474, "rewards/safety_reward/mean": 4.404165744781494, "rewards/safety_reward/std": 1.6984643936157227, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 90.0, "completions/max_terminated_length": 90.0, "completions/mean_length": 41.6640625, "completions/mean_terminated_length": 41.6640625, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.016417780106540915, "frac_reward_zero_std": 0.0, "grad_norm": 0.6616412997245789, "kl": 0.635009765625, "learning_rate": 5e-05, "loss": 0.0313, "num_tokens": 2166382.0, "reward": 3.71246337890625, "reward_std": 0.4354974627494812, "rewards/helpfulness_reward/mean": 0.9189300537109375, "rewards/helpfulness_reward/std": 1.204404592514038, "rewards/safety_reward/mean": 3.71246337890625, "rewards/safety_reward/std": 1.2105213403701782, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 58.4140625, "completions/mean_terminated_length": 58.4140625, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.01676709457689285, "frac_reward_zero_std": 0.0, "grad_norm": 0.504286527633667, "kl": 0.57666015625, "learning_rate": 5e-05, "loss": 0.0507, "num_tokens": 2179163.0, "reward": 4.6170654296875, "reward_std": 0.5255957841873169, "rewards/helpfulness_reward/mean": 0.717397928237915, "rewards/helpfulness_reward/std": 1.3943313360214233, "rewards/safety_reward/mean": 4.6170654296875, "rewards/safety_reward/std": 0.9791164994239807, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 795.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 69.1328125, "completions/mean_terminated_length": 69.1328125, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.017116409047244784, "frac_reward_zero_std": 0.0, "grad_norm": 0.6797713041305542, "kl": 0.78662109375, "learning_rate": 5e-05, "loss": 0.0918, "num_tokens": 2192220.0, "reward": 4.4377288818359375, "reward_std": 0.604800820350647, "rewards/helpfulness_reward/mean": 0.946941614151001, "rewards/helpfulness_reward/std": 1.192151427268982, "rewards/safety_reward/mean": 4.4377288818359375, "rewards/safety_reward/std": 1.6728565692901611, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 113.0, "completions/max_terminated_length": 113.0, "completions/mean_length": 47.75, "completions/mean_terminated_length": 47.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.017465723517596718, "frac_reward_zero_std": 0.0, "grad_norm": 0.6112455129623413, "kl": 0.642578125, "learning_rate": 5e-05, "loss": -0.013, "num_tokens": 2202780.0, "reward": 4.6834716796875, "reward_std": 0.5851795673370361, "rewards/helpfulness_reward/mean": 0.5270030498504639, "rewards/helpfulness_reward/std": 1.2229913473129272, "rewards/safety_reward/mean": 4.6834716796875, "rewards/safety_reward/std": 1.3719333410263062, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 93.0, "completions/max_terminated_length": 93.0, "completions/mean_length": 47.1015625, "completions/mean_terminated_length": 47.1015625, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.017815037987948652, "frac_reward_zero_std": 0.0, "grad_norm": 0.6449106931686401, "kl": 0.689453125, "learning_rate": 5e-05, "loss": 0.0646, "num_tokens": 2212777.0, "reward": 4.49298095703125, "reward_std": 0.477655827999115, "rewards/helpfulness_reward/mean": 0.6944427490234375, "rewards/helpfulness_reward/std": 1.1496020555496216, "rewards/safety_reward/mean": 4.49298095703125, "rewards/safety_reward/std": 1.4851378202438354, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 94.0, "completions/max_terminated_length": 94.0, "completions/mean_length": 49.8359375, "completions/mean_terminated_length": 49.8359375, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.018164352458300587, "frac_reward_zero_std": 0.0, "grad_norm": 0.5843983888626099, "kl": 0.75244140625, "learning_rate": 5e-05, "loss": -0.0044, "num_tokens": 2223732.0, "reward": 4.8013916015625, "reward_std": 0.5406953692436218, "rewards/helpfulness_reward/mean": 1.1184325218200684, "rewards/helpfulness_reward/std": 1.0623689889907837, "rewards/safety_reward/mean": 4.8013916015625, "rewards/safety_reward/std": 1.3654446601867676, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 95.0, "completions/max_terminated_length": 95.0, "completions/mean_length": 50.9375, "completions/mean_terminated_length": 50.9375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.01851366692865252, "frac_reward_zero_std": 0.0, "grad_norm": 0.561687171459198, "kl": 0.6015625, "learning_rate": 5e-05, "loss": 0.0544, "num_tokens": 2234276.0, "reward": 4.749267578125, "reward_std": 0.5687539577484131, "rewards/helpfulness_reward/mean": 1.1417982578277588, "rewards/helpfulness_reward/std": 0.9287499189376831, "rewards/safety_reward/mean": 4.749267578125, "rewards/safety_reward/std": 1.5401513576507568, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 46.6328125, "completions/mean_terminated_length": 46.6328125, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.018862981399004455, "frac_reward_zero_std": 0.0, "grad_norm": 0.5517235994338989, "kl": 0.845703125, "learning_rate": 5e-05, "loss": 0.0477, "num_tokens": 2243981.0, "reward": 4.79339599609375, "reward_std": 0.5928653478622437, "rewards/helpfulness_reward/mean": 0.905106782913208, "rewards/helpfulness_reward/std": 0.9698130488395691, "rewards/safety_reward/mean": 4.79339599609375, "rewards/safety_reward/std": 1.8071792125701904, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 99.0, "completions/max_terminated_length": 99.0, "completions/mean_length": 45.0625, "completions/mean_terminated_length": 45.0625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.01921229586935639, "frac_reward_zero_std": 0.0, "grad_norm": 0.6921412348747253, "kl": 0.723876953125, "learning_rate": 5e-05, "loss": 0.055, "num_tokens": 2255509.0, "reward": 4.59637451171875, "reward_std": 0.5688818097114563, "rewards/helpfulness_reward/mean": 0.5436763763427734, "rewards/helpfulness_reward/std": 1.2209751605987549, "rewards/safety_reward/mean": 4.59637451171875, "rewards/safety_reward/std": 1.5784746408462524, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 92.0, "completions/max_terminated_length": 92.0, "completions/mean_length": 49.265625, "completions/mean_terminated_length": 49.265625, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.019561610339708324, "frac_reward_zero_std": 0.0, "grad_norm": 0.5755359530448914, "kl": 0.68896484375, "learning_rate": 5e-05, "loss": 0.0257, "num_tokens": 2265919.0, "reward": 4.78900146484375, "reward_std": 0.6358133554458618, "rewards/helpfulness_reward/mean": 0.937016487121582, "rewards/helpfulness_reward/std": 0.9626507759094238, "rewards/safety_reward/mean": 4.78900146484375, "rewards/safety_reward/std": 1.7232720851898193, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 99.0, "completions/max_terminated_length": 99.0, "completions/mean_length": 51.0546875, "completions/mean_terminated_length": 51.0546875, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.01991092481006026, "frac_reward_zero_std": 0.0, "grad_norm": 0.5438100695610046, "kl": 0.67333984375, "learning_rate": 5e-05, "loss": 0.0336, "num_tokens": 2277126.0, "reward": 4.76531982421875, "reward_std": 0.6281905770301819, "rewards/helpfulness_reward/mean": 0.49950122833251953, "rewards/helpfulness_reward/std": 1.1201666593551636, "rewards/safety_reward/mean": 4.76531982421875, "rewards/safety_reward/std": 1.3097554445266724, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 91.0, "completions/max_terminated_length": 91.0, "completions/mean_length": 48.046875, "completions/mean_terminated_length": 48.046875, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.020260239280412193, "frac_reward_zero_std": 0.0, "grad_norm": 0.5530316233634949, "kl": 0.78271484375, "learning_rate": 5e-05, "loss": 0.0356, "num_tokens": 2287388.0, "reward": 4.78643798828125, "reward_std": 0.6464449763298035, "rewards/helpfulness_reward/mean": 0.38198089599609375, "rewards/helpfulness_reward/std": 1.2730035781860352, "rewards/safety_reward/mean": 4.78643798828125, "rewards/safety_reward/std": 1.5243130922317505, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 87.0, "completions/max_terminated_length": 87.0, "completions/mean_length": 47.65625, "completions/mean_terminated_length": 47.65625, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.020609553750764124, "frac_reward_zero_std": 0.0, "grad_norm": 0.5412368774414062, "kl": 0.9248046875, "learning_rate": 5e-05, "loss": 0.019, "num_tokens": 2298296.0, "reward": 4.5062255859375, "reward_std": 0.6450906991958618, "rewards/helpfulness_reward/mean": 0.584539532661438, "rewards/helpfulness_reward/std": 1.1277756690979004, "rewards/safety_reward/mean": 4.5062255859375, "rewards/safety_reward/std": 1.1050933599472046, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 55.671875, "completions/mean_terminated_length": 55.671875, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.020958868221116058, "frac_reward_zero_std": 0.0, "grad_norm": 0.5419270992279053, "kl": 0.8212890625, "learning_rate": 5e-05, "loss": 0.0148, "num_tokens": 2310542.0, "reward": 4.685302734375, "reward_std": 0.6171823740005493, "rewards/helpfulness_reward/mean": 0.6760408282279968, "rewards/helpfulness_reward/std": 1.3955116271972656, "rewards/safety_reward/mean": 4.685302734375, "rewards/safety_reward/std": 1.3394179344177246, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 90.0, "completions/max_terminated_length": 90.0, "completions/mean_length": 47.171875, "completions/mean_terminated_length": 47.171875, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.021308182691467992, "frac_reward_zero_std": 0.0, "grad_norm": 0.5448645353317261, "kl": 0.783203125, "learning_rate": 5e-05, "loss": 0.0431, "num_tokens": 2322324.0, "reward": 4.8408203125, "reward_std": 0.6640768647193909, "rewards/helpfulness_reward/mean": 0.3401503562927246, "rewards/helpfulness_reward/std": 1.1398398876190186, "rewards/safety_reward/mean": 4.8408203125, "rewards/safety_reward/std": 1.3911588191986084, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 48.7109375, "completions/mean_terminated_length": 48.7109375, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.021657497161819927, "frac_reward_zero_std": 0.0, "grad_norm": 0.5329705476760864, "kl": 0.80322265625, "learning_rate": 5e-05, "loss": 0.0226, "num_tokens": 2332519.0, "reward": 4.7869873046875, "reward_std": 0.5377621650695801, "rewards/helpfulness_reward/mean": 0.587010383605957, "rewards/helpfulness_reward/std": 1.1382843255996704, "rewards/safety_reward/mean": 4.7869873046875, "rewards/safety_reward/std": 1.5194827318191528, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 48.1015625, "completions/mean_terminated_length": 48.1015625, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.02200681163217186, "frac_reward_zero_std": 0.0, "grad_norm": 0.5443978905677795, "kl": 0.80517578125, "learning_rate": 5e-05, "loss": 0.0173, "num_tokens": 2342340.0, "reward": 4.664794921875, "reward_std": 0.5679095387458801, "rewards/helpfulness_reward/mean": 0.4040336608886719, "rewards/helpfulness_reward/std": 1.068395972251892, "rewards/safety_reward/mean": 4.664794921875, "rewards/safety_reward/std": 1.101770043373108, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 105.0, "completions/max_terminated_length": 105.0, "completions/mean_length": 50.2421875, "completions/mean_terminated_length": 50.2421875, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.022356126102523795, "frac_reward_zero_std": 0.0, "grad_norm": 0.5393102765083313, "kl": 0.74072265625, "learning_rate": 5e-05, "loss": -0.0224, "num_tokens": 2354235.0, "reward": 5.272216796875, "reward_std": 0.6093888282775879, "rewards/helpfulness_reward/mean": 0.4253056049346924, "rewards/helpfulness_reward/std": 0.9728653430938721, "rewards/safety_reward/mean": 5.272216796875, "rewards/safety_reward/std": 1.2308176755905151, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 49.34375, "completions/mean_terminated_length": 49.34375, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.02270544057287573, "frac_reward_zero_std": 0.0, "grad_norm": 0.5101951956748962, "kl": 0.69921875, "learning_rate": 5e-05, "loss": 0.0244, "num_tokens": 2364287.0, "reward": 5.800048828125, "reward_std": 0.613960325717926, "rewards/helpfulness_reward/mean": 0.25794315338134766, "rewards/helpfulness_reward/std": 1.066056728363037, "rewards/safety_reward/mean": 5.800048828125, "rewards/safety_reward/std": 1.250385046005249, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 106.0, "completions/max_terminated_length": 106.0, "completions/mean_length": 55.375, "completions/mean_terminated_length": 55.375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.023054755043227664, "frac_reward_zero_std": 0.0, "grad_norm": 0.48173338174819946, "kl": 0.76953125, "learning_rate": 5e-05, "loss": 0.0208, "num_tokens": 2377487.0, "reward": 5.30413818359375, "reward_std": 0.5779344439506531, "rewards/helpfulness_reward/mean": 0.020534753799438477, "rewards/helpfulness_reward/std": 1.084672212600708, "rewards/safety_reward/mean": 5.30413818359375, "rewards/safety_reward/std": 1.3936436176300049, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 119.0, "completions/max_terminated_length": 119.0, "completions/mean_length": 44.6796875, "completions/mean_terminated_length": 44.6796875, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.0234040695135796, "frac_reward_zero_std": 0.0, "grad_norm": 0.5858774185180664, "kl": 1.07275390625, "learning_rate": 5e-05, "loss": -0.002, "num_tokens": 2387630.0, "reward": 4.8094482421875, "reward_std": 0.5151589512825012, "rewards/helpfulness_reward/mean": 0.5540618896484375, "rewards/helpfulness_reward/std": 1.2122985124588013, "rewards/safety_reward/mean": 4.8094482421875, "rewards/safety_reward/std": 1.4443269968032837, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 100.0, "completions/max_terminated_length": 100.0, "completions/mean_length": 46.25, "completions/mean_terminated_length": 46.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.023753383983931533, "frac_reward_zero_std": 0.0, "grad_norm": 0.5773969292640686, "kl": 0.908203125, "learning_rate": 5e-05, "loss": 0.0431, "num_tokens": 2399022.0, "reward": 4.5615234375, "reward_std": 0.5812417268753052, "rewards/helpfulness_reward/mean": 0.15369558334350586, "rewards/helpfulness_reward/std": 1.1077525615692139, "rewards/safety_reward/mean": 4.5615234375, "rewards/safety_reward/std": 1.0297571420669556, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 41.09375, "completions/mean_terminated_length": 41.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.024102698454283467, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6705759763717651, "kl": 0.92919921875, "learning_rate": 5e-05, "loss": -0.0215, "num_tokens": 2408498.0, "reward": 5.0228271484375, "reward_std": 0.5161320567131042, "rewards/helpfulness_reward/mean": 0.2744922637939453, "rewards/helpfulness_reward/std": 1.0888358354568481, "rewards/safety_reward/mean": 5.0228271484375, "rewards/safety_reward/std": 1.291672706604004, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 85.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 48.421875, "completions/mean_terminated_length": 48.421875, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.0244520129246354, "frac_reward_zero_std": 0.0, "grad_norm": 0.5314742922782898, "kl": 0.81982421875, "learning_rate": 5e-05, "loss": 0.0025, "num_tokens": 2418536.0, "reward": 5.832275390625, "reward_std": 0.5845184326171875, "rewards/helpfulness_reward/mean": 0.2909541130065918, "rewards/helpfulness_reward/std": 0.9298239350318909, "rewards/safety_reward/mean": 5.832275390625, "rewards/safety_reward/std": 1.593422532081604, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 124.0, "completions/max_terminated_length": 124.0, "completions/mean_length": 47.328125, "completions/mean_terminated_length": 47.328125, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.024801327394987336, "frac_reward_zero_std": 0.0, "grad_norm": 0.5855083465576172, "kl": 1.01708984375, "learning_rate": 5e-05, "loss": 0.0445, "num_tokens": 2428530.0, "reward": 5.20416259765625, "reward_std": 0.6305185556411743, "rewards/helpfulness_reward/mean": 0.5324840545654297, "rewards/helpfulness_reward/std": 1.2243911027908325, "rewards/safety_reward/mean": 5.20416259765625, "rewards/safety_reward/std": 1.3979272842407227, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1792.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 80.0546875, "completions/mean_terminated_length": 52.88095474243164, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.02515064186533927, "frac_reward_zero_std": 0.0, "grad_norm": 0.6610618829727173, "kl": 0.87451171875, "learning_rate": 5e-05, "loss": 0.0505, "num_tokens": 2446129.0, "reward": 3.9224853515625, "reward_std": 0.7200387716293335, "rewards/helpfulness_reward/mean": -0.08075454831123352, "rewards/helpfulness_reward/std": 1.1722532510757446, "rewards/safety_reward/mean": 3.9224853515625, "rewards/safety_reward/std": 2.6232099533081055, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 51.75, "completions/mean_terminated_length": 51.75, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.025499956335691205, "frac_reward_zero_std": 0.0, "grad_norm": 0.65023273229599, "kl": 1.0146484375, "learning_rate": 5e-05, "loss": 0.0329, "num_tokens": 2457633.0, "reward": 5.53985595703125, "reward_std": 0.5499386787414551, "rewards/helpfulness_reward/mean": 0.13230228424072266, "rewards/helpfulness_reward/std": 0.9101870059967041, "rewards/safety_reward/mean": 5.53985595703125, "rewards/safety_reward/std": 1.4415745735168457, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 45.109375, "completions/mean_terminated_length": 45.109375, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.02584927080604314, "frac_reward_zero_std": 0.0, "grad_norm": 0.8500030040740967, "kl": 1.478515625, "learning_rate": 5e-05, "loss": 0.0049, "num_tokens": 2466775.0, "reward": 4.47802734375, "reward_std": 0.6170284748077393, "rewards/helpfulness_reward/mean": 0.5934576988220215, "rewards/helpfulness_reward/std": 0.8429650664329529, "rewards/safety_reward/mean": 4.47802734375, "rewards/safety_reward/std": 1.0285106897354126, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 47.6953125, "completions/mean_terminated_length": 47.6953125, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.026198585276395073, "frac_reward_zero_std": 0.0, "grad_norm": 0.59416663646698, "kl": 0.97119140625, "learning_rate": 5e-05, "loss": 0.0236, "num_tokens": 2477128.0, "reward": 5.03509521484375, "reward_std": 0.5629220604896545, "rewards/helpfulness_reward/mean": 0.09198951721191406, "rewards/helpfulness_reward/std": 1.0930874347686768, "rewards/safety_reward/mean": 5.03509521484375, "rewards/safety_reward/std": 1.2595106363296509, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 48.2890625, "completions/mean_terminated_length": 48.2890625, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.026547899746747008, "frac_reward_zero_std": 0.0, "grad_norm": 0.557625949382782, "kl": 0.7470703125, "learning_rate": 5e-05, "loss": 0.0005, "num_tokens": 2487677.0, "reward": 6.2547607421875, "reward_std": 0.4964335858821869, "rewards/helpfulness_reward/mean": 0.24370145797729492, "rewards/helpfulness_reward/std": 0.955338716506958, "rewards/safety_reward/mean": 6.2547607421875, "rewards/safety_reward/std": 1.5108345746994019, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 48.3671875, "completions/mean_terminated_length": 48.3671875, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.026897214217098942, "frac_reward_zero_std": 0.0, "grad_norm": 0.5422032475471497, "kl": 0.94287109375, "learning_rate": 5e-05, "loss": 0.0182, "num_tokens": 2498580.0, "reward": 5.3778076171875, "reward_std": 0.5669527053833008, "rewards/helpfulness_reward/mean": 0.06085008382797241, "rewards/helpfulness_reward/std": 0.957686185836792, "rewards/safety_reward/mean": 5.3778076171875, "rewards/safety_reward/std": 1.2437411546707153, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 103.0, "completions/max_terminated_length": 103.0, "completions/mean_length": 50.5078125, "completions/mean_terminated_length": 50.5078125, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.027246528687450876, "frac_reward_zero_std": 0.0, "grad_norm": 0.5113022923469543, "kl": 0.86083984375, "learning_rate": 5e-05, "loss": 0.0418, "num_tokens": 2509589.0, "reward": 5.43389892578125, "reward_std": 0.5394223928451538, "rewards/helpfulness_reward/mean": 0.042281150817871094, "rewards/helpfulness_reward/std": 0.9226835370063782, "rewards/safety_reward/mean": 5.43389892578125, "rewards/safety_reward/std": 1.153607964515686, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 84.0, "completions/max_terminated_length": 84.0, "completions/mean_length": 50.203125, "completions/mean_terminated_length": 50.203125, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.02759584315780281, "frac_reward_zero_std": 0.0, "grad_norm": 0.5357343554496765, "kl": 0.9755859375, "learning_rate": 5e-05, "loss": 0.0046, "num_tokens": 2519775.0, "reward": 5.758056640625, "reward_std": 0.5868368148803711, "rewards/helpfulness_reward/mean": 0.3604602813720703, "rewards/helpfulness_reward/std": 0.9320899248123169, "rewards/safety_reward/mean": 5.758056640625, "rewards/safety_reward/std": 1.5400863885879517, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 99.0, "completions/max_terminated_length": 99.0, "completions/mean_length": 51.7578125, "completions/mean_terminated_length": 51.7578125, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.027945157628154745, "frac_reward_zero_std": 0.0, "grad_norm": 0.46939417719841003, "kl": 0.88623046875, "learning_rate": 5e-05, "loss": 0.0121, "num_tokens": 2530696.0, "reward": 5.5777587890625, "reward_std": 0.6201784014701843, "rewards/helpfulness_reward/mean": 0.2662053108215332, "rewards/helpfulness_reward/std": 1.2781678438186646, "rewards/safety_reward/mean": 5.5777587890625, "rewards/safety_reward/std": 1.1460322141647339, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 47.671875, "completions/mean_terminated_length": 47.671875, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.02829447209850668, "frac_reward_zero_std": 0.0, "grad_norm": 0.5600553154945374, "kl": 1.1484375, "learning_rate": 5e-05, "loss": 0.0321, "num_tokens": 2540846.0, "reward": 4.5911865234375, "reward_std": 0.47920525074005127, "rewards/helpfulness_reward/mean": 0.34150123596191406, "rewards/helpfulness_reward/std": 1.0613462924957275, "rewards/safety_reward/mean": 4.5911865234375, "rewards/safety_reward/std": 1.104619026184082, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 87.0, "completions/max_terminated_length": 87.0, "completions/mean_length": 50.109375, "completions/mean_terminated_length": 50.109375, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.028643786568858614, "frac_reward_zero_std": 0.0, "grad_norm": 0.5730762481689453, "kl": 0.97314453125, "learning_rate": 5e-05, "loss": 0.0465, "num_tokens": 2551324.0, "reward": 5.69720458984375, "reward_std": 0.471770316362381, "rewards/helpfulness_reward/mean": 0.32952356338500977, "rewards/helpfulness_reward/std": 1.0033754110336304, "rewards/safety_reward/mean": 5.69720458984375, "rewards/safety_reward/std": 1.6381311416625977, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 124.0, "completions/max_terminated_length": 124.0, "completions/mean_length": 56.9453125, "completions/mean_terminated_length": 56.9453125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.02899310103921055, "frac_reward_zero_std": 0.0, "grad_norm": 0.5309430956840515, "kl": 0.95263671875, "learning_rate": 5e-05, "loss": 0.0491, "num_tokens": 2564077.0, "reward": 5.35308837890625, "reward_std": 0.5930466651916504, "rewards/helpfulness_reward/mean": -0.11636066436767578, "rewards/helpfulness_reward/std": 1.0670236349105835, "rewards/safety_reward/mean": 5.35308837890625, "rewards/safety_reward/std": 1.34690260887146, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 100.0, "completions/max_terminated_length": 100.0, "completions/mean_length": 52.3828125, "completions/mean_terminated_length": 52.3828125, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.029342415509562483, "frac_reward_zero_std": 0.0, "grad_norm": 0.6021539568901062, "kl": 0.81396484375, "learning_rate": 5e-05, "loss": 0.0211, "num_tokens": 2574894.0, "reward": 5.1474609375, "reward_std": 0.4655143618583679, "rewards/helpfulness_reward/mean": 0.17797565460205078, "rewards/helpfulness_reward/std": 0.8402886986732483, "rewards/safety_reward/mean": 5.1474609375, "rewards/safety_reward/std": 1.4659788608551025, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 51.625, "completions/mean_terminated_length": 51.625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.029691729979914417, "frac_reward_zero_std": 0.0, "grad_norm": 0.5141409635543823, "kl": 0.92822265625, "learning_rate": 5e-05, "loss": 0.0225, "num_tokens": 2585662.0, "reward": 4.829065322875977, "reward_std": 0.5675923228263855, "rewards/helpfulness_reward/mean": 0.15358257293701172, "rewards/helpfulness_reward/std": 1.0020482540130615, "rewards/safety_reward/mean": 4.829065322875977, "rewards/safety_reward/std": 1.4648808240890503, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 110.0, "completions/max_terminated_length": 110.0, "completions/mean_length": 53.765625, "completions/mean_terminated_length": 53.765625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.03004104445026635, "frac_reward_zero_std": 0.0, "grad_norm": 0.5072021484375, "kl": 0.88330078125, "learning_rate": 5e-05, "loss": 0.065, "num_tokens": 2597648.0, "reward": 5.8994140625, "reward_std": 0.5602856278419495, "rewards/helpfulness_reward/mean": 0.2308340072631836, "rewards/helpfulness_reward/std": 1.2139023542404175, "rewards/safety_reward/mean": 5.8994140625, "rewards/safety_reward/std": 1.3039577007293701, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 95.0, "completions/max_terminated_length": 95.0, "completions/mean_length": 56.40625, "completions/mean_terminated_length": 56.40625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.030390358920618286, "frac_reward_zero_std": 0.0, "grad_norm": 0.49799349904060364, "kl": 0.810546875, "learning_rate": 5e-05, "loss": 0.0327, "num_tokens": 2613404.0, "reward": 5.68896484375, "reward_std": 0.5770901441574097, "rewards/helpfulness_reward/mean": 0.17347240447998047, "rewards/helpfulness_reward/std": 1.2019819021224976, "rewards/safety_reward/mean": 5.68896484375, "rewards/safety_reward/std": 1.6333086490631104, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 88.0, "completions/max_terminated_length": 88.0, "completions/mean_length": 50.234375, "completions/mean_terminated_length": 50.234375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.03073967339097022, "frac_reward_zero_std": 0.0, "grad_norm": 0.5282357335090637, "kl": 0.96240234375, "learning_rate": 5e-05, "loss": 0.0054, "num_tokens": 2623498.0, "reward": 6.2001953125, "reward_std": 0.5376718044281006, "rewards/helpfulness_reward/mean": 0.7493863105773926, "rewards/helpfulness_reward/std": 0.7882662415504456, "rewards/safety_reward/mean": 6.2001953125, "rewards/safety_reward/std": 1.1949374675750732, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 725.0, "completions/max_terminated_length": 725.0, "completions/mean_length": 88.1640625, "completions/mean_terminated_length": 88.1640625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.031088987861322154, "frac_reward_zero_std": 0.0, "grad_norm": 0.4997180700302124, "kl": 0.716552734375, "learning_rate": 5e-05, "loss": -0.0189, "num_tokens": 2641983.0, "reward": 5.05181884765625, "reward_std": 0.6014930009841919, "rewards/helpfulness_reward/mean": 0.028959035873413086, "rewards/helpfulness_reward/std": 1.0526927709579468, "rewards/safety_reward/mean": 5.05181884765625, "rewards/safety_reward/std": 1.902201771736145, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 92.0, "completions/max_terminated_length": 92.0, "completions/mean_length": 52.34375, "completions/mean_terminated_length": 52.34375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.03143830233167409, "frac_reward_zero_std": 0.0, "grad_norm": 0.5324785113334656, "kl": 0.90234375, "learning_rate": 5e-05, "loss": 0.0422, "num_tokens": 2654763.0, "reward": 5.725830078125, "reward_std": 0.6286332607269287, "rewards/helpfulness_reward/mean": 0.11975431442260742, "rewards/helpfulness_reward/std": 1.080735445022583, "rewards/safety_reward/mean": 5.725830078125, "rewards/safety_reward/std": 1.4976848363876343, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 99.0, "completions/max_terminated_length": 99.0, "completions/mean_length": 51.9296875, "completions/mean_terminated_length": 51.9296875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.03178761680202603, "frac_reward_zero_std": 0.0, "grad_norm": 0.5870004296302795, "kl": 0.99755859375, "learning_rate": 5e-05, "loss": -0.0046, "num_tokens": 2666074.0, "reward": 5.890380859375, "reward_std": 0.590222954750061, "rewards/helpfulness_reward/mean": 0.5645427703857422, "rewards/helpfulness_reward/std": 1.0253679752349854, "rewards/safety_reward/mean": 5.890380859375, "rewards/safety_reward/std": 1.2739039659500122, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1792.0, "completions/max_terminated_length": 84.0, "completions/mean_length": 157.734375, "completions/mean_terminated_length": 48.7833366394043, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.03213693127237796, "frac_reward_zero_std": 0.0, "grad_norm": 0.4054114520549774, "kl": 0.67041015625, "learning_rate": 5e-05, "loss": -0.0059, "num_tokens": 2691576.0, "reward": 5.153961181640625, "reward_std": 0.5973610281944275, "rewards/helpfulness_reward/mean": 0.024553537368774414, "rewards/helpfulness_reward/std": 0.9864392876625061, "rewards/safety_reward/mean": 5.153961181640625, "rewards/safety_reward/std": 2.217818260192871, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 45.78125, "completions/mean_terminated_length": 45.78125, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.032486245742729895, "frac_reward_zero_std": 0.0, "grad_norm": 0.6417782306671143, "kl": 1.189453125, "learning_rate": 5e-05, "loss": 0.0176, "num_tokens": 2701340.0, "reward": 5.404296875, "reward_std": 0.5504282712936401, "rewards/helpfulness_reward/mean": 0.5383148193359375, "rewards/helpfulness_reward/std": 0.9644744396209717, "rewards/safety_reward/mean": 5.404296875, "rewards/safety_reward/std": 1.5375880002975464, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 115.0, "completions/max_terminated_length": 115.0, "completions/mean_length": 49.4921875, "completions/mean_terminated_length": 49.4921875, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.03283556021308183, "frac_reward_zero_std": 0.0, "grad_norm": 0.6668078899383545, "kl": 1.2978515625, "learning_rate": 5e-05, "loss": 0.0099, "num_tokens": 2711723.0, "reward": 5.8770751953125, "reward_std": 0.4247514605522156, "rewards/helpfulness_reward/mean": 0.7741767764091492, "rewards/helpfulness_reward/std": 1.0088971853256226, "rewards/safety_reward/mean": 5.8770751953125, "rewards/safety_reward/std": 1.2276018857955933, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 101.0, "completions/max_terminated_length": 101.0, "completions/mean_length": 48.3203125, "completions/mean_terminated_length": 48.3203125, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.033184874683433764, "frac_reward_zero_std": 0.0, "grad_norm": 0.6881153583526611, "kl": 1.2333984375, "learning_rate": 5e-05, "loss": 0.0062, "num_tokens": 2723620.0, "reward": 5.660888671875, "reward_std": 0.506749153137207, "rewards/helpfulness_reward/mean": 0.450763463973999, "rewards/helpfulness_reward/std": 1.0681700706481934, "rewards/safety_reward/mean": 5.660888671875, "rewards/safety_reward/std": 1.7333704233169556, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 115.0, "completions/max_terminated_length": 115.0, "completions/mean_length": 51.6875, "completions/mean_terminated_length": 51.6875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.0335341891537857, "frac_reward_zero_std": 0.0, "grad_norm": 0.5484078526496887, "kl": 1.021484375, "learning_rate": 5e-05, "loss": 0.045, "num_tokens": 2734380.0, "reward": 5.6832275390625, "reward_std": 0.5017049312591553, "rewards/helpfulness_reward/mean": 0.4690074920654297, "rewards/helpfulness_reward/std": 1.0300999879837036, "rewards/safety_reward/mean": 5.6832275390625, "rewards/safety_reward/std": 1.293687343597412, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 86.0, "completions/max_terminated_length": 86.0, "completions/mean_length": 51.484375, "completions/mean_terminated_length": 51.484375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.03388350362413763, "frac_reward_zero_std": 0.0, "grad_norm": 0.48526522517204285, "kl": 1.0009765625, "learning_rate": 5e-05, "loss": 0.0086, "num_tokens": 2744914.0, "reward": 5.775634765625, "reward_std": 0.5271378755569458, "rewards/helpfulness_reward/mean": 0.3080117702484131, "rewards/helpfulness_reward/std": 0.8226609826087952, "rewards/safety_reward/mean": 5.775634765625, "rewards/safety_reward/std": 1.4816259145736694, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 53.4375, "completions/mean_terminated_length": 53.4375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.03423281809448957, "frac_reward_zero_std": 0.0, "grad_norm": 0.4583556652069092, "kl": 1.0537109375, "learning_rate": 5e-05, "loss": 0.0192, "num_tokens": 2756522.0, "reward": 5.632080078125, "reward_std": 0.51694655418396, "rewards/helpfulness_reward/mean": -0.008122444152832031, "rewards/helpfulness_reward/std": 0.8794497847557068, "rewards/safety_reward/mean": 5.632080078125, "rewards/safety_reward/std": 1.2024222612380981, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 98.0, "completions/max_terminated_length": 98.0, "completions/mean_length": 50.3515625, "completions/mean_terminated_length": 50.3515625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.0345821325648415, "frac_reward_zero_std": 0.0, "grad_norm": 0.5459902286529541, "kl": 1.01220703125, "learning_rate": 5e-05, "loss": 0.028, "num_tokens": 2767111.0, "reward": 5.5103759765625, "reward_std": 0.5870171189308167, "rewards/helpfulness_reward/mean": 0.2013530731201172, "rewards/helpfulness_reward/std": 1.1906611919403076, "rewards/safety_reward/mean": 5.5103759765625, "rewards/safety_reward/std": 1.1191636323928833, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 840.0, "completions/max_terminated_length": 840.0, "completions/mean_length": 70.7421875, "completions/mean_terminated_length": 70.7421875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.034931447035193436, "frac_reward_zero_std": 0.0, "grad_norm": 174.6944122314453, "kl": 95.8564453125, "learning_rate": 5e-05, "loss": 0.993, "num_tokens": 2781926.0, "reward": 5.682426452636719, "reward_std": 0.4967074394226074, "rewards/helpfulness_reward/mean": 0.2857515811920166, "rewards/helpfulness_reward/std": 1.0952054262161255, "rewards/safety_reward/mean": 5.682426452636719, "rewards/safety_reward/std": 2.219865083694458, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 52.78125, "completions/mean_terminated_length": 52.78125, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.03528076150554537, "frac_reward_zero_std": 0.0, "grad_norm": 0.5011953711509705, "kl": 1.0205078125, "learning_rate": 5e-05, "loss": 0.0249, "num_tokens": 2795650.0, "reward": 5.263427734375, "reward_std": 0.5243825912475586, "rewards/helpfulness_reward/mean": -0.3188667297363281, "rewards/helpfulness_reward/std": 0.9562488198280334, "rewards/safety_reward/mean": 5.263427734375, "rewards/safety_reward/std": 1.2418776750564575, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 115.0, "completions/max_terminated_length": 115.0, "completions/mean_length": 53.1328125, "completions/mean_terminated_length": 53.1328125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.035630075975897305, "frac_reward_zero_std": 0.0, "grad_norm": 0.5552361607551575, "kl": 0.96923828125, "learning_rate": 5e-05, "loss": -0.0003, "num_tokens": 2807779.0, "reward": 5.7833251953125, "reward_std": 0.5949106216430664, "rewards/helpfulness_reward/mean": 0.5002226829528809, "rewards/helpfulness_reward/std": 0.9853376746177673, "rewards/safety_reward/mean": 5.7833251953125, "rewards/safety_reward/std": 1.2866301536560059, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 89.0, "completions/max_terminated_length": 89.0, "completions/mean_length": 51.6796875, "completions/mean_terminated_length": 51.6796875, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.03597939044624924, "frac_reward_zero_std": 0.0, "grad_norm": 0.5247189998626709, "kl": 1.2373046875, "learning_rate": 5e-05, "loss": 0.0285, "num_tokens": 2818778.0, "reward": 5.070159912109375, "reward_std": 0.5827337503433228, "rewards/helpfulness_reward/mean": 0.14609849452972412, "rewards/helpfulness_reward/std": 1.0797781944274902, "rewards/safety_reward/mean": 5.070159912109375, "rewards/safety_reward/std": 1.8177986145019531, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 84.0, "completions/max_terminated_length": 84.0, "completions/mean_length": 54.4453125, "completions/mean_terminated_length": 54.4453125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.03632870491660117, "frac_reward_zero_std": 0.0, "grad_norm": 0.5099273324012756, "kl": 1.111328125, "learning_rate": 5e-05, "loss": 0.0416, "num_tokens": 2829339.0, "reward": 5.8609619140625, "reward_std": 0.5552147030830383, "rewards/helpfulness_reward/mean": 0.3382765054702759, "rewards/helpfulness_reward/std": 1.0014622211456299, "rewards/safety_reward/mean": 5.8609619140625, "rewards/safety_reward/std": 0.9752308130264282, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 88.0, "completions/max_terminated_length": 88.0, "completions/mean_length": 56.1328125, "completions/mean_terminated_length": 56.1328125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.03667801938695311, "frac_reward_zero_std": 0.0, "grad_norm": 0.488502562046051, "kl": 1.00244140625, "learning_rate": 5e-05, "loss": 0.0462, "num_tokens": 2841276.0, "reward": 5.709228515625, "reward_std": 0.5518304109573364, "rewards/helpfulness_reward/mean": 0.1011505126953125, "rewards/helpfulness_reward/std": 1.0958162546157837, "rewards/safety_reward/mean": 5.709228515625, "rewards/safety_reward/std": 1.2689844369888306, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1792.0, "completions/max_terminated_length": 87.0, "completions/mean_length": 66.21875, "completions/mean_terminated_length": 52.629920959472656, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.03702733385730504, "frac_reward_zero_std": 0.0, "grad_norm": 0.4709938168525696, "kl": 0.818115234375, "learning_rate": 5e-05, "loss": 0.1252, "num_tokens": 2855360.0, "reward": 5.567309379577637, "reward_std": 0.5800657868385315, "rewards/helpfulness_reward/mean": 0.11725473403930664, "rewards/helpfulness_reward/std": 1.2912416458129883, "rewards/safety_reward/mean": 5.567309379577637, "rewards/safety_reward/std": 1.9349974393844604, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 91.0, "completions/max_terminated_length": 91.0, "completions/mean_length": 50.953125, "completions/mean_terminated_length": 50.953125, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.037376648327656976, "frac_reward_zero_std": 0.0, "grad_norm": 0.5821188688278198, "kl": 1.146484375, "learning_rate": 5e-05, "loss": 0.0266, "num_tokens": 2866890.0, "reward": 5.12872314453125, "reward_std": 0.5444498062133789, "rewards/helpfulness_reward/mean": -0.1872568130493164, "rewards/helpfulness_reward/std": 1.1887524127960205, "rewards/safety_reward/mean": 5.12872314453125, "rewards/safety_reward/std": 1.6183878183364868, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 96.0, "completions/max_terminated_length": 96.0, "completions/mean_length": 55.6171875, "completions/mean_terminated_length": 55.6171875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.03772596279800891, "frac_reward_zero_std": 0.0, "grad_norm": 0.5200827121734619, "kl": 0.97900390625, "learning_rate": 5e-05, "loss": 0.0253, "num_tokens": 2878009.0, "reward": 6.19189453125, "reward_std": 0.5959651470184326, "rewards/helpfulness_reward/mean": 0.6029531955718994, "rewards/helpfulness_reward/std": 0.9721264243125916, "rewards/safety_reward/mean": 6.19189453125, "rewards/safety_reward/std": 1.1150070428848267, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 125.0, "completions/max_terminated_length": 125.0, "completions/mean_length": 54.8359375, "completions/mean_terminated_length": 54.8359375, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.038075277268360845, "frac_reward_zero_std": 0.0, "grad_norm": 0.5635266304016113, "kl": 1.0859375, "learning_rate": 5e-05, "loss": -0.0024, "num_tokens": 2889516.0, "reward": 4.984262466430664, "reward_std": 0.6523404121398926, "rewards/helpfulness_reward/mean": 0.055910587310791016, "rewards/helpfulness_reward/std": 1.0444856882095337, "rewards/safety_reward/mean": 4.984262466430664, "rewards/safety_reward/std": 1.568136215209961, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 92.0, "completions/max_terminated_length": 92.0, "completions/mean_length": 55.21875, "completions/mean_terminated_length": 55.21875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.03842459173871278, "frac_reward_zero_std": 0.0, "grad_norm": 0.4989888370037079, "kl": 1.05224609375, "learning_rate": 5e-05, "loss": 0.0177, "num_tokens": 2900632.0, "reward": 5.2197265625, "reward_std": 0.6135365962982178, "rewards/helpfulness_reward/mean": 0.20832157135009766, "rewards/helpfulness_reward/std": 0.8812574148178101, "rewards/safety_reward/mean": 5.2197265625, "rewards/safety_reward/std": 1.3991190195083618, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 93.0, "completions/max_terminated_length": 93.0, "completions/mean_length": 52.8359375, "completions/mean_terminated_length": 52.8359375, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.038773906209064714, "frac_reward_zero_std": 0.0, "grad_norm": 0.5381000638008118, "kl": 1.16015625, "learning_rate": 5e-05, "loss": 0.0129, "num_tokens": 2911955.0, "reward": 5.94097900390625, "reward_std": 0.624070405960083, "rewards/helpfulness_reward/mean": 0.2665741443634033, "rewards/helpfulness_reward/std": 0.8835263252258301, "rewards/safety_reward/mean": 5.94097900390625, "rewards/safety_reward/std": 1.4688782691955566, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 92.0, "completions/max_terminated_length": 92.0, "completions/mean_length": 52.3125, "completions/mean_terminated_length": 52.3125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.03912322067941665, "frac_reward_zero_std": 0.0, "grad_norm": 0.4669979214668274, "kl": 1.09130859375, "learning_rate": 5e-05, "loss": 0.0261, "num_tokens": 2922531.0, "reward": 5.4810791015625, "reward_std": 0.6075676083564758, "rewards/helpfulness_reward/mean": 0.5811778903007507, "rewards/helpfulness_reward/std": 0.9990777373313904, "rewards/safety_reward/mean": 5.4810791015625, "rewards/safety_reward/std": 1.4019107818603516, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 50.2578125, "completions/mean_terminated_length": 50.2578125, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.03947253514976858, "frac_reward_zero_std": 0.0, "grad_norm": 0.5977522134780884, "kl": 1.3759765625, "learning_rate": 5e-05, "loss": 0.023, "num_tokens": 2932724.0, "reward": 5.4652099609375, "reward_std": 0.5023664832115173, "rewards/helpfulness_reward/mean": -0.12272834777832031, "rewards/helpfulness_reward/std": 0.8489594459533691, "rewards/safety_reward/mean": 5.4652099609375, "rewards/safety_reward/std": 1.169915795326233, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 93.0, "completions/max_terminated_length": 93.0, "completions/mean_length": 50.25, "completions/mean_terminated_length": 50.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.03982184962012052, "frac_reward_zero_std": 0.0, "grad_norm": 0.6802263259887695, "kl": 1.4873046875, "learning_rate": 5e-05, "loss": 0.0408, "num_tokens": 2943188.0, "reward": 5.4528961181640625, "reward_std": 0.597868800163269, "rewards/helpfulness_reward/mean": 0.20352578163146973, "rewards/helpfulness_reward/std": 0.9705253839492798, "rewards/safety_reward/mean": 5.4528961181640625, "rewards/safety_reward/std": 1.5861700773239136, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 86.0, "completions/max_terminated_length": 86.0, "completions/mean_length": 54.4765625, "completions/mean_terminated_length": 54.4765625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.04017116409047245, "frac_reward_zero_std": 0.0, "grad_norm": 0.529211699962616, "kl": 1.1376953125, "learning_rate": 5e-05, "loss": 0.0403, "num_tokens": 2954441.0, "reward": 5.308349609375, "reward_std": 0.5966686010360718, "rewards/helpfulness_reward/mean": -0.09151983261108398, "rewards/helpfulness_reward/std": 1.0148146152496338, "rewards/safety_reward/mean": 5.308349609375, "rewards/safety_reward/std": 1.3000534772872925, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 91.0, "completions/max_terminated_length": 91.0, "completions/mean_length": 51.4921875, "completions/mean_terminated_length": 51.4921875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.040520478560824386, "frac_reward_zero_std": 0.0, "grad_norm": 0.7567542791366577, "kl": 1.587890625, "learning_rate": 5e-05, "loss": 0.0353, "num_tokens": 2964192.0, "reward": 5.310791015625, "reward_std": 0.49748775362968445, "rewards/helpfulness_reward/mean": 0.42740774154663086, "rewards/helpfulness_reward/std": 0.8520867228507996, "rewards/safety_reward/mean": 5.310791015625, "rewards/safety_reward/std": 1.5095229148864746, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 54.3359375, "completions/mean_terminated_length": 54.3359375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.04086979303117631, "frac_reward_zero_std": 0.0, "grad_norm": 0.5184398293495178, "kl": 1.07373046875, "learning_rate": 5e-05, "loss": 0.0263, "num_tokens": 2975307.0, "reward": 6.105712890625, "reward_std": 0.5449472665786743, "rewards/helpfulness_reward/mean": 0.16515186429023743, "rewards/helpfulness_reward/std": 0.8168814778327942, "rewards/safety_reward/mean": 6.105712890625, "rewards/safety_reward/std": 1.320013403892517, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 95.0, "completions/max_terminated_length": 95.0, "completions/mean_length": 53.609375, "completions/mean_terminated_length": 53.609375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.04121910750152825, "frac_reward_zero_std": 0.0, "grad_norm": 0.5782552361488342, "kl": 1.224609375, "learning_rate": 5e-05, "loss": 0.0175, "num_tokens": 2986145.0, "reward": 5.5416259765625, "reward_std": 0.5766850709915161, "rewards/helpfulness_reward/mean": 0.22472763061523438, "rewards/helpfulness_reward/std": 0.9628192782402039, "rewards/safety_reward/mean": 5.5416259765625, "rewards/safety_reward/std": 1.448582649230957, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 86.0, "completions/max_terminated_length": 86.0, "completions/mean_length": 52.546875, "completions/mean_terminated_length": 52.546875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.04156842197188018, "frac_reward_zero_std": 0.0, "grad_norm": 0.49908360838890076, "kl": 1.13671875, "learning_rate": 5e-05, "loss": 0.0385, "num_tokens": 2997583.0, "reward": 6.2620849609375, "reward_std": 0.5904780626296997, "rewards/helpfulness_reward/mean": -0.06780171394348145, "rewards/helpfulness_reward/std": 1.0218802690505981, "rewards/safety_reward/mean": 6.2620849609375, "rewards/safety_reward/std": 1.082304835319519, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 89.0, "completions/max_terminated_length": 89.0, "completions/mean_length": 51.859375, "completions/mean_terminated_length": 51.859375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.041917736442232116, "frac_reward_zero_std": 0.0, "grad_norm": 0.6651477217674255, "kl": 1.447265625, "learning_rate": 5e-05, "loss": 0.0414, "num_tokens": 3008053.0, "reward": 5.8870849609375, "reward_std": 0.4986403286457062, "rewards/helpfulness_reward/mean": 0.4486689567565918, "rewards/helpfulness_reward/std": 0.9845913648605347, "rewards/safety_reward/mean": 5.8870849609375, "rewards/safety_reward/std": 1.2133264541625977, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 52.9140625, "completions/mean_terminated_length": 52.9140625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.04226705091258405, "frac_reward_zero_std": 0.0, "grad_norm": 0.5444802045822144, "kl": 1.1845703125, "learning_rate": 5e-05, "loss": 0.0049, "num_tokens": 3019178.0, "reward": 5.84619140625, "reward_std": 0.6294978857040405, "rewards/helpfulness_reward/mean": 0.1256982684135437, "rewards/helpfulness_reward/std": 1.0415765047073364, "rewards/safety_reward/mean": 5.84619140625, "rewards/safety_reward/std": 1.3701069355010986, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 92.0, "completions/max_terminated_length": 92.0, "completions/mean_length": 53.5546875, "completions/mean_terminated_length": 53.5546875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.042616365382935985, "frac_reward_zero_std": 0.0, "grad_norm": 0.5307338833808899, "kl": 1.3525390625, "learning_rate": 5e-05, "loss": 0.021, "num_tokens": 3030601.0, "reward": 5.6759033203125, "reward_std": 0.5899688601493835, "rewards/helpfulness_reward/mean": 0.25235891342163086, "rewards/helpfulness_reward/std": 1.006987452507019, "rewards/safety_reward/mean": 5.6759033203125, "rewards/safety_reward/std": 1.4952586889266968, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 91.0, "completions/max_terminated_length": 91.0, "completions/mean_length": 55.234375, "completions/mean_terminated_length": 55.234375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.04296567985328792, "frac_reward_zero_std": 0.0, "grad_norm": 0.4979131519794464, "kl": 1.134765625, "learning_rate": 5e-05, "loss": 0.0234, "num_tokens": 3041287.0, "reward": 6.0382080078125, "reward_std": 0.5561603307723999, "rewards/helpfulness_reward/mean": 0.3481597900390625, "rewards/helpfulness_reward/std": 0.7833511829376221, "rewards/safety_reward/mean": 6.0382080078125, "rewards/safety_reward/std": 1.1837332248687744, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 89.0, "completions/max_terminated_length": 89.0, "completions/mean_length": 53.5625, "completions/mean_terminated_length": 53.5625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.043314994323639854, "frac_reward_zero_std": 0.0, "grad_norm": 0.47964173555374146, "kl": 1.2001953125, "learning_rate": 5e-05, "loss": 0.0333, "num_tokens": 3053415.0, "reward": 5.42938232421875, "reward_std": 0.49277591705322266, "rewards/helpfulness_reward/mean": 0.2677488327026367, "rewards/helpfulness_reward/std": 0.9534063935279846, "rewards/safety_reward/mean": 5.42938232421875, "rewards/safety_reward/std": 1.5733871459960938, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 86.0, "completions/max_terminated_length": 86.0, "completions/mean_length": 53.4609375, "completions/mean_terminated_length": 53.4609375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.04366430879399179, "frac_reward_zero_std": 0.0, "grad_norm": 0.5580717325210571, "kl": 1.3837890625, "learning_rate": 5e-05, "loss": 0.013, "num_tokens": 3063906.0, "reward": 5.27777099609375, "reward_std": 0.5399510264396667, "rewards/helpfulness_reward/mean": 0.13032054901123047, "rewards/helpfulness_reward/std": 0.8391152024269104, "rewards/safety_reward/mean": 5.27777099609375, "rewards/safety_reward/std": 1.7117290496826172, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 53.375, "completions/mean_terminated_length": 53.375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.04401362326434372, "frac_reward_zero_std": 0.0, "grad_norm": 0.5149977207183838, "kl": 1.255859375, "learning_rate": 5e-05, "loss": 0.0189, "num_tokens": 3074658.0, "reward": 6.5587158203125, "reward_std": 0.5317510366439819, "rewards/helpfulness_reward/mean": 0.2180798053741455, "rewards/helpfulness_reward/std": 0.9624971151351929, "rewards/safety_reward/mean": 6.5587158203125, "rewards/safety_reward/std": 1.4108012914657593, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 86.0, "completions/max_terminated_length": 86.0, "completions/mean_length": 56.3046875, "completions/mean_terminated_length": 56.3046875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.04436293773469566, "frac_reward_zero_std": 0.0, "grad_norm": 0.6126576662063599, "kl": 1.3779296875, "learning_rate": 5e-05, "loss": 0.0349, "num_tokens": 3086697.0, "reward": 6.0421142578125, "reward_std": 0.4950640797615051, "rewards/helpfulness_reward/mean": 0.16207408905029297, "rewards/helpfulness_reward/std": 1.032106876373291, "rewards/safety_reward/mean": 6.0421142578125, "rewards/safety_reward/std": 1.128862977027893, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 54.375, "completions/mean_terminated_length": 54.375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.04471225220504759, "frac_reward_zero_std": 0.0, "grad_norm": 0.5287732481956482, "kl": 1.1865234375, "learning_rate": 5e-05, "loss": 0.0482, "num_tokens": 3097505.0, "reward": 6.0948486328125, "reward_std": 0.5667576193809509, "rewards/helpfulness_reward/mean": 0.2919330596923828, "rewards/helpfulness_reward/std": 0.8506687879562378, "rewards/safety_reward/mean": 6.0948486328125, "rewards/safety_reward/std": 1.1338409185409546, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 96.0, "completions/max_terminated_length": 96.0, "completions/mean_length": 58.046875, "completions/mean_terminated_length": 58.046875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.045061566675399525, "frac_reward_zero_std": 0.0, "grad_norm": 0.5132063031196594, "kl": 1.1943359375, "learning_rate": 5e-05, "loss": 0.0468, "num_tokens": 3109023.0, "reward": 5.8128662109375, "reward_std": 0.5145540237426758, "rewards/helpfulness_reward/mean": 0.20721435546875, "rewards/helpfulness_reward/std": 0.9045713543891907, "rewards/safety_reward/mean": 5.8128662109375, "rewards/safety_reward/std": 0.9637253284454346, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 90.0, "completions/max_terminated_length": 90.0, "completions/mean_length": 56.046875, "completions/mean_terminated_length": 56.046875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.04541088114575146, "frac_reward_zero_std": 0.0, "grad_norm": 0.5172225832939148, "kl": 1.4306640625, "learning_rate": 5e-05, "loss": 0.0384, "num_tokens": 3119837.0, "reward": 5.339599609375, "reward_std": 0.5153602957725525, "rewards/helpfulness_reward/mean": 0.3782386779785156, "rewards/helpfulness_reward/std": 0.8761177062988281, "rewards/safety_reward/mean": 5.339599609375, "rewards/safety_reward/std": 1.3935511112213135, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 95.0, "completions/max_terminated_length": 95.0, "completions/mean_length": 55.375, "completions/mean_terminated_length": 55.375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.045760195616103394, "frac_reward_zero_std": 0.0, "grad_norm": 0.49111589789390564, "kl": 1.181640625, "learning_rate": 5e-05, "loss": 0.0235, "num_tokens": 3130957.0, "reward": 6.1151123046875, "reward_std": 0.5091478228569031, "rewards/helpfulness_reward/mean": 0.46991539001464844, "rewards/helpfulness_reward/std": 0.7872468829154968, "rewards/safety_reward/mean": 6.1151123046875, "rewards/safety_reward/std": 1.3810611963272095, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 84.0, "completions/max_terminated_length": 84.0, "completions/mean_length": 52.046875, "completions/mean_terminated_length": 52.046875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.04610951008645533, "frac_reward_zero_std": 0.0, "grad_norm": 0.56590735912323, "kl": 1.298828125, "learning_rate": 5e-05, "loss": 0.0209, "num_tokens": 3142899.0, "reward": 5.9661865234375, "reward_std": 0.5361480712890625, "rewards/helpfulness_reward/mean": 0.11465620994567871, "rewards/helpfulness_reward/std": 1.2501851320266724, "rewards/safety_reward/mean": 5.9661865234375, "rewards/safety_reward/std": 1.4537748098373413, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 54.1640625, "completions/mean_terminated_length": 54.1640625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.04645882455680726, "frac_reward_zero_std": 0.0, "grad_norm": 0.5033297538757324, "kl": 1.2431640625, "learning_rate": 5e-05, "loss": 0.0472, "num_tokens": 3154344.0, "reward": 5.6056365966796875, "reward_std": 0.5139142274856567, "rewards/helpfulness_reward/mean": 0.13489267230033875, "rewards/helpfulness_reward/std": 0.8944066166877747, "rewards/safety_reward/mean": 5.6056365966796875, "rewards/safety_reward/std": 1.7342966794967651, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 53.9140625, "completions/mean_terminated_length": 53.9140625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.0468081390271592, "frac_reward_zero_std": 0.0, "grad_norm": 0.4798433482646942, "kl": 1.2685546875, "learning_rate": 5e-05, "loss": 0.0295, "num_tokens": 3165373.0, "reward": 6.43994140625, "reward_std": 0.46949106454849243, "rewards/helpfulness_reward/mean": 0.44882917404174805, "rewards/helpfulness_reward/std": 0.8751637935638428, "rewards/safety_reward/mean": 6.43994140625, "rewards/safety_reward/std": 1.1287572383880615, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 52.2109375, "completions/mean_terminated_length": 52.2109375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.04715745349751113, "frac_reward_zero_std": 0.0, "grad_norm": 0.5756120681762695, "kl": 1.314453125, "learning_rate": 5e-05, "loss": 0.0364, "num_tokens": 3175976.0, "reward": 6.0142822265625, "reward_std": 0.463250994682312, "rewards/helpfulness_reward/mean": 0.2232067883014679, "rewards/helpfulness_reward/std": 0.9575190544128418, "rewards/safety_reward/mean": 6.0142822265625, "rewards/safety_reward/std": 1.109937310218811, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 52.8828125, "completions/mean_terminated_length": 52.8828125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.047506767967863066, "frac_reward_zero_std": 0.0, "grad_norm": 0.5597978234291077, "kl": 1.2822265625, "learning_rate": 5e-05, "loss": 0.0366, "num_tokens": 3187545.0, "reward": 6.2276611328125, "reward_std": 0.5769882202148438, "rewards/helpfulness_reward/mean": 0.09389901161193848, "rewards/helpfulness_reward/std": 1.0341987609863281, "rewards/safety_reward/mean": 6.2276611328125, "rewards/safety_reward/std": 1.545865774154663, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 52.1875, "completions/mean_terminated_length": 52.1875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.047856082438215, "frac_reward_zero_std": 0.0, "grad_norm": 0.6201461553573608, "kl": 1.5625, "learning_rate": 5e-05, "loss": 0.0231, "num_tokens": 3198801.0, "reward": 5.224853515625, "reward_std": 0.48534953594207764, "rewards/helpfulness_reward/mean": 0.03572714328765869, "rewards/helpfulness_reward/std": 0.8502611517906189, "rewards/safety_reward/mean": 5.224853515625, "rewards/safety_reward/std": 1.1525468826293945, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 84.0, "completions/max_terminated_length": 84.0, "completions/mean_length": 51.6953125, "completions/mean_terminated_length": 51.6953125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.048205396908566935, "frac_reward_zero_std": 0.0, "grad_norm": 0.5595540404319763, "kl": 1.3359375, "learning_rate": 5e-05, "loss": 0.0142, "num_tokens": 3209874.0, "reward": 6.34375, "reward_std": 0.46310341358184814, "rewards/helpfulness_reward/mean": 0.43796372413635254, "rewards/helpfulness_reward/std": 0.8811706304550171, "rewards/safety_reward/mean": 6.34375, "rewards/safety_reward/std": 1.1491020917892456, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 122.0, "completions/max_terminated_length": 122.0, "completions/mean_length": 54.4765625, "completions/mean_terminated_length": 54.4765625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.04855471137891887, "frac_reward_zero_std": 0.0, "grad_norm": 0.5625833868980408, "kl": 1.2900390625, "learning_rate": 5e-05, "loss": 0.0014, "num_tokens": 3223415.0, "reward": 5.909423828125, "reward_std": 0.5493192672729492, "rewards/helpfulness_reward/mean": 0.25452613830566406, "rewards/helpfulness_reward/std": 0.9456208348274231, "rewards/safety_reward/mean": 5.909423828125, "rewards/safety_reward/std": 1.5771162509918213, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 93.0, "completions/max_terminated_length": 93.0, "completions/mean_length": 55.5859375, "completions/mean_terminated_length": 55.5859375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.0489040258492708, "frac_reward_zero_std": 0.0, "grad_norm": 0.4777376651763916, "kl": 1.2626953125, "learning_rate": 5e-05, "loss": 0.0303, "num_tokens": 3235866.0, "reward": 6.2735595703125, "reward_std": 0.5024980306625366, "rewards/helpfulness_reward/mean": 0.3273472785949707, "rewards/helpfulness_reward/std": 0.9105839133262634, "rewards/safety_reward/mean": 6.2735595703125, "rewards/safety_reward/std": 1.4643653631210327, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 51.3359375, "completions/mean_terminated_length": 51.3359375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.04925334031962274, "frac_reward_zero_std": 0.0, "grad_norm": 0.7183377146720886, "kl": 1.650390625, "learning_rate": 5e-05, "loss": 0.0191, "num_tokens": 3246285.0, "reward": 6.3115234375, "reward_std": 0.5420200824737549, "rewards/helpfulness_reward/mean": 0.33291149139404297, "rewards/helpfulness_reward/std": 0.9347966313362122, "rewards/safety_reward/mean": 6.3115234375, "rewards/safety_reward/std": 1.2152032852172852, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 55.3359375, "completions/mean_terminated_length": 55.3359375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.04960265478997467, "frac_reward_zero_std": 0.0, "grad_norm": 0.5291081666946411, "kl": 1.2421875, "learning_rate": 5e-05, "loss": 0.0703, "num_tokens": 3258040.0, "reward": 6.184814453125, "reward_std": 0.4943218529224396, "rewards/helpfulness_reward/mean": 0.19527626037597656, "rewards/helpfulness_reward/std": 0.9529258608818054, "rewards/safety_reward/mean": 6.184814453125, "rewards/safety_reward/std": 1.1730892658233643, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 85.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 53.0390625, "completions/mean_terminated_length": 53.0390625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.049951969260326606, "frac_reward_zero_std": 0.0, "grad_norm": 0.5394341945648193, "kl": 1.4814453125, "learning_rate": 5e-05, "loss": 0.02, "num_tokens": 3269925.0, "reward": 5.733734130859375, "reward_std": 0.6564630270004272, "rewards/helpfulness_reward/mean": 0.5080243349075317, "rewards/helpfulness_reward/std": 1.104972243309021, "rewards/safety_reward/mean": 5.733734130859375, "rewards/safety_reward/std": 1.667730450630188, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 88.0, "completions/max_terminated_length": 88.0, "completions/mean_length": 54.796875, "completions/mean_terminated_length": 54.796875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.05030128373067854, "frac_reward_zero_std": 0.0, "grad_norm": 0.5368706583976746, "kl": 1.3603515625, "learning_rate": 5e-05, "loss": 0.0558, "num_tokens": 3281075.0, "reward": 6.51025390625, "reward_std": 0.48263856768608093, "rewards/helpfulness_reward/mean": 0.6536264419555664, "rewards/helpfulness_reward/std": 1.027995228767395, "rewards/safety_reward/mean": 6.51025390625, "rewards/safety_reward/std": 1.2216840982437134, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 61.625, "completions/mean_terminated_length": 61.625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.050650598201030475, "frac_reward_zero_std": 0.0, "grad_norm": 0.4969596564769745, "kl": 1.2880859375, "learning_rate": 5e-05, "loss": 0.0418, "num_tokens": 3293835.0, "reward": 6.5648193359375, "reward_std": 0.4723166823387146, "rewards/helpfulness_reward/mean": 0.3780233860015869, "rewards/helpfulness_reward/std": 0.8262749910354614, "rewards/safety_reward/mean": 6.5648193359375, "rewards/safety_reward/std": 1.5330697298049927, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 127.0, "completions/max_terminated_length": 127.0, "completions/mean_length": 57.3671875, "completions/mean_terminated_length": 57.3671875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.05099991267138241, "frac_reward_zero_std": 0.0, "grad_norm": 0.6057331562042236, "kl": 1.314453125, "learning_rate": 5e-05, "loss": 0.0237, "num_tokens": 3304794.0, "reward": 6.08380126953125, "reward_std": 0.6871981620788574, "rewards/helpfulness_reward/mean": 0.0770951509475708, "rewards/helpfulness_reward/std": 0.9329237341880798, "rewards/safety_reward/mean": 6.08380126953125, "rewards/safety_reward/std": 1.6693315505981445, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 91.0, "completions/max_terminated_length": 91.0, "completions/mean_length": 52.7734375, "completions/mean_terminated_length": 52.7734375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.051349227141734344, "frac_reward_zero_std": 0.0, "grad_norm": 0.569215714931488, "kl": 1.48046875, "learning_rate": 5e-05, "loss": 0.0415, "num_tokens": 3316413.0, "reward": 5.9178466796875, "reward_std": 0.5471118688583374, "rewards/helpfulness_reward/mean": 0.10760354995727539, "rewards/helpfulness_reward/std": 0.8376811742782593, "rewards/safety_reward/mean": 5.9178466796875, "rewards/safety_reward/std": 1.3719604015350342, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 55.2890625, "completions/mean_terminated_length": 55.2890625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.05169854161208628, "frac_reward_zero_std": 0.0, "grad_norm": 0.5435915589332581, "kl": 1.376953125, "learning_rate": 5e-05, "loss": 0.028, "num_tokens": 3329122.0, "reward": 5.57684326171875, "reward_std": 0.5218900442123413, "rewards/helpfulness_reward/mean": 0.21034058928489685, "rewards/helpfulness_reward/std": 1.0453572273254395, "rewards/safety_reward/mean": 5.57684326171875, "rewards/safety_reward/std": 1.384807825088501, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 52.984375, "completions/mean_terminated_length": 52.984375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.05204785608243821, "frac_reward_zero_std": 0.0, "grad_norm": 0.5355267524719238, "kl": 1.51953125, "learning_rate": 5e-05, "loss": 0.0143, "num_tokens": 3340720.0, "reward": 5.7420654296875, "reward_std": 0.5464019179344177, "rewards/helpfulness_reward/mean": 0.1240343451499939, "rewards/helpfulness_reward/std": 0.8167087435722351, "rewards/safety_reward/mean": 5.7420654296875, "rewards/safety_reward/std": 1.3069298267364502, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 89.0, "completions/max_terminated_length": 89.0, "completions/mean_length": 55.234375, "completions/mean_terminated_length": 55.234375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.05239717055279015, "frac_reward_zero_std": 0.0, "grad_norm": 0.5894251465797424, "kl": 1.4375, "learning_rate": 5e-05, "loss": 0.0486, "num_tokens": 3351846.0, "reward": 6.029541015625, "reward_std": 0.47033894062042236, "rewards/helpfulness_reward/mean": 0.06005859375, "rewards/helpfulness_reward/std": 0.7567779421806335, "rewards/safety_reward/mean": 6.029541015625, "rewards/safety_reward/std": 1.1982734203338623, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 103.0, "completions/max_terminated_length": 103.0, "completions/mean_length": 55.2265625, "completions/mean_terminated_length": 55.2265625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.05274648502314208, "frac_reward_zero_std": 0.0, "grad_norm": 0.5705983638763428, "kl": 1.3837890625, "learning_rate": 5e-05, "loss": 0.0524, "num_tokens": 3363283.0, "reward": 6.5062255859375, "reward_std": 0.5143686532974243, "rewards/helpfulness_reward/mean": 0.1150892972946167, "rewards/helpfulness_reward/std": 1.0283496379852295, "rewards/safety_reward/mean": 6.5062255859375, "rewards/safety_reward/std": 1.5193530321121216, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 87.0, "completions/max_terminated_length": 87.0, "completions/mean_length": 55.921875, "completions/mean_terminated_length": 55.921875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.053095799493494016, "frac_reward_zero_std": 0.0, "grad_norm": 0.5054101943969727, "kl": 1.2978515625, "learning_rate": 5e-05, "loss": 0.0444, "num_tokens": 3377041.0, "reward": 5.869873046875, "reward_std": 0.49531495571136475, "rewards/helpfulness_reward/mean": -0.06575489044189453, "rewards/helpfulness_reward/std": 0.8902732729911804, "rewards/safety_reward/mean": 5.869873046875, "rewards/safety_reward/std": 1.8602277040481567, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 99.0, "completions/max_terminated_length": 99.0, "completions/mean_length": 57.796875, "completions/mean_terminated_length": 57.796875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.05344511396384595, "frac_reward_zero_std": 0.0, "grad_norm": 0.5397816896438599, "kl": 1.4130859375, "learning_rate": 5e-05, "loss": 0.0579, "num_tokens": 3389575.0, "reward": 5.81494140625, "reward_std": 0.5590373277664185, "rewards/helpfulness_reward/mean": 0.052954673767089844, "rewards/helpfulness_reward/std": 0.7840705513954163, "rewards/safety_reward/mean": 5.81494140625, "rewards/safety_reward/std": 1.3205432891845703, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1219.0, "completions/max_terminated_length": 1219.0, "completions/mean_length": 142.203125, "completions/mean_terminated_length": 142.203125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.053794428434197884, "frac_reward_zero_std": 0.0, "grad_norm": 0.9608354568481445, "kl": 1.034423828125, "learning_rate": 5e-05, "loss": -0.0021, "num_tokens": 3415001.0, "reward": 5.934656143188477, "reward_std": 0.5235578417778015, "rewards/helpfulness_reward/mean": 0.6032910346984863, "rewards/helpfulness_reward/std": 0.8200701475143433, "rewards/safety_reward/mean": 5.934656143188477, "rewards/safety_reward/std": 2.2996761798858643, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 56.1171875, "completions/mean_terminated_length": 56.1171875, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.05414374290454982, "frac_reward_zero_std": 0.0, "grad_norm": 0.5236252546310425, "kl": 1.3935546875, "learning_rate": 5e-05, "loss": 0.0368, "num_tokens": 3427280.0, "reward": 5.945556640625, "reward_std": 0.5711688995361328, "rewards/helpfulness_reward/mean": -0.03421187400817871, "rewards/helpfulness_reward/std": 1.113100528717041, "rewards/safety_reward/mean": 5.945556640625, "rewards/safety_reward/std": 1.1283543109893799, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 88.0, "completions/max_terminated_length": 88.0, "completions/mean_length": 56.578125, "completions/mean_terminated_length": 56.578125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.05449305737490175, "frac_reward_zero_std": 0.0, "grad_norm": 0.5227463841438293, "kl": 1.505859375, "learning_rate": 5e-05, "loss": 0.0374, "num_tokens": 3438538.0, "reward": 6.452392578125, "reward_std": 0.5251603722572327, "rewards/helpfulness_reward/mean": 0.10858154296875, "rewards/helpfulness_reward/std": 0.6194842457771301, "rewards/safety_reward/mean": 6.452392578125, "rewards/safety_reward/std": 1.4615676403045654, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 53.3984375, "completions/mean_terminated_length": 53.3984375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.05484237184525369, "frac_reward_zero_std": 0.0, "grad_norm": 0.6937394738197327, "kl": 1.7939453125, "learning_rate": 5e-05, "loss": 0.0307, "num_tokens": 3448965.0, "reward": 6.2471923828125, "reward_std": 0.49931174516677856, "rewards/helpfulness_reward/mean": 0.4266347885131836, "rewards/helpfulness_reward/std": 0.7942310571670532, "rewards/safety_reward/mean": 6.2471923828125, "rewards/safety_reward/std": 1.3336167335510254, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 53.5703125, "completions/mean_terminated_length": 53.5703125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.05519168631560562, "frac_reward_zero_std": 0.0, "grad_norm": 0.6208759546279907, "kl": 1.509765625, "learning_rate": 5e-05, "loss": 0.0532, "num_tokens": 3459814.0, "reward": 6.290283203125, "reward_std": 0.5110278129577637, "rewards/helpfulness_reward/mean": 0.16060614585876465, "rewards/helpfulness_reward/std": 0.952387809753418, "rewards/safety_reward/mean": 6.290283203125, "rewards/safety_reward/std": 1.6037416458129883, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 126.0, "completions/max_terminated_length": 126.0, "completions/mean_length": 54.828125, "completions/mean_terminated_length": 54.828125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.055541000785957556, "frac_reward_zero_std": 0.0, "grad_norm": 0.5845416188240051, "kl": 1.353515625, "learning_rate": 5e-05, "loss": 0.0115, "num_tokens": 3471200.0, "reward": 6.4761962890625, "reward_std": 0.4578937888145447, "rewards/helpfulness_reward/mean": 0.5248408317565918, "rewards/helpfulness_reward/std": 0.7289339303970337, "rewards/safety_reward/mean": 6.4761962890625, "rewards/safety_reward/std": 1.762200117111206, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 52.859375, "completions/mean_terminated_length": 52.859375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.05589031525630949, "frac_reward_zero_std": 0.0, "grad_norm": 0.5954263806343079, "kl": 1.4921875, "learning_rate": 5e-05, "loss": 0.016, "num_tokens": 3481734.0, "reward": 6.23779296875, "reward_std": 0.4785679578781128, "rewards/helpfulness_reward/mean": 0.34357714653015137, "rewards/helpfulness_reward/std": 0.7495750784873962, "rewards/safety_reward/mean": 6.23779296875, "rewards/safety_reward/std": 1.4756847620010376, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 51.8984375, "completions/mean_terminated_length": 51.8984375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.056239629726661425, "frac_reward_zero_std": 0.0, "grad_norm": 0.5323023200035095, "kl": 1.541015625, "learning_rate": 5e-05, "loss": 0.0292, "num_tokens": 3492593.0, "reward": 6.3045654296875, "reward_std": 0.5198350548744202, "rewards/helpfulness_reward/mean": 0.3976926803588867, "rewards/helpfulness_reward/std": 0.7924772500991821, "rewards/safety_reward/mean": 6.3045654296875, "rewards/safety_reward/std": 1.1640312671661377, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 90.0, "completions/max_terminated_length": 90.0, "completions/mean_length": 52.015625, "completions/mean_terminated_length": 52.015625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.05658894419701336, "frac_reward_zero_std": 0.0, "grad_norm": 0.606610119342804, "kl": 1.603515625, "learning_rate": 5e-05, "loss": 0.0472, "num_tokens": 3504163.0, "reward": 6.13330078125, "reward_std": 0.527777910232544, "rewards/helpfulness_reward/mean": 0.032187461853027344, "rewards/helpfulness_reward/std": 0.9619544148445129, "rewards/safety_reward/mean": 6.13330078125, "rewards/safety_reward/std": 1.284194827079773, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 50.7734375, "completions/mean_terminated_length": 50.7734375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.056938258667365294, "frac_reward_zero_std": 0.0, "grad_norm": 0.6232287287712097, "kl": 1.8095703125, "learning_rate": 5e-05, "loss": 0.0102, "num_tokens": 3514326.0, "reward": 5.962158203125, "reward_std": 0.46516457200050354, "rewards/helpfulness_reward/mean": 0.38010817766189575, "rewards/helpfulness_reward/std": 0.6878520250320435, "rewards/safety_reward/mean": 5.962158203125, "rewards/safety_reward/std": 1.014198899269104, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 110.0, "completions/max_terminated_length": 110.0, "completions/mean_length": 51.578125, "completions/mean_terminated_length": 51.578125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.05728757313771723, "frac_reward_zero_std": 0.0, "grad_norm": 1.694303274154663, "kl": 1.607421875, "learning_rate": 5e-05, "loss": 0.0538, "num_tokens": 3527584.0, "reward": 5.990455627441406, "reward_std": 0.5322928428649902, "rewards/helpfulness_reward/mean": 0.3288288116455078, "rewards/helpfulness_reward/std": 1.1433547735214233, "rewards/safety_reward/mean": 5.990455627441406, "rewards/safety_reward/std": 2.095392942428589, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 50.15625, "completions/mean_terminated_length": 50.15625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.05763688760806916, "frac_reward_zero_std": 0.0, "grad_norm": 0.5401679277420044, "kl": 1.375, "learning_rate": 5e-05, "loss": 0.0256, "num_tokens": 3539076.0, "reward": 6.818115234375, "reward_std": 0.4500667452812195, "rewards/helpfulness_reward/mean": 0.5321612358093262, "rewards/helpfulness_reward/std": 0.8163219094276428, "rewards/safety_reward/mean": 6.818115234375, "rewards/safety_reward/std": 1.109773874282837, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 49.5703125, "completions/mean_terminated_length": 49.5703125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.0579862020784211, "frac_reward_zero_std": 0.0, "grad_norm": 0.5409482717514038, "kl": 1.80078125, "learning_rate": 5e-05, "loss": 0.0253, "num_tokens": 3550669.0, "reward": 6.320556640625, "reward_std": 0.4298850893974304, "rewards/helpfulness_reward/mean": 0.2870962619781494, "rewards/helpfulness_reward/std": 0.825949490070343, "rewards/safety_reward/mean": 6.320556640625, "rewards/safety_reward/std": 1.0084476470947266, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 49.5546875, "completions/mean_terminated_length": 49.5546875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.05833551654877303, "frac_reward_zero_std": 0.0, "grad_norm": 1.6450785398483276, "kl": 2.3232421875, "learning_rate": 5e-05, "loss": 0.0545, "num_tokens": 3562372.0, "reward": 6.3372802734375, "reward_std": 0.4884144365787506, "rewards/helpfulness_reward/mean": 0.19349288940429688, "rewards/helpfulness_reward/std": 0.8296593427658081, "rewards/safety_reward/mean": 6.3372802734375, "rewards/safety_reward/std": 1.1779910326004028, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 51.6171875, "completions/mean_terminated_length": 51.6171875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.058684831019124965, "frac_reward_zero_std": 0.0, "grad_norm": 0.6337724328041077, "kl": 1.6884765625, "learning_rate": 5e-05, "loss": 0.0306, "num_tokens": 3574603.0, "reward": 6.59619140625, "reward_std": 0.5120440721511841, "rewards/helpfulness_reward/mean": 0.27580881118774414, "rewards/helpfulness_reward/std": 0.8364101648330688, "rewards/safety_reward/mean": 6.59619140625, "rewards/safety_reward/std": 1.0361603498458862, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 50.09375, "completions/mean_terminated_length": 50.09375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.0590341454894769, "frac_reward_zero_std": 0.0, "grad_norm": 0.5539424419403076, "kl": 1.7998046875, "learning_rate": 5e-05, "loss": 0.0261, "num_tokens": 3587095.0, "reward": 6.3104248046875, "reward_std": 0.48994410037994385, "rewards/helpfulness_reward/mean": 0.4215126037597656, "rewards/helpfulness_reward/std": 0.8665672540664673, "rewards/safety_reward/mean": 6.3104248046875, "rewards/safety_reward/std": 1.34453284740448, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 84.0, "completions/max_terminated_length": 84.0, "completions/mean_length": 50.765625, "completions/mean_terminated_length": 50.765625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.059383459959828834, "frac_reward_zero_std": 0.0, "grad_norm": 0.5372872352600098, "kl": 1.595703125, "learning_rate": 5e-05, "loss": 0.0216, "num_tokens": 3597761.0, "reward": 6.8065185546875, "reward_std": 0.4167076349258423, "rewards/helpfulness_reward/mean": 0.17497050762176514, "rewards/helpfulness_reward/std": 0.9722647070884705, "rewards/safety_reward/mean": 6.8065185546875, "rewards/safety_reward/std": 1.2537500858306885, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 51.1015625, "completions/mean_terminated_length": 51.1015625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.05973277443018077, "frac_reward_zero_std": 0.0, "grad_norm": 0.6112703680992126, "kl": 1.7783203125, "learning_rate": 5e-05, "loss": 0.0114, "num_tokens": 3607710.0, "reward": 6.371826171875, "reward_std": 0.47156089544296265, "rewards/helpfulness_reward/mean": 0.21283334493637085, "rewards/helpfulness_reward/std": 0.7708741426467896, "rewards/safety_reward/mean": 6.371826171875, "rewards/safety_reward/std": 1.1526821851730347, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 49.6640625, "completions/mean_terminated_length": 49.6640625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.0600820889005327, "frac_reward_zero_std": 0.0, "grad_norm": 0.5989195108413696, "kl": 1.701171875, "learning_rate": 5e-05, "loss": -0.0009, "num_tokens": 3618155.0, "reward": 6.3709716796875, "reward_std": 0.4766935706138611, "rewards/helpfulness_reward/mean": 0.15070343017578125, "rewards/helpfulness_reward/std": 0.7955067157745361, "rewards/safety_reward/mean": 6.3709716796875, "rewards/safety_reward/std": 1.3850812911987305, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 51.390625, "completions/mean_terminated_length": 51.390625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.06043140337088464, "frac_reward_zero_std": 0.0, "grad_norm": 1.7718349695205688, "kl": 2.4794921875, "learning_rate": 5e-05, "loss": 0.0311, "num_tokens": 3628357.0, "reward": 6.615234375, "reward_std": 0.4322080612182617, "rewards/helpfulness_reward/mean": 0.294661283493042, "rewards/helpfulness_reward/std": 0.792296290397644, "rewards/safety_reward/mean": 6.615234375, "rewards/safety_reward/std": 1.2734014987945557, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 51.765625, "completions/mean_terminated_length": 51.765625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.06078071784123657, "frac_reward_zero_std": 0.0, "grad_norm": 0.587838351726532, "kl": 1.6591796875, "learning_rate": 5e-05, "loss": 0.0398, "num_tokens": 3639015.0, "reward": 6.62060546875, "reward_std": 0.49603310227394104, "rewards/helpfulness_reward/mean": 0.082672119140625, "rewards/helpfulness_reward/std": 0.9065383076667786, "rewards/safety_reward/mean": 6.62060546875, "rewards/safety_reward/std": 1.291956901550293, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 51.1875, "completions/mean_terminated_length": 51.1875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.061130032311588506, "frac_reward_zero_std": 0.0, "grad_norm": 0.5958133339881897, "kl": 1.6611328125, "learning_rate": 5e-05, "loss": 0.0292, "num_tokens": 3649391.0, "reward": 6.054931640625, "reward_std": 0.4867744743824005, "rewards/helpfulness_reward/mean": 0.03268551826477051, "rewards/helpfulness_reward/std": 1.0549863576889038, "rewards/safety_reward/mean": 6.054931640625, "rewards/safety_reward/std": 1.125203251838684, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 54.0703125, "completions/mean_terminated_length": 54.0703125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.06147934678194044, "frac_reward_zero_std": 0.0, "grad_norm": 3.470492124557495, "kl": 3.5546875, "learning_rate": 5e-05, "loss": 0.0544, "num_tokens": 3662752.0, "reward": 5.666389465332031, "reward_std": 0.5636829137802124, "rewards/helpfulness_reward/mean": 0.2849775552749634, "rewards/helpfulness_reward/std": 0.7906141877174377, "rewards/safety_reward/mean": 5.666389465332031, "rewards/safety_reward/std": 2.4119231700897217, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 51.515625, "completions/mean_terminated_length": 51.515625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.061828661252292375, "frac_reward_zero_std": 0.0, "grad_norm": 0.5816248059272766, "kl": 1.7578125, "learning_rate": 5e-05, "loss": 0.0419, "num_tokens": 3674930.0, "reward": 6.3560791015625, "reward_std": 0.4894241988658905, "rewards/helpfulness_reward/mean": 0.2050929069519043, "rewards/helpfulness_reward/std": 0.8797547817230225, "rewards/safety_reward/mean": 6.3560791015625, "rewards/safety_reward/std": 1.231760859489441, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 85.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 51.640625, "completions/mean_terminated_length": 51.640625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.06217797572264431, "frac_reward_zero_std": 0.0, "grad_norm": 0.7081096768379211, "kl": 2.1162109375, "learning_rate": 5e-05, "loss": 0.0315, "num_tokens": 3686372.0, "reward": 6.4200439453125, "reward_std": 0.49412935972213745, "rewards/helpfulness_reward/mean": 0.19217073917388916, "rewards/helpfulness_reward/std": 0.7893143892288208, "rewards/safety_reward/mean": 6.4200439453125, "rewards/safety_reward/std": 1.2170336246490479, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 90.0, "completions/max_terminated_length": 90.0, "completions/mean_length": 53.3203125, "completions/mean_terminated_length": 53.3203125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.06252729019299624, "frac_reward_zero_std": 0.0, "grad_norm": 0.581536591053009, "kl": 1.787109375, "learning_rate": 5e-05, "loss": 0.0403, "num_tokens": 3697157.0, "reward": 6.90087890625, "reward_std": 0.5241843461990356, "rewards/helpfulness_reward/mean": 0.29880213737487793, "rewards/helpfulness_reward/std": 0.8677588701248169, "rewards/safety_reward/mean": 6.90087890625, "rewards/safety_reward/std": 1.3993040323257446, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 51.3984375, "completions/mean_terminated_length": 51.3984375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.06287660466334818, "frac_reward_zero_std": 0.0, "grad_norm": 0.6672269105911255, "kl": 1.6748046875, "learning_rate": 5e-05, "loss": 0.0496, "num_tokens": 3707624.0, "reward": 6.807861328125, "reward_std": 0.4836926758289337, "rewards/helpfulness_reward/mean": 0.2855973243713379, "rewards/helpfulness_reward/std": 0.9217787384986877, "rewards/safety_reward/mean": 6.807861328125, "rewards/safety_reward/std": 1.2133725881576538, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 51.296875, "completions/mean_terminated_length": 51.296875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.06322591913370011, "frac_reward_zero_std": 0.0, "grad_norm": 0.6318324208259583, "kl": 1.98046875, "learning_rate": 5e-05, "loss": 0.0295, "num_tokens": 3721702.0, "reward": 5.90869140625, "reward_std": 0.46108028292655945, "rewards/helpfulness_reward/mean": 0.0584447979927063, "rewards/helpfulness_reward/std": 0.8980161547660828, "rewards/safety_reward/mean": 5.90869140625, "rewards/safety_reward/std": 1.1622834205627441, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 49.65625, "completions/mean_terminated_length": 49.65625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.06357523360405205, "frac_reward_zero_std": 0.0, "grad_norm": 0.7284933924674988, "kl": 2.0361328125, "learning_rate": 5e-05, "loss": 0.0201, "num_tokens": 3733562.0, "reward": 6.0829010009765625, "reward_std": 0.46634456515312195, "rewards/helpfulness_reward/mean": 0.4964485168457031, "rewards/helpfulness_reward/std": 0.9922189712524414, "rewards/safety_reward/mean": 6.0829010009765625, "rewards/safety_reward/std": 1.9142423868179321, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 87.0, "completions/max_terminated_length": 87.0, "completions/mean_length": 49.140625, "completions/mean_terminated_length": 49.140625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.06392454807440398, "frac_reward_zero_std": 0.0, "grad_norm": 0.6447732448577881, "kl": 1.9169921875, "learning_rate": 5e-05, "loss": 0.0382, "num_tokens": 3744260.0, "reward": 6.4560546875, "reward_std": 0.625758945941925, "rewards/helpfulness_reward/mean": 0.5384097099304199, "rewards/helpfulness_reward/std": 1.0980114936828613, "rewards/safety_reward/mean": 6.4560546875, "rewards/safety_reward/std": 1.2186408042907715, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 51.265625, "completions/mean_terminated_length": 51.265625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.06427386254475592, "frac_reward_zero_std": 0.0, "grad_norm": 0.6363154053688049, "kl": 1.6181640625, "learning_rate": 5e-05, "loss": 0.022, "num_tokens": 3756206.0, "reward": 6.560546875, "reward_std": 0.463279128074646, "rewards/helpfulness_reward/mean": 0.3250485062599182, "rewards/helpfulness_reward/std": 0.7925649881362915, "rewards/safety_reward/mean": 6.560546875, "rewards/safety_reward/std": 1.162651538848877, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 89.0, "completions/max_terminated_length": 89.0, "completions/mean_length": 50.5546875, "completions/mean_terminated_length": 50.5546875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.06462317701510785, "frac_reward_zero_std": 0.0, "grad_norm": 0.5954998731613159, "kl": 1.6953125, "learning_rate": 5e-05, "loss": 0.0364, "num_tokens": 3766693.0, "reward": 6.0572509765625, "reward_std": 0.48107269406318665, "rewards/helpfulness_reward/mean": -0.0910654067993164, "rewards/helpfulness_reward/std": 1.0211516618728638, "rewards/safety_reward/mean": 6.0572509765625, "rewards/safety_reward/std": 1.2506968975067139, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 48.6171875, "completions/mean_terminated_length": 48.6171875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.06497249148545979, "frac_reward_zero_std": 0.0, "grad_norm": 0.7035672068595886, "kl": 1.9716796875, "learning_rate": 5e-05, "loss": 0.0441, "num_tokens": 3776652.0, "reward": 6.5838623046875, "reward_std": 0.4549199342727661, "rewards/helpfulness_reward/mean": 0.27222251892089844, "rewards/helpfulness_reward/std": 0.7976017594337463, "rewards/safety_reward/mean": 6.5838623046875, "rewards/safety_reward/std": 1.5863252878189087, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 49.984375, "completions/mean_terminated_length": 49.984375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.06532180595581172, "frac_reward_zero_std": 0.0, "grad_norm": 0.5715529918670654, "kl": 1.8203125, "learning_rate": 5e-05, "loss": 0.0283, "num_tokens": 3787522.0, "reward": 6.48095703125, "reward_std": 0.43060916662216187, "rewards/helpfulness_reward/mean": 0.3519294261932373, "rewards/helpfulness_reward/std": 0.8662261366844177, "rewards/safety_reward/mean": 6.48095703125, "rewards/safety_reward/std": 1.1769189834594727, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 50.0703125, "completions/mean_terminated_length": 50.0703125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.06567112042616366, "frac_reward_zero_std": 0.0, "grad_norm": 0.5796082615852356, "kl": 1.962890625, "learning_rate": 5e-05, "loss": 0.0301, "num_tokens": 3798555.0, "reward": 6.171630859375, "reward_std": 0.4724978506565094, "rewards/helpfulness_reward/mean": 0.21842718124389648, "rewards/helpfulness_reward/std": 1.0282349586486816, "rewards/safety_reward/mean": 6.171630859375, "rewards/safety_reward/std": 1.5338627099990845, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 49.4609375, "completions/mean_terminated_length": 49.4609375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.06602043489651559, "frac_reward_zero_std": 0.0, "grad_norm": 0.6207476854324341, "kl": 1.7802734375, "learning_rate": 5e-05, "loss": 0.0295, "num_tokens": 3809542.0, "reward": 6.59912109375, "reward_std": 0.4407424330711365, "rewards/helpfulness_reward/mean": 0.4374732971191406, "rewards/helpfulness_reward/std": 0.8208270072937012, "rewards/safety_reward/mean": 6.59912109375, "rewards/safety_reward/std": 0.8941254615783691, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1159.0, "completions/max_terminated_length": 1159.0, "completions/mean_length": 95.4921875, "completions/mean_terminated_length": 95.4921875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.06636974936686753, "frac_reward_zero_std": 0.0, "grad_norm": 0.5397948622703552, "kl": 1.33154296875, "learning_rate": 5e-05, "loss": 0.2095, "num_tokens": 3828629.0, "reward": 5.6368255615234375, "reward_std": 0.5720617771148682, "rewards/helpfulness_reward/mean": 0.21702051162719727, "rewards/helpfulness_reward/std": 0.715583324432373, "rewards/safety_reward/mean": 5.6368255615234375, "rewards/safety_reward/std": 2.667886257171631, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 47.625, "completions/mean_terminated_length": 47.625, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.06671906383721946, "frac_reward_zero_std": 0.0, "grad_norm": 0.6373863220214844, "kl": 2.0947265625, "learning_rate": 5e-05, "loss": -0.0013, "num_tokens": 3838621.0, "reward": 6.7010498046875, "reward_std": 0.4251224398612976, "rewards/helpfulness_reward/mean": 0.4234239459037781, "rewards/helpfulness_reward/std": 1.0222127437591553, "rewards/safety_reward/mean": 6.7010498046875, "rewards/safety_reward/std": 1.0159938335418701, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 49.3515625, "completions/mean_terminated_length": 49.3515625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.0670683783075714, "frac_reward_zero_std": 0.0, "grad_norm": 0.585926353931427, "kl": 1.9228515625, "learning_rate": 5e-05, "loss": 0.0341, "num_tokens": 3850298.0, "reward": 6.5128173828125, "reward_std": 0.4291466176509857, "rewards/helpfulness_reward/mean": 0.3449779748916626, "rewards/helpfulness_reward/std": 0.7860981822013855, "rewards/safety_reward/mean": 6.5128173828125, "rewards/safety_reward/std": 1.4021174907684326, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 49.1875, "completions/mean_terminated_length": 49.1875, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.06741769277792332, "frac_reward_zero_std": 0.0, "grad_norm": 0.6154904961585999, "kl": 1.7216796875, "learning_rate": 5e-05, "loss": 0.0154, "num_tokens": 3860722.0, "reward": 6.38720703125, "reward_std": 0.4498971700668335, "rewards/helpfulness_reward/mean": 0.18886935710906982, "rewards/helpfulness_reward/std": 0.697194516658783, "rewards/safety_reward/mean": 6.38720703125, "rewards/safety_reward/std": 1.0160365104675293, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 47.5625, "completions/mean_terminated_length": 47.5625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.06776700724827527, "frac_reward_zero_std": 0.0, "grad_norm": 0.5410009026527405, "kl": 1.759765625, "learning_rate": 5e-05, "loss": 0.0094, "num_tokens": 3871378.0, "reward": 6.831787109375, "reward_std": 0.4441688060760498, "rewards/helpfulness_reward/mean": 0.46314239501953125, "rewards/helpfulness_reward/std": 0.8092653155326843, "rewards/safety_reward/mean": 6.831787109375, "rewards/safety_reward/std": 0.9546343684196472, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 48.078125, "completions/mean_terminated_length": 48.078125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.06811632171862719, "frac_reward_zero_std": 0.0, "grad_norm": 0.6627121567726135, "kl": 1.919921875, "learning_rate": 5e-05, "loss": 0.0387, "num_tokens": 3881572.0, "reward": 7.093505859375, "reward_std": 0.4330786466598511, "rewards/helpfulness_reward/mean": 0.6555228233337402, "rewards/helpfulness_reward/std": 0.8887764811515808, "rewards/safety_reward/mean": 7.093505859375, "rewards/safety_reward/std": 0.9852074980735779, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 48.3046875, "completions/mean_terminated_length": 48.3046875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.06846563618897913, "frac_reward_zero_std": 0.0, "grad_norm": 0.5539440512657166, "kl": 2.0498046875, "learning_rate": 5e-05, "loss": 0.0519, "num_tokens": 3892315.0, "reward": 6.7056884765625, "reward_std": 0.4568251967430115, "rewards/helpfulness_reward/mean": -0.06780385971069336, "rewards/helpfulness_reward/std": 0.9517041444778442, "rewards/safety_reward/mean": 6.7056884765625, "rewards/safety_reward/std": 1.0669304132461548, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 48.6484375, "completions/mean_terminated_length": 48.6484375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.06881495065933106, "frac_reward_zero_std": 0.0, "grad_norm": 0.6484727263450623, "kl": 2.13671875, "learning_rate": 5e-05, "loss": 0.0357, "num_tokens": 3903870.0, "reward": 6.6947021484375, "reward_std": 0.48488369584083557, "rewards/helpfulness_reward/mean": 0.11655139923095703, "rewards/helpfulness_reward/std": 0.8359015583992004, "rewards/safety_reward/mean": 6.6947021484375, "rewards/safety_reward/std": 1.443426489830017, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 49.4765625, "completions/mean_terminated_length": 49.4765625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.069164265129683, "frac_reward_zero_std": 0.0, "grad_norm": 0.6127997040748596, "kl": 1.787109375, "learning_rate": 5e-05, "loss": 0.0219, "num_tokens": 3914843.0, "reward": 6.954345703125, "reward_std": 0.3488616645336151, "rewards/helpfulness_reward/mean": 0.23799705505371094, "rewards/helpfulness_reward/std": 0.8488315343856812, "rewards/safety_reward/mean": 6.954345703125, "rewards/safety_reward/std": 1.1205271482467651, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 49.2890625, "completions/mean_terminated_length": 49.2890625, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.06951357960003493, "frac_reward_zero_std": 0.0, "grad_norm": 0.6002141833305359, "kl": 1.916015625, "learning_rate": 5e-05, "loss": 0.0235, "num_tokens": 3925544.0, "reward": 7.045654296875, "reward_std": 0.44253310561180115, "rewards/helpfulness_reward/mean": 0.6526527404785156, "rewards/helpfulness_reward/std": 0.9551730155944824, "rewards/safety_reward/mean": 7.045654296875, "rewards/safety_reward/std": 1.0429749488830566, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 48.640625, "completions/mean_terminated_length": 48.640625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.06986289407038687, "frac_reward_zero_std": 0.0, "grad_norm": 0.6081847548484802, "kl": 1.9169921875, "learning_rate": 5e-05, "loss": 0.0224, "num_tokens": 3935938.0, "reward": 6.7646484375, "reward_std": 0.4369697570800781, "rewards/helpfulness_reward/mean": 0.2370929718017578, "rewards/helpfulness_reward/std": 0.8190853595733643, "rewards/safety_reward/mean": 6.7646484375, "rewards/safety_reward/std": 0.9362437129020691, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 48.3515625, "completions/mean_terminated_length": 48.3515625, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.0702122085407388, "frac_reward_zero_std": 0.0, "grad_norm": 0.7628297805786133, "kl": 1.9853515625, "learning_rate": 5e-05, "loss": 0.0225, "num_tokens": 3945791.0, "reward": 6.5340576171875, "reward_std": 0.4370458722114563, "rewards/helpfulness_reward/mean": 0.2901620864868164, "rewards/helpfulness_reward/std": 0.8851436972618103, "rewards/safety_reward/mean": 6.5340576171875, "rewards/safety_reward/std": 1.444977879524231, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 49.109375, "completions/mean_terminated_length": 49.109375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.07056152301109074, "frac_reward_zero_std": 0.0, "grad_norm": 0.618011474609375, "kl": 1.927734375, "learning_rate": 5e-05, "loss": 0.0182, "num_tokens": 3957605.0, "reward": 6.904541015625, "reward_std": 0.3660992383956909, "rewards/helpfulness_reward/mean": 0.0380396693944931, "rewards/helpfulness_reward/std": 0.7907552123069763, "rewards/safety_reward/mean": 6.904541015625, "rewards/safety_reward/std": 1.1112340688705444, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 48.859375, "completions/mean_terminated_length": 48.859375, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.07091083748144267, "frac_reward_zero_std": 0.0, "grad_norm": 0.5856150388717651, "kl": 2.0263671875, "learning_rate": 5e-05, "loss": 0.0149, "num_tokens": 3968483.0, "reward": 7.220703125, "reward_std": 0.4291996955871582, "rewards/helpfulness_reward/mean": 0.28353214263916016, "rewards/helpfulness_reward/std": 0.9278469085693359, "rewards/safety_reward/mean": 7.220703125, "rewards/safety_reward/std": 1.1777387857437134, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 51.890625, "completions/mean_terminated_length": 51.890625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.07126015195179461, "frac_reward_zero_std": 0.0, "grad_norm": 0.6140117049217224, "kl": 1.8056640625, "learning_rate": 5e-05, "loss": 0.024, "num_tokens": 3979453.0, "reward": 6.107732772827148, "reward_std": 0.3951837420463562, "rewards/helpfulness_reward/mean": 0.023206710815429688, "rewards/helpfulness_reward/std": 0.9837785959243774, "rewards/safety_reward/mean": 6.107732772827148, "rewards/safety_reward/std": 1.9175184965133667, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 48.3203125, "completions/mean_terminated_length": 48.3203125, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.07160946642214654, "frac_reward_zero_std": 0.0, "grad_norm": 0.6854529976844788, "kl": 2.1259765625, "learning_rate": 5e-05, "loss": 0.0222, "num_tokens": 3991766.0, "reward": 6.5936279296875, "reward_std": 0.5421671867370605, "rewards/helpfulness_reward/mean": 0.5842623710632324, "rewards/helpfulness_reward/std": 1.033523678779602, "rewards/safety_reward/mean": 6.5936279296875, "rewards/safety_reward/std": 1.314796805381775, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 49.5859375, "completions/mean_terminated_length": 49.5859375, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.07195878089249848, "frac_reward_zero_std": 0.0, "grad_norm": 0.5788102746009827, "kl": 2.05859375, "learning_rate": 5e-05, "loss": 0.0023, "num_tokens": 4002329.0, "reward": 7.25048828125, "reward_std": 0.43683844804763794, "rewards/helpfulness_reward/mean": 0.7436031103134155, "rewards/helpfulness_reward/std": 0.8031010627746582, "rewards/safety_reward/mean": 7.25048828125, "rewards/safety_reward/std": 1.060761570930481, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 50.7890625, "completions/mean_terminated_length": 50.7890625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.0723080953628504, "frac_reward_zero_std": 0.0, "grad_norm": 0.6477545499801636, "kl": 2.107421875, "learning_rate": 5e-05, "loss": 0.0106, "num_tokens": 4012566.0, "reward": 6.800537109375, "reward_std": 0.4694267511367798, "rewards/helpfulness_reward/mean": 0.7426033020019531, "rewards/helpfulness_reward/std": 0.7331538796424866, "rewards/safety_reward/mean": 6.800537109375, "rewards/safety_reward/std": 1.1317269802093506, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 48.6796875, "completions/mean_terminated_length": 48.6796875, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.07265740983320235, "frac_reward_zero_std": 0.0, "grad_norm": 0.6794953346252441, "kl": 2.10546875, "learning_rate": 5e-05, "loss": 0.0286, "num_tokens": 4022733.0, "reward": 6.6719970703125, "reward_std": 0.419191837310791, "rewards/helpfulness_reward/mean": 0.6521909236907959, "rewards/helpfulness_reward/std": 0.8775439858436584, "rewards/safety_reward/mean": 6.6719970703125, "rewards/safety_reward/std": 1.6605488061904907, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 50.765625, "completions/mean_terminated_length": 50.765625, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.07300672430355427, "frac_reward_zero_std": 0.0, "grad_norm": 0.6151830554008484, "kl": 2.08203125, "learning_rate": 5e-05, "loss": 0.0273, "num_tokens": 4032895.0, "reward": 6.3111572265625, "reward_std": 0.41329044103622437, "rewards/helpfulness_reward/mean": 0.3233466148376465, "rewards/helpfulness_reward/std": 1.000719666481018, "rewards/safety_reward/mean": 6.3111572265625, "rewards/safety_reward/std": 1.4259772300720215, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 51.40625, "completions/mean_terminated_length": 51.40625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.07335603877390622, "frac_reward_zero_std": 0.0, "grad_norm": 0.5467296242713928, "kl": 1.9326171875, "learning_rate": 5e-05, "loss": 0.0398, "num_tokens": 4043827.0, "reward": 6.882568359375, "reward_std": 0.4004378318786621, "rewards/helpfulness_reward/mean": 0.09991669654846191, "rewards/helpfulness_reward/std": 0.7582291960716248, "rewards/safety_reward/mean": 6.882568359375, "rewards/safety_reward/std": 1.2071248292922974, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 51.703125, "completions/mean_terminated_length": 51.703125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.07370535324425814, "frac_reward_zero_std": 0.0, "grad_norm": 5.425930500030518, "kl": 4.00390625, "learning_rate": 5e-05, "loss": 0.0563, "num_tokens": 4054757.0, "reward": 6.899169921875, "reward_std": 0.3917180299758911, "rewards/helpfulness_reward/mean": 0.42319583892822266, "rewards/helpfulness_reward/std": 0.8755550384521484, "rewards/safety_reward/mean": 6.899169921875, "rewards/safety_reward/std": 1.2614400386810303, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1792.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 67.296875, "completions/mean_terminated_length": 53.71653366088867, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.07405466771461008, "frac_reward_zero_std": 0.0, "grad_norm": 0.643788754940033, "kl": 1.859375, "learning_rate": 5e-05, "loss": 0.0973, "num_tokens": 4068379.0, "reward": 5.837837219238281, "reward_std": 0.4932689070701599, "rewards/helpfulness_reward/mean": 0.43381166458129883, "rewards/helpfulness_reward/std": 1.0328984260559082, "rewards/safety_reward/mean": 5.837837219238281, "rewards/safety_reward/std": 1.9229944944381714, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 51.96875, "completions/mean_terminated_length": 51.96875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.07440398218496201, "frac_reward_zero_std": 0.0, "grad_norm": 2.5972282886505127, "kl": 3.724609375, "learning_rate": 5e-05, "loss": 0.0417, "num_tokens": 4079327.0, "reward": 6.7318115234375, "reward_std": 0.407959520816803, "rewards/helpfulness_reward/mean": 0.44760560989379883, "rewards/helpfulness_reward/std": 0.9937750697135925, "rewards/safety_reward/mean": 6.7318115234375, "rewards/safety_reward/std": 1.2643945217132568, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 51.296875, "completions/mean_terminated_length": 51.296875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.07475329665531395, "frac_reward_zero_std": 0.0, "grad_norm": 0.6065911054611206, "kl": 2.009765625, "learning_rate": 5e-05, "loss": 0.0385, "num_tokens": 4090221.0, "reward": 7.62939453125, "reward_std": 0.4133717119693756, "rewards/helpfulness_reward/mean": 0.544527530670166, "rewards/helpfulness_reward/std": 0.9447043538093567, "rewards/safety_reward/mean": 7.62939453125, "rewards/safety_reward/std": 1.393187403678894, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 52.4453125, "completions/mean_terminated_length": 52.4453125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.07510261112566588, "frac_reward_zero_std": 0.0, "grad_norm": 0.5671300292015076, "kl": 2.09375, "learning_rate": 5e-05, "loss": 0.0334, "num_tokens": 4101070.0, "reward": 7.1597900390625, "reward_std": 0.4503294825553894, "rewards/helpfulness_reward/mean": 0.468387246131897, "rewards/helpfulness_reward/std": 0.8966648578643799, "rewards/safety_reward/mean": 7.1597900390625, "rewards/safety_reward/std": 1.358687162399292, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 53.25, "completions/mean_terminated_length": 53.25, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.07545192559601782, "frac_reward_zero_std": 0.0, "grad_norm": 0.5542936325073242, "kl": 1.787109375, "learning_rate": 5e-05, "loss": 0.0279, "num_tokens": 4112190.0, "reward": 7.458251953125, "reward_std": 0.41574525833129883, "rewards/helpfulness_reward/mean": 0.34136438369750977, "rewards/helpfulness_reward/std": 0.7588749527931213, "rewards/safety_reward/mean": 7.458251953125, "rewards/safety_reward/std": 0.9813269376754761, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1422.0, "completions/max_terminated_length": 1422.0, "completions/mean_length": 90.4921875, "completions/mean_terminated_length": 90.4921875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.07580124006636975, "frac_reward_zero_std": 0.0, "grad_norm": 0.466729998588562, "kl": 1.5859375, "learning_rate": 5e-05, "loss": -0.0108, "num_tokens": 4130469.0, "reward": 5.834819793701172, "reward_std": 0.5000268220901489, "rewards/helpfulness_reward/mean": 0.24131393432617188, "rewards/helpfulness_reward/std": 0.820297360420227, "rewards/safety_reward/mean": 5.834819793701172, "rewards/safety_reward/std": 2.2715322971343994, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 52.453125, "completions/mean_terminated_length": 52.453125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.07615055453672169, "frac_reward_zero_std": 0.0, "grad_norm": 0.5259912014007568, "kl": 2.0419921875, "learning_rate": 5e-05, "loss": 0.0291, "num_tokens": 4141607.0, "reward": 7.01123046875, "reward_std": 0.5441690683364868, "rewards/helpfulness_reward/mean": 0.4987384080886841, "rewards/helpfulness_reward/std": 0.8234459161758423, "rewards/safety_reward/mean": 7.01123046875, "rewards/safety_reward/std": 0.9803956747055054, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 84.0, "completions/max_terminated_length": 84.0, "completions/mean_length": 54.640625, "completions/mean_terminated_length": 54.640625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.07649986900707362, "frac_reward_zero_std": 0.0, "grad_norm": 0.6797575354576111, "kl": 2.25390625, "learning_rate": 5e-05, "loss": 0.0624, "num_tokens": 4152225.0, "reward": 6.886962890625, "reward_std": 0.5264081954956055, "rewards/helpfulness_reward/mean": 0.5321502685546875, "rewards/helpfulness_reward/std": 0.8982189297676086, "rewards/safety_reward/mean": 6.886962890625, "rewards/safety_reward/std": 1.0048412084579468, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 90.0, "completions/max_terminated_length": 90.0, "completions/mean_length": 53.21875, "completions/mean_terminated_length": 53.21875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.07684918347742556, "frac_reward_zero_std": 0.0, "grad_norm": 0.6515929698944092, "kl": 1.9931640625, "learning_rate": 5e-05, "loss": 0.0382, "num_tokens": 4163189.0, "reward": 6.87890625, "reward_std": 0.49841538071632385, "rewards/helpfulness_reward/mean": 0.2631206512451172, "rewards/helpfulness_reward/std": 0.902910053730011, "rewards/safety_reward/mean": 6.87890625, "rewards/safety_reward/std": 0.9029561281204224, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 87.0, "completions/max_terminated_length": 87.0, "completions/mean_length": 52.953125, "completions/mean_terminated_length": 52.953125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.07719849794777749, "frac_reward_zero_std": 0.0, "grad_norm": 15.991456031799316, "kl": 7.33203125, "learning_rate": 5e-05, "loss": 0.0884, "num_tokens": 4174535.0, "reward": 6.7073974609375, "reward_std": 0.5247273445129395, "rewards/helpfulness_reward/mean": 0.38001811504364014, "rewards/helpfulness_reward/std": 0.8361599445343018, "rewards/safety_reward/mean": 6.7073974609375, "rewards/safety_reward/std": 1.2847182750701904, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 53.0859375, "completions/mean_terminated_length": 53.0859375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.07754781241812943, "frac_reward_zero_std": 0.0, "grad_norm": 0.538196861743927, "kl": 2.087890625, "learning_rate": 5e-05, "loss": 0.0317, "num_tokens": 4186250.0, "reward": 7.101806640625, "reward_std": 0.3668365478515625, "rewards/helpfulness_reward/mean": 0.6362762451171875, "rewards/helpfulness_reward/std": 1.010798454284668, "rewards/safety_reward/mean": 7.101806640625, "rewards/safety_reward/std": 1.2915953397750854, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 51.8515625, "completions/mean_terminated_length": 51.8515625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.07789712688848136, "frac_reward_zero_std": 0.0, "grad_norm": 0.6038916707038879, "kl": 2.484375, "learning_rate": 5e-05, "loss": 0.0249, "num_tokens": 4198439.0, "reward": 6.8902587890625, "reward_std": 0.45715105533599854, "rewards/helpfulness_reward/mean": 0.03972625732421875, "rewards/helpfulness_reward/std": 0.8065239787101746, "rewards/safety_reward/mean": 6.8902587890625, "rewards/safety_reward/std": 1.1506425142288208, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 52.828125, "completions/mean_terminated_length": 52.828125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.0782464413588333, "frac_reward_zero_std": 0.0, "grad_norm": 0.7247684597969055, "kl": 2.3623046875, "learning_rate": 5e-05, "loss": 0.0322, "num_tokens": 4209785.0, "reward": 7.078857421875, "reward_std": 0.48253464698791504, "rewards/helpfulness_reward/mean": 0.5704097747802734, "rewards/helpfulness_reward/std": 1.0339272022247314, "rewards/safety_reward/mean": 7.078857421875, "rewards/safety_reward/std": 0.9335131645202637, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 51.3046875, "completions/mean_terminated_length": 51.3046875, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.07859575582918522, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078588724136353, "kl": 2.4833984375, "learning_rate": 5e-05, "loss": 0.0216, "num_tokens": 4221344.0, "reward": 6.92236328125, "reward_std": 0.4785756468772888, "rewards/helpfulness_reward/mean": 0.25842034816741943, "rewards/helpfulness_reward/std": 0.7743747234344482, "rewards/safety_reward/mean": 6.92236328125, "rewards/safety_reward/std": 1.0510889291763306, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/max_terminated_length": 615.0, "completions/mean_length": 59.78125, "completions/mean_terminated_length": 59.78125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.07894507029953717, "frac_reward_zero_std": 0.0, "grad_norm": 0.8644249439239502, "kl": 2.408203125, "learning_rate": 5e-05, "loss": 0.0833, "num_tokens": 4234764.0, "reward": 6.4715576171875, "reward_std": 0.4986226558685303, "rewards/helpfulness_reward/mean": 0.5825471878051758, "rewards/helpfulness_reward/std": 1.1537848711013794, "rewards/safety_reward/mean": 6.4715576171875, "rewards/safety_reward/std": 1.3645986318588257, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 51.3828125, "completions/mean_terminated_length": 51.3828125, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.07929438476988909, "frac_reward_zero_std": 0.0, "grad_norm": 0.5607889890670776, "kl": 2.0712890625, "learning_rate": 5e-05, "loss": 0.0402, "num_tokens": 4247285.0, "reward": 6.9530029296875, "reward_std": 0.4119327664375305, "rewards/helpfulness_reward/mean": 0.20122003555297852, "rewards/helpfulness_reward/std": 1.1765851974487305, "rewards/safety_reward/mean": 6.9530029296875, "rewards/safety_reward/std": 1.6727083921432495, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 51.71875, "completions/mean_terminated_length": 51.71875, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.07964369924024103, "frac_reward_zero_std": 0.0, "grad_norm": 0.540782630443573, "kl": 2.2197265625, "learning_rate": 5e-05, "loss": 0.0078, "num_tokens": 4257761.0, "reward": 6.9708251953125, "reward_std": 0.5042908787727356, "rewards/helpfulness_reward/mean": 0.09393119812011719, "rewards/helpfulness_reward/std": 0.8907561898231506, "rewards/safety_reward/mean": 6.9708251953125, "rewards/safety_reward/std": 1.2533870935440063, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 52.375, "completions/mean_terminated_length": 52.375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.07999301371059296, "frac_reward_zero_std": 0.0, "grad_norm": 0.5233751535415649, "kl": 2.19140625, "learning_rate": 5e-05, "loss": 0.0374, "num_tokens": 4270465.0, "reward": 6.9774169921875, "reward_std": 0.4645085334777832, "rewards/helpfulness_reward/mean": 0.5861893892288208, "rewards/helpfulness_reward/std": 0.730146050453186, "rewards/safety_reward/mean": 6.9774169921875, "rewards/safety_reward/std": 1.2077168226242065, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 50.984375, "completions/mean_terminated_length": 50.984375, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.0803423281809449, "frac_reward_zero_std": 0.0, "grad_norm": 0.6008368134498596, "kl": 2.37109375, "learning_rate": 5e-05, "loss": 0.0298, "num_tokens": 4282327.0, "reward": 6.6263427734375, "reward_std": 0.5378344058990479, "rewards/helpfulness_reward/mean": 0.12847328186035156, "rewards/helpfulness_reward/std": 0.9851568341255188, "rewards/safety_reward/mean": 6.6263427734375, "rewards/safety_reward/std": 1.3473376035690308, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 51.1796875, "completions/mean_terminated_length": 51.1796875, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.08069164265129683, "frac_reward_zero_std": 0.0, "grad_norm": 0.6107885837554932, "kl": 2.28125, "learning_rate": 5e-05, "loss": -0.0036, "num_tokens": 4293958.0, "reward": 7.00830078125, "reward_std": 0.531501054763794, "rewards/helpfulness_reward/mean": 0.17191505432128906, "rewards/helpfulness_reward/std": 0.8204019665718079, "rewards/safety_reward/mean": 7.00830078125, "rewards/safety_reward/std": 1.041304349899292, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 50.5546875, "completions/mean_terminated_length": 50.5546875, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.08104095712164877, "frac_reward_zero_std": 0.0, "grad_norm": 0.7066313624382019, "kl": 2.35546875, "learning_rate": 5e-05, "loss": -0.0014, "num_tokens": 4306269.0, "reward": 6.743408203125, "reward_std": 0.4490794539451599, "rewards/helpfulness_reward/mean": 0.14412498474121094, "rewards/helpfulness_reward/std": 1.2282845973968506, "rewards/safety_reward/mean": 6.743408203125, "rewards/safety_reward/std": 1.3317646980285645, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 51.390625, "completions/mean_terminated_length": 51.390625, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.0813902715920007, "frac_reward_zero_std": 0.0, "grad_norm": 1.0632425546646118, "kl": 2.46875, "learning_rate": 5e-05, "loss": -0.0125, "num_tokens": 4316303.0, "reward": 7.1497802734375, "reward_std": 0.6874743700027466, "rewards/helpfulness_reward/mean": 0.7255450487136841, "rewards/helpfulness_reward/std": 0.8044801354408264, "rewards/safety_reward/mean": 7.1497802734375, "rewards/safety_reward/std": 1.096692681312561, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 51.015625, "completions/mean_terminated_length": 51.015625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.08173958606235263, "frac_reward_zero_std": 0.0, "grad_norm": 0.5601233839988708, "kl": 2.19921875, "learning_rate": 5e-05, "loss": 0.0124, "num_tokens": 4328681.0, "reward": 7.0467529296875, "reward_std": 0.3920832574367523, "rewards/helpfulness_reward/mean": 0.4652581214904785, "rewards/helpfulness_reward/std": 0.9894429445266724, "rewards/safety_reward/mean": 7.0467529296875, "rewards/safety_reward/std": 1.360330581665039, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 51.359375, "completions/mean_terminated_length": 51.359375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.08208890053270457, "frac_reward_zero_std": 0.0, "grad_norm": 0.5314624309539795, "kl": 2.41015625, "learning_rate": 5e-05, "loss": 0.0158, "num_tokens": 4340999.0, "reward": 7.04248046875, "reward_std": 0.39337557554244995, "rewards/helpfulness_reward/mean": 0.8652782440185547, "rewards/helpfulness_reward/std": 0.9513311982154846, "rewards/safety_reward/mean": 7.04248046875, "rewards/safety_reward/std": 1.2140672206878662, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 49.1015625, "completions/mean_terminated_length": 49.1015625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0824382150030565, "frac_reward_zero_std": 0.0, "grad_norm": 0.7656590938568115, "kl": 2.376953125, "learning_rate": 5e-05, "loss": 0.0211, "num_tokens": 4352724.0, "reward": 6.94384765625, "reward_std": 0.41656577587127686, "rewards/helpfulness_reward/mean": 0.7087640762329102, "rewards/helpfulness_reward/std": 0.6737750768661499, "rewards/safety_reward/mean": 6.94384765625, "rewards/safety_reward/std": 1.8472691774368286, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 51.0703125, "completions/mean_terminated_length": 51.0703125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.08278752947340844, "frac_reward_zero_std": 0.0, "grad_norm": 0.5947611927986145, "kl": 2.466796875, "learning_rate": 5e-05, "loss": 0.027, "num_tokens": 4363149.0, "reward": 7.125244140625, "reward_std": 0.4148294925689697, "rewards/helpfulness_reward/mean": 0.7232074737548828, "rewards/helpfulness_reward/std": 0.9390519857406616, "rewards/safety_reward/mean": 7.125244140625, "rewards/safety_reward/std": 1.149593710899353, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 51.0859375, "completions/mean_terminated_length": 51.0859375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.08313684394376036, "frac_reward_zero_std": 0.0, "grad_norm": 0.6832050085067749, "kl": 2.501953125, "learning_rate": 5e-05, "loss": 0.0143, "num_tokens": 4373864.0, "reward": 7.9765625, "reward_std": 0.33583879470825195, "rewards/helpfulness_reward/mean": 0.9619364738464355, "rewards/helpfulness_reward/std": 0.8019294142723083, "rewards/safety_reward/mean": 7.9765625, "rewards/safety_reward/std": 0.8520976305007935, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 52.140625, "completions/mean_terminated_length": 52.140625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.0834861584141123, "frac_reward_zero_std": 0.0, "grad_norm": 0.6200147271156311, "kl": 2.6640625, "learning_rate": 5e-05, "loss": 0.0263, "num_tokens": 4384938.0, "reward": 7.1693115234375, "reward_std": 0.5671199560165405, "rewards/helpfulness_reward/mean": 0.5305979251861572, "rewards/helpfulness_reward/std": 0.9945369362831116, "rewards/safety_reward/mean": 7.1693115234375, "rewards/safety_reward/std": 1.3892755508422852, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 51.6640625, "completions/mean_terminated_length": 51.6640625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.08383547288446423, "frac_reward_zero_std": 0.0, "grad_norm": 0.5136114358901978, "kl": 2.4609375, "learning_rate": 5e-05, "loss": 0.0202, "num_tokens": 4396495.0, "reward": 7.624755859375, "reward_std": 0.37155625224113464, "rewards/helpfulness_reward/mean": 0.6659414768218994, "rewards/helpfulness_reward/std": 0.7767598032951355, "rewards/safety_reward/mean": 7.624755859375, "rewards/safety_reward/std": 0.6941173672676086, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 51.6484375, "completions/mean_terminated_length": 51.6484375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.08418478735481617, "frac_reward_zero_std": 0.0, "grad_norm": 0.6381754875183105, "kl": 2.576171875, "learning_rate": 5e-05, "loss": 0.011, "num_tokens": 4407714.0, "reward": 6.89501953125, "reward_std": 0.41945910453796387, "rewards/helpfulness_reward/mean": 0.4316136837005615, "rewards/helpfulness_reward/std": 0.811225175857544, "rewards/safety_reward/mean": 6.89501953125, "rewards/safety_reward/std": 0.6495987772941589, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 52.0546875, "completions/mean_terminated_length": 52.0546875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.0845341018251681, "frac_reward_zero_std": 0.0, "grad_norm": 0.6245242357254028, "kl": 2.517578125, "learning_rate": 5e-05, "loss": 0.0194, "num_tokens": 4418265.0, "reward": 7.3870849609375, "reward_std": 0.34225332736968994, "rewards/helpfulness_reward/mean": 0.694699764251709, "rewards/helpfulness_reward/std": 0.8758864402770996, "rewards/safety_reward/mean": 7.3870849609375, "rewards/safety_reward/std": 1.0448434352874756, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 51.765625, "completions/mean_terminated_length": 51.765625, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.08488341629552004, "frac_reward_zero_std": 0.0, "grad_norm": 0.5909316539764404, "kl": 2.32421875, "learning_rate": 5e-05, "loss": 0.0245, "num_tokens": 4429515.0, "reward": 6.7879638671875, "reward_std": 0.3351929187774658, "rewards/helpfulness_reward/mean": 0.6828763484954834, "rewards/helpfulness_reward/std": 0.9496625065803528, "rewards/safety_reward/mean": 6.7879638671875, "rewards/safety_reward/std": 1.5151941776275635, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 51.96875, "completions/mean_terminated_length": 51.96875, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.08523273076587197, "frac_reward_zero_std": 0.0, "grad_norm": 0.7432966828346252, "kl": 2.6171875, "learning_rate": 5e-05, "loss": 0.0284, "num_tokens": 4441431.0, "reward": 7.9921875, "reward_std": 0.39441755414009094, "rewards/helpfulness_reward/mean": 0.9815711975097656, "rewards/helpfulness_reward/std": 0.7170512080192566, "rewards/safety_reward/mean": 7.9921875, "rewards/safety_reward/std": 1.2377785444259644, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 52.921875, "completions/mean_terminated_length": 52.921875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.08558204523622391, "frac_reward_zero_std": 0.0, "grad_norm": 3.1068801879882812, "kl": 3.904296875, "learning_rate": 5e-05, "loss": 0.0408, "num_tokens": 4452197.0, "reward": 7.43896484375, "reward_std": 0.3398527503013611, "rewards/helpfulness_reward/mean": 0.8023242950439453, "rewards/helpfulness_reward/std": 0.8767033219337463, "rewards/safety_reward/mean": 7.43896484375, "rewards/safety_reward/std": 0.9276455640792847, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 51.6796875, "completions/mean_terminated_length": 51.6796875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.08593135970657584, "frac_reward_zero_std": 0.0, "grad_norm": 3.078186273574829, "kl": 3.27734375, "learning_rate": 5e-05, "loss": 0.0395, "num_tokens": 4464292.0, "reward": 7.412086486816406, "reward_std": 0.4256852865219116, "rewards/helpfulness_reward/mean": 0.7961635589599609, "rewards/helpfulness_reward/std": 0.9241849184036255, "rewards/safety_reward/mean": 7.412086486816406, "rewards/safety_reward/std": 1.9256904125213623, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 52.5, "completions/mean_terminated_length": 52.5, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.08628067417692778, "frac_reward_zero_std": 0.0, "grad_norm": 0.7327623963356018, "kl": 2.814453125, "learning_rate": 5e-05, "loss": 0.0332, "num_tokens": 4475116.0, "reward": 7.605712890625, "reward_std": 0.3810673952102661, "rewards/helpfulness_reward/mean": 1.1986312866210938, "rewards/helpfulness_reward/std": 0.5521837472915649, "rewards/safety_reward/mean": 7.605712890625, "rewards/safety_reward/std": 0.9347903728485107, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 51.9921875, "completions/mean_terminated_length": 51.9921875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.08662998864727971, "frac_reward_zero_std": 0.0, "grad_norm": 0.634578287601471, "kl": 2.552734375, "learning_rate": 5e-05, "loss": 0.0188, "num_tokens": 4485707.0, "reward": 7.2908935546875, "reward_std": 0.4097391963005066, "rewards/helpfulness_reward/mean": 0.6093753576278687, "rewards/helpfulness_reward/std": 0.7050984501838684, "rewards/safety_reward/mean": 7.2908935546875, "rewards/safety_reward/std": 1.0752050876617432, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 52.0625, "completions/mean_terminated_length": 52.0625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.08697930311763165, "frac_reward_zero_std": 0.0, "grad_norm": 1.2294237613677979, "kl": 2.9296875, "learning_rate": 5e-05, "loss": 0.0358, "num_tokens": 4500267.0, "reward": 7.117919921875, "reward_std": 0.4038703739643097, "rewards/helpfulness_reward/mean": 0.3442840576171875, "rewards/helpfulness_reward/std": 1.4533631801605225, "rewards/safety_reward/mean": 7.117919921875, "rewards/safety_reward/std": 2.167090654373169, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 51.859375, "completions/mean_terminated_length": 51.859375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.08732861758798358, "frac_reward_zero_std": 0.0, "grad_norm": 0.6161786913871765, "kl": 2.806640625, "learning_rate": 5e-05, "loss": 0.0399, "num_tokens": 4512529.0, "reward": 7.34619140625, "reward_std": 0.36666157841682434, "rewards/helpfulness_reward/mean": 0.6649646759033203, "rewards/helpfulness_reward/std": 0.9879084229469299, "rewards/safety_reward/mean": 7.34619140625, "rewards/safety_reward/std": 1.4240026473999023, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 51.9765625, "completions/mean_terminated_length": 51.9765625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.08767793205833552, "frac_reward_zero_std": 0.0, "grad_norm": 0.6458096504211426, "kl": 3.021484375, "learning_rate": 5e-05, "loss": 0.0306, "num_tokens": 4523542.0, "reward": 7.78271484375, "reward_std": 0.4209515452384949, "rewards/helpfulness_reward/mean": 0.9921978116035461, "rewards/helpfulness_reward/std": 0.9904576539993286, "rewards/safety_reward/mean": 7.78271484375, "rewards/safety_reward/std": 1.3554093837738037, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 51.7578125, "completions/mean_terminated_length": 51.7578125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.08802724652868744, "frac_reward_zero_std": 0.0, "grad_norm": 0.6493793725967407, "kl": 2.681640625, "learning_rate": 5e-05, "loss": 0.0267, "num_tokens": 4533967.0, "reward": 7.400634765625, "reward_std": 0.3049907684326172, "rewards/helpfulness_reward/mean": 0.918264627456665, "rewards/helpfulness_reward/std": 0.6235273480415344, "rewards/safety_reward/mean": 7.400634765625, "rewards/safety_reward/std": 1.0805796384811401, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 51.234375, "completions/mean_terminated_length": 51.234375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.08837656099903939, "frac_reward_zero_std": 0.0, "grad_norm": 0.6152685284614563, "kl": 2.7265625, "learning_rate": 5e-05, "loss": 0.0247, "num_tokens": 4544245.0, "reward": 7.31298828125, "reward_std": 0.34557902812957764, "rewards/helpfulness_reward/mean": 0.9235613346099854, "rewards/helpfulness_reward/std": 0.7135672569274902, "rewards/safety_reward/mean": 7.31298828125, "rewards/safety_reward/std": 1.381392478942871, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 50.765625, "completions/mean_terminated_length": 50.765625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.08872587546939131, "frac_reward_zero_std": 0.0, "grad_norm": 0.5317577123641968, "kl": 2.58984375, "learning_rate": 5e-05, "loss": 0.0278, "num_tokens": 4554935.0, "reward": 7.48583984375, "reward_std": 0.3476046025753021, "rewards/helpfulness_reward/mean": 1.0612367391586304, "rewards/helpfulness_reward/std": 0.8918705582618713, "rewards/safety_reward/mean": 7.48583984375, "rewards/safety_reward/std": 1.0297677516937256, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 51.4453125, "completions/mean_terminated_length": 51.4453125, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.08907518993974325, "frac_reward_zero_std": 0.0, "grad_norm": 0.6642631888389587, "kl": 2.7734375, "learning_rate": 5e-05, "loss": 0.0204, "num_tokens": 4565640.0, "reward": 7.83349609375, "reward_std": 0.316476047039032, "rewards/helpfulness_reward/mean": 1.1107655763626099, "rewards/helpfulness_reward/std": 0.7306391596794128, "rewards/safety_reward/mean": 7.83349609375, "rewards/safety_reward/std": 1.1360872983932495, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 51.0390625, "completions/mean_terminated_length": 51.0390625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.08942450441009518, "frac_reward_zero_std": 0.0, "grad_norm": 0.4734959304332733, "kl": 2.830078125, "learning_rate": 5e-05, "loss": 0.0157, "num_tokens": 4576637.0, "reward": 7.1815185546875, "reward_std": 0.429790735244751, "rewards/helpfulness_reward/mean": 0.8092708587646484, "rewards/helpfulness_reward/std": 0.8916334509849548, "rewards/safety_reward/mean": 7.1815185546875, "rewards/safety_reward/std": 1.0426902770996094, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 50.4296875, "completions/mean_terminated_length": 50.4296875, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.08977381888044712, "frac_reward_zero_std": 0.0, "grad_norm": 0.656073808670044, "kl": 2.775390625, "learning_rate": 5e-05, "loss": -0.0029, "num_tokens": 4587044.0, "reward": 7.740478515625, "reward_std": 0.5901978015899658, "rewards/helpfulness_reward/mean": 1.2657170295715332, "rewards/helpfulness_reward/std": 0.6612967252731323, "rewards/safety_reward/mean": 7.740478515625, "rewards/safety_reward/std": 1.2292369604110718, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 50.6953125, "completions/mean_terminated_length": 50.6953125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.09012313335079905, "frac_reward_zero_std": 0.0, "grad_norm": 0.8501467108726501, "kl": 2.73828125, "learning_rate": 5e-05, "loss": 0.0344, "num_tokens": 4599325.0, "reward": 7.2786865234375, "reward_std": 0.42672842741012573, "rewards/helpfulness_reward/mean": 0.5365457534790039, "rewards/helpfulness_reward/std": 0.8059032559394836, "rewards/safety_reward/mean": 7.2786865234375, "rewards/safety_reward/std": 1.392653226852417, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 50.578125, "completions/mean_terminated_length": 50.578125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.09047244782115099, "frac_reward_zero_std": 0.0, "grad_norm": 0.6736500859260559, "kl": 2.828125, "learning_rate": 5e-05, "loss": 0.0227, "num_tokens": 4610255.0, "reward": 7.739013671875, "reward_std": 0.3295970559120178, "rewards/helpfulness_reward/mean": 0.6908489465713501, "rewards/helpfulness_reward/std": 0.7089573740959167, "rewards/safety_reward/mean": 7.739013671875, "rewards/safety_reward/std": 0.8760941624641418, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 51.2265625, "completions/mean_terminated_length": 51.2265625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.09082176229150292, "frac_reward_zero_std": 0.0, "grad_norm": 0.6463622450828552, "kl": 2.7265625, "learning_rate": 5e-05, "loss": 0.0257, "num_tokens": 4620364.0, "reward": 7.51416015625, "reward_std": 0.3786676526069641, "rewards/helpfulness_reward/mean": 0.8147573471069336, "rewards/helpfulness_reward/std": 0.6364395022392273, "rewards/safety_reward/mean": 7.51416015625, "rewards/safety_reward/std": 1.1178748607635498, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/max_terminated_length": 520.0, "completions/mean_length": 73.546875, "completions/mean_terminated_length": 73.546875, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.09117107676185486, "frac_reward_zero_std": 0.0, "grad_norm": 0.6062162518501282, "kl": 2.35546875, "learning_rate": 5e-05, "loss": 0.0608, "num_tokens": 4638146.0, "reward": 6.971534729003906, "reward_std": 0.3437599241733551, "rewards/helpfulness_reward/mean": 0.7982988357543945, "rewards/helpfulness_reward/std": 0.748300313949585, "rewards/safety_reward/mean": 6.971534729003906, "rewards/safety_reward/std": 2.4689671993255615, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 50.984375, "completions/mean_terminated_length": 50.984375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.09152039123220679, "frac_reward_zero_std": 0.0, "grad_norm": 2.539105176925659, "kl": 4.359375, "learning_rate": 5e-05, "loss": 0.0417, "num_tokens": 4648272.0, "reward": 7.721923828125, "reward_std": 0.28090500831604004, "rewards/helpfulness_reward/mean": 1.1433639526367188, "rewards/helpfulness_reward/std": 0.8813363909721375, "rewards/safety_reward/mean": 7.721923828125, "rewards/safety_reward/std": 1.0141927003860474, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 51.3203125, "completions/mean_terminated_length": 51.3203125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.09186970570255873, "frac_reward_zero_std": 0.0, "grad_norm": 0.7476218938827515, "kl": 2.650390625, "learning_rate": 5e-05, "loss": 0.0292, "num_tokens": 4659081.0, "reward": 8.0244140625, "reward_std": 0.32715609669685364, "rewards/helpfulness_reward/mean": 1.0993175506591797, "rewards/helpfulness_reward/std": 0.99833083152771, "rewards/safety_reward/mean": 8.0244140625, "rewards/safety_reward/std": 1.2420635223388672, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 51.359375, "completions/mean_terminated_length": 51.359375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.09221902017291066, "frac_reward_zero_std": 0.0, "grad_norm": 0.683851420879364, "kl": 2.859375, "learning_rate": 5e-05, "loss": 0.026, "num_tokens": 4668991.0, "reward": 7.562744140625, "reward_std": 0.22404858469963074, "rewards/helpfulness_reward/mean": 0.8782990574836731, "rewards/helpfulness_reward/std": 0.7544283270835876, "rewards/safety_reward/mean": 7.562744140625, "rewards/safety_reward/std": 1.0057084560394287, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 103.0, "completions/max_terminated_length": 103.0, "completions/mean_length": 52.0703125, "completions/mean_terminated_length": 52.0703125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.0925683346432626, "frac_reward_zero_std": 0.0, "grad_norm": 0.5951545834541321, "kl": 2.8671875, "learning_rate": 5e-05, "loss": 0.0213, "num_tokens": 4679432.0, "reward": 7.456787109375, "reward_std": 0.30867719650268555, "rewards/helpfulness_reward/mean": 1.21254301071167, "rewards/helpfulness_reward/std": 0.8700965642929077, "rewards/safety_reward/mean": 7.456787109375, "rewards/safety_reward/std": 1.5923256874084473, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 51.046875, "completions/mean_terminated_length": 51.046875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.09291764911361453, "frac_reward_zero_std": 0.0, "grad_norm": 0.5204070806503296, "kl": 2.732421875, "learning_rate": 5e-05, "loss": 0.0321, "num_tokens": 4689526.0, "reward": 7.781005859375, "reward_std": 0.3156229853630066, "rewards/helpfulness_reward/mean": 1.1358070373535156, "rewards/helpfulness_reward/std": 0.6886677742004395, "rewards/safety_reward/mean": 7.781005859375, "rewards/safety_reward/std": 0.8582165241241455, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 50.46875, "completions/mean_terminated_length": 50.46875, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.09326696358396647, "frac_reward_zero_std": 0.0, "grad_norm": 1.6445039510726929, "kl": 3.189453125, "learning_rate": 5e-05, "loss": 0.0271, "num_tokens": 4702530.0, "reward": 7.1563720703125, "reward_std": 0.41955405473709106, "rewards/helpfulness_reward/mean": 0.7203445434570312, "rewards/helpfulness_reward/std": 1.2981153726577759, "rewards/safety_reward/mean": 7.1563720703125, "rewards/safety_reward/std": 1.319413185119629, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 51.140625, "completions/mean_terminated_length": 51.140625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.0936162780543184, "frac_reward_zero_std": 0.0, "grad_norm": 0.6405123472213745, "kl": 3.060546875, "learning_rate": 5e-05, "loss": 0.0254, "num_tokens": 4712644.0, "reward": 7.676513671875, "reward_std": 0.22237437963485718, "rewards/helpfulness_reward/mean": 1.0448007583618164, "rewards/helpfulness_reward/std": 0.7499186396598816, "rewards/safety_reward/mean": 7.676513671875, "rewards/safety_reward/std": 0.9857544302940369, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 50.65625, "completions/mean_terminated_length": 50.65625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.09396559252467034, "frac_reward_zero_std": 0.0, "grad_norm": 0.7482747435569763, "kl": 2.818359375, "learning_rate": 5e-05, "loss": 0.0265, "num_tokens": 4723120.0, "reward": 7.4205322265625, "reward_std": 0.41459202766418457, "rewards/helpfulness_reward/mean": 0.8500032424926758, "rewards/helpfulness_reward/std": 0.9326725006103516, "rewards/safety_reward/mean": 7.4205322265625, "rewards/safety_reward/std": 1.1311230659484863, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 51.171875, "completions/mean_terminated_length": 51.171875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.09431490699502226, "frac_reward_zero_std": 0.0, "grad_norm": 0.7867903709411621, "kl": 3.21484375, "learning_rate": 5e-05, "loss": 0.0287, "num_tokens": 4736054.0, "reward": 8.0322265625, "reward_std": 0.27828848361968994, "rewards/helpfulness_reward/mean": 0.9301223754882812, "rewards/helpfulness_reward/std": 0.7219776511192322, "rewards/safety_reward/mean": 8.0322265625, "rewards/safety_reward/std": 0.9114063382148743, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 50.5078125, "completions/mean_terminated_length": 50.5078125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.0946642214653742, "frac_reward_zero_std": 0.0, "grad_norm": 1.0414550304412842, "kl": 3.150390625, "learning_rate": 5e-05, "loss": 0.026, "num_tokens": 4747127.0, "reward": 7.614013671875, "reward_std": 0.4092431962490082, "rewards/helpfulness_reward/mean": 0.943084716796875, "rewards/helpfulness_reward/std": 0.7528203129768372, "rewards/safety_reward/mean": 7.614013671875, "rewards/safety_reward/std": 1.0607949495315552, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 90.0, "completions/max_terminated_length": 90.0, "completions/mean_length": 51.4609375, "completions/mean_terminated_length": 51.4609375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.09501353593572613, "frac_reward_zero_std": 0.0, "grad_norm": 0.44053518772125244, "kl": 2.748046875, "learning_rate": 5e-05, "loss": 0.0297, "num_tokens": 4758706.0, "reward": 7.3494873046875, "reward_std": 0.37226665019989014, "rewards/helpfulness_reward/mean": 1.2064943313598633, "rewards/helpfulness_reward/std": 0.6547616124153137, "rewards/safety_reward/mean": 7.3494873046875, "rewards/safety_reward/std": 1.7200177907943726, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 50.5703125, "completions/mean_terminated_length": 50.5703125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.09536285040607807, "frac_reward_zero_std": 0.0, "grad_norm": 0.46845415234565735, "kl": 2.767578125, "learning_rate": 5e-05, "loss": 0.0225, "num_tokens": 4768403.0, "reward": 7.52099609375, "reward_std": 0.2412329912185669, "rewards/helpfulness_reward/mean": 1.0560989379882812, "rewards/helpfulness_reward/std": 0.6657376885414124, "rewards/safety_reward/mean": 7.52099609375, "rewards/safety_reward/std": 1.0067601203918457, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 51.6640625, "completions/mean_terminated_length": 51.6640625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.09571216487643, "frac_reward_zero_std": 0.0, "grad_norm": 0.6467104554176331, "kl": 3.00390625, "learning_rate": 5e-05, "loss": 0.0303, "num_tokens": 4780072.0, "reward": 8.03076171875, "reward_std": 0.3328056335449219, "rewards/helpfulness_reward/mean": 1.1097126007080078, "rewards/helpfulness_reward/std": 0.8093951940536499, "rewards/safety_reward/mean": 8.03076171875, "rewards/safety_reward/std": 1.2067313194274902, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 50.90625, "completions/mean_terminated_length": 50.90625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.09606147934678194, "frac_reward_zero_std": 0.0, "grad_norm": 0.5411428809165955, "kl": 2.923828125, "learning_rate": 5e-05, "loss": 0.0209, "num_tokens": 4790604.0, "reward": 7.41162109375, "reward_std": 0.2861209213733673, "rewards/helpfulness_reward/mean": 1.1339693069458008, "rewards/helpfulness_reward/std": 0.5651072263717651, "rewards/safety_reward/mean": 7.41162109375, "rewards/safety_reward/std": 1.1252057552337646, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 50.40625, "completions/mean_terminated_length": 50.40625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.09641079381713387, "frac_reward_zero_std": 0.0, "grad_norm": 0.5285611152648926, "kl": 2.798828125, "learning_rate": 5e-05, "loss": 0.0178, "num_tokens": 4802024.0, "reward": 7.7459716796875, "reward_std": 0.45552995800971985, "rewards/helpfulness_reward/mean": 0.6318340301513672, "rewards/helpfulness_reward/std": 1.2051146030426025, "rewards/safety_reward/mean": 7.7459716796875, "rewards/safety_reward/std": 1.3627263307571411, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 50.78125, "completions/mean_terminated_length": 50.78125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.09676010828748581, "frac_reward_zero_std": 0.0, "grad_norm": 0.5657576322555542, "kl": 2.89453125, "learning_rate": 5e-05, "loss": 0.0132, "num_tokens": 4811964.0, "reward": 7.6279296875, "reward_std": 0.40276217460632324, "rewards/helpfulness_reward/mean": 1.129643440246582, "rewards/helpfulness_reward/std": 0.5917001366615295, "rewards/safety_reward/mean": 7.6279296875, "rewards/safety_reward/std": 1.0771657228469849, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 50.8671875, "completions/mean_terminated_length": 50.8671875, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.09710942275783774, "frac_reward_zero_std": 0.0, "grad_norm": 2.267714023590088, "kl": 3.900390625, "learning_rate": 5e-05, "loss": 0.0484, "num_tokens": 4823019.0, "reward": 7.787353515625, "reward_std": 0.3293289244174957, "rewards/helpfulness_reward/mean": 0.7321476340293884, "rewards/helpfulness_reward/std": 0.8513231873512268, "rewards/safety_reward/mean": 7.787353515625, "rewards/safety_reward/std": 0.9780129194259644, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1088.0, "completions/max_terminated_length": 1088.0, "completions/mean_length": 86.8203125, "completions/mean_terminated_length": 86.8203125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.09745873722818968, "frac_reward_zero_std": 0.0, "grad_norm": 0.5550286769866943, "kl": 2.40234375, "learning_rate": 5e-05, "loss": 0.1299, "num_tokens": 4840044.0, "reward": 7.491096496582031, "reward_std": 0.3477376401424408, "rewards/helpfulness_reward/mean": 0.8979253768920898, "rewards/helpfulness_reward/std": 0.9646686911582947, "rewards/safety_reward/mean": 7.491096496582031, "rewards/safety_reward/std": 1.9282143115997314, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 51.265625, "completions/mean_terminated_length": 51.265625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.0978080516985416, "frac_reward_zero_std": 0.0, "grad_norm": 0.5763794183731079, "kl": 2.728515625, "learning_rate": 5e-05, "loss": 0.0482, "num_tokens": 4851374.0, "reward": 7.832275390625, "reward_std": 0.3801168203353882, "rewards/helpfulness_reward/mean": 0.8061704635620117, "rewards/helpfulness_reward/std": 0.8026238083839417, "rewards/safety_reward/mean": 7.832275390625, "rewards/safety_reward/std": 1.542359709739685, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 51.3515625, "completions/mean_terminated_length": 51.3515625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.09815736616889355, "frac_reward_zero_std": 0.0, "grad_norm": 0.40362313389778137, "kl": 2.837890625, "learning_rate": 5e-05, "loss": 0.0335, "num_tokens": 4862299.0, "reward": 8.096923828125, "reward_std": 0.32297998666763306, "rewards/helpfulness_reward/mean": 1.2580413818359375, "rewards/helpfulness_reward/std": 1.0939637422561646, "rewards/safety_reward/mean": 8.096923828125, "rewards/safety_reward/std": 1.1328998804092407, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 51.140625, "completions/mean_terminated_length": 51.140625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.09850668063924548, "frac_reward_zero_std": 0.0, "grad_norm": 0.6529896855354309, "kl": 3.197265625, "learning_rate": 5e-05, "loss": 0.0557, "num_tokens": 4873637.0, "reward": 7.830810546875, "reward_std": 0.37302064895629883, "rewards/helpfulness_reward/mean": 0.7477531433105469, "rewards/helpfulness_reward/std": 0.8861419558525085, "rewards/safety_reward/mean": 7.830810546875, "rewards/safety_reward/std": 0.9543879628181458, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 50.4765625, "completions/mean_terminated_length": 50.4765625, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.09885599510959742, "frac_reward_zero_std": 0.0, "grad_norm": 0.7061375975608826, "kl": 3.09375, "learning_rate": 5e-05, "loss": 0.0287, "num_tokens": 4884242.0, "reward": 7.48193359375, "reward_std": 0.3345058858394623, "rewards/helpfulness_reward/mean": 0.9624118804931641, "rewards/helpfulness_reward/std": 0.6660060882568359, "rewards/safety_reward/mean": 7.48193359375, "rewards/safety_reward/std": 1.090349793434143, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 84.0, "completions/max_terminated_length": 84.0, "completions/mean_length": 51.3984375, "completions/mean_terminated_length": 51.3984375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.09920530957994934, "frac_reward_zero_std": 0.0, "grad_norm": 0.69968181848526, "kl": 2.83984375, "learning_rate": 5e-05, "loss": 0.0549, "num_tokens": 4895245.0, "reward": 7.76806640625, "reward_std": 0.3606148958206177, "rewards/helpfulness_reward/mean": 1.2713532447814941, "rewards/helpfulness_reward/std": 0.7347195148468018, "rewards/safety_reward/mean": 7.76806640625, "rewards/safety_reward/std": 1.0305346250534058, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 50.8203125, "completions/mean_terminated_length": 50.8203125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.09955462405030129, "frac_reward_zero_std": 0.0, "grad_norm": 0.540401041507721, "kl": 3.01953125, "learning_rate": 5e-05, "loss": 0.0393, "num_tokens": 4907238.0, "reward": 7.552734375, "reward_std": 0.30909663438796997, "rewards/helpfulness_reward/mean": 1.0640113353729248, "rewards/helpfulness_reward/std": 0.5759408473968506, "rewards/safety_reward/mean": 7.552734375, "rewards/safety_reward/std": 0.8635156750679016, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 50.296875, "completions/mean_terminated_length": 50.296875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.09990393852065321, "frac_reward_zero_std": 0.0, "grad_norm": 1.4126076698303223, "kl": 3.015625, "learning_rate": 5e-05, "loss": 0.0383, "num_tokens": 4919788.0, "reward": 6.8912353515625, "reward_std": 0.3815104365348816, "rewards/helpfulness_reward/mean": 0.8975794315338135, "rewards/helpfulness_reward/std": 0.88361656665802, "rewards/safety_reward/mean": 6.8912353515625, "rewards/safety_reward/std": 1.836322546005249, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 50.3359375, "completions/mean_terminated_length": 50.3359375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.10025325299100515, "frac_reward_zero_std": 0.0, "grad_norm": 0.5973515510559082, "kl": 2.763671875, "learning_rate": 5e-05, "loss": 0.0302, "num_tokens": 4930655.0, "reward": 8.2506103515625, "reward_std": 0.38389086723327637, "rewards/helpfulness_reward/mean": 1.2900161743164062, "rewards/helpfulness_reward/std": 0.65638667345047, "rewards/safety_reward/mean": 8.2506103515625, "rewards/safety_reward/std": 1.122866153717041, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 50.671875, "completions/mean_terminated_length": 50.671875, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.10060256746135708, "frac_reward_zero_std": 0.0, "grad_norm": 0.6394835114479065, "kl": 2.71484375, "learning_rate": 5e-05, "loss": 0.0256, "num_tokens": 4941205.0, "reward": 7.752197265625, "reward_std": 0.3741064667701721, "rewards/helpfulness_reward/mean": 0.9714020490646362, "rewards/helpfulness_reward/std": 0.6021812558174133, "rewards/safety_reward/mean": 7.752197265625, "rewards/safety_reward/std": 1.0893590450286865, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 51.1015625, "completions/mean_terminated_length": 51.1015625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.10095188193170902, "frac_reward_zero_std": 0.0, "grad_norm": 0.5424870252609253, "kl": 2.806640625, "learning_rate": 5e-05, "loss": 0.0416, "num_tokens": 4952082.0, "reward": 7.588134765625, "reward_std": 0.3194371461868286, "rewards/helpfulness_reward/mean": 0.9443454742431641, "rewards/helpfulness_reward/std": 0.8756372928619385, "rewards/safety_reward/mean": 7.588134765625, "rewards/safety_reward/std": 1.245398998260498, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 50.90625, "completions/mean_terminated_length": 50.90625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.10130119640206095, "frac_reward_zero_std": 0.0, "grad_norm": 0.6275315284729004, "kl": 2.970703125, "learning_rate": 5e-05, "loss": 0.0328, "num_tokens": 4962310.0, "reward": 7.305419921875, "reward_std": 0.3106255531311035, "rewards/helpfulness_reward/mean": 0.9678001403808594, "rewards/helpfulness_reward/std": 0.9214643836021423, "rewards/safety_reward/mean": 7.305419921875, "rewards/safety_reward/std": 1.455384612083435, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 50.8203125, "completions/mean_terminated_length": 50.8203125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.10165051087241289, "frac_reward_zero_std": 0.0, "grad_norm": 0.4680807888507843, "kl": 2.798828125, "learning_rate": 5e-05, "loss": 0.0363, "num_tokens": 4973567.0, "reward": 8.44970703125, "reward_std": 0.30223241448402405, "rewards/helpfulness_reward/mean": 0.9971208572387695, "rewards/helpfulness_reward/std": 0.6864738464355469, "rewards/safety_reward/mean": 8.44970703125, "rewards/safety_reward/std": 1.1191767454147339, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 51.0625, "completions/mean_terminated_length": 51.0625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.10199982534276482, "frac_reward_zero_std": 0.0, "grad_norm": 0.44101768732070923, "kl": 2.802734375, "learning_rate": 5e-05, "loss": 0.0279, "num_tokens": 4984655.0, "reward": 7.8427734375, "reward_std": 0.2560751736164093, "rewards/helpfulness_reward/mean": 0.9801411628723145, "rewards/helpfulness_reward/std": 0.6793422102928162, "rewards/safety_reward/mean": 7.8427734375, "rewards/safety_reward/std": 1.104464054107666, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 51.359375, "completions/mean_terminated_length": 51.359375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.10234913981311676, "frac_reward_zero_std": 0.0, "grad_norm": 0.5770735740661621, "kl": 2.884765625, "learning_rate": 5e-05, "loss": 0.0275, "num_tokens": 4996093.0, "reward": 8.26513671875, "reward_std": 0.20187577605247498, "rewards/helpfulness_reward/mean": 1.0436859130859375, "rewards/helpfulness_reward/std": 0.5371475219726562, "rewards/safety_reward/mean": 8.26513671875, "rewards/safety_reward/std": 0.8039658665657043, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 50.984375, "completions/mean_terminated_length": 50.984375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.10269845428346869, "frac_reward_zero_std": 0.0, "grad_norm": 0.5636701583862305, "kl": 2.84765625, "learning_rate": 5e-05, "loss": 0.0368, "num_tokens": 5008003.0, "reward": 7.3060302734375, "reward_std": 0.26703453063964844, "rewards/helpfulness_reward/mean": 0.8618984222412109, "rewards/helpfulness_reward/std": 0.9478545188903809, "rewards/safety_reward/mean": 7.3060302734375, "rewards/safety_reward/std": 1.6542751789093018, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 51.375, "completions/mean_terminated_length": 51.375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.10304776875382063, "frac_reward_zero_std": 0.0, "grad_norm": 0.6300445199012756, "kl": 2.6875, "learning_rate": 5e-05, "loss": 0.0209, "num_tokens": 5018763.0, "reward": 7.465576171875, "reward_std": 0.3047492802143097, "rewards/helpfulness_reward/mean": 1.0124664306640625, "rewards/helpfulness_reward/std": 0.9445160627365112, "rewards/safety_reward/mean": 7.465576171875, "rewards/safety_reward/std": 1.2782297134399414, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 51.4140625, "completions/mean_terminated_length": 51.4140625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.10339708322417256, "frac_reward_zero_std": 0.0, "grad_norm": 0.7416171431541443, "kl": 3.154296875, "learning_rate": 5e-05, "loss": 0.0377, "num_tokens": 5029360.0, "reward": 7.71044921875, "reward_std": 0.24539822340011597, "rewards/helpfulness_reward/mean": 0.9841461181640625, "rewards/helpfulness_reward/std": 0.7182735204696655, "rewards/safety_reward/mean": 7.71044921875, "rewards/safety_reward/std": 0.9034114480018616, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 51.96875, "completions/mean_terminated_length": 51.96875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.1037463976945245, "frac_reward_zero_std": 0.0, "grad_norm": 0.5929480791091919, "kl": 3.076171875, "learning_rate": 5e-05, "loss": 0.0315, "num_tokens": 5040596.0, "reward": 7.591796875, "reward_std": 0.23995347321033478, "rewards/helpfulness_reward/mean": 1.0037333965301514, "rewards/helpfulness_reward/std": 0.786497175693512, "rewards/safety_reward/mean": 7.591796875, "rewards/safety_reward/std": 1.354302167892456, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 60.515625, "completions/mean_terminated_length": 60.515625, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.10409571216487642, "frac_reward_zero_std": 0.0, "grad_norm": 0.5208987593650818, "kl": 2.595703125, "learning_rate": 5e-05, "loss": 0.0624, "num_tokens": 5054318.0, "reward": 7.216705322265625, "reward_std": 0.3237225115299225, "rewards/helpfulness_reward/mean": 0.5916385650634766, "rewards/helpfulness_reward/std": 0.6872313618659973, "rewards/safety_reward/mean": 7.216705322265625, "rewards/safety_reward/std": 1.748558759689331, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 55.1015625, "completions/mean_terminated_length": 55.1015625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.10444502663522837, "frac_reward_zero_std": 0.0, "grad_norm": 0.5979918837547302, "kl": 2.646484375, "learning_rate": 5e-05, "loss": 0.0554, "num_tokens": 5065539.0, "reward": 7.18939208984375, "reward_std": 0.4574136734008789, "rewards/helpfulness_reward/mean": 0.8681039810180664, "rewards/helpfulness_reward/std": 0.8235934376716614, "rewards/safety_reward/mean": 7.18939208984375, "rewards/safety_reward/std": 1.4106847047805786, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 51.7734375, "completions/mean_terminated_length": 51.7734375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.1047943411055803, "frac_reward_zero_std": 0.0, "grad_norm": 0.6582717895507812, "kl": 3.1171875, "learning_rate": 5e-05, "loss": 0.0351, "num_tokens": 5075798.0, "reward": 7.7244873046875, "reward_std": 0.32227054238319397, "rewards/helpfulness_reward/mean": 0.9513269662857056, "rewards/helpfulness_reward/std": 0.6497437357902527, "rewards/safety_reward/mean": 7.7244873046875, "rewards/safety_reward/std": 1.2831178903579712, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 59.8125, "completions/mean_terminated_length": 59.8125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.10514365557593223, "frac_reward_zero_std": 0.0, "grad_norm": 0.6461156010627747, "kl": 2.677734375, "learning_rate": 5e-05, "loss": 0.1324, "num_tokens": 5086910.0, "reward": 7.35205078125, "reward_std": 0.49643927812576294, "rewards/helpfulness_reward/mean": 0.6511080265045166, "rewards/helpfulness_reward/std": 0.6904481649398804, "rewards/safety_reward/mean": 7.35205078125, "rewards/safety_reward/std": 1.3796908855438232, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 51.4140625, "completions/mean_terminated_length": 51.4140625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.10549297004628416, "frac_reward_zero_std": 0.0, "grad_norm": 0.5852591395378113, "kl": 2.666015625, "learning_rate": 5e-05, "loss": 0.0223, "num_tokens": 5098115.0, "reward": 7.7125244140625, "reward_std": 0.35976600646972656, "rewards/helpfulness_reward/mean": 0.952690601348877, "rewards/helpfulness_reward/std": 0.8030321002006531, "rewards/safety_reward/mean": 7.7125244140625, "rewards/safety_reward/std": 1.3236403465270996, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 50.4453125, "completions/mean_terminated_length": 50.4453125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.1058422845166361, "frac_reward_zero_std": 0.0, "grad_norm": 0.7389498949050903, "kl": 3.13671875, "learning_rate": 5e-05, "loss": 0.0151, "num_tokens": 5109740.0, "reward": 7.4641876220703125, "reward_std": 0.4237574338912964, "rewards/helpfulness_reward/mean": 0.5919783115386963, "rewards/helpfulness_reward/std": 0.6678202152252197, "rewards/safety_reward/mean": 7.4641876220703125, "rewards/safety_reward/std": 1.7589983940124512, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 50.9140625, "completions/mean_terminated_length": 50.9140625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.10619159898698803, "frac_reward_zero_std": 0.0, "grad_norm": 0.4530092179775238, "kl": 2.857421875, "learning_rate": 5e-05, "loss": 0.0192, "num_tokens": 5121273.0, "reward": 7.67333984375, "reward_std": 0.2966079115867615, "rewards/helpfulness_reward/mean": 0.6907625198364258, "rewards/helpfulness_reward/std": 0.7628931403160095, "rewards/safety_reward/mean": 7.67333984375, "rewards/safety_reward/std": 1.0277795791625977, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 51.609375, "completions/mean_terminated_length": 51.609375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.10654091345733997, "frac_reward_zero_std": 0.0, "grad_norm": 0.6301886439323425, "kl": 2.759765625, "learning_rate": 5e-05, "loss": 0.0356, "num_tokens": 5132231.0, "reward": 7.805908203125, "reward_std": 0.24283544719219208, "rewards/helpfulness_reward/mean": 0.718846321105957, "rewards/helpfulness_reward/std": 0.8612838387489319, "rewards/safety_reward/mean": 7.805908203125, "rewards/safety_reward/std": 1.1970365047454834, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 51.140625, "completions/mean_terminated_length": 51.140625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.1068902279276919, "frac_reward_zero_std": 0.0, "grad_norm": 0.5629796385765076, "kl": 2.927734375, "learning_rate": 5e-05, "loss": 0.0285, "num_tokens": 5144513.0, "reward": 7.72607421875, "reward_std": 0.4256669282913208, "rewards/helpfulness_reward/mean": 0.9421825408935547, "rewards/helpfulness_reward/std": 0.6484464406967163, "rewards/safety_reward/mean": 7.72607421875, "rewards/safety_reward/std": 1.2181756496429443, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1112.0, "completions/max_terminated_length": 1112.0, "completions/mean_length": 77.0546875, "completions/mean_terminated_length": 77.0546875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.10723954239804384, "frac_reward_zero_std": 0.0, "grad_norm": 2.7616820335388184, "kl": 3.1328125, "learning_rate": 5e-05, "loss": 0.0739, "num_tokens": 5159872.0, "reward": 7.50775146484375, "reward_std": 0.2842238247394562, "rewards/helpfulness_reward/mean": 1.2978401184082031, "rewards/helpfulness_reward/std": 0.6223834156990051, "rewards/safety_reward/mean": 7.50775146484375, "rewards/safety_reward/std": 1.653106689453125, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 50.6484375, "completions/mean_terminated_length": 50.6484375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.10758885686839577, "frac_reward_zero_std": 0.0, "grad_norm": 0.6890342831611633, "kl": 2.837890625, "learning_rate": 5e-05, "loss": 0.0173, "num_tokens": 5172091.0, "reward": 7.638671875, "reward_std": 0.444500207901001, "rewards/helpfulness_reward/mean": 1.2932281494140625, "rewards/helpfulness_reward/std": 0.8822053670883179, "rewards/safety_reward/mean": 7.638671875, "rewards/safety_reward/std": 1.1516671180725098, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 50.84375, "completions/mean_terminated_length": 50.84375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.10793817133874771, "frac_reward_zero_std": 0.0, "grad_norm": 0.5016152262687683, "kl": 3.025390625, "learning_rate": 5e-05, "loss": 0.0226, "num_tokens": 5182759.0, "reward": 7.894287109375, "reward_std": 0.35509034991264343, "rewards/helpfulness_reward/mean": 1.0423400402069092, "rewards/helpfulness_reward/std": 0.8572023510932922, "rewards/safety_reward/mean": 7.894287109375, "rewards/safety_reward/std": 1.0439566373825073, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 49.8828125, "completions/mean_terminated_length": 49.8828125, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.10828748580909964, "frac_reward_zero_std": 0.0, "grad_norm": 0.6135720610618591, "kl": 2.814453125, "learning_rate": 5e-05, "loss": 0.0414, "num_tokens": 5196168.0, "reward": 7.3541259765625, "reward_std": 0.40483298897743225, "rewards/helpfulness_reward/mean": 0.9680252075195312, "rewards/helpfulness_reward/std": 0.7024363279342651, "rewards/safety_reward/mean": 7.3541259765625, "rewards/safety_reward/std": 1.483892798423767, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 51.328125, "completions/mean_terminated_length": 51.328125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.10863680027945158, "frac_reward_zero_std": 0.0, "grad_norm": 0.5571587085723877, "kl": 3.05859375, "learning_rate": 5e-05, "loss": 0.0328, "num_tokens": 5207586.0, "reward": 7.529541015625, "reward_std": 0.45452389121055603, "rewards/helpfulness_reward/mean": 0.9562568664550781, "rewards/helpfulness_reward/std": 0.76007080078125, "rewards/safety_reward/mean": 7.529541015625, "rewards/safety_reward/std": 1.1568270921707153, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 51.2265625, "completions/mean_terminated_length": 51.2265625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.1089861147498035, "frac_reward_zero_std": 0.0, "grad_norm": 1.1172174215316772, "kl": 3.625, "learning_rate": 5e-05, "loss": 0.0226, "num_tokens": 5217671.0, "reward": 7.0968017578125, "reward_std": 0.40008845925331116, "rewards/helpfulness_reward/mean": 1.1665363311767578, "rewards/helpfulness_reward/std": 0.5637357831001282, "rewards/safety_reward/mean": 7.0968017578125, "rewards/safety_reward/std": 0.967971920967102, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 51.3984375, "completions/mean_terminated_length": 51.3984375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.10933542922015545, "frac_reward_zero_std": 0.0, "grad_norm": 0.5255357623100281, "kl": 3.015625, "learning_rate": 5e-05, "loss": 0.0299, "num_tokens": 5228026.0, "reward": 7.6708984375, "reward_std": 0.2981686592102051, "rewards/helpfulness_reward/mean": 1.1529958248138428, "rewards/helpfulness_reward/std": 0.8283553719520569, "rewards/safety_reward/mean": 7.6708984375, "rewards/safety_reward/std": 1.073601245880127, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 51.59375, "completions/mean_terminated_length": 51.59375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.10968474369050737, "frac_reward_zero_std": 0.0, "grad_norm": 0.5391107201576233, "kl": 3.001953125, "learning_rate": 5e-05, "loss": 0.0431, "num_tokens": 5238454.0, "reward": 7.7880859375, "reward_std": 0.3795875012874603, "rewards/helpfulness_reward/mean": 1.1347026824951172, "rewards/helpfulness_reward/std": 0.9700644612312317, "rewards/safety_reward/mean": 7.7880859375, "rewards/safety_reward/std": 1.0569632053375244, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 51.5234375, "completions/mean_terminated_length": 51.5234375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.11003405816085932, "frac_reward_zero_std": 0.0, "grad_norm": 0.48926928639411926, "kl": 2.759765625, "learning_rate": 5e-05, "loss": 0.0252, "num_tokens": 5249337.0, "reward": 8.1416015625, "reward_std": 0.2926212549209595, "rewards/helpfulness_reward/mean": 0.8457412719726562, "rewards/helpfulness_reward/std": 0.708624005317688, "rewards/safety_reward/mean": 8.1416015625, "rewards/safety_reward/std": 1.359103798866272, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 51.9921875, "completions/mean_terminated_length": 51.9921875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.11038337263121124, "frac_reward_zero_std": 0.0, "grad_norm": 1.6794112920761108, "kl": 4.013671875, "learning_rate": 5e-05, "loss": 0.0531, "num_tokens": 5261056.0, "reward": 7.5274658203125, "reward_std": 0.5520209670066833, "rewards/helpfulness_reward/mean": 0.6396504044532776, "rewards/helpfulness_reward/std": 0.8514429926872253, "rewards/safety_reward/mean": 7.5274658203125, "rewards/safety_reward/std": 1.0255351066589355, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 51.578125, "completions/mean_terminated_length": 51.578125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.11073268710156318, "frac_reward_zero_std": 0.0, "grad_norm": 0.5035623908042908, "kl": 2.9609375, "learning_rate": 5e-05, "loss": 0.0372, "num_tokens": 5271778.0, "reward": 7.707275390625, "reward_std": 0.3051682114601135, "rewards/helpfulness_reward/mean": 0.9332847595214844, "rewards/helpfulness_reward/std": 0.8653374314308167, "rewards/safety_reward/mean": 7.707275390625, "rewards/safety_reward/std": 1.2906163930892944, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 51.4453125, "completions/mean_terminated_length": 51.4453125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.11108200157191511, "frac_reward_zero_std": 0.0, "grad_norm": 0.5983542799949646, "kl": 3.10546875, "learning_rate": 5e-05, "loss": 0.0181, "num_tokens": 5282091.0, "reward": 8.086181640625, "reward_std": 0.3683314323425293, "rewards/helpfulness_reward/mean": 0.995112419128418, "rewards/helpfulness_reward/std": 0.701879620552063, "rewards/safety_reward/mean": 8.086181640625, "rewards/safety_reward/std": 0.9311277866363525, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 52.2734375, "completions/mean_terminated_length": 52.2734375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.11143131604226705, "frac_reward_zero_std": 0.0, "grad_norm": 0.697809100151062, "kl": 2.958984375, "learning_rate": 5e-05, "loss": 0.0501, "num_tokens": 5292574.0, "reward": 7.32177734375, "reward_std": 0.28493791818618774, "rewards/helpfulness_reward/mean": 0.5397453308105469, "rewards/helpfulness_reward/std": 0.563825786113739, "rewards/safety_reward/mean": 7.32177734375, "rewards/safety_reward/std": 1.0339521169662476, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 52.09375, "completions/mean_terminated_length": 52.09375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.11178063051261898, "frac_reward_zero_std": 0.0, "grad_norm": 0.901831865310669, "kl": 3.6484375, "learning_rate": 5e-05, "loss": 0.0394, "num_tokens": 5302938.0, "reward": 7.7371826171875, "reward_std": 0.38078629970550537, "rewards/helpfulness_reward/mean": 1.038090705871582, "rewards/helpfulness_reward/std": 0.5413657426834106, "rewards/safety_reward/mean": 7.7371826171875, "rewards/safety_reward/std": 0.8650068640708923, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 52.0078125, "completions/mean_terminated_length": 52.0078125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.11212994498297092, "frac_reward_zero_std": 0.0, "grad_norm": 0.37395423650741577, "kl": 2.9765625, "learning_rate": 5e-05, "loss": 0.0254, "num_tokens": 5313971.0, "reward": 8.147705078125, "reward_std": 0.28225475549697876, "rewards/helpfulness_reward/mean": 1.2442626953125, "rewards/helpfulness_reward/std": 0.6181461811065674, "rewards/safety_reward/mean": 8.147705078125, "rewards/safety_reward/std": 1.1363282203674316, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 51.78125, "completions/mean_terminated_length": 51.78125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.11247925945332285, "frac_reward_zero_std": 0.0, "grad_norm": 0.5053548216819763, "kl": 3.296875, "learning_rate": 5e-05, "loss": 0.0301, "num_tokens": 5325775.0, "reward": 7.75341796875, "reward_std": 0.26883572340011597, "rewards/helpfulness_reward/mean": 1.0648746490478516, "rewards/helpfulness_reward/std": 0.8624622821807861, "rewards/safety_reward/mean": 7.75341796875, "rewards/safety_reward/std": 1.2112613916397095, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 51.5703125, "completions/mean_terminated_length": 51.5703125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.11282857392367479, "frac_reward_zero_std": 0.0, "grad_norm": 0.4525130093097687, "kl": 3.189453125, "learning_rate": 5e-05, "loss": 0.0277, "num_tokens": 5336208.0, "reward": 8.167236328125, "reward_std": 0.2692614793777466, "rewards/helpfulness_reward/mean": 1.0823554992675781, "rewards/helpfulness_reward/std": 0.7254301905632019, "rewards/safety_reward/mean": 8.167236328125, "rewards/safety_reward/std": 1.1329524517059326, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 51.8828125, "completions/mean_terminated_length": 51.8828125, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.11317788839402672, "frac_reward_zero_std": 0.0, "grad_norm": 0.3916407823562622, "kl": 2.943359375, "learning_rate": 5e-05, "loss": 0.02, "num_tokens": 5346441.0, "reward": 7.93212890625, "reward_std": 0.25282129645347595, "rewards/helpfulness_reward/mean": 1.0149612426757812, "rewards/helpfulness_reward/std": 0.5908747911453247, "rewards/safety_reward/mean": 7.93212890625, "rewards/safety_reward/std": 1.0877267122268677, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 52.0546875, "completions/mean_terminated_length": 52.0546875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.11352720286437866, "frac_reward_zero_std": 0.0, "grad_norm": 0.4919320046901703, "kl": 3.189453125, "learning_rate": 5e-05, "loss": 0.0432, "num_tokens": 5357920.0, "reward": 7.7066650390625, "reward_std": 0.29470962285995483, "rewards/helpfulness_reward/mean": 0.6881847381591797, "rewards/helpfulness_reward/std": 0.7848690748214722, "rewards/safety_reward/mean": 7.7066650390625, "rewards/safety_reward/std": 1.3697909116744995, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 51.59375, "completions/mean_terminated_length": 51.59375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.11387651733473059, "frac_reward_zero_std": 0.0, "grad_norm": 0.5427919030189514, "kl": 3.01171875, "learning_rate": 5e-05, "loss": 0.0299, "num_tokens": 5368508.0, "reward": 7.5736083984375, "reward_std": 0.2571260929107666, "rewards/helpfulness_reward/mean": 0.9742927551269531, "rewards/helpfulness_reward/std": 0.8538867831230164, "rewards/safety_reward/mean": 7.5736083984375, "rewards/safety_reward/std": 1.6532076597213745, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 52.2578125, "completions/mean_terminated_length": 52.2578125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.11422583180508253, "frac_reward_zero_std": 0.0, "grad_norm": 0.4412013590335846, "kl": 3.20703125, "learning_rate": 5e-05, "loss": 0.0486, "num_tokens": 5379869.0, "reward": 7.4967041015625, "reward_std": 0.2991712689399719, "rewards/helpfulness_reward/mean": 0.9151554107666016, "rewards/helpfulness_reward/std": 0.6799939274787903, "rewards/safety_reward/mean": 7.4967041015625, "rewards/safety_reward/std": 1.3772403001785278, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 52.5234375, "completions/mean_terminated_length": 52.5234375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.11457514627543446, "frac_reward_zero_std": 0.0, "grad_norm": 0.6341754198074341, "kl": 3.23828125, "learning_rate": 5e-05, "loss": 0.0404, "num_tokens": 5390184.0, "reward": 7.70703125, "reward_std": 0.3715673089027405, "rewards/helpfulness_reward/mean": 1.1408634185791016, "rewards/helpfulness_reward/std": 0.5411721467971802, "rewards/safety_reward/mean": 7.70703125, "rewards/safety_reward/std": 1.0277057886123657, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 52.4609375, "completions/mean_terminated_length": 52.4609375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.1149244607457864, "frac_reward_zero_std": 0.0, "grad_norm": 0.5175265669822693, "kl": 3.1328125, "learning_rate": 5e-05, "loss": 0.0514, "num_tokens": 5401251.0, "reward": 8.270263671875, "reward_std": 0.3425493836402893, "rewards/helpfulness_reward/mean": 1.0025124549865723, "rewards/helpfulness_reward/std": 0.6140477061271667, "rewards/safety_reward/mean": 8.270263671875, "rewards/safety_reward/std": 0.9501005411148071, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 84.0, "completions/max_terminated_length": 84.0, "completions/mean_length": 52.1015625, "completions/mean_terminated_length": 52.1015625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.11527377521613832, "frac_reward_zero_std": 0.0, "grad_norm": 0.3890992999076843, "kl": 3.205078125, "learning_rate": 5e-05, "loss": 0.0386, "num_tokens": 5413808.0, "reward": 7.693115234375, "reward_std": 0.24855345487594604, "rewards/helpfulness_reward/mean": 0.9345567226409912, "rewards/helpfulness_reward/std": 0.5963249802589417, "rewards/safety_reward/mean": 7.693115234375, "rewards/safety_reward/std": 0.743506908416748, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 51.921875, "completions/mean_terminated_length": 51.921875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.11562308968649027, "frac_reward_zero_std": 0.0, "grad_norm": 0.5506869554519653, "kl": 2.86328125, "learning_rate": 5e-05, "loss": 0.0398, "num_tokens": 5425406.0, "reward": 6.9793701171875, "reward_std": 0.21416336297988892, "rewards/helpfulness_reward/mean": 0.5916576385498047, "rewards/helpfulness_reward/std": 1.124584436416626, "rewards/safety_reward/mean": 6.9793701171875, "rewards/safety_reward/std": 1.5948543548583984, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 52.2265625, "completions/mean_terminated_length": 52.2265625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.1159724041568422, "frac_reward_zero_std": 0.0, "grad_norm": 0.5265506505966187, "kl": 3.177734375, "learning_rate": 5e-05, "loss": 0.0564, "num_tokens": 5437115.0, "reward": 7.867431640625, "reward_std": 0.39180299639701843, "rewards/helpfulness_reward/mean": 1.1092214584350586, "rewards/helpfulness_reward/std": 0.7824857234954834, "rewards/safety_reward/mean": 7.867431640625, "rewards/safety_reward/std": 1.172641634941101, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 51.8125, "completions/mean_terminated_length": 51.8125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.11632171862719413, "frac_reward_zero_std": 0.0, "grad_norm": 0.6032008528709412, "kl": 3.103515625, "learning_rate": 5e-05, "loss": 0.0349, "num_tokens": 5447403.0, "reward": 8.56787109375, "reward_std": 0.25705134868621826, "rewards/helpfulness_reward/mean": 1.196279525756836, "rewards/helpfulness_reward/std": 0.6534370183944702, "rewards/safety_reward/mean": 8.56787109375, "rewards/safety_reward/std": 0.9357275366783142, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 51.984375, "completions/mean_terminated_length": 51.984375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.11667103309754606, "frac_reward_zero_std": 0.0, "grad_norm": 0.5274612903594971, "kl": 3.08203125, "learning_rate": 5e-05, "loss": 0.0443, "num_tokens": 5458385.0, "reward": 7.761962890625, "reward_std": 0.3873803913593292, "rewards/helpfulness_reward/mean": 0.8494625091552734, "rewards/helpfulness_reward/std": 0.7547725439071655, "rewards/safety_reward/mean": 7.761962890625, "rewards/safety_reward/std": 1.3913228511810303, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 746.0, "completions/max_terminated_length": 746.0, "completions/mean_length": 62.625, "completions/mean_terminated_length": 62.625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.117020347567898, "frac_reward_zero_std": 0.0, "grad_norm": 0.5303625464439392, "kl": 2.90625, "learning_rate": 5e-05, "loss": 0.0725, "num_tokens": 5471353.0, "reward": 7.43096923828125, "reward_std": 0.26506757736206055, "rewards/helpfulness_reward/mean": 1.0186195373535156, "rewards/helpfulness_reward/std": 0.8808140158653259, "rewards/safety_reward/mean": 7.43096923828125, "rewards/safety_reward/std": 1.5876123905181885, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 51.421875, "completions/mean_terminated_length": 51.421875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.11736966203824993, "frac_reward_zero_std": 0.0, "grad_norm": 0.503108024597168, "kl": 3.134765625, "learning_rate": 5e-05, "loss": 0.0337, "num_tokens": 5482015.0, "reward": 7.887939453125, "reward_std": 0.2793133556842804, "rewards/helpfulness_reward/mean": 1.1343059539794922, "rewards/helpfulness_reward/std": 0.7739255428314209, "rewards/safety_reward/mean": 7.887939453125, "rewards/safety_reward/std": 1.0745576620101929, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 52.9375, "completions/mean_terminated_length": 52.9375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.11771897650860187, "frac_reward_zero_std": 0.0, "grad_norm": 0.8438085913658142, "kl": 3.0625, "learning_rate": 5e-05, "loss": 0.047, "num_tokens": 5492535.0, "reward": 7.692138671875, "reward_std": 0.3812427222728729, "rewards/helpfulness_reward/mean": 0.9357404708862305, "rewards/helpfulness_reward/std": 0.5504415035247803, "rewards/safety_reward/mean": 7.692138671875, "rewards/safety_reward/std": 1.6914112567901611, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 51.9375, "completions/mean_terminated_length": 51.9375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.1180682909789538, "frac_reward_zero_std": 0.0, "grad_norm": 0.5511302351951599, "kl": 3.203125, "learning_rate": 5e-05, "loss": 0.0324, "num_tokens": 5503855.0, "reward": 8.038818359375, "reward_std": 0.446083664894104, "rewards/helpfulness_reward/mean": 1.1539058685302734, "rewards/helpfulness_reward/std": 0.6601349115371704, "rewards/safety_reward/mean": 8.038818359375, "rewards/safety_reward/std": 1.1336930990219116, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 52.5703125, "completions/mean_terminated_length": 52.5703125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.11841760544930574, "frac_reward_zero_std": 0.0, "grad_norm": 0.40585508942604065, "kl": 3.013671875, "learning_rate": 5e-05, "loss": 0.0558, "num_tokens": 5516200.0, "reward": 7.42816162109375, "reward_std": 0.2840356230735779, "rewards/helpfulness_reward/mean": 0.916778564453125, "rewards/helpfulness_reward/std": 1.2232921123504639, "rewards/safety_reward/mean": 7.42816162109375, "rewards/safety_reward/std": 1.6893788576126099, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 51.6640625, "completions/mean_terminated_length": 51.6640625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.11876691991965767, "frac_reward_zero_std": 0.0, "grad_norm": 0.5656968355178833, "kl": 2.875, "learning_rate": 5e-05, "loss": 0.0224, "num_tokens": 5528053.0, "reward": 7.4603271484375, "reward_std": 0.3610985279083252, "rewards/helpfulness_reward/mean": 0.8088068962097168, "rewards/helpfulness_reward/std": 0.6551220417022705, "rewards/safety_reward/mean": 7.4603271484375, "rewards/safety_reward/std": 1.2562602758407593, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 52.734375, "completions/mean_terminated_length": 52.734375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.11911623439000961, "frac_reward_zero_std": 0.0, "grad_norm": 0.4727332890033722, "kl": 3.087890625, "learning_rate": 5e-05, "loss": 0.0471, "num_tokens": 5538379.0, "reward": 8.14599609375, "reward_std": 0.36146968603134155, "rewards/helpfulness_reward/mean": 1.0590400695800781, "rewards/helpfulness_reward/std": 0.8965274095535278, "rewards/safety_reward/mean": 8.14599609375, "rewards/safety_reward/std": 1.481903076171875, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 52.703125, "completions/mean_terminated_length": 52.703125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.11946554886036154, "frac_reward_zero_std": 0.0, "grad_norm": 0.5868703126907349, "kl": 2.96875, "learning_rate": 5e-05, "loss": 0.0637, "num_tokens": 5549205.0, "reward": 7.944091796875, "reward_std": 0.47296571731567383, "rewards/helpfulness_reward/mean": 0.9524726867675781, "rewards/helpfulness_reward/std": 0.8264049291610718, "rewards/safety_reward/mean": 7.944091796875, "rewards/safety_reward/std": 0.9654833674430847, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 51.921875, "completions/mean_terminated_length": 51.921875, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.11981486333071348, "frac_reward_zero_std": 0.0, "grad_norm": 0.39554843306541443, "kl": 2.8359375, "learning_rate": 5e-05, "loss": 0.0366, "num_tokens": 5560715.0, "reward": 7.53515625, "reward_std": 0.27375394105911255, "rewards/helpfulness_reward/mean": 0.9209709167480469, "rewards/helpfulness_reward/std": 0.7810530662536621, "rewards/safety_reward/mean": 7.53515625, "rewards/safety_reward/std": 1.3640583753585815, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 52.0546875, "completions/mean_terminated_length": 52.0546875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.1201641778010654, "frac_reward_zero_std": 0.0, "grad_norm": 0.6437555551528931, "kl": 3.060546875, "learning_rate": 5e-05, "loss": 0.0287, "num_tokens": 5571626.0, "reward": 8.00146484375, "reward_std": 0.34526827931404114, "rewards/helpfulness_reward/mean": 0.871424674987793, "rewards/helpfulness_reward/std": 0.6864902973175049, "rewards/safety_reward/mean": 8.00146484375, "rewards/safety_reward/std": 1.0933728218078613, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 51.890625, "completions/mean_terminated_length": 51.890625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.12051349227141735, "frac_reward_zero_std": 0.0, "grad_norm": 0.5851004719734192, "kl": 3.201171875, "learning_rate": 5e-05, "loss": 0.0324, "num_tokens": 5583108.0, "reward": 7.8212890625, "reward_std": 0.3023041784763336, "rewards/helpfulness_reward/mean": 0.7629876136779785, "rewards/helpfulness_reward/std": 1.1552739143371582, "rewards/safety_reward/mean": 7.8212890625, "rewards/safety_reward/std": 1.1732066869735718, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 52.2890625, "completions/mean_terminated_length": 52.2890625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.12086280674176927, "frac_reward_zero_std": 0.0, "grad_norm": 0.48807868361473083, "kl": 2.8203125, "learning_rate": 5e-05, "loss": 0.0369, "num_tokens": 5594761.0, "reward": 7.8612060546875, "reward_std": 0.38173025846481323, "rewards/helpfulness_reward/mean": 0.6592650413513184, "rewards/helpfulness_reward/std": 0.954247772693634, "rewards/safety_reward/mean": 7.8612060546875, "rewards/safety_reward/std": 1.3083465099334717, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 52.90625, "completions/mean_terminated_length": 52.90625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.12121212121212122, "frac_reward_zero_std": 0.0, "grad_norm": 0.54925137758255, "kl": 2.875, "learning_rate": 5e-05, "loss": 0.0376, "num_tokens": 5606197.0, "reward": 8.03662109375, "reward_std": 0.3394629657268524, "rewards/helpfulness_reward/mean": 0.9388339519500732, "rewards/helpfulness_reward/std": 0.6241660714149475, "rewards/safety_reward/mean": 8.03662109375, "rewards/safety_reward/std": 0.9059920907020569, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 52.609375, "completions/mean_terminated_length": 52.609375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.12156143568247314, "frac_reward_zero_std": 0.0, "grad_norm": 0.6612775921821594, "kl": 3.392578125, "learning_rate": 5e-05, "loss": 0.0516, "num_tokens": 5617419.0, "reward": 8.2822265625, "reward_std": 0.2859109044075012, "rewards/helpfulness_reward/mean": 0.8676853179931641, "rewards/helpfulness_reward/std": 0.8613160252571106, "rewards/safety_reward/mean": 8.2822265625, "rewards/safety_reward/std": 0.9029214978218079, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 52.1875, "completions/mean_terminated_length": 52.1875, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.12191075015282508, "frac_reward_zero_std": 0.0, "grad_norm": 0.40191566944122314, "kl": 3.091796875, "learning_rate": 5e-05, "loss": 0.0257, "num_tokens": 5628363.0, "reward": 8.0634765625, "reward_std": 0.2862740457057953, "rewards/helpfulness_reward/mean": 0.6048237085342407, "rewards/helpfulness_reward/std": 0.7748916745185852, "rewards/safety_reward/mean": 8.0634765625, "rewards/safety_reward/std": 1.0567594766616821, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 52.546875, "completions/mean_terminated_length": 52.546875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.12226006462317701, "frac_reward_zero_std": 0.0, "grad_norm": 0.46219581365585327, "kl": 3.05078125, "learning_rate": 5e-05, "loss": 0.0333, "num_tokens": 5639217.0, "reward": 7.88818359375, "reward_std": 0.1687239110469818, "rewards/helpfulness_reward/mean": 0.8499716520309448, "rewards/helpfulness_reward/std": 0.5849139094352722, "rewards/safety_reward/mean": 7.88818359375, "rewards/safety_reward/std": 1.0781373977661133, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 86.0, "completions/max_terminated_length": 86.0, "completions/mean_length": 52.1640625, "completions/mean_terminated_length": 52.1640625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.12260937909352895, "frac_reward_zero_std": 0.0, "grad_norm": 0.47235456109046936, "kl": 2.876953125, "learning_rate": 5e-05, "loss": 0.0337, "num_tokens": 5651438.0, "reward": 7.7532958984375, "reward_std": 0.20605874061584473, "rewards/helpfulness_reward/mean": 0.8357734680175781, "rewards/helpfulness_reward/std": 0.7822549939155579, "rewards/safety_reward/mean": 7.7532958984375, "rewards/safety_reward/std": 1.4155501127243042, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 52.3515625, "completions/mean_terminated_length": 52.3515625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.12295869356388088, "frac_reward_zero_std": 0.0, "grad_norm": 0.418874055147171, "kl": 3.396484375, "learning_rate": 5e-05, "loss": 0.0304, "num_tokens": 5661939.0, "reward": 7.570556640625, "reward_std": 0.25256091356277466, "rewards/helpfulness_reward/mean": 0.7443408966064453, "rewards/helpfulness_reward/std": 0.7717351913452148, "rewards/safety_reward/mean": 7.570556640625, "rewards/safety_reward/std": 1.195876955986023, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 52.6796875, "completions/mean_terminated_length": 52.6796875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.12330800803423282, "frac_reward_zero_std": 0.0, "grad_norm": 0.417724072933197, "kl": 3.17578125, "learning_rate": 5e-05, "loss": 0.0363, "num_tokens": 5674162.0, "reward": 7.4306640625, "reward_std": 0.20728306472301483, "rewards/helpfulness_reward/mean": 0.6283899545669556, "rewards/helpfulness_reward/std": 0.8284946084022522, "rewards/safety_reward/mean": 7.4306640625, "rewards/safety_reward/std": 1.3820836544036865, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 51.8515625, "completions/mean_terminated_length": 51.8515625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.12365732250458475, "frac_reward_zero_std": 0.0, "grad_norm": 0.460274875164032, "kl": 2.978515625, "learning_rate": 5e-05, "loss": 0.0241, "num_tokens": 5685391.0, "reward": 7.950927734375, "reward_std": 0.24030902981758118, "rewards/helpfulness_reward/mean": 0.8275848031044006, "rewards/helpfulness_reward/std": 0.7209804058074951, "rewards/safety_reward/mean": 7.950927734375, "rewards/safety_reward/std": 1.0017261505126953, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 52.3515625, "completions/mean_terminated_length": 52.3515625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.12400663697493669, "frac_reward_zero_std": 0.0, "grad_norm": 0.5186414122581482, "kl": 2.892578125, "learning_rate": 5e-05, "loss": 0.0355, "num_tokens": 5696092.0, "reward": 8.105224609375, "reward_std": 0.20423835515975952, "rewards/helpfulness_reward/mean": 0.9129862785339355, "rewards/helpfulness_reward/std": 0.6282588839530945, "rewards/safety_reward/mean": 8.105224609375, "rewards/safety_reward/std": 0.9687132239341736, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 51.7421875, "completions/mean_terminated_length": 51.7421875, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.12435595144528862, "frac_reward_zero_std": 0.0, "grad_norm": 2.377795934677124, "kl": 4.029296875, "learning_rate": 5e-05, "loss": 0.0526, "num_tokens": 5710267.0, "reward": 7.84521484375, "reward_std": 0.3456413447856903, "rewards/helpfulness_reward/mean": 0.7978267669677734, "rewards/helpfulness_reward/std": 0.9640175700187683, "rewards/safety_reward/mean": 7.84521484375, "rewards/safety_reward/std": 1.9464144706726074, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 88.0, "completions/max_terminated_length": 88.0, "completions/mean_length": 52.96875, "completions/mean_terminated_length": 52.96875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.12470526591564056, "frac_reward_zero_std": 0.0, "grad_norm": 0.6228832602500916, "kl": 3.11328125, "learning_rate": 5e-05, "loss": 0.0493, "num_tokens": 5721631.0, "reward": 7.91162109375, "reward_std": 0.34924057126045227, "rewards/helpfulness_reward/mean": 0.9047164916992188, "rewards/helpfulness_reward/std": 0.8101896047592163, "rewards/safety_reward/mean": 7.91162109375, "rewards/safety_reward/std": 1.1509548425674438, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 52.359375, "completions/mean_terminated_length": 52.359375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.1250545803859925, "frac_reward_zero_std": 0.0, "grad_norm": 0.6570159792900085, "kl": 3.0625, "learning_rate": 5e-05, "loss": 0.0296, "num_tokens": 5732973.0, "reward": 8.1103515625, "reward_std": 0.2990683317184448, "rewards/helpfulness_reward/mean": 0.9485225677490234, "rewards/helpfulness_reward/std": 0.8864178657531738, "rewards/safety_reward/mean": 8.1103515625, "rewards/safety_reward/std": 0.9660668969154358, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 51.90625, "completions/mean_terminated_length": 51.90625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.12540389485634443, "frac_reward_zero_std": 0.0, "grad_norm": 0.6880946755409241, "kl": 3.26953125, "learning_rate": 5e-05, "loss": 0.0228, "num_tokens": 5743401.0, "reward": 7.8677978515625, "reward_std": 0.44801852107048035, "rewards/helpfulness_reward/mean": 0.7872767448425293, "rewards/helpfulness_reward/std": 0.8988798260688782, "rewards/safety_reward/mean": 7.8677978515625, "rewards/safety_reward/std": 1.2960556745529175, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 52.421875, "completions/mean_terminated_length": 52.421875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.12575320932669637, "frac_reward_zero_std": 0.0, "grad_norm": 1.31740140914917, "kl": 3.6484375, "learning_rate": 5e-05, "loss": 0.0481, "num_tokens": 5755503.0, "reward": 7.280487060546875, "reward_std": 0.3051735758781433, "rewards/helpfulness_reward/mean": 0.6218070983886719, "rewards/helpfulness_reward/std": 1.066353678703308, "rewards/safety_reward/mean": 7.280487060546875, "rewards/safety_reward/std": 2.0798301696777344, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 52.25, "completions/mean_terminated_length": 52.25, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.12610252379704828, "frac_reward_zero_std": 0.0, "grad_norm": 0.4847746789455414, "kl": 2.951171875, "learning_rate": 5e-05, "loss": 0.0286, "num_tokens": 5765975.0, "reward": 7.677734375, "reward_std": 0.22535257041454315, "rewards/helpfulness_reward/mean": 0.7783498764038086, "rewards/helpfulness_reward/std": 0.7477624416351318, "rewards/safety_reward/mean": 7.677734375, "rewards/safety_reward/std": 1.020947813987732, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 52.3046875, "completions/mean_terminated_length": 52.3046875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.12645183826740022, "frac_reward_zero_std": 0.0, "grad_norm": 0.4478297531604767, "kl": 2.994140625, "learning_rate": 5e-05, "loss": 0.0396, "num_tokens": 5777022.0, "reward": 7.919677734375, "reward_std": 0.3455789089202881, "rewards/helpfulness_reward/mean": 0.5602807998657227, "rewards/helpfulness_reward/std": 0.7155314683914185, "rewards/safety_reward/mean": 7.919677734375, "rewards/safety_reward/std": 1.1247859001159668, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 52.0390625, "completions/mean_terminated_length": 52.0390625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.12680115273775217, "frac_reward_zero_std": 0.0, "grad_norm": 0.69840407371521, "kl": 3.53515625, "learning_rate": 5e-05, "loss": 0.0302, "num_tokens": 5787883.0, "reward": 7.432861328125, "reward_std": 0.301943838596344, "rewards/helpfulness_reward/mean": 1.0567216873168945, "rewards/helpfulness_reward/std": 0.8828763365745544, "rewards/safety_reward/mean": 7.432861328125, "rewards/safety_reward/std": 0.9930405616760254, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 52.1796875, "completions/mean_terminated_length": 52.1796875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.1271504672081041, "frac_reward_zero_std": 0.0, "grad_norm": 0.798984169960022, "kl": 3.431640625, "learning_rate": 5e-05, "loss": 0.0472, "num_tokens": 5798978.0, "reward": 8.088623046875, "reward_std": 0.3115752041339874, "rewards/helpfulness_reward/mean": 0.5150089263916016, "rewards/helpfulness_reward/std": 0.7371346354484558, "rewards/safety_reward/mean": 8.088623046875, "rewards/safety_reward/std": 1.0499327182769775, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 52.1796875, "completions/mean_terminated_length": 52.1796875, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.12749978167845602, "frac_reward_zero_std": 0.0, "grad_norm": 0.5087329745292664, "kl": 3.06640625, "learning_rate": 5e-05, "loss": 0.0347, "num_tokens": 5810505.0, "reward": 8.08154296875, "reward_std": 0.34850266575813293, "rewards/helpfulness_reward/mean": 0.7772026062011719, "rewards/helpfulness_reward/std": 0.7043159604072571, "rewards/safety_reward/mean": 8.08154296875, "rewards/safety_reward/std": 1.1014962196350098, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 52.5546875, "completions/mean_terminated_length": 52.5546875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.12784909614880796, "frac_reward_zero_std": 0.0, "grad_norm": 0.36164870858192444, "kl": 3.306640625, "learning_rate": 5e-05, "loss": 0.0468, "num_tokens": 5821448.0, "reward": 8.366455078125, "reward_std": 0.21309977769851685, "rewards/helpfulness_reward/mean": 1.0300936698913574, "rewards/helpfulness_reward/std": 0.6017800569534302, "rewards/safety_reward/mean": 8.366455078125, "rewards/safety_reward/std": 0.8640049695968628, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 51.859375, "completions/mean_terminated_length": 51.859375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.1281984106191599, "frac_reward_zero_std": 0.0, "grad_norm": 0.4905009865760803, "kl": 2.9609375, "learning_rate": 5e-05, "loss": 0.0294, "num_tokens": 5832310.0, "reward": 7.9095458984375, "reward_std": 0.24796149134635925, "rewards/helpfulness_reward/mean": 0.9261689186096191, "rewards/helpfulness_reward/std": 0.6750937104225159, "rewards/safety_reward/mean": 7.9095458984375, "rewards/safety_reward/std": 1.603771686553955, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 52.203125, "completions/mean_terminated_length": 52.203125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.12854772508951184, "frac_reward_zero_std": 0.0, "grad_norm": 0.4992920160293579, "kl": 3.04296875, "learning_rate": 5e-05, "loss": 0.0392, "num_tokens": 5843056.0, "reward": 8.40966796875, "reward_std": 0.4571441411972046, "rewards/helpfulness_reward/mean": 1.1340234279632568, "rewards/helpfulness_reward/std": 0.7509950399398804, "rewards/safety_reward/mean": 8.40966796875, "rewards/safety_reward/std": 0.8842856287956238, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 85.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 52.6640625, "completions/mean_terminated_length": 52.6640625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.12889703955986376, "frac_reward_zero_std": 0.125, "grad_norm": 0.44700509309768677, "kl": 3.1015625, "learning_rate": 5e-05, "loss": 0.0564, "num_tokens": 5853325.0, "reward": 7.850830078125, "reward_std": 0.3213415741920471, "rewards/helpfulness_reward/mean": 0.8952579498291016, "rewards/helpfulness_reward/std": 0.6253082156181335, "rewards/safety_reward/mean": 7.850830078125, "rewards/safety_reward/std": 1.009423017501831, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 90.0, "completions/max_terminated_length": 90.0, "completions/mean_length": 52.6875, "completions/mean_terminated_length": 52.6875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.1292463540302157, "frac_reward_zero_std": 0.0625, "grad_norm": 0.526551365852356, "kl": 3.203125, "learning_rate": 5e-05, "loss": 0.0538, "num_tokens": 5867293.0, "reward": 7.6917724609375, "reward_std": 0.39029547572135925, "rewards/helpfulness_reward/mean": 0.834294319152832, "rewards/helpfulness_reward/std": 0.7285851836204529, "rewards/safety_reward/mean": 7.6917724609375, "rewards/safety_reward/std": 1.5089606046676636, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 51.96875, "completions/mean_terminated_length": 51.96875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.12959566850056764, "frac_reward_zero_std": 0.0, "grad_norm": 2.5338854789733887, "kl": 4.080078125, "learning_rate": 5e-05, "loss": 0.045, "num_tokens": 5878593.0, "reward": 8.06689453125, "reward_std": 0.26359304785728455, "rewards/helpfulness_reward/mean": 0.9183769226074219, "rewards/helpfulness_reward/std": 0.700234055519104, "rewards/safety_reward/mean": 8.06689453125, "rewards/safety_reward/std": 0.9888786673545837, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 52.71875, "completions/mean_terminated_length": 52.71875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.12994498297091958, "frac_reward_zero_std": 0.0, "grad_norm": 0.423992395401001, "kl": 3.095703125, "learning_rate": 5e-05, "loss": 0.0469, "num_tokens": 5890669.0, "reward": 8.3056640625, "reward_std": 0.3482555150985718, "rewards/helpfulness_reward/mean": 1.0793895721435547, "rewards/helpfulness_reward/std": 0.5805417895317078, "rewards/safety_reward/mean": 8.3056640625, "rewards/safety_reward/std": 0.9740965366363525, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 85.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 52.328125, "completions/mean_terminated_length": 52.328125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.1302942974412715, "frac_reward_zero_std": 0.0625, "grad_norm": 0.4177128076553345, "kl": 3.3125, "learning_rate": 5e-05, "loss": 0.0459, "num_tokens": 5901351.0, "reward": 8.3369140625, "reward_std": 0.27853578329086304, "rewards/helpfulness_reward/mean": 1.2273750305175781, "rewards/helpfulness_reward/std": 0.7272596955299377, "rewards/safety_reward/mean": 8.3369140625, "rewards/safety_reward/std": 0.8917816877365112, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 52.6953125, "completions/mean_terminated_length": 52.6953125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.13064361191162344, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5439137816429138, "kl": 2.9921875, "learning_rate": 5e-05, "loss": 0.0492, "num_tokens": 5912096.0, "reward": 7.77392578125, "reward_std": 0.2832643985748291, "rewards/helpfulness_reward/mean": 0.7718939781188965, "rewards/helpfulness_reward/std": 0.7943907976150513, "rewards/safety_reward/mean": 7.77392578125, "rewards/safety_reward/std": 1.4787520170211792, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 52.2890625, "completions/mean_terminated_length": 52.2890625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.13099292638197538, "frac_reward_zero_std": 0.0, "grad_norm": 0.3726462423801422, "kl": 3.064453125, "learning_rate": 5e-05, "loss": 0.0398, "num_tokens": 5923941.0, "reward": 8.0189208984375, "reward_std": 0.2442430555820465, "rewards/helpfulness_reward/mean": 0.8827018737792969, "rewards/helpfulness_reward/std": 0.7749543190002441, "rewards/safety_reward/mean": 8.0189208984375, "rewards/safety_reward/std": 1.2062163352966309, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 736.0, "completions/max_terminated_length": 736.0, "completions/mean_length": 66.8046875, "completions/mean_terminated_length": 66.8046875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.13134224085232732, "frac_reward_zero_std": 0.0, "grad_norm": 15.045563697814941, "kl": 8.48046875, "learning_rate": 5e-05, "loss": 0.1451, "num_tokens": 5940068.0, "reward": 7.048553466796875, "reward_std": 0.36525315046310425, "rewards/helpfulness_reward/mean": 0.6081229448318481, "rewards/helpfulness_reward/std": 1.0289113521575928, "rewards/safety_reward/mean": 7.048553466796875, "rewards/safety_reward/std": 2.5162224769592285, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 52.1640625, "completions/mean_terminated_length": 52.1640625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.13169155532267923, "frac_reward_zero_std": 0.0, "grad_norm": 0.5597753524780273, "kl": 3.22265625, "learning_rate": 5e-05, "loss": 0.0504, "num_tokens": 5950753.0, "reward": 8.135009765625, "reward_std": 0.3576928973197937, "rewards/helpfulness_reward/mean": 1.0849990844726562, "rewards/helpfulness_reward/std": 0.8829225897789001, "rewards/safety_reward/mean": 8.135009765625, "rewards/safety_reward/std": 0.8592949509620667, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 52.15625, "completions/mean_terminated_length": 52.15625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.13204086979303117, "frac_reward_zero_std": 0.0, "grad_norm": 0.581575870513916, "kl": 3.333984375, "learning_rate": 5e-05, "loss": 0.0457, "num_tokens": 5963117.0, "reward": 8.193359375, "reward_std": 0.4023078680038452, "rewards/helpfulness_reward/mean": 1.1351776123046875, "rewards/helpfulness_reward/std": 0.6759616732597351, "rewards/safety_reward/mean": 8.193359375, "rewards/safety_reward/std": 1.2385532855987549, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 86.0, "completions/max_terminated_length": 86.0, "completions/mean_length": 52.53125, "completions/mean_terminated_length": 52.53125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.13239018426338311, "frac_reward_zero_std": 0.0, "grad_norm": 0.44966015219688416, "kl": 3.150390625, "learning_rate": 5e-05, "loss": 0.0375, "num_tokens": 5977025.0, "reward": 7.98675537109375, "reward_std": 0.30386534333229065, "rewards/helpfulness_reward/mean": 0.7973594665527344, "rewards/helpfulness_reward/std": 0.9609074592590332, "rewards/safety_reward/mean": 7.98675537109375, "rewards/safety_reward/std": 1.7788515090942383, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 51.6328125, "completions/mean_terminated_length": 51.6328125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.13273949873373506, "frac_reward_zero_std": 0.0, "grad_norm": 1.385069727897644, "kl": 3.712890625, "learning_rate": 5e-05, "loss": 0.0233, "num_tokens": 5988546.0, "reward": 7.469970703125, "reward_std": 0.33397841453552246, "rewards/helpfulness_reward/mean": 0.8789873123168945, "rewards/helpfulness_reward/std": 1.028361439704895, "rewards/safety_reward/mean": 7.469970703125, "rewards/safety_reward/std": 1.4443116188049316, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 51.9453125, "completions/mean_terminated_length": 51.9453125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.13308881320408697, "frac_reward_zero_std": 0.0, "grad_norm": 0.6332398056983948, "kl": 3.494140625, "learning_rate": 5e-05, "loss": 0.0349, "num_tokens": 5999347.0, "reward": 7.709716796875, "reward_std": 0.3970968723297119, "rewards/helpfulness_reward/mean": 1.1333446502685547, "rewards/helpfulness_reward/std": 0.6484174728393555, "rewards/safety_reward/mean": 7.709716796875, "rewards/safety_reward/std": 1.5297623872756958, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 52.125, "completions/mean_terminated_length": 52.125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.1334381276744389, "frac_reward_zero_std": 0.0, "grad_norm": 0.4118211269378662, "kl": 3.041015625, "learning_rate": 5e-05, "loss": 0.0395, "num_tokens": 6010331.0, "reward": 8.149169921875, "reward_std": 0.2753380537033081, "rewards/helpfulness_reward/mean": 1.0928850173950195, "rewards/helpfulness_reward/std": 0.7844544649124146, "rewards/safety_reward/mean": 8.149169921875, "rewards/safety_reward/std": 0.9775894284248352, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 51.953125, "completions/mean_terminated_length": 51.953125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.13378744214479085, "frac_reward_zero_std": 0.0, "grad_norm": 0.4862751066684723, "kl": 3.220703125, "learning_rate": 5e-05, "loss": 0.0343, "num_tokens": 6022629.0, "reward": 7.945556640625, "reward_std": 0.2606816291809082, "rewards/helpfulness_reward/mean": 1.1252975463867188, "rewards/helpfulness_reward/std": 0.7721318602561951, "rewards/safety_reward/mean": 7.945556640625, "rewards/safety_reward/std": 1.0965497493743896, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 52.3046875, "completions/mean_terminated_length": 52.3046875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.1341367566151428, "frac_reward_zero_std": 0.0, "grad_norm": 0.5951616168022156, "kl": 3.2265625, "learning_rate": 5e-05, "loss": 0.0387, "num_tokens": 6033444.0, "reward": 8.19873046875, "reward_std": 0.2944105863571167, "rewards/helpfulness_reward/mean": 1.0298576354980469, "rewards/helpfulness_reward/std": 0.8613808155059814, "rewards/safety_reward/mean": 8.19873046875, "rewards/safety_reward/std": 0.966144859790802, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 52.1875, "completions/mean_terminated_length": 52.1875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.1344860710854947, "frac_reward_zero_std": 0.0, "grad_norm": 0.49059686064720154, "kl": 3.17578125, "learning_rate": 5e-05, "loss": 0.0408, "num_tokens": 6044396.0, "reward": 8.134765625, "reward_std": 0.34284570813179016, "rewards/helpfulness_reward/mean": 1.0995922088623047, "rewards/helpfulness_reward/std": 0.7559216022491455, "rewards/safety_reward/mean": 8.134765625, "rewards/safety_reward/std": 0.9070578217506409, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 52.3046875, "completions/mean_terminated_length": 52.3046875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.13483538555584665, "frac_reward_zero_std": 0.0, "grad_norm": 0.6786602139472961, "kl": 3.439453125, "learning_rate": 5e-05, "loss": 0.0374, "num_tokens": 6054875.0, "reward": 8.1787109375, "reward_std": 0.3524566888809204, "rewards/helpfulness_reward/mean": 0.7165360450744629, "rewards/helpfulness_reward/std": 0.6787005066871643, "rewards/safety_reward/mean": 8.1787109375, "rewards/safety_reward/std": 0.9254220128059387, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 52.53125, "completions/mean_terminated_length": 52.53125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.1351847000261986, "frac_reward_zero_std": 0.0, "grad_norm": 0.6849316954612732, "kl": 3.671875, "learning_rate": 5e-05, "loss": 0.0547, "num_tokens": 6065335.0, "reward": 7.9892578125, "reward_std": 0.42491549253463745, "rewards/helpfulness_reward/mean": 0.8934340476989746, "rewards/helpfulness_reward/std": 0.6972825527191162, "rewards/safety_reward/mean": 7.9892578125, "rewards/safety_reward/std": 1.1889783143997192, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 52.1015625, "completions/mean_terminated_length": 52.1015625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.13553401449655053, "frac_reward_zero_std": 0.0, "grad_norm": 0.387481689453125, "kl": 2.966796875, "learning_rate": 5e-05, "loss": 0.0252, "num_tokens": 6078156.0, "reward": 7.80322265625, "reward_std": 0.20955094695091248, "rewards/helpfulness_reward/mean": 0.6689577102661133, "rewards/helpfulness_reward/std": 0.6148703694343567, "rewards/safety_reward/mean": 7.80322265625, "rewards/safety_reward/std": 0.9181095361709595, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 52.1875, "completions/mean_terminated_length": 52.1875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.13588332896690244, "frac_reward_zero_std": 0.0, "grad_norm": 0.4455218017101288, "kl": 3.244140625, "learning_rate": 5e-05, "loss": 0.0406, "num_tokens": 6088444.0, "reward": 8.03125, "reward_std": 0.3460593819618225, "rewards/helpfulness_reward/mean": 1.0394477844238281, "rewards/helpfulness_reward/std": 0.5454636216163635, "rewards/safety_reward/mean": 8.03125, "rewards/safety_reward/std": 1.0130409002304077, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 52.53125, "completions/mean_terminated_length": 52.53125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.13623264343725439, "frac_reward_zero_std": 0.0, "grad_norm": 0.4738694131374359, "kl": 3.201171875, "learning_rate": 5e-05, "loss": 0.0433, "num_tokens": 6101480.0, "reward": 7.953369140625, "reward_std": 0.3371990919113159, "rewards/helpfulness_reward/mean": 0.8174505233764648, "rewards/helpfulness_reward/std": 0.7477365136146545, "rewards/safety_reward/mean": 7.953369140625, "rewards/safety_reward/std": 1.0205503702163696, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 52.28125, "completions/mean_terminated_length": 52.28125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.13658195790760633, "frac_reward_zero_std": 0.0, "grad_norm": 0.5031687021255493, "kl": 3.458984375, "learning_rate": 5e-05, "loss": 0.0275, "num_tokens": 6112436.0, "reward": 7.79736328125, "reward_std": 0.31849968433380127, "rewards/helpfulness_reward/mean": 0.7040083408355713, "rewards/helpfulness_reward/std": 0.6577455401420593, "rewards/safety_reward/mean": 7.79736328125, "rewards/safety_reward/std": 1.223167896270752, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 51.609375, "completions/mean_terminated_length": 51.609375, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.13693127237795827, "frac_reward_zero_std": 0.0, "grad_norm": 0.7122562527656555, "kl": 3.427734375, "learning_rate": 5e-05, "loss": 0.0044, "num_tokens": 6124866.0, "reward": 7.9169921875, "reward_std": 0.4090579152107239, "rewards/helpfulness_reward/mean": 0.8891239166259766, "rewards/helpfulness_reward/std": 0.8263025283813477, "rewards/safety_reward/mean": 7.9169921875, "rewards/safety_reward/std": 1.0749647617340088, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 52.140625, "completions/mean_terminated_length": 52.140625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.13728058684831018, "frac_reward_zero_std": 0.0, "grad_norm": 0.7953011393547058, "kl": 3.7265625, "learning_rate": 5e-05, "loss": 0.0327, "num_tokens": 6135124.0, "reward": 7.558349609375, "reward_std": 0.27341392636299133, "rewards/helpfulness_reward/mean": 0.5948424935340881, "rewards/helpfulness_reward/std": 0.9042920470237732, "rewards/safety_reward/mean": 7.558349609375, "rewards/safety_reward/std": 1.2737855911254883, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 52.6171875, "completions/mean_terminated_length": 52.6171875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.13762990131866212, "frac_reward_zero_std": 0.0, "grad_norm": 0.42502713203430176, "kl": 3.162109375, "learning_rate": 5e-05, "loss": 0.0387, "num_tokens": 6145675.0, "reward": 8.385009765625, "reward_std": 0.27614182233810425, "rewards/helpfulness_reward/mean": 0.9072946310043335, "rewards/helpfulness_reward/std": 0.7510352730751038, "rewards/safety_reward/mean": 8.385009765625, "rewards/safety_reward/std": 1.1416966915130615, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 52.578125, "completions/mean_terminated_length": 52.578125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.13797921578901406, "frac_reward_zero_std": 0.0, "grad_norm": 0.43222352862358093, "kl": 3.103515625, "learning_rate": 5e-05, "loss": 0.0368, "num_tokens": 6156309.0, "reward": 8.21728515625, "reward_std": 0.3773415684700012, "rewards/helpfulness_reward/mean": 1.1715388298034668, "rewards/helpfulness_reward/std": 0.8348271250724792, "rewards/safety_reward/mean": 8.21728515625, "rewards/safety_reward/std": 0.8654557466506958, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 52.1875, "completions/mean_terminated_length": 52.1875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.138328530259366, "frac_reward_zero_std": 0.0, "grad_norm": 0.4504859149456024, "kl": 3.251953125, "learning_rate": 5e-05, "loss": 0.0295, "num_tokens": 6166789.0, "reward": 8.3876953125, "reward_std": 0.3548297882080078, "rewards/helpfulness_reward/mean": 1.1927299499511719, "rewards/helpfulness_reward/std": 0.6856167912483215, "rewards/safety_reward/mean": 8.3876953125, "rewards/safety_reward/std": 1.0269755125045776, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 51.7578125, "completions/mean_terminated_length": 51.7578125, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.13867784472971792, "frac_reward_zero_std": 0.0, "grad_norm": 0.6227671504020691, "kl": 3.5703125, "learning_rate": 5e-05, "loss": 0.0182, "num_tokens": 6178334.0, "reward": 7.7445068359375, "reward_std": 0.33872339129447937, "rewards/helpfulness_reward/mean": 0.7492642402648926, "rewards/helpfulness_reward/std": 0.9546976685523987, "rewards/safety_reward/mean": 7.7445068359375, "rewards/safety_reward/std": 1.6437729597091675, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 52.265625, "completions/mean_terminated_length": 52.265625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.13902715920006986, "frac_reward_zero_std": 0.0625, "grad_norm": 0.3430500328540802, "kl": 3.248046875, "learning_rate": 5e-05, "loss": 0.0358, "num_tokens": 6190304.0, "reward": 8.00341796875, "reward_std": 0.15879738330841064, "rewards/helpfulness_reward/mean": 0.8080272674560547, "rewards/helpfulness_reward/std": 0.7775546908378601, "rewards/safety_reward/mean": 8.00341796875, "rewards/safety_reward/std": 1.069124698638916, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 52.375, "completions/mean_terminated_length": 52.375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.1393764736704218, "frac_reward_zero_std": 0.0625, "grad_norm": 0.32865267992019653, "kl": 3.142578125, "learning_rate": 5e-05, "loss": 0.0353, "num_tokens": 6200368.0, "reward": 7.874755859375, "reward_std": 0.2475697547197342, "rewards/helpfulness_reward/mean": 0.7613364458084106, "rewards/helpfulness_reward/std": 0.7831569910049438, "rewards/safety_reward/mean": 7.874755859375, "rewards/safety_reward/std": 1.141087532043457, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 52.0859375, "completions/mean_terminated_length": 52.0859375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.13972578814077374, "frac_reward_zero_std": 0.0, "grad_norm": 0.4153243899345398, "kl": 3.392578125, "learning_rate": 5e-05, "loss": 0.0369, "num_tokens": 6211323.0, "reward": 8.504638671875, "reward_std": 0.18115226924419403, "rewards/helpfulness_reward/mean": 1.0240638256072998, "rewards/helpfulness_reward/std": 0.8162088990211487, "rewards/safety_reward/mean": 8.504638671875, "rewards/safety_reward/std": 1.0883979797363281, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 51.8359375, "completions/mean_terminated_length": 51.8359375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.14007510261112566, "frac_reward_zero_std": 0.0625, "grad_norm": 0.30340129137039185, "kl": 3.369140625, "learning_rate": 5e-05, "loss": 0.0247, "num_tokens": 6221478.0, "reward": 7.935302734375, "reward_std": 0.24809683859348297, "rewards/helpfulness_reward/mean": 1.1146392822265625, "rewards/helpfulness_reward/std": 0.858105480670929, "rewards/safety_reward/mean": 7.935302734375, "rewards/safety_reward/std": 1.03101646900177, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 52.3125, "completions/mean_terminated_length": 52.3125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.1404244170814776, "frac_reward_zero_std": 0.0, "grad_norm": 0.4477056860923767, "kl": 3.607421875, "learning_rate": 5e-05, "loss": 0.0401, "num_tokens": 6232470.0, "reward": 7.959228515625, "reward_std": 0.36982262134552, "rewards/helpfulness_reward/mean": 0.9470291137695312, "rewards/helpfulness_reward/std": 0.7795370221138, "rewards/safety_reward/mean": 7.959228515625, "rewards/safety_reward/std": 1.0927867889404297, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 52.28125, "completions/mean_terminated_length": 52.28125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.14077373155182954, "frac_reward_zero_std": 0.0625, "grad_norm": 0.635299563407898, "kl": 3.205078125, "learning_rate": 5e-05, "loss": 0.0354, "num_tokens": 6243002.0, "reward": 8.26171875, "reward_std": 0.2319325953722, "rewards/helpfulness_reward/mean": 1.0134344100952148, "rewards/helpfulness_reward/std": 0.6596150994300842, "rewards/safety_reward/mean": 8.26171875, "rewards/safety_reward/std": 0.7642490863800049, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 51.8515625, "completions/mean_terminated_length": 51.8515625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.14112304602218148, "frac_reward_zero_std": 0.0, "grad_norm": 0.5009210109710693, "kl": 3.46484375, "learning_rate": 5e-05, "loss": 0.0324, "num_tokens": 6253679.0, "reward": 7.985107421875, "reward_std": 0.2980261445045471, "rewards/helpfulness_reward/mean": 1.1896719932556152, "rewards/helpfulness_reward/std": 0.8948034644126892, "rewards/safety_reward/mean": 7.985107421875, "rewards/safety_reward/std": 0.925945520401001, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 52.09375, "completions/mean_terminated_length": 52.09375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.1414723604925334, "frac_reward_zero_std": 0.0, "grad_norm": 0.5905711054801941, "kl": 3.6171875, "learning_rate": 5e-05, "loss": 0.0434, "num_tokens": 6265155.0, "reward": 7.8154296875, "reward_std": 0.28297099471092224, "rewards/helpfulness_reward/mean": 0.8809356689453125, "rewards/helpfulness_reward/std": 0.806475043296814, "rewards/safety_reward/mean": 7.8154296875, "rewards/safety_reward/std": 1.4533449411392212, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 52.21875, "completions/mean_terminated_length": 52.21875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.14182167496288534, "frac_reward_zero_std": 0.0, "grad_norm": 0.4895817041397095, "kl": 3.365234375, "learning_rate": 5e-05, "loss": 0.0461, "num_tokens": 6276879.0, "reward": 7.872802734375, "reward_std": 0.16802117228507996, "rewards/helpfulness_reward/mean": 0.900629997253418, "rewards/helpfulness_reward/std": 0.791160523891449, "rewards/safety_reward/mean": 7.872802734375, "rewards/safety_reward/std": 0.989294171333313, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 52.3671875, "completions/mean_terminated_length": 52.3671875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.14217098943323728, "frac_reward_zero_std": 0.0625, "grad_norm": 0.4266988933086395, "kl": 3.30859375, "learning_rate": 5e-05, "loss": 0.0424, "num_tokens": 6287094.0, "reward": 8.2901611328125, "reward_std": 0.37707802653312683, "rewards/helpfulness_reward/mean": 0.9931144714355469, "rewards/helpfulness_reward/std": 0.6836220622062683, "rewards/safety_reward/mean": 8.2901611328125, "rewards/safety_reward/std": 1.3055857419967651, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 51.578125, "completions/mean_terminated_length": 51.578125, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.14252030390358922, "frac_reward_zero_std": 0.0, "grad_norm": 0.3434336483478546, "kl": 3.1875, "learning_rate": 5e-05, "loss": 0.0141, "num_tokens": 6297912.0, "reward": 8.5511474609375, "reward_std": 0.19805222749710083, "rewards/helpfulness_reward/mean": 1.1284711360931396, "rewards/helpfulness_reward/std": 0.7598015666007996, "rewards/safety_reward/mean": 8.5511474609375, "rewards/safety_reward/std": 0.8430268168449402, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 54.3671875, "completions/mean_terminated_length": 54.3671875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.14286961837394113, "frac_reward_zero_std": 0.0, "grad_norm": 0.47168323397636414, "kl": 3.439453125, "learning_rate": 5e-05, "loss": 0.1094, "num_tokens": 6310495.0, "reward": 7.93316650390625, "reward_std": 0.47487300634384155, "rewards/helpfulness_reward/mean": 1.1333284378051758, "rewards/helpfulness_reward/std": 0.8633084297180176, "rewards/safety_reward/mean": 7.93316650390625, "rewards/safety_reward/std": 1.8410176038742065, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 52.5859375, "completions/mean_terminated_length": 52.5859375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.14321893284429307, "frac_reward_zero_std": 0.0625, "grad_norm": 0.3583295941352844, "kl": 3.1640625, "learning_rate": 5e-05, "loss": 0.0525, "num_tokens": 6322138.0, "reward": 8.177734375, "reward_std": 0.28123733401298523, "rewards/helpfulness_reward/mean": 1.1887645721435547, "rewards/helpfulness_reward/std": 0.8696140050888062, "rewards/safety_reward/mean": 8.177734375, "rewards/safety_reward/std": 0.985740065574646, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 52.84375, "completions/mean_terminated_length": 52.84375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.14356824731464501, "frac_reward_zero_std": 0.0, "grad_norm": 1.2147618532180786, "kl": 3.818359375, "learning_rate": 5e-05, "loss": 0.0664, "num_tokens": 6333390.0, "reward": 8.51220703125, "reward_std": 0.4196009635925293, "rewards/helpfulness_reward/mean": 1.122054100036621, "rewards/helpfulness_reward/std": 0.7525345087051392, "rewards/safety_reward/mean": 8.51220703125, "rewards/safety_reward/std": 1.5131386518478394, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 52.7421875, "completions/mean_terminated_length": 52.7421875, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.14391756178499696, "frac_reward_zero_std": 0.0, "grad_norm": 0.619175374507904, "kl": 3.60546875, "learning_rate": 5e-05, "loss": 0.0588, "num_tokens": 6343949.0, "reward": 8.081298828125, "reward_std": 0.40469348430633545, "rewards/helpfulness_reward/mean": 1.1854829788208008, "rewards/helpfulness_reward/std": 0.6150726079940796, "rewards/safety_reward/mean": 8.081298828125, "rewards/safety_reward/std": 0.9072747230529785, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 51.953125, "completions/mean_terminated_length": 51.953125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.14426687625534887, "frac_reward_zero_std": 0.0, "grad_norm": 0.46012261509895325, "kl": 3.18359375, "learning_rate": 5e-05, "loss": 0.042, "num_tokens": 6354263.0, "reward": 7.8426513671875, "reward_std": 0.19574663043022156, "rewards/helpfulness_reward/mean": 1.060373306274414, "rewards/helpfulness_reward/std": 0.49643564224243164, "rewards/safety_reward/mean": 7.8426513671875, "rewards/safety_reward/std": 1.2350938320159912, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 52.28125, "completions/mean_terminated_length": 52.28125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.1446161907257008, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6169764995574951, "kl": 3.75, "learning_rate": 5e-05, "loss": 0.048, "num_tokens": 6364755.0, "reward": 7.792724609375, "reward_std": 0.26199275255203247, "rewards/helpfulness_reward/mean": 1.0057202577590942, "rewards/helpfulness_reward/std": 0.5873374342918396, "rewards/safety_reward/mean": 7.792724609375, "rewards/safety_reward/std": 1.106307029724121, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 51.9375, "completions/mean_terminated_length": 51.9375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.14496550519605275, "frac_reward_zero_std": 0.1875, "grad_norm": 0.36632686853408813, "kl": 3.46875, "learning_rate": 5e-05, "loss": 0.0358, "num_tokens": 6374995.0, "reward": 8.345947265625, "reward_std": 0.21430706977844238, "rewards/helpfulness_reward/mean": 0.9075860977172852, "rewards/helpfulness_reward/std": 0.6455698013305664, "rewards/safety_reward/mean": 8.345947265625, "rewards/safety_reward/std": 0.98372882604599, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 52.8671875, "completions/mean_terminated_length": 52.8671875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.1453148196664047, "frac_reward_zero_std": 0.0, "grad_norm": 0.642648458480835, "kl": 3.474609375, "learning_rate": 5e-05, "loss": 0.0608, "num_tokens": 6387586.0, "reward": 7.892822265625, "reward_std": 0.34229958057403564, "rewards/helpfulness_reward/mean": 0.8496856689453125, "rewards/helpfulness_reward/std": 0.8204764723777771, "rewards/safety_reward/mean": 7.892822265625, "rewards/safety_reward/std": 0.9912237524986267, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 52.265625, "completions/mean_terminated_length": 52.265625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.1456641341367566, "frac_reward_zero_std": 0.0, "grad_norm": 0.7418199777603149, "kl": 3.56640625, "learning_rate": 5e-05, "loss": 0.0469, "num_tokens": 6398876.0, "reward": 7.96826171875, "reward_std": 0.41274553537368774, "rewards/helpfulness_reward/mean": 0.8803262710571289, "rewards/helpfulness_reward/std": 0.5278185606002808, "rewards/safety_reward/mean": 7.96826171875, "rewards/safety_reward/std": 1.1434133052825928, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 107.0, "completions/max_terminated_length": 107.0, "completions/mean_length": 52.3203125, "completions/mean_terminated_length": 52.3203125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.14601344860710855, "frac_reward_zero_std": 0.0, "grad_norm": 0.5110577344894409, "kl": 3.2734375, "learning_rate": 5e-05, "loss": 0.0518, "num_tokens": 6410589.0, "reward": 8.66650390625, "reward_std": 0.3519807457923889, "rewards/helpfulness_reward/mean": 1.2582507133483887, "rewards/helpfulness_reward/std": 0.5950504541397095, "rewards/safety_reward/mean": 8.66650390625, "rewards/safety_reward/std": 1.0159674882888794, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 51.453125, "completions/mean_terminated_length": 51.453125, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.1463627630774605, "frac_reward_zero_std": 0.0, "grad_norm": 0.4808948040008545, "kl": 3.35546875, "learning_rate": 5e-05, "loss": 0.0106, "num_tokens": 6421399.0, "reward": 8.4473876953125, "reward_std": 0.5260929465293884, "rewards/helpfulness_reward/mean": 1.2825608253479004, "rewards/helpfulness_reward/std": 0.7106883525848389, "rewards/safety_reward/mean": 8.4473876953125, "rewards/safety_reward/std": 1.0290048122406006, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 51.9296875, "completions/mean_terminated_length": 51.9296875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.14671207754781243, "frac_reward_zero_std": 0.0, "grad_norm": 0.5075982809066772, "kl": 3.23828125, "learning_rate": 5e-05, "loss": 0.0307, "num_tokens": 6432670.0, "reward": 8.087890625, "reward_std": 0.31882399320602417, "rewards/helpfulness_reward/mean": 1.0872764587402344, "rewards/helpfulness_reward/std": 0.7932888865470886, "rewards/safety_reward/mean": 8.087890625, "rewards/safety_reward/std": 1.2775009870529175, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 51.8359375, "completions/mean_terminated_length": 51.8359375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.14706139201816434, "frac_reward_zero_std": 0.0, "grad_norm": 0.5040799975395203, "kl": 3.462890625, "learning_rate": 5e-05, "loss": 0.025, "num_tokens": 6443313.0, "reward": 8.37646484375, "reward_std": 0.2855238914489746, "rewards/helpfulness_reward/mean": 1.0123748779296875, "rewards/helpfulness_reward/std": 0.6182407736778259, "rewards/safety_reward/mean": 8.37646484375, "rewards/safety_reward/std": 0.8606534004211426, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 51.71875, "completions/mean_terminated_length": 51.71875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.14741070648851629, "frac_reward_zero_std": 0.0, "grad_norm": 0.5125418305397034, "kl": 3.455078125, "learning_rate": 5e-05, "loss": 0.0296, "num_tokens": 6453869.0, "reward": 7.853759765625, "reward_std": 0.3502154052257538, "rewards/helpfulness_reward/mean": 1.0219841003417969, "rewards/helpfulness_reward/std": 0.979918360710144, "rewards/safety_reward/mean": 7.853759765625, "rewards/safety_reward/std": 1.3216191530227661, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 52.25, "completions/mean_terminated_length": 52.25, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.14776002095886823, "frac_reward_zero_std": 0.0, "grad_norm": 111.90654754638672, "kl": 30.5703125, "learning_rate": 5e-05, "loss": 0.3125, "num_tokens": 6466373.0, "reward": 8.514892578125, "reward_std": 0.3373253345489502, "rewards/helpfulness_reward/mean": 0.8979759216308594, "rewards/helpfulness_reward/std": 0.8056710958480835, "rewards/safety_reward/mean": 8.514892578125, "rewards/safety_reward/std": 1.1402480602264404, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 85.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 52.4296875, "completions/mean_terminated_length": 52.4296875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.14810933542922017, "frac_reward_zero_std": 0.0, "grad_norm": 0.5462226271629333, "kl": 3.220703125, "learning_rate": 5e-05, "loss": 0.0406, "num_tokens": 6477868.0, "reward": 8.5166015625, "reward_std": 0.31889408826828003, "rewards/helpfulness_reward/mean": 0.6239683628082275, "rewards/helpfulness_reward/std": 0.6883863806724548, "rewards/safety_reward/mean": 8.5166015625, "rewards/safety_reward/std": 1.096070647239685, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 52.265625, "completions/mean_terminated_length": 52.265625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.14845864989957208, "frac_reward_zero_std": 0.0, "grad_norm": 0.46806344389915466, "kl": 3.171875, "learning_rate": 5e-05, "loss": 0.0265, "num_tokens": 6489318.0, "reward": 8.55517578125, "reward_std": 0.35721415281295776, "rewards/helpfulness_reward/mean": 1.1084403991699219, "rewards/helpfulness_reward/std": 0.7136748433113098, "rewards/safety_reward/mean": 8.55517578125, "rewards/safety_reward/std": 1.0949093103408813, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 51.453125, "completions/mean_terminated_length": 51.453125, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.14880796436992402, "frac_reward_zero_std": 0.0, "grad_norm": 0.5792899131774902, "kl": 3.44921875, "learning_rate": 5e-05, "loss": 0.0357, "num_tokens": 6501936.0, "reward": 7.42529296875, "reward_std": 0.4626159071922302, "rewards/helpfulness_reward/mean": 0.6482264399528503, "rewards/helpfulness_reward/std": 0.8811198472976685, "rewards/safety_reward/mean": 7.42529296875, "rewards/safety_reward/std": 2.1635682582855225, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 52.8515625, "completions/mean_terminated_length": 52.8515625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.14915727884027596, "frac_reward_zero_std": 0.0, "grad_norm": 0.529728889465332, "kl": 3.470703125, "learning_rate": 5e-05, "loss": 0.042, "num_tokens": 6513765.0, "reward": 7.738037109375, "reward_std": 0.4036500155925751, "rewards/helpfulness_reward/mean": 0.7840967178344727, "rewards/helpfulness_reward/std": 0.8544482588768005, "rewards/safety_reward/mean": 7.738037109375, "rewards/safety_reward/std": 1.2803231477737427, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 52.3828125, "completions/mean_terminated_length": 52.3828125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.1495065933106279, "frac_reward_zero_std": 0.0, "grad_norm": 0.46859854459762573, "kl": 3.349609375, "learning_rate": 5e-05, "loss": 0.0322, "num_tokens": 6525294.0, "reward": 7.84912109375, "reward_std": 0.4080979824066162, "rewards/helpfulness_reward/mean": 0.8919146060943604, "rewards/helpfulness_reward/std": 0.9332273602485657, "rewards/safety_reward/mean": 7.84912109375, "rewards/safety_reward/std": 1.2349494695663452, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 52.6953125, "completions/mean_terminated_length": 52.6953125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.14985590778097982, "frac_reward_zero_std": 0.0, "grad_norm": 0.46037521958351135, "kl": 3.26953125, "learning_rate": 5e-05, "loss": 0.0312, "num_tokens": 6537055.0, "reward": 7.7496337890625, "reward_std": 0.34580153226852417, "rewards/helpfulness_reward/mean": 0.8039302825927734, "rewards/helpfulness_reward/std": 0.9926039576530457, "rewards/safety_reward/mean": 7.7496337890625, "rewards/safety_reward/std": 1.5579639673233032, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 52.5234375, "completions/mean_terminated_length": 52.5234375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.15020522225133176, "frac_reward_zero_std": 0.0, "grad_norm": 0.4734418988227844, "kl": 3.494140625, "learning_rate": 5e-05, "loss": 0.0236, "num_tokens": 6547786.0, "reward": 8.23486328125, "reward_std": 0.40129268169403076, "rewards/helpfulness_reward/mean": 0.8921098709106445, "rewards/helpfulness_reward/std": 0.8419246673583984, "rewards/safety_reward/mean": 8.23486328125, "rewards/safety_reward/std": 1.0617903470993042, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 51.5234375, "completions/mean_terminated_length": 51.5234375, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.1505545367216837, "frac_reward_zero_std": 0.0, "grad_norm": 0.559692919254303, "kl": 3.234375, "learning_rate": 5e-05, "loss": 0.0313, "num_tokens": 6560853.0, "reward": 7.69146728515625, "reward_std": 0.3582197427749634, "rewards/helpfulness_reward/mean": 0.786738395690918, "rewards/helpfulness_reward/std": 0.7167508006095886, "rewards/safety_reward/mean": 7.69146728515625, "rewards/safety_reward/std": 1.5832796096801758, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 52.9765625, "completions/mean_terminated_length": 52.9765625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.15090385119203564, "frac_reward_zero_std": 0.0, "grad_norm": 0.7544333338737488, "kl": 3.677734375, "learning_rate": 5e-05, "loss": 0.0412, "num_tokens": 6571490.0, "reward": 7.6064453125, "reward_std": 0.28963956236839294, "rewards/helpfulness_reward/mean": 0.9264793395996094, "rewards/helpfulness_reward/std": 0.668559193611145, "rewards/safety_reward/mean": 7.6064453125, "rewards/safety_reward/std": 1.385802149772644, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 52.53125, "completions/mean_terminated_length": 52.53125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.15125316566238756, "frac_reward_zero_std": 0.0, "grad_norm": 0.5530776977539062, "kl": 3.466796875, "learning_rate": 5e-05, "loss": 0.0428, "num_tokens": 6583182.0, "reward": 8.02740478515625, "reward_std": 0.4270966053009033, "rewards/helpfulness_reward/mean": 0.9275951385498047, "rewards/helpfulness_reward/std": 1.0129446983337402, "rewards/safety_reward/mean": 8.02740478515625, "rewards/safety_reward/std": 1.9301164150238037, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 52.875, "completions/mean_terminated_length": 52.875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.1516024801327395, "frac_reward_zero_std": 0.0, "grad_norm": 0.5250292420387268, "kl": 3.322265625, "learning_rate": 5e-05, "loss": 0.04, "num_tokens": 6595278.0, "reward": 8.04541015625, "reward_std": 0.35664770007133484, "rewards/helpfulness_reward/mean": 0.924004077911377, "rewards/helpfulness_reward/std": 0.6564435958862305, "rewards/safety_reward/mean": 8.04541015625, "rewards/safety_reward/std": 1.2785245180130005, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 53.1640625, "completions/mean_terminated_length": 53.1640625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.15195179460309144, "frac_reward_zero_std": 0.0, "grad_norm": 1.8946443796157837, "kl": 4.255859375, "learning_rate": 5e-05, "loss": 0.0585, "num_tokens": 6606067.0, "reward": 8.57080078125, "reward_std": 0.26062503457069397, "rewards/helpfulness_reward/mean": 0.9623298645019531, "rewards/helpfulness_reward/std": 0.8354560732841492, "rewards/safety_reward/mean": 8.57080078125, "rewards/safety_reward/std": 1.2478172779083252, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 53.140625, "completions/mean_terminated_length": 53.140625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.15230110907344338, "frac_reward_zero_std": 0.0, "grad_norm": 0.6534906625747681, "kl": 3.765625, "learning_rate": 5e-05, "loss": 0.0473, "num_tokens": 6617317.0, "reward": 7.4931640625, "reward_std": 0.22142714262008667, "rewards/helpfulness_reward/mean": 0.5643672943115234, "rewards/helpfulness_reward/std": 0.7161004543304443, "rewards/safety_reward/mean": 7.4931640625, "rewards/safety_reward/std": 1.1815142631530762, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 53.0234375, "completions/mean_terminated_length": 53.0234375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.1526504235437953, "frac_reward_zero_std": 0.0, "grad_norm": 0.4275021255016327, "kl": 3.390625, "learning_rate": 5e-05, "loss": 0.0339, "num_tokens": 6628624.0, "reward": 7.6455078125, "reward_std": 0.3281796872615814, "rewards/helpfulness_reward/mean": 1.0116524696350098, "rewards/helpfulness_reward/std": 0.7319703102111816, "rewards/safety_reward/mean": 7.6455078125, "rewards/safety_reward/std": 1.4438858032226562, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 53.5546875, "completions/mean_terminated_length": 53.5546875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.15299973801414724, "frac_reward_zero_std": 0.0, "grad_norm": 0.5013652443885803, "kl": 3.576171875, "learning_rate": 5e-05, "loss": 0.0592, "num_tokens": 6639743.0, "reward": 8.03515625, "reward_std": 0.347358763217926, "rewards/helpfulness_reward/mean": 0.870151937007904, "rewards/helpfulness_reward/std": 0.9565660357475281, "rewards/safety_reward/mean": 8.03515625, "rewards/safety_reward/std": 1.6443333625793457, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 87.0, "completions/max_terminated_length": 87.0, "completions/mean_length": 53.6328125, "completions/mean_terminated_length": 53.6328125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.15334905248449918, "frac_reward_zero_std": 0.0, "grad_norm": 0.5099378228187561, "kl": 3.408203125, "learning_rate": 5e-05, "loss": 0.0531, "num_tokens": 6650872.0, "reward": 8.36767578125, "reward_std": 0.33382725715637207, "rewards/helpfulness_reward/mean": 0.9546890258789062, "rewards/helpfulness_reward/std": 0.6283095479011536, "rewards/safety_reward/mean": 8.36767578125, "rewards/safety_reward/std": 0.8923036456108093, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 53.328125, "completions/mean_terminated_length": 53.328125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.15369836695485112, "frac_reward_zero_std": 0.0, "grad_norm": 1.6436715126037598, "kl": 4.251953125, "learning_rate": 5e-05, "loss": 0.0489, "num_tokens": 6661538.0, "reward": 8.348876953125, "reward_std": 0.26266705989837646, "rewards/helpfulness_reward/mean": 0.7305731773376465, "rewards/helpfulness_reward/std": 0.6479461193084717, "rewards/safety_reward/mean": 8.348876953125, "rewards/safety_reward/std": 1.11964750289917, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 53.0859375, "completions/mean_terminated_length": 53.0859375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.15404768142520303, "frac_reward_zero_std": 0.0, "grad_norm": 0.5977259874343872, "kl": 3.509765625, "learning_rate": 5e-05, "loss": 0.0516, "num_tokens": 6672749.0, "reward": 8.1090087890625, "reward_std": 0.26667124032974243, "rewards/helpfulness_reward/mean": 0.8723154067993164, "rewards/helpfulness_reward/std": 1.0034204721450806, "rewards/safety_reward/mean": 8.1090087890625, "rewards/safety_reward/std": 1.2931668758392334, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 52.7265625, "completions/mean_terminated_length": 52.7265625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.15439699589555497, "frac_reward_zero_std": 0.0, "grad_norm": 0.7378703355789185, "kl": 3.583984375, "learning_rate": 5e-05, "loss": 0.0391, "num_tokens": 6683090.0, "reward": 8.208740234375, "reward_std": 0.39959341287612915, "rewards/helpfulness_reward/mean": 1.1313495635986328, "rewards/helpfulness_reward/std": 0.7280312180519104, "rewards/safety_reward/mean": 8.208740234375, "rewards/safety_reward/std": 1.0952986478805542, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 52.765625, "completions/mean_terminated_length": 52.765625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.15474631036590691, "frac_reward_zero_std": 0.0, "grad_norm": 0.3716745674610138, "kl": 3.580078125, "learning_rate": 5e-05, "loss": 0.0457, "num_tokens": 6694740.0, "reward": 8.2275390625, "reward_std": 0.22875934839248657, "rewards/helpfulness_reward/mean": 0.9269165992736816, "rewards/helpfulness_reward/std": 0.8662680387496948, "rewards/safety_reward/mean": 8.2275390625, "rewards/safety_reward/std": 0.7899655103683472, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 52.109375, "completions/mean_terminated_length": 52.109375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.15509562483625886, "frac_reward_zero_std": 0.0, "grad_norm": 0.6043615341186523, "kl": 3.41015625, "learning_rate": 5e-05, "loss": 0.0231, "num_tokens": 6705338.0, "reward": 7.987548828125, "reward_std": 0.398752361536026, "rewards/helpfulness_reward/mean": 1.2930221557617188, "rewards/helpfulness_reward/std": 0.7956570386886597, "rewards/safety_reward/mean": 7.987548828125, "rewards/safety_reward/std": 0.996173620223999, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 52.96875, "completions/mean_terminated_length": 52.96875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.15544493930661077, "frac_reward_zero_std": 0.0, "grad_norm": 0.5079682469367981, "kl": 3.720703125, "learning_rate": 5e-05, "loss": 0.0562, "num_tokens": 6716062.0, "reward": 8.138916015625, "reward_std": 0.34554749727249146, "rewards/helpfulness_reward/mean": 1.455596923828125, "rewards/helpfulness_reward/std": 0.6237953901290894, "rewards/safety_reward/mean": 8.138916015625, "rewards/safety_reward/std": 1.544561505317688, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 52.3203125, "completions/mean_terminated_length": 52.3203125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.1557942537769627, "frac_reward_zero_std": 0.0, "grad_norm": 0.4135846495628357, "kl": 3.314453125, "learning_rate": 5e-05, "loss": 0.0284, "num_tokens": 6726463.0, "reward": 8.26123046875, "reward_std": 0.2280336618423462, "rewards/helpfulness_reward/mean": 1.5718574523925781, "rewards/helpfulness_reward/std": 0.7708855867385864, "rewards/safety_reward/mean": 8.26123046875, "rewards/safety_reward/std": 0.9645437598228455, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 52.1640625, "completions/mean_terminated_length": 52.1640625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.15614356824731465, "frac_reward_zero_std": 0.0, "grad_norm": 0.520263135433197, "kl": 3.1640625, "learning_rate": 5e-05, "loss": 0.0314, "num_tokens": 6737252.0, "reward": 8.31689453125, "reward_std": 0.25371867418289185, "rewards/helpfulness_reward/mean": 1.4851980209350586, "rewards/helpfulness_reward/std": 0.591541588306427, "rewards/safety_reward/mean": 8.31689453125, "rewards/safety_reward/std": 1.0546311140060425, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 84.0, "completions/max_terminated_length": 84.0, "completions/mean_length": 52.4921875, "completions/mean_terminated_length": 52.4921875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.1564928827176666, "frac_reward_zero_std": 0.0, "grad_norm": 0.5169287919998169, "kl": 3.3046875, "learning_rate": 5e-05, "loss": 0.0457, "num_tokens": 6748915.0, "reward": 7.73016357421875, "reward_std": 0.3554341793060303, "rewards/helpfulness_reward/mean": 1.4111790657043457, "rewards/helpfulness_reward/std": 1.0635206699371338, "rewards/safety_reward/mean": 7.73016357421875, "rewards/safety_reward/std": 2.1485061645507812, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 52.46875, "completions/mean_terminated_length": 52.46875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.1568421971880185, "frac_reward_zero_std": 0.0, "grad_norm": 0.37769967317581177, "kl": 3.333984375, "learning_rate": 5e-05, "loss": 0.0338, "num_tokens": 6759495.0, "reward": 8.56201171875, "reward_std": 0.30329760909080505, "rewards/helpfulness_reward/mean": 1.5981216430664062, "rewards/helpfulness_reward/std": 0.6758284568786621, "rewards/safety_reward/mean": 8.56201171875, "rewards/safety_reward/std": 1.013511300086975, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 52.7109375, "completions/mean_terminated_length": 52.7109375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.15719151165837045, "frac_reward_zero_std": 0.0, "grad_norm": 0.5431464910507202, "kl": 3.66015625, "learning_rate": 5e-05, "loss": 0.0509, "num_tokens": 6770362.0, "reward": 8.143798828125, "reward_std": 0.3541565537452698, "rewards/helpfulness_reward/mean": 1.5672307014465332, "rewards/helpfulness_reward/std": 0.6163500547409058, "rewards/safety_reward/mean": 8.143798828125, "rewards/safety_reward/std": 1.0651984214782715, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 52.453125, "completions/mean_terminated_length": 52.453125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.1575408261287224, "frac_reward_zero_std": 0.0, "grad_norm": 0.4034031629562378, "kl": 3.58203125, "learning_rate": 5e-05, "loss": 0.0332, "num_tokens": 6781620.0, "reward": 8.33349609375, "reward_std": 0.23923632502555847, "rewards/helpfulness_reward/mean": 1.6154975891113281, "rewards/helpfulness_reward/std": 0.8552731275558472, "rewards/safety_reward/mean": 8.33349609375, "rewards/safety_reward/std": 1.0256694555282593, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 52.40625, "completions/mean_terminated_length": 52.40625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.15789014059907433, "frac_reward_zero_std": 0.0, "grad_norm": 0.42902612686157227, "kl": 3.296875, "learning_rate": 5e-05, "loss": 0.0408, "num_tokens": 6792664.0, "reward": 7.99853515625, "reward_std": 0.29355764389038086, "rewards/helpfulness_reward/mean": 1.5814857482910156, "rewards/helpfulness_reward/std": 0.7715524435043335, "rewards/safety_reward/mean": 7.99853515625, "rewards/safety_reward/std": 0.8682145476341248, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 52.375, "completions/mean_terminated_length": 52.375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.15823945506942624, "frac_reward_zero_std": 0.0, "grad_norm": 0.4153858423233032, "kl": 3.43359375, "learning_rate": 5e-05, "loss": 0.037, "num_tokens": 6804632.0, "reward": 8.118408203125, "reward_std": 0.23865348100662231, "rewards/helpfulness_reward/mean": 1.3323554992675781, "rewards/helpfulness_reward/std": 0.7012885808944702, "rewards/safety_reward/mean": 8.118408203125, "rewards/safety_reward/std": 0.7084609866142273, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 52.15625, "completions/mean_terminated_length": 52.15625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.15858876953977818, "frac_reward_zero_std": 0.0, "grad_norm": 0.3034946322441101, "kl": 3.361328125, "learning_rate": 5e-05, "loss": 0.0335, "num_tokens": 6816628.0, "reward": 8.0345458984375, "reward_std": 0.1702490746974945, "rewards/helpfulness_reward/mean": 1.640383243560791, "rewards/helpfulness_reward/std": 0.8495433330535889, "rewards/safety_reward/mean": 8.0345458984375, "rewards/safety_reward/std": 1.4053853750228882, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 52.5859375, "completions/mean_terminated_length": 52.5859375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.15893808401013013, "frac_reward_zero_std": 0.0, "grad_norm": 0.4300536811351776, "kl": 3.55078125, "learning_rate": 5e-05, "loss": 0.0433, "num_tokens": 6827503.0, "reward": 8.523193359375, "reward_std": 0.27926063537597656, "rewards/helpfulness_reward/mean": 1.43245530128479, "rewards/helpfulness_reward/std": 0.747684121131897, "rewards/safety_reward/mean": 8.523193359375, "rewards/safety_reward/std": 0.8329335451126099, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 52.875, "completions/mean_terminated_length": 52.875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.15928739848048207, "frac_reward_zero_std": 0.0, "grad_norm": 0.4927482604980469, "kl": 3.509765625, "learning_rate": 5e-05, "loss": 0.045, "num_tokens": 6838775.0, "reward": 8.283203125, "reward_std": 0.2643720507621765, "rewards/helpfulness_reward/mean": 1.606597900390625, "rewards/helpfulness_reward/std": 0.748440682888031, "rewards/safety_reward/mean": 8.283203125, "rewards/safety_reward/std": 0.8915903568267822, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 52.6640625, "completions/mean_terminated_length": 52.6640625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.15963671295083398, "frac_reward_zero_std": 0.0625, "grad_norm": 0.4589694142341614, "kl": 3.384765625, "learning_rate": 5e-05, "loss": 0.0443, "num_tokens": 6849772.0, "reward": 8.385009765625, "reward_std": 0.20711404085159302, "rewards/helpfulness_reward/mean": 1.3884143829345703, "rewards/helpfulness_reward/std": 0.8938432931900024, "rewards/safety_reward/mean": 8.385009765625, "rewards/safety_reward/std": 1.0745819807052612, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 104.0, "completions/max_terminated_length": 104.0, "completions/mean_length": 53.453125, "completions/mean_terminated_length": 53.453125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.15998602742118592, "frac_reward_zero_std": 0.0, "grad_norm": 0.4278115928173065, "kl": 3.5234375, "learning_rate": 5e-05, "loss": 0.0729, "num_tokens": 6861670.0, "reward": 7.6365966796875, "reward_std": 0.36114558577537537, "rewards/helpfulness_reward/mean": 1.230604887008667, "rewards/helpfulness_reward/std": 0.8162372708320618, "rewards/safety_reward/mean": 7.6365966796875, "rewards/safety_reward/std": 1.7735633850097656, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 52.65625, "completions/mean_terminated_length": 52.65625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.16033534189153786, "frac_reward_zero_std": 0.0, "grad_norm": 0.46250590682029724, "kl": 3.30859375, "learning_rate": 5e-05, "loss": 0.0481, "num_tokens": 6872682.0, "reward": 8.060546875, "reward_std": 0.30333182215690613, "rewards/helpfulness_reward/mean": 1.547098159790039, "rewards/helpfulness_reward/std": 0.6226441264152527, "rewards/safety_reward/mean": 8.060546875, "rewards/safety_reward/std": 0.8830805420875549, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 52.5625, "completions/mean_terminated_length": 52.5625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.1606846563618898, "frac_reward_zero_std": 0.0, "grad_norm": 0.5281779766082764, "kl": 3.67578125, "learning_rate": 5e-05, "loss": 0.0524, "num_tokens": 6883074.0, "reward": 8.060791015625, "reward_std": 0.3666194677352905, "rewards/helpfulness_reward/mean": 1.4315810203552246, "rewards/helpfulness_reward/std": 1.0619112253189087, "rewards/safety_reward/mean": 8.060791015625, "rewards/safety_reward/std": 1.0438917875289917, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 52.3828125, "completions/mean_terminated_length": 52.3828125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.16103397083224172, "frac_reward_zero_std": 0.0, "grad_norm": 0.3854823410511017, "kl": 3.18359375, "learning_rate": 5e-05, "loss": 0.0355, "num_tokens": 6893771.0, "reward": 8.22412109375, "reward_std": 0.2931646406650543, "rewards/helpfulness_reward/mean": 1.3461923599243164, "rewards/helpfulness_reward/std": 0.9264986515045166, "rewards/safety_reward/mean": 8.22412109375, "rewards/safety_reward/std": 0.9469756484031677, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 52.828125, "completions/mean_terminated_length": 52.828125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.16138328530259366, "frac_reward_zero_std": 0.0, "grad_norm": 0.5897141098976135, "kl": 3.029296875, "learning_rate": 5e-05, "loss": 0.0505, "num_tokens": 6904541.0, "reward": 9.154296875, "reward_std": 0.27110400795936584, "rewards/helpfulness_reward/mean": 1.7267532348632812, "rewards/helpfulness_reward/std": 0.793089747428894, "rewards/safety_reward/mean": 9.154296875, "rewards/safety_reward/std": 0.8234960436820984, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 53.1640625, "completions/mean_terminated_length": 53.1640625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.1617325997729456, "frac_reward_zero_std": 0.0, "grad_norm": 0.4714491367340088, "kl": 3.33203125, "learning_rate": 5e-05, "loss": 0.0524, "num_tokens": 6916650.0, "reward": 8.044921875, "reward_std": 0.34826427698135376, "rewards/helpfulness_reward/mean": 1.33491849899292, "rewards/helpfulness_reward/std": 1.0484633445739746, "rewards/safety_reward/mean": 8.044921875, "rewards/safety_reward/std": 1.3492109775543213, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 52.84375, "completions/mean_terminated_length": 52.84375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.16208191424329754, "frac_reward_zero_std": 0.0, "grad_norm": 0.4574874937534332, "kl": 3.375, "learning_rate": 5e-05, "loss": 0.0494, "num_tokens": 6928262.0, "reward": 7.884765625, "reward_std": 0.21402671933174133, "rewards/helpfulness_reward/mean": 1.1767935752868652, "rewards/helpfulness_reward/std": 0.7616832852363586, "rewards/safety_reward/mean": 7.884765625, "rewards/safety_reward/std": 1.0030615329742432, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 52.3671875, "completions/mean_terminated_length": 52.3671875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.16243122871364946, "frac_reward_zero_std": 0.0, "grad_norm": 0.41839444637298584, "kl": 3.412109375, "learning_rate": 5e-05, "loss": 0.0317, "num_tokens": 6939997.0, "reward": 8.5810546875, "reward_std": 0.23040567338466644, "rewards/helpfulness_reward/mean": 1.362019419670105, "rewards/helpfulness_reward/std": 0.770763635635376, "rewards/safety_reward/mean": 8.5810546875, "rewards/safety_reward/std": 1.0404988527297974, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 52.796875, "completions/mean_terminated_length": 52.796875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.1627805431840014, "frac_reward_zero_std": 0.0, "grad_norm": 0.5255281925201416, "kl": 3.353515625, "learning_rate": 5e-05, "loss": 0.0381, "num_tokens": 6952155.0, "reward": 9.0185546875, "reward_std": 0.2525489628314972, "rewards/helpfulness_reward/mean": 1.7582931518554688, "rewards/helpfulness_reward/std": 0.7907862067222595, "rewards/safety_reward/mean": 9.0185546875, "rewards/safety_reward/std": 0.8916652798652649, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 52.6875, "completions/mean_terminated_length": 52.6875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.16312985765435334, "frac_reward_zero_std": 0.0, "grad_norm": 0.47052013874053955, "kl": 3.408203125, "learning_rate": 5e-05, "loss": 0.0253, "num_tokens": 6962411.0, "reward": 7.809326171875, "reward_std": 0.3076476454734802, "rewards/helpfulness_reward/mean": 0.8496122360229492, "rewards/helpfulness_reward/std": 0.7992796897888184, "rewards/safety_reward/mean": 7.809326171875, "rewards/safety_reward/std": 1.0410927534103394, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 52.578125, "completions/mean_terminated_length": 52.578125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.16347917212470525, "frac_reward_zero_std": 0.0, "grad_norm": 2.109360456466675, "kl": 4.30078125, "learning_rate": 5e-05, "loss": 0.0456, "num_tokens": 6973845.0, "reward": 8.289306640625, "reward_std": 0.27515411376953125, "rewards/helpfulness_reward/mean": 1.1542155742645264, "rewards/helpfulness_reward/std": 0.8202416896820068, "rewards/safety_reward/mean": 8.289306640625, "rewards/safety_reward/std": 1.235074758529663, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 52.6328125, "completions/mean_terminated_length": 52.6328125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.1638284865950572, "frac_reward_zero_std": 0.0, "grad_norm": 0.687719464302063, "kl": 3.595703125, "learning_rate": 5e-05, "loss": 0.0391, "num_tokens": 6984958.0, "reward": 8.670166015625, "reward_std": 0.20830225944519043, "rewards/helpfulness_reward/mean": 1.5118560791015625, "rewards/helpfulness_reward/std": 0.9120919108390808, "rewards/safety_reward/mean": 8.670166015625, "rewards/safety_reward/std": 0.8318764567375183, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 52.5234375, "completions/mean_terminated_length": 52.5234375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.16417780106540913, "frac_reward_zero_std": 0.0625, "grad_norm": 1.2045632600784302, "kl": 4.0, "learning_rate": 5e-05, "loss": 0.0392, "num_tokens": 6995545.0, "reward": 8.630859375, "reward_std": 0.2709299325942993, "rewards/helpfulness_reward/mean": 1.0732173919677734, "rewards/helpfulness_reward/std": 0.7327713966369629, "rewards/safety_reward/mean": 8.630859375, "rewards/safety_reward/std": 0.9659857749938965, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 52.7109375, "completions/mean_terminated_length": 52.7109375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.16452711553576108, "frac_reward_zero_std": 0.0625, "grad_norm": 0.46299970149993896, "kl": 3.4609375, "learning_rate": 5e-05, "loss": 0.0367, "num_tokens": 7006412.0, "reward": 7.802001953125, "reward_std": 0.1997363269329071, "rewards/helpfulness_reward/mean": 0.8442779183387756, "rewards/helpfulness_reward/std": 0.9196763634681702, "rewards/safety_reward/mean": 7.802001953125, "rewards/safety_reward/std": 1.322638988494873, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 52.6640625, "completions/mean_terminated_length": 52.6640625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.164876430006113, "frac_reward_zero_std": 0.0, "grad_norm": 0.6349869966506958, "kl": 3.50390625, "learning_rate": 5e-05, "loss": 0.0293, "num_tokens": 7017089.0, "reward": 8.968505859375, "reward_std": 0.29262128472328186, "rewards/helpfulness_reward/mean": 1.443939208984375, "rewards/helpfulness_reward/std": 0.7155949473381042, "rewards/safety_reward/mean": 8.968505859375, "rewards/safety_reward/std": 1.0109474658966064, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 52.2578125, "completions/mean_terminated_length": 52.2578125, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.16522574447646493, "frac_reward_zero_std": 0.0625, "grad_norm": 0.39769840240478516, "kl": 3.79296875, "learning_rate": 5e-05, "loss": 0.0233, "num_tokens": 7027586.0, "reward": 8.68115234375, "reward_std": 0.27792298793792725, "rewards/helpfulness_reward/mean": 0.9959220886230469, "rewards/helpfulness_reward/std": 0.7411127686500549, "rewards/safety_reward/mean": 8.68115234375, "rewards/safety_reward/std": 1.0762295722961426, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 53.0078125, "completions/mean_terminated_length": 53.0078125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.16557505894681687, "frac_reward_zero_std": 0.0625, "grad_norm": 0.47989100217819214, "kl": 3.376953125, "learning_rate": 5e-05, "loss": 0.04, "num_tokens": 7038411.0, "reward": 8.8349609375, "reward_std": 0.24538737535476685, "rewards/helpfulness_reward/mean": 1.4084930419921875, "rewards/helpfulness_reward/std": 0.7325749397277832, "rewards/safety_reward/mean": 8.8349609375, "rewards/safety_reward/std": 1.0266947746276855, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 52.84375, "completions/mean_terminated_length": 52.84375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.1659243734171688, "frac_reward_zero_std": 0.0, "grad_norm": 0.49616196751594543, "kl": 3.517578125, "learning_rate": 5e-05, "loss": 0.0403, "num_tokens": 7050511.0, "reward": 8.220703125, "reward_std": 0.4002026915550232, "rewards/helpfulness_reward/mean": 1.0558595657348633, "rewards/helpfulness_reward/std": 0.7631142735481262, "rewards/safety_reward/mean": 8.220703125, "rewards/safety_reward/std": 1.419303297996521, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 53.0078125, "completions/mean_terminated_length": 53.0078125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.16627368788752073, "frac_reward_zero_std": 0.0, "grad_norm": 0.46334871649742126, "kl": 3.2578125, "learning_rate": 5e-05, "loss": 0.0364, "num_tokens": 7061248.0, "reward": 8.138427734375, "reward_std": 0.42471206188201904, "rewards/helpfulness_reward/mean": 0.8621635437011719, "rewards/helpfulness_reward/std": 0.8717804551124573, "rewards/safety_reward/mean": 8.138427734375, "rewards/safety_reward/std": 1.1258996725082397, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 52.796875, "completions/mean_terminated_length": 52.796875, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.16662300235787267, "frac_reward_zero_std": 0.0, "grad_norm": 0.6808704137802124, "kl": 3.5234375, "learning_rate": 5e-05, "loss": 0.0294, "num_tokens": 7071382.0, "reward": 7.646728515625, "reward_std": 0.48440125584602356, "rewards/helpfulness_reward/mean": 0.7948017120361328, "rewards/helpfulness_reward/std": 1.0253205299377441, "rewards/safety_reward/mean": 7.646728515625, "rewards/safety_reward/std": 1.6114803552627563, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 52.9375, "completions/mean_terminated_length": 52.9375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.1669723168282246, "frac_reward_zero_std": 0.0, "grad_norm": 3.434795379638672, "kl": 4.9609375, "learning_rate": 5e-05, "loss": 0.0546, "num_tokens": 7082094.0, "reward": 7.988525390625, "reward_std": 0.29949501156806946, "rewards/helpfulness_reward/mean": 0.7184028625488281, "rewards/helpfulness_reward/std": 0.9967852830886841, "rewards/safety_reward/mean": 7.988525390625, "rewards/safety_reward/std": 1.2791379690170288, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 52.8984375, "completions/mean_terminated_length": 52.8984375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.16732163129857655, "frac_reward_zero_std": 0.0, "grad_norm": 0.7189974784851074, "kl": 3.3125, "learning_rate": 5e-05, "loss": 0.0425, "num_tokens": 7094817.0, "reward": 7.83404541015625, "reward_std": 0.3491813540458679, "rewards/helpfulness_reward/mean": 0.6785469055175781, "rewards/helpfulness_reward/std": 1.290791630744934, "rewards/safety_reward/mean": 7.83404541015625, "rewards/safety_reward/std": 1.942609429359436, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 52.125, "completions/mean_terminated_length": 52.125, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.16767094576892846, "frac_reward_zero_std": 0.0, "grad_norm": 0.4923006594181061, "kl": 3.556640625, "learning_rate": 5e-05, "loss": 0.0088, "num_tokens": 7106161.0, "reward": 7.6417236328125, "reward_std": 0.4171220064163208, "rewards/helpfulness_reward/mean": 0.8405513763427734, "rewards/helpfulness_reward/std": 0.8184909224510193, "rewards/safety_reward/mean": 7.6417236328125, "rewards/safety_reward/std": 1.4351648092269897, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 52.75, "completions/mean_terminated_length": 52.75, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.1680202602392804, "frac_reward_zero_std": 0.0, "grad_norm": 0.4724617898464203, "kl": 3.318359375, "learning_rate": 5e-05, "loss": 0.0421, "num_tokens": 7116481.0, "reward": 8.2913818359375, "reward_std": 0.3041030764579773, "rewards/helpfulness_reward/mean": 1.0219135284423828, "rewards/helpfulness_reward/std": 0.7372429370880127, "rewards/safety_reward/mean": 8.2913818359375, "rewards/safety_reward/std": 0.9330059885978699, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 52.8203125, "completions/mean_terminated_length": 52.8203125, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.16836957470963235, "frac_reward_zero_std": 0.0, "grad_norm": 1.265206217765808, "kl": 3.7578125, "learning_rate": 5e-05, "loss": 0.0386, "num_tokens": 7128970.0, "reward": 8.030517578125, "reward_std": 0.4682009816169739, "rewards/helpfulness_reward/mean": 1.186614990234375, "rewards/helpfulness_reward/std": 0.7701665759086609, "rewards/safety_reward/mean": 8.030517578125, "rewards/safety_reward/std": 1.270237684249878, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 52.546875, "completions/mean_terminated_length": 52.546875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.1687188891799843, "frac_reward_zero_std": 0.0, "grad_norm": 0.5924174189567566, "kl": 3.634765625, "learning_rate": 5e-05, "loss": 0.036, "num_tokens": 7139992.0, "reward": 8.2255859375, "reward_std": 0.26896804571151733, "rewards/helpfulness_reward/mean": 1.3817672729492188, "rewards/helpfulness_reward/std": 0.7489290237426758, "rewards/safety_reward/mean": 8.2255859375, "rewards/safety_reward/std": 1.3768048286437988, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 86.0, "completions/max_terminated_length": 86.0, "completions/mean_length": 52.953125, "completions/mean_terminated_length": 52.953125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.1690682036503362, "frac_reward_zero_std": 0.125, "grad_norm": 0.5094701051712036, "kl": 3.271484375, "learning_rate": 5e-05, "loss": 0.0555, "num_tokens": 7152194.0, "reward": 8.33349609375, "reward_std": 0.23536624014377594, "rewards/helpfulness_reward/mean": 1.0794811248779297, "rewards/helpfulness_reward/std": 0.9645957350730896, "rewards/safety_reward/mean": 8.33349609375, "rewards/safety_reward/std": 1.304938793182373, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 52.4296875, "completions/mean_terminated_length": 52.4296875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.16941751812068814, "frac_reward_zero_std": 0.0, "grad_norm": 1.7456302642822266, "kl": 4.228515625, "learning_rate": 5e-05, "loss": 0.0306, "num_tokens": 7162465.0, "reward": 8.640380859375, "reward_std": 0.47185832262039185, "rewards/helpfulness_reward/mean": 1.2969640493392944, "rewards/helpfulness_reward/std": 0.9443899989128113, "rewards/safety_reward/mean": 8.640380859375, "rewards/safety_reward/std": 1.1002862453460693, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 52.890625, "completions/mean_terminated_length": 52.890625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.16976683259104008, "frac_reward_zero_std": 0.0, "grad_norm": 0.6825353503227234, "kl": 3.33203125, "learning_rate": 5e-05, "loss": 0.0343, "num_tokens": 7176595.0, "reward": 8.27783203125, "reward_std": 0.21764953434467316, "rewards/helpfulness_reward/mean": 0.9957351684570312, "rewards/helpfulness_reward/std": 0.8325470089912415, "rewards/safety_reward/mean": 8.27783203125, "rewards/safety_reward/std": 1.2038236856460571, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 53.078125, "completions/mean_terminated_length": 53.078125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.17011614706139203, "frac_reward_zero_std": 0.0, "grad_norm": 0.49559056758880615, "kl": 3.5703125, "learning_rate": 5e-05, "loss": 0.0484, "num_tokens": 7187285.0, "reward": 8.302490234375, "reward_std": 0.28640031814575195, "rewards/helpfulness_reward/mean": 1.0162901878356934, "rewards/helpfulness_reward/std": 0.7256311774253845, "rewards/safety_reward/mean": 8.302490234375, "rewards/safety_reward/std": 1.2299944162368774, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 52.9453125, "completions/mean_terminated_length": 52.9453125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.17046546153174394, "frac_reward_zero_std": 0.0, "grad_norm": 0.5704374313354492, "kl": 3.48828125, "learning_rate": 5e-05, "loss": 0.042, "num_tokens": 7198382.0, "reward": 7.959716796875, "reward_std": 0.291143000125885, "rewards/helpfulness_reward/mean": 0.7202887535095215, "rewards/helpfulness_reward/std": 0.8404873013496399, "rewards/safety_reward/mean": 7.959716796875, "rewards/safety_reward/std": 1.0204007625579834, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 53.5, "completions/mean_terminated_length": 53.5, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.17081477600209588, "frac_reward_zero_std": 0.0625, "grad_norm": 0.572026789188385, "kl": 3.76953125, "learning_rate": 5e-05, "loss": 0.0602, "num_tokens": 7209230.0, "reward": 7.449951171875, "reward_std": 0.40564772486686707, "rewards/helpfulness_reward/mean": 0.5939979553222656, "rewards/helpfulness_reward/std": 1.0461221933364868, "rewards/safety_reward/mean": 7.449951171875, "rewards/safety_reward/std": 1.6742463111877441, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 53.359375, "completions/mean_terminated_length": 53.359375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.17116409047244782, "frac_reward_zero_std": 0.0, "grad_norm": 0.6275709271430969, "kl": 3.490234375, "learning_rate": 5e-05, "loss": 0.0438, "num_tokens": 7225156.0, "reward": 7.563232421875, "reward_std": 0.2901284098625183, "rewards/helpfulness_reward/mean": 0.8034440279006958, "rewards/helpfulness_reward/std": 1.1133604049682617, "rewards/safety_reward/mean": 7.563232421875, "rewards/safety_reward/std": 1.7541745901107788, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 52.890625, "completions/mean_terminated_length": 52.890625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.17151340494279976, "frac_reward_zero_std": 0.0, "grad_norm": 0.37922921776771545, "kl": 3.341796875, "learning_rate": 5e-05, "loss": 0.0384, "num_tokens": 7237038.0, "reward": 8.012939453125, "reward_std": 0.28078389167785645, "rewards/helpfulness_reward/mean": 0.9364242553710938, "rewards/helpfulness_reward/std": 0.8267152309417725, "rewards/safety_reward/mean": 8.012939453125, "rewards/safety_reward/std": 0.8961477279663086, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 53.875, "completions/mean_terminated_length": 53.875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.17186271941315168, "frac_reward_zero_std": 0.0, "grad_norm": 0.48775774240493774, "kl": 3.501953125, "learning_rate": 5e-05, "loss": 0.0612, "num_tokens": 7248078.0, "reward": 8.37548828125, "reward_std": 0.3029993772506714, "rewards/helpfulness_reward/mean": 1.0709383487701416, "rewards/helpfulness_reward/std": 0.5358361601829529, "rewards/safety_reward/mean": 8.37548828125, "rewards/safety_reward/std": 0.9253808856010437, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 53.2734375, "completions/mean_terminated_length": 53.2734375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.17221203388350362, "frac_reward_zero_std": 0.0, "grad_norm": 0.4328867793083191, "kl": 3.62109375, "learning_rate": 5e-05, "loss": 0.0409, "num_tokens": 7259177.0, "reward": 8.204833984375, "reward_std": 0.45503467321395874, "rewards/helpfulness_reward/mean": 1.110177993774414, "rewards/helpfulness_reward/std": 0.6538820266723633, "rewards/safety_reward/mean": 8.204833984375, "rewards/safety_reward/std": 1.2059334516525269, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 53.15625, "completions/mean_terminated_length": 53.15625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.17256134835385556, "frac_reward_zero_std": 0.0625, "grad_norm": 0.3009473979473114, "kl": 3.375, "learning_rate": 5e-05, "loss": 0.0389, "num_tokens": 7269685.0, "reward": 8.330078125, "reward_std": 0.19199706614017487, "rewards/helpfulness_reward/mean": 0.9190253019332886, "rewards/helpfulness_reward/std": 0.7525532245635986, "rewards/safety_reward/mean": 8.330078125, "rewards/safety_reward/std": 0.9599570035934448, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 86.0, "completions/max_terminated_length": 86.0, "completions/mean_length": 53.8203125, "completions/mean_terminated_length": 53.8203125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.1729106628242075, "frac_reward_zero_std": 0.0625, "grad_norm": 0.4002794921398163, "kl": 3.69140625, "learning_rate": 5e-05, "loss": 0.0704, "num_tokens": 7279990.0, "reward": 8.354736328125, "reward_std": 0.42610639333724976, "rewards/helpfulness_reward/mean": 0.993316650390625, "rewards/helpfulness_reward/std": 0.8299631476402283, "rewards/safety_reward/mean": 8.354736328125, "rewards/safety_reward/std": 1.1810237169265747, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 53.046875, "completions/mean_terminated_length": 53.046875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.17325997729455941, "frac_reward_zero_std": 0.0, "grad_norm": 1.4776933193206787, "kl": 4.078125, "learning_rate": 5e-05, "loss": 0.0485, "num_tokens": 7291044.0, "reward": 8.323974609375, "reward_std": 0.26205193996429443, "rewards/helpfulness_reward/mean": 1.2502422332763672, "rewards/helpfulness_reward/std": 1.0133026838302612, "rewards/safety_reward/mean": 8.323974609375, "rewards/safety_reward/std": 1.2728222608566284, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 75.359375, "completions/mean_terminated_length": 75.359375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.17360929176491136, "frac_reward_zero_std": 0.0, "grad_norm": 2.990985631942749, "kl": 4.310546875, "learning_rate": 5e-05, "loss": 0.0803, "num_tokens": 7306346.0, "reward": 8.161796569824219, "reward_std": 0.4008035659790039, "rewards/helpfulness_reward/mean": 0.8589885234832764, "rewards/helpfulness_reward/std": 0.988324761390686, "rewards/safety_reward/mean": 8.161796569824219, "rewards/safety_reward/std": 2.584695339202881, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 53.234375, "completions/mean_terminated_length": 53.234375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.1739586062352633, "frac_reward_zero_std": 0.0, "grad_norm": 0.5063849687576294, "kl": 3.294921875, "learning_rate": 5e-05, "loss": 0.0261, "num_tokens": 7319896.0, "reward": 7.6494140625, "reward_std": 0.3094189763069153, "rewards/helpfulness_reward/mean": 0.7931911945343018, "rewards/helpfulness_reward/std": 0.9918444752693176, "rewards/safety_reward/mean": 7.6494140625, "rewards/safety_reward/std": 1.350637435913086, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 53.546875, "completions/mean_terminated_length": 53.546875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.17430792070561524, "frac_reward_zero_std": 0.0, "grad_norm": 0.5001463294029236, "kl": 3.28515625, "learning_rate": 5e-05, "loss": 0.0404, "num_tokens": 7331926.0, "reward": 8.6103515625, "reward_std": 0.27318865060806274, "rewards/helpfulness_reward/mean": 1.227264404296875, "rewards/helpfulness_reward/std": 0.8673130869865417, "rewards/safety_reward/mean": 8.6103515625, "rewards/safety_reward/std": 1.0389938354492188, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 53.4296875, "completions/mean_terminated_length": 53.4296875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.17465723517596715, "frac_reward_zero_std": 0.0, "grad_norm": 0.5270383358001709, "kl": 3.630859375, "learning_rate": 5e-05, "loss": 0.0484, "num_tokens": 7342405.0, "reward": 8.2080078125, "reward_std": 0.3126397430896759, "rewards/helpfulness_reward/mean": 0.7637910842895508, "rewards/helpfulness_reward/std": 0.6864234805107117, "rewards/safety_reward/mean": 8.2080078125, "rewards/safety_reward/std": 1.017769694328308, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 53.4375, "completions/mean_terminated_length": 53.4375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.1750065496463191, "frac_reward_zero_std": 0.0, "grad_norm": 0.4072262644767761, "kl": 3.51953125, "learning_rate": 5e-05, "loss": 0.053, "num_tokens": 7353421.0, "reward": 8.104736328125, "reward_std": 0.30891019105911255, "rewards/helpfulness_reward/mean": 0.4913487434387207, "rewards/helpfulness_reward/std": 0.9003255367279053, "rewards/safety_reward/mean": 8.104736328125, "rewards/safety_reward/std": 1.2060786485671997, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 53.3125, "completions/mean_terminated_length": 53.3125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.17535586411667103, "frac_reward_zero_std": 0.0, "grad_norm": 0.43616366386413574, "kl": 3.3359375, "learning_rate": 5e-05, "loss": 0.0298, "num_tokens": 7365581.0, "reward": 8.083251953125, "reward_std": 0.33916106820106506, "rewards/helpfulness_reward/mean": 0.9822578430175781, "rewards/helpfulness_reward/std": 0.8204089999198914, "rewards/safety_reward/mean": 8.083251953125, "rewards/safety_reward/std": 1.1537652015686035, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 53.2421875, "completions/mean_terminated_length": 53.2421875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.17570517858702298, "frac_reward_zero_std": 0.0, "grad_norm": 0.36050719022750854, "kl": 3.43359375, "learning_rate": 5e-05, "loss": 0.0481, "num_tokens": 7376572.0, "reward": 8.332763671875, "reward_std": 0.30149757862091064, "rewards/helpfulness_reward/mean": 0.5964555740356445, "rewards/helpfulness_reward/std": 0.9239503145217896, "rewards/safety_reward/mean": 8.332763671875, "rewards/safety_reward/std": 1.305418848991394, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 52.921875, "completions/mean_terminated_length": 52.921875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.1760544930573749, "frac_reward_zero_std": 0.0, "grad_norm": 0.41612088680267334, "kl": 3.34765625, "learning_rate": 5e-05, "loss": 0.0304, "num_tokens": 7387602.0, "reward": 8.093994140625, "reward_std": 0.29506441950798035, "rewards/helpfulness_reward/mean": 0.7700092792510986, "rewards/helpfulness_reward/std": 0.5456146001815796, "rewards/safety_reward/mean": 8.093994140625, "rewards/safety_reward/std": 1.1587942838668823, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 53.1640625, "completions/mean_terminated_length": 53.1640625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.17640380752772683, "frac_reward_zero_std": 0.0, "grad_norm": 1.248282790184021, "kl": 4.130859375, "learning_rate": 5e-05, "loss": 0.0482, "num_tokens": 7397935.0, "reward": 8.227783203125, "reward_std": 0.332588255405426, "rewards/helpfulness_reward/mean": 0.8049860000610352, "rewards/helpfulness_reward/std": 0.9350112676620483, "rewards/safety_reward/mean": 8.227783203125, "rewards/safety_reward/std": 1.3604170083999634, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 53.1171875, "completions/mean_terminated_length": 53.1171875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.17675312199807877, "frac_reward_zero_std": 0.0, "grad_norm": 0.4869705140590668, "kl": 3.580078125, "learning_rate": 5e-05, "loss": 0.0489, "num_tokens": 7409006.0, "reward": 7.8408203125, "reward_std": 0.27294448018074036, "rewards/helpfulness_reward/mean": 0.8846349716186523, "rewards/helpfulness_reward/std": 0.7420529723167419, "rewards/safety_reward/mean": 7.8408203125, "rewards/safety_reward/std": 0.9864316582679749, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 53.4765625, "completions/mean_terminated_length": 53.4765625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.1771024364684307, "frac_reward_zero_std": 0.0, "grad_norm": 0.42906856536865234, "kl": 3.66015625, "learning_rate": 5e-05, "loss": 0.045, "num_tokens": 7419691.0, "reward": 8.40380859375, "reward_std": 0.35161396861076355, "rewards/helpfulness_reward/mean": 1.0243988037109375, "rewards/helpfulness_reward/std": 0.704938530921936, "rewards/safety_reward/mean": 8.40380859375, "rewards/safety_reward/std": 0.9957433342933655, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 53.7265625, "completions/mean_terminated_length": 53.7265625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.17745175093878263, "frac_reward_zero_std": 0.0, "grad_norm": 0.5406219363212585, "kl": 3.427734375, "learning_rate": 5e-05, "loss": 0.0559, "num_tokens": 7430392.0, "reward": 7.884765625, "reward_std": 0.34884464740753174, "rewards/helpfulness_reward/mean": 0.8775930404663086, "rewards/helpfulness_reward/std": 0.7536749839782715, "rewards/safety_reward/mean": 7.884765625, "rewards/safety_reward/std": 0.7091888785362244, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 52.7265625, "completions/mean_terminated_length": 52.7265625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.17780106540913457, "frac_reward_zero_std": 0.0, "grad_norm": 0.95319002866745, "kl": 3.849609375, "learning_rate": 5e-05, "loss": 0.046, "num_tokens": 7441453.0, "reward": 8.10400390625, "reward_std": 0.3227360248565674, "rewards/helpfulness_reward/mean": 1.192610740661621, "rewards/helpfulness_reward/std": 0.7424989342689514, "rewards/safety_reward/mean": 8.10400390625, "rewards/safety_reward/std": 1.846533179283142, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 53.0234375, "completions/mean_terminated_length": 53.0234375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.1781503798794865, "frac_reward_zero_std": 0.0, "grad_norm": 0.3932349681854248, "kl": 3.59765625, "learning_rate": 5e-05, "loss": 0.0297, "num_tokens": 7452712.0, "reward": 7.931640625, "reward_std": 0.2837292551994324, "rewards/helpfulness_reward/mean": 0.9017500877380371, "rewards/helpfulness_reward/std": 0.7741639018058777, "rewards/safety_reward/mean": 7.931640625, "rewards/safety_reward/std": 0.8085129857063293, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 53.09375, "completions/mean_terminated_length": 53.09375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.17849969434983845, "frac_reward_zero_std": 0.0, "grad_norm": 0.39964228868484497, "kl": 3.3984375, "learning_rate": 5e-05, "loss": 0.056, "num_tokens": 7464580.0, "reward": 7.65277099609375, "reward_std": 0.4635525643825531, "rewards/helpfulness_reward/mean": 0.7165675163269043, "rewards/helpfulness_reward/std": 1.3232842683792114, "rewards/safety_reward/mean": 7.65277099609375, "rewards/safety_reward/std": 2.46907114982605, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 52.6796875, "completions/mean_terminated_length": 52.6796875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.17884900882019036, "frac_reward_zero_std": 0.0, "grad_norm": 0.32682284712791443, "kl": 3.572265625, "learning_rate": 5e-05, "loss": 0.0385, "num_tokens": 7475459.0, "reward": 8.0205078125, "reward_std": 0.3807162642478943, "rewards/helpfulness_reward/mean": 1.061903953552246, "rewards/helpfulness_reward/std": 0.6218333840370178, "rewards/safety_reward/mean": 8.0205078125, "rewards/safety_reward/std": 1.1775538921356201, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 53.2265625, "completions/mean_terminated_length": 53.2265625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.1791983232905423, "frac_reward_zero_std": 0.0, "grad_norm": 0.6460004448890686, "kl": 3.9765625, "learning_rate": 5e-05, "loss": 0.0603, "num_tokens": 7488184.0, "reward": 7.858154296875, "reward_std": 0.42430761456489563, "rewards/helpfulness_reward/mean": 0.9213597774505615, "rewards/helpfulness_reward/std": 0.8207114338874817, "rewards/safety_reward/mean": 7.858154296875, "rewards/safety_reward/std": 1.1322755813598633, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 52.6015625, "completions/mean_terminated_length": 52.6015625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.17954763776089425, "frac_reward_zero_std": 0.0, "grad_norm": 0.358093798160553, "kl": 3.494140625, "learning_rate": 5e-05, "loss": 0.0348, "num_tokens": 7499437.0, "reward": 8.123779296875, "reward_std": 0.3495336174964905, "rewards/helpfulness_reward/mean": 1.4005317687988281, "rewards/helpfulness_reward/std": 0.6921106576919556, "rewards/safety_reward/mean": 8.123779296875, "rewards/safety_reward/std": 0.9124305248260498, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 53.125, "completions/mean_terminated_length": 53.125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.1798969522312462, "frac_reward_zero_std": 0.0, "grad_norm": 0.8464233875274658, "kl": 4.072265625, "learning_rate": 5e-05, "loss": 0.0644, "num_tokens": 7511037.0, "reward": 7.5113525390625, "reward_std": 0.4624253511428833, "rewards/helpfulness_reward/mean": 0.8430891036987305, "rewards/helpfulness_reward/std": 1.1295791864395142, "rewards/safety_reward/mean": 7.5113525390625, "rewards/safety_reward/std": 1.8362988233566284, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 52.7734375, "completions/mean_terminated_length": 52.7734375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.1802462667015981, "frac_reward_zero_std": 0.0, "grad_norm": 0.42973849177360535, "kl": 3.369140625, "learning_rate": 5e-05, "loss": 0.0427, "num_tokens": 7522432.0, "reward": 8.322509765625, "reward_std": 0.3483266234397888, "rewards/helpfulness_reward/mean": 1.1665668487548828, "rewards/helpfulness_reward/std": 0.8177008032798767, "rewards/safety_reward/mean": 8.322509765625, "rewards/safety_reward/std": 1.2505038976669312, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 52.9609375, "completions/mean_terminated_length": 52.9609375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.18059558117195004, "frac_reward_zero_std": 0.0, "grad_norm": 0.36955854296684265, "kl": 3.3125, "learning_rate": 5e-05, "loss": 0.0492, "num_tokens": 7533027.0, "reward": 8.28857421875, "reward_std": 0.30647727847099304, "rewards/helpfulness_reward/mean": 1.381143569946289, "rewards/helpfulness_reward/std": 0.881619393825531, "rewards/safety_reward/mean": 8.28857421875, "rewards/safety_reward/std": 1.2198976278305054, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 53.140625, "completions/mean_terminated_length": 53.140625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.18094489564230198, "frac_reward_zero_std": 0.0, "grad_norm": 0.7036133408546448, "kl": 3.81640625, "learning_rate": 5e-05, "loss": 0.0465, "num_tokens": 7543613.0, "reward": 8.103515625, "reward_std": 0.389790803194046, "rewards/helpfulness_reward/mean": 0.9494071006774902, "rewards/helpfulness_reward/std": 0.7912173271179199, "rewards/safety_reward/mean": 8.103515625, "rewards/safety_reward/std": 1.0035289525985718, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 52.765625, "completions/mean_terminated_length": 52.765625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.18129421011265393, "frac_reward_zero_std": 0.0, "grad_norm": 0.4066995084285736, "kl": 3.36328125, "learning_rate": 5e-05, "loss": 0.0412, "num_tokens": 7553903.0, "reward": 8.59423828125, "reward_std": 0.3611155152320862, "rewards/helpfulness_reward/mean": 1.0787923336029053, "rewards/helpfulness_reward/std": 1.0283665657043457, "rewards/safety_reward/mean": 8.59423828125, "rewards/safety_reward/std": 1.0913323163986206, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 50.875, "completions/mean_terminated_length": 50.875, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.18164352458300584, "frac_reward_zero_std": 0.0, "grad_norm": 48287.41796875, "kl": 10018.09765625, "learning_rate": 5e-05, "loss": 99.9128, "num_tokens": 7565991.0, "reward": 8.3370361328125, "reward_std": 0.3493386507034302, "rewards/helpfulness_reward/mean": 1.431447982788086, "rewards/helpfulness_reward/std": 1.1649960279464722, "rewards/safety_reward/mean": 8.3370361328125, "rewards/safety_reward/std": 1.8441636562347412, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 53.3203125, "completions/mean_terminated_length": 53.3203125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.18199283905335778, "frac_reward_zero_std": 0.0, "grad_norm": 0.331794410943985, "kl": 3.5625, "learning_rate": 5e-05, "loss": 0.058, "num_tokens": 7578184.0, "reward": 8.431884765625, "reward_std": 0.39633607864379883, "rewards/helpfulness_reward/mean": 1.1106524467468262, "rewards/helpfulness_reward/std": 0.9091062545776367, "rewards/safety_reward/mean": 8.431884765625, "rewards/safety_reward/std": 0.9848635196685791, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 53.015625, "completions/mean_terminated_length": 53.015625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.18234215352370972, "frac_reward_zero_std": 0.0, "grad_norm": 0.42577609419822693, "kl": 3.189453125, "learning_rate": 5e-05, "loss": 0.0392, "num_tokens": 7588826.0, "reward": 8.38427734375, "reward_std": 0.37619566917419434, "rewards/helpfulness_reward/mean": 1.250502586364746, "rewards/helpfulness_reward/std": 0.9893267750740051, "rewards/safety_reward/mean": 8.38427734375, "rewards/safety_reward/std": 1.1028227806091309, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 53.4140625, "completions/mean_terminated_length": 53.4140625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.18269146799406166, "frac_reward_zero_std": 0.0, "grad_norm": 0.486083447933197, "kl": 3.5625, "learning_rate": 5e-05, "loss": 0.05, "num_tokens": 7599623.0, "reward": 8.27392578125, "reward_std": 0.33261239528656006, "rewards/helpfulness_reward/mean": 1.0475921630859375, "rewards/helpfulness_reward/std": 0.8346936106681824, "rewards/safety_reward/mean": 8.27392578125, "rewards/safety_reward/std": 1.2064158916473389, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 52.8828125, "completions/mean_terminated_length": 52.8828125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.18304078246441358, "frac_reward_zero_std": 0.0, "grad_norm": 0.4317058026790619, "kl": 3.59765625, "learning_rate": 5e-05, "loss": 0.0341, "num_tokens": 7610064.0, "reward": 7.9322509765625, "reward_std": 0.5490206480026245, "rewards/helpfulness_reward/mean": 0.9607226848602295, "rewards/helpfulness_reward/std": 0.7769060730934143, "rewards/safety_reward/mean": 7.9322509765625, "rewards/safety_reward/std": 1.3764382600784302, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 53.140625, "completions/mean_terminated_length": 53.140625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.18339009693476552, "frac_reward_zero_std": 0.0, "grad_norm": 0.5167401432991028, "kl": 3.62890625, "learning_rate": 5e-05, "loss": 0.0471, "num_tokens": 7620346.0, "reward": 8.213134765625, "reward_std": 0.3080519437789917, "rewards/helpfulness_reward/mean": 1.1371231079101562, "rewards/helpfulness_reward/std": 0.832928478717804, "rewards/safety_reward/mean": 8.213134765625, "rewards/safety_reward/std": 0.9314389824867249, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 52.875, "completions/mean_terminated_length": 52.875, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.18373941140511746, "frac_reward_zero_std": 0.0, "grad_norm": 2.0291361808776855, "kl": 4.48046875, "learning_rate": 5e-05, "loss": 0.0472, "num_tokens": 7631330.0, "reward": 8.1416015625, "reward_std": 0.3910890817642212, "rewards/helpfulness_reward/mean": 0.8367671966552734, "rewards/helpfulness_reward/std": 0.8904580473899841, "rewards/safety_reward/mean": 8.1416015625, "rewards/safety_reward/std": 1.023078441619873, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 52.890625, "completions/mean_terminated_length": 52.890625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.1840887258754694, "frac_reward_zero_std": 0.0, "grad_norm": 0.42593836784362793, "kl": 3.615234375, "learning_rate": 5e-05, "loss": 0.0338, "num_tokens": 7643604.0, "reward": 8.058837890625, "reward_std": 0.3774649500846863, "rewards/helpfulness_reward/mean": 0.7294607162475586, "rewards/helpfulness_reward/std": 0.9208241701126099, "rewards/safety_reward/mean": 8.058837890625, "rewards/safety_reward/std": 1.1364697217941284, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 49.6484375, "completions/mean_terminated_length": 49.6484375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.1844380403458213, "frac_reward_zero_std": 0.0625, "grad_norm": 0.44072476029396057, "kl": 3.533203125, "learning_rate": 5e-05, "loss": 0.0456, "num_tokens": 7655559.0, "reward": 8.237548828125, "reward_std": 0.19201765954494476, "rewards/helpfulness_reward/mean": 1.3153691291809082, "rewards/helpfulness_reward/std": 0.9467343688011169, "rewards/safety_reward/mean": 8.237548828125, "rewards/safety_reward/std": 1.5144548416137695, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 52.78125, "completions/mean_terminated_length": 52.78125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.18478735481617325, "frac_reward_zero_std": 0.0, "grad_norm": 0.408497154712677, "kl": 3.65234375, "learning_rate": 5e-05, "loss": 0.0327, "num_tokens": 7667651.0, "reward": 8.452392578125, "reward_std": 0.309526652097702, "rewards/helpfulness_reward/mean": 0.9780418872833252, "rewards/helpfulness_reward/std": 0.8466319441795349, "rewards/safety_reward/mean": 8.452392578125, "rewards/safety_reward/std": 1.1794039011001587, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 52.7109375, "completions/mean_terminated_length": 52.7109375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.1851366692865252, "frac_reward_zero_std": 0.0, "grad_norm": 4.4483795166015625, "kl": 5.927734375, "learning_rate": 5e-05, "loss": 0.0544, "num_tokens": 7677878.0, "reward": 8.362060546875, "reward_std": 0.3376484513282776, "rewards/helpfulness_reward/mean": 1.33807373046875, "rewards/helpfulness_reward/std": 0.7280566096305847, "rewards/safety_reward/mean": 8.362060546875, "rewards/safety_reward/std": 1.1926594972610474, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 52.671875, "completions/mean_terminated_length": 52.671875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.18548598375687714, "frac_reward_zero_std": 0.0, "grad_norm": 0.38528212904930115, "kl": 3.408203125, "learning_rate": 5e-05, "loss": 0.0371, "num_tokens": 7689004.0, "reward": 8.37109375, "reward_std": 0.36506277322769165, "rewards/helpfulness_reward/mean": 0.6594324111938477, "rewards/helpfulness_reward/std": 0.8013404011726379, "rewards/safety_reward/mean": 8.37109375, "rewards/safety_reward/std": 1.4201385974884033, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 53.15625, "completions/mean_terminated_length": 53.15625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.18583529822722905, "frac_reward_zero_std": 0.0, "grad_norm": 0.4418794512748718, "kl": 3.57421875, "learning_rate": 5e-05, "loss": 0.0426, "num_tokens": 7699672.0, "reward": 8.144775390625, "reward_std": 0.4023369550704956, "rewards/helpfulness_reward/mean": 0.8001281023025513, "rewards/helpfulness_reward/std": 0.7019540667533875, "rewards/safety_reward/mean": 8.144775390625, "rewards/safety_reward/std": 1.2270334959030151, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 52.8359375, "completions/mean_terminated_length": 52.8359375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.186184612697581, "frac_reward_zero_std": 0.0, "grad_norm": 0.4033966660499573, "kl": 3.28515625, "learning_rate": 5e-05, "loss": 0.0334, "num_tokens": 7711267.0, "reward": 8.106201171875, "reward_std": 0.3033636212348938, "rewards/helpfulness_reward/mean": 1.0175037384033203, "rewards/helpfulness_reward/std": 1.1220422983169556, "rewards/safety_reward/mean": 8.106201171875, "rewards/safety_reward/std": 0.9315042495727539, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 53.015625, "completions/mean_terminated_length": 53.015625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.18653392716793293, "frac_reward_zero_std": 0.0, "grad_norm": 0.49259576201438904, "kl": 3.642578125, "learning_rate": 5e-05, "loss": 0.0437, "num_tokens": 7722069.0, "reward": 8.630126953125, "reward_std": 0.2883702516555786, "rewards/helpfulness_reward/mean": 1.13818359375, "rewards/helpfulness_reward/std": 0.6438074111938477, "rewards/safety_reward/mean": 8.630126953125, "rewards/safety_reward/std": 1.2048168182373047, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 53.0078125, "completions/mean_terminated_length": 53.0078125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.18688324163828487, "frac_reward_zero_std": 0.0, "grad_norm": 0.39825204014778137, "kl": 3.392578125, "learning_rate": 5e-05, "loss": 0.0353, "num_tokens": 7734454.0, "reward": 8.484130859375, "reward_std": 0.3356088399887085, "rewards/helpfulness_reward/mean": 0.9386649131774902, "rewards/helpfulness_reward/std": 0.8316611051559448, "rewards/safety_reward/mean": 8.484130859375, "rewards/safety_reward/std": 1.3540480136871338, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 52.6015625, "completions/mean_terminated_length": 52.6015625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.1872325561086368, "frac_reward_zero_std": 0.0, "grad_norm": 0.3349386155605316, "kl": 3.48046875, "learning_rate": 5e-05, "loss": 0.0338, "num_tokens": 7745163.0, "reward": 8.2138671875, "reward_std": 0.28301310539245605, "rewards/helpfulness_reward/mean": 0.8920595645904541, "rewards/helpfulness_reward/std": 0.8639522194862366, "rewards/safety_reward/mean": 8.2138671875, "rewards/safety_reward/std": 1.0746285915374756, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 52.484375, "completions/mean_terminated_length": 52.484375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.18758187057898873, "frac_reward_zero_std": 0.0, "grad_norm": 0.3604457378387451, "kl": 3.41796875, "learning_rate": 5e-05, "loss": 0.0381, "num_tokens": 7757441.0, "reward": 7.778564453125, "reward_std": 0.3247672915458679, "rewards/helpfulness_reward/mean": 1.0445330142974854, "rewards/helpfulness_reward/std": 0.5591793656349182, "rewards/safety_reward/mean": 7.778564453125, "rewards/safety_reward/std": 0.7774071097373962, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 126.0, "completions/max_terminated_length": 126.0, "completions/mean_length": 55.0546875, "completions/mean_terminated_length": 55.0546875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.18793118504934067, "frac_reward_zero_std": 0.0, "grad_norm": 1.2271952629089355, "kl": 4.2265625, "learning_rate": 5e-05, "loss": 0.0522, "num_tokens": 7768904.0, "reward": 7.9395751953125, "reward_std": 0.4354662001132965, "rewards/helpfulness_reward/mean": 1.4082012176513672, "rewards/helpfulness_reward/std": 0.708054780960083, "rewards/safety_reward/mean": 7.9395751953125, "rewards/safety_reward/std": 1.7611521482467651, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 52.546875, "completions/mean_terminated_length": 52.546875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.1882804995196926, "frac_reward_zero_std": 0.0, "grad_norm": 0.4297665059566498, "kl": 3.41796875, "learning_rate": 5e-05, "loss": 0.0291, "num_tokens": 7779718.0, "reward": 8.201416015625, "reward_std": 0.3395012617111206, "rewards/helpfulness_reward/mean": 0.8593063354492188, "rewards/helpfulness_reward/std": 0.7911742925643921, "rewards/safety_reward/mean": 8.201416015625, "rewards/safety_reward/std": 0.9905418753623962, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 52.8046875, "completions/mean_terminated_length": 52.8046875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.18862981399004453, "frac_reward_zero_std": 0.0, "grad_norm": 0.4258078336715698, "kl": 3.494140625, "learning_rate": 5e-05, "loss": 0.0406, "num_tokens": 7790717.0, "reward": 8.422607421875, "reward_std": 0.3033628463745117, "rewards/helpfulness_reward/mean": 1.2126598358154297, "rewards/helpfulness_reward/std": 0.7528262138366699, "rewards/safety_reward/mean": 8.422607421875, "rewards/safety_reward/std": 1.1357409954071045, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 52.921875, "completions/mean_terminated_length": 52.921875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.18897912846039647, "frac_reward_zero_std": 0.0, "grad_norm": 0.3923708498477936, "kl": 3.62890625, "learning_rate": 5e-05, "loss": 0.038, "num_tokens": 7801267.0, "reward": 8.366455078125, "reward_std": 0.2663114070892334, "rewards/helpfulness_reward/mean": 1.0463895797729492, "rewards/helpfulness_reward/std": 0.8369267582893372, "rewards/safety_reward/mean": 8.366455078125, "rewards/safety_reward/std": 0.889690637588501, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 52.65625, "completions/mean_terminated_length": 52.65625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.1893284429307484, "frac_reward_zero_std": 0.0, "grad_norm": 0.3661254644393921, "kl": 3.640625, "learning_rate": 5e-05, "loss": 0.0373, "num_tokens": 7812119.0, "reward": 8.35888671875, "reward_std": 0.48360908031463623, "rewards/helpfulness_reward/mean": 1.1696863174438477, "rewards/helpfulness_reward/std": 0.9437620639801025, "rewards/safety_reward/mean": 8.35888671875, "rewards/safety_reward/std": 0.9083898067474365, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 52.4609375, "completions/mean_terminated_length": 52.4609375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.18967775740110035, "frac_reward_zero_std": 0.0, "grad_norm": 0.4307120442390442, "kl": 3.95703125, "learning_rate": 5e-05, "loss": 0.0342, "num_tokens": 7825322.0, "reward": 7.685791015625, "reward_std": 0.2758772373199463, "rewards/helpfulness_reward/mean": 1.047248363494873, "rewards/helpfulness_reward/std": 0.8455339670181274, "rewards/safety_reward/mean": 7.685791015625, "rewards/safety_reward/std": 1.1620532274246216, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 49.859375, "completions/mean_terminated_length": 49.859375, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.19002707187145226, "frac_reward_zero_std": 0.0625, "grad_norm": 0.43650540709495544, "kl": 3.56640625, "learning_rate": 5e-05, "loss": 0.0427, "num_tokens": 7836904.0, "reward": 8.009765625, "reward_std": 0.3135777711868286, "rewards/helpfulness_reward/mean": 1.5104994773864746, "rewards/helpfulness_reward/std": 0.9832385182380676, "rewards/safety_reward/mean": 8.009765625, "rewards/safety_reward/std": 2.2058348655700684, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 52.546875, "completions/mean_terminated_length": 52.546875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.1903763863418042, "frac_reward_zero_std": 0.0, "grad_norm": 2.387317419052124, "kl": 4.40234375, "learning_rate": 5e-05, "loss": 0.0489, "num_tokens": 7847294.0, "reward": 8.5244140625, "reward_std": 0.35042881965637207, "rewards/helpfulness_reward/mean": 1.2218246459960938, "rewards/helpfulness_reward/std": 0.6962118148803711, "rewards/safety_reward/mean": 8.5244140625, "rewards/safety_reward/std": 1.282656192779541, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 52.71875, "completions/mean_terminated_length": 52.71875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.19072570081215615, "frac_reward_zero_std": 0.0, "grad_norm": 0.370440810918808, "kl": 3.3359375, "learning_rate": 5e-05, "loss": 0.0427, "num_tokens": 7857786.0, "reward": 8.7998046875, "reward_std": 0.2571612000465393, "rewards/helpfulness_reward/mean": 1.3516464233398438, "rewards/helpfulness_reward/std": 0.5482742190361023, "rewards/safety_reward/mean": 8.7998046875, "rewards/safety_reward/std": 0.8860682249069214, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 52.53125, "completions/mean_terminated_length": 52.53125, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.1910750152825081, "frac_reward_zero_std": 0.0, "grad_norm": 2.3622331619262695, "kl": 5.16796875, "learning_rate": 5e-05, "loss": 0.0474, "num_tokens": 7869118.0, "reward": 8.112060546875, "reward_std": 0.29429736733436584, "rewards/helpfulness_reward/mean": 1.1504549980163574, "rewards/helpfulness_reward/std": 0.7359533905982971, "rewards/safety_reward/mean": 8.112060546875, "rewards/safety_reward/std": 1.2100056409835815, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 52.6640625, "completions/mean_terminated_length": 52.6640625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.19142432975286, "frac_reward_zero_std": 0.0, "grad_norm": 0.6351557970046997, "kl": 3.61328125, "learning_rate": 5e-05, "loss": 0.0394, "num_tokens": 7881675.0, "reward": 7.89727783203125, "reward_std": 0.4299360513687134, "rewards/helpfulness_reward/mean": 0.8507480621337891, "rewards/helpfulness_reward/std": 0.87397301197052, "rewards/safety_reward/mean": 7.89727783203125, "rewards/safety_reward/std": 1.4081140756607056, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 52.8359375, "completions/mean_terminated_length": 52.8359375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.19177364422321194, "frac_reward_zero_std": 0.0, "grad_norm": 0.34364455938339233, "kl": 3.51171875, "learning_rate": 5e-05, "loss": 0.0405, "num_tokens": 7892446.0, "reward": 8.526123046875, "reward_std": 0.2643333673477173, "rewards/helpfulness_reward/mean": 1.438593864440918, "rewards/helpfulness_reward/std": 0.7636765241622925, "rewards/safety_reward/mean": 8.526123046875, "rewards/safety_reward/std": 0.6726402044296265, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 51.59375, "completions/mean_terminated_length": 51.59375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.19212295869356388, "frac_reward_zero_std": 0.0, "grad_norm": 0.5139715075492859, "kl": 3.3125, "learning_rate": 5e-05, "loss": 0.0278, "num_tokens": 7903730.0, "reward": 7.9981689453125, "reward_std": 0.3287096619606018, "rewards/helpfulness_reward/mean": 1.3974246978759766, "rewards/helpfulness_reward/std": 0.7405548095703125, "rewards/safety_reward/mean": 7.9981689453125, "rewards/safety_reward/std": 1.9840680360794067, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 50.9609375, "completions/mean_terminated_length": 50.9609375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.19247227316391582, "frac_reward_zero_std": 0.0, "grad_norm": 0.38469254970550537, "kl": 3.404296875, "learning_rate": 5e-05, "loss": 0.0244, "num_tokens": 7914613.0, "reward": 8.158601760864258, "reward_std": 0.35397815704345703, "rewards/helpfulness_reward/mean": 1.3680944442749023, "rewards/helpfulness_reward/std": 0.7944198250770569, "rewards/safety_reward/mean": 8.158601760864258, "rewards/safety_reward/std": 2.2947006225585938, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 53.203125, "completions/mean_terminated_length": 53.203125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.19282158763426774, "frac_reward_zero_std": 0.0, "grad_norm": 2.025861978530884, "kl": 4.517578125, "learning_rate": 5e-05, "loss": 0.0606, "num_tokens": 7928063.0, "reward": 8.3203125, "reward_std": 0.36389249563217163, "rewards/helpfulness_reward/mean": 1.0807228088378906, "rewards/helpfulness_reward/std": 0.8807182312011719, "rewards/safety_reward/mean": 8.3203125, "rewards/safety_reward/std": 1.070668339729309, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 53.671875, "completions/mean_terminated_length": 53.671875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.19317090210461968, "frac_reward_zero_std": 0.0, "grad_norm": 0.5143126845359802, "kl": 3.7265625, "learning_rate": 5e-05, "loss": 0.0597, "num_tokens": 7938901.0, "reward": 8.74560546875, "reward_std": 0.3372408151626587, "rewards/helpfulness_reward/mean": 1.2786407470703125, "rewards/helpfulness_reward/std": 0.732585608959198, "rewards/safety_reward/mean": 8.74560546875, "rewards/safety_reward/std": 1.2174636125564575, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 52.9375, "completions/mean_terminated_length": 52.9375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.19352021657497162, "frac_reward_zero_std": 0.0, "grad_norm": 0.4050464630126953, "kl": 3.41796875, "learning_rate": 5e-05, "loss": 0.0387, "num_tokens": 7949189.0, "reward": 8.1287841796875, "reward_std": 0.3249693214893341, "rewards/helpfulness_reward/mean": 1.0926628112792969, "rewards/helpfulness_reward/std": 0.779727578163147, "rewards/safety_reward/mean": 8.1287841796875, "rewards/safety_reward/std": 1.1845967769622803, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 53.578125, "completions/mean_terminated_length": 53.578125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.19386953104532356, "frac_reward_zero_std": 0.0, "grad_norm": 0.512808084487915, "kl": 3.5546875, "learning_rate": 5e-05, "loss": 0.0433, "num_tokens": 7961231.0, "reward": 8.469970703125, "reward_std": 0.3389153778553009, "rewards/helpfulness_reward/mean": 1.306409239768982, "rewards/helpfulness_reward/std": 0.8856992721557617, "rewards/safety_reward/mean": 8.469970703125, "rewards/safety_reward/std": 1.0402545928955078, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 104.0, "completions/max_terminated_length": 104.0, "completions/mean_length": 53.953125, "completions/mean_terminated_length": 53.953125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.19421884551567548, "frac_reward_zero_std": 0.0, "grad_norm": 0.46770334243774414, "kl": 3.484375, "learning_rate": 5e-05, "loss": 0.0663, "num_tokens": 7973049.0, "reward": 8.18115234375, "reward_std": 0.3068435788154602, "rewards/helpfulness_reward/mean": 1.0208320617675781, "rewards/helpfulness_reward/std": 0.9460827708244324, "rewards/safety_reward/mean": 8.18115234375, "rewards/safety_reward/std": 1.3093581199645996, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 53.3359375, "completions/mean_terminated_length": 53.3359375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.19456815998602742, "frac_reward_zero_std": 0.0, "grad_norm": 0.4207618236541748, "kl": 3.2578125, "learning_rate": 5e-05, "loss": 0.0465, "num_tokens": 7984084.0, "reward": 8.49609375, "reward_std": 0.3531380593776703, "rewards/helpfulness_reward/mean": 1.2801475524902344, "rewards/helpfulness_reward/std": 0.653390109539032, "rewards/safety_reward/mean": 8.49609375, "rewards/safety_reward/std": 0.7721068859100342, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 85.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 54.3203125, "completions/mean_terminated_length": 54.3203125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.19491747445637936, "frac_reward_zero_std": 0.0, "grad_norm": 0.3555881679058075, "kl": 3.3984375, "learning_rate": 5e-05, "loss": 0.044, "num_tokens": 7995661.0, "reward": 8.11279296875, "reward_std": 0.33208009600639343, "rewards/helpfulness_reward/mean": 1.2249727249145508, "rewards/helpfulness_reward/std": 0.7043002843856812, "rewards/safety_reward/mean": 8.11279296875, "rewards/safety_reward/std": 1.4047001600265503, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 53.859375, "completions/mean_terminated_length": 53.859375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.1952667889267313, "frac_reward_zero_std": 0.0, "grad_norm": 0.4522862136363983, "kl": 3.45703125, "learning_rate": 5e-05, "loss": 0.0602, "num_tokens": 8005979.0, "reward": 7.991455078125, "reward_std": 0.3333076238632202, "rewards/helpfulness_reward/mean": 0.8518962860107422, "rewards/helpfulness_reward/std": 0.8194264769554138, "rewards/safety_reward/mean": 7.991455078125, "rewards/safety_reward/std": 1.714727520942688, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 93.0, "completions/max_terminated_length": 93.0, "completions/mean_length": 53.5859375, "completions/mean_terminated_length": 53.5859375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.1956161033970832, "frac_reward_zero_std": 0.0, "grad_norm": 0.43116477131843567, "kl": 3.482421875, "learning_rate": 5e-05, "loss": 0.0624, "num_tokens": 8016046.0, "reward": 7.95556640625, "reward_std": 0.328468918800354, "rewards/helpfulness_reward/mean": 1.0085582733154297, "rewards/helpfulness_reward/std": 0.7580580711364746, "rewards/safety_reward/mean": 7.95556640625, "rewards/safety_reward/std": 0.9525902271270752, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 87.0, "completions/max_terminated_length": 87.0, "completions/mean_length": 53.71875, "completions/mean_terminated_length": 53.71875, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.19596541786743515, "frac_reward_zero_std": 0.0, "grad_norm": 0.5182638168334961, "kl": 3.47265625, "learning_rate": 5e-05, "loss": 0.04, "num_tokens": 8026562.0, "reward": 8.262451171875, "reward_std": 0.36147257685661316, "rewards/helpfulness_reward/mean": 0.9330577850341797, "rewards/helpfulness_reward/std": 0.8932369947433472, "rewards/safety_reward/mean": 8.262451171875, "rewards/safety_reward/std": 1.303814172744751, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 53.15625, "completions/mean_terminated_length": 53.15625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.1963147323377871, "frac_reward_zero_std": 0.0, "grad_norm": 0.4622547924518585, "kl": 3.51953125, "learning_rate": 5e-05, "loss": 0.0349, "num_tokens": 8037238.0, "reward": 8.137451171875, "reward_std": 0.410696804523468, "rewards/helpfulness_reward/mean": 0.7449607849121094, "rewards/helpfulness_reward/std": 0.5924825668334961, "rewards/safety_reward/mean": 8.137451171875, "rewards/safety_reward/std": 1.1091701984405518, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 53.1796875, "completions/mean_terminated_length": 53.1796875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.19666404680813904, "frac_reward_zero_std": 0.0, "grad_norm": 0.49396535754203796, "kl": 4.056640625, "learning_rate": 5e-05, "loss": 0.0454, "num_tokens": 8048277.0, "reward": 8.656982421875, "reward_std": 0.3368628919124603, "rewards/helpfulness_reward/mean": 1.384847640991211, "rewards/helpfulness_reward/std": 0.7718047499656677, "rewards/safety_reward/mean": 8.656982421875, "rewards/safety_reward/std": 0.8712435364723206, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 52.8984375, "completions/mean_terminated_length": 52.8984375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.19701336127849095, "frac_reward_zero_std": 0.0, "grad_norm": 0.38686665892601013, "kl": 3.73828125, "learning_rate": 5e-05, "loss": 0.0364, "num_tokens": 8058504.0, "reward": 8.162353515625, "reward_std": 0.24609482288360596, "rewards/helpfulness_reward/mean": 0.8200316429138184, "rewards/helpfulness_reward/std": 0.713485836982727, "rewards/safety_reward/mean": 8.162353515625, "rewards/safety_reward/std": 0.8701183199882507, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 53.1953125, "completions/mean_terminated_length": 53.1953125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.1973626757488429, "frac_reward_zero_std": 0.0, "grad_norm": 0.49456751346588135, "kl": 3.787109375, "learning_rate": 5e-05, "loss": 0.0412, "num_tokens": 8068961.0, "reward": 8.324951171875, "reward_std": 0.2884545922279358, "rewards/helpfulness_reward/mean": 0.8815631866455078, "rewards/helpfulness_reward/std": 0.7339379787445068, "rewards/safety_reward/mean": 8.324951171875, "rewards/safety_reward/std": 1.0878061056137085, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 53.21875, "completions/mean_terminated_length": 53.21875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.19771199021919483, "frac_reward_zero_std": 0.0, "grad_norm": 0.9065523743629456, "kl": 4.443359375, "learning_rate": 5e-05, "loss": 0.0496, "num_tokens": 8079653.0, "reward": 8.72705078125, "reward_std": 0.3493327796459198, "rewards/helpfulness_reward/mean": 0.8320140838623047, "rewards/helpfulness_reward/std": 0.6568275094032288, "rewards/safety_reward/mean": 8.72705078125, "rewards/safety_reward/std": 1.3087772130966187, "step": 566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 52.734375, "completions/mean_terminated_length": 52.734375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.19806130468954677, "frac_reward_zero_std": 0.0, "grad_norm": 0.4790550470352173, "kl": 3.576171875, "learning_rate": 5e-05, "loss": 0.0298, "num_tokens": 8092947.0, "reward": 8.1563720703125, "reward_std": 0.21422192454338074, "rewards/helpfulness_reward/mean": 0.6028175354003906, "rewards/helpfulness_reward/std": 0.7259665727615356, "rewards/safety_reward/mean": 8.1563720703125, "rewards/safety_reward/std": 1.2772926092147827, "step": 567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 53.2265625, "completions/mean_terminated_length": 53.2265625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.1984106191598987, "frac_reward_zero_std": 0.0, "grad_norm": 0.46367746591567993, "kl": 3.451171875, "learning_rate": 5e-05, "loss": 0.044, "num_tokens": 8103608.0, "reward": 8.09814453125, "reward_std": 0.3066530227661133, "rewards/helpfulness_reward/mean": 0.9190444946289062, "rewards/helpfulness_reward/std": 1.0682059526443481, "rewards/safety_reward/mean": 8.09814453125, "rewards/safety_reward/std": 1.0743587017059326, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 53.3203125, "completions/mean_terminated_length": 53.3203125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.19875993363025063, "frac_reward_zero_std": 0.0, "grad_norm": 0.5878990888595581, "kl": 3.890625, "learning_rate": 5e-05, "loss": 0.0509, "num_tokens": 8114289.0, "reward": 8.585205078125, "reward_std": 0.26032179594039917, "rewards/helpfulness_reward/mean": 1.2128773927688599, "rewards/helpfulness_reward/std": 0.737248957157135, "rewards/safety_reward/mean": 8.585205078125, "rewards/safety_reward/std": 0.9604131579399109, "step": 569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 53.0546875, "completions/mean_terminated_length": 53.0546875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.19910924810060257, "frac_reward_zero_std": 0.0, "grad_norm": 619.5333862304688, "kl": 189.4765625, "learning_rate": 5e-05, "loss": 1.9023, "num_tokens": 8128392.0, "reward": 6.76763916015625, "reward_std": 0.346839964389801, "rewards/helpfulness_reward/mean": 0.7785072326660156, "rewards/helpfulness_reward/std": 1.0750709772109985, "rewards/safety_reward/mean": 6.76763916015625, "rewards/safety_reward/std": 2.159406900405884, "step": 570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 53.0078125, "completions/mean_terminated_length": 53.0078125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.1994585625709545, "frac_reward_zero_std": 0.0, "grad_norm": 0.37510553002357483, "kl": 3.72265625, "learning_rate": 5e-05, "loss": 0.0327, "num_tokens": 8138657.0, "reward": 8.79736328125, "reward_std": 0.2405543327331543, "rewards/helpfulness_reward/mean": 0.6797027587890625, "rewards/helpfulness_reward/std": 0.7887466549873352, "rewards/safety_reward/mean": 8.79736328125, "rewards/safety_reward/std": 0.9300225377082825, "step": 571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 52.828125, "completions/mean_terminated_length": 52.828125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.19980787704130643, "frac_reward_zero_std": 0.0, "grad_norm": 0.4232967793941498, "kl": 3.802734375, "learning_rate": 5e-05, "loss": 0.0332, "num_tokens": 8149251.0, "reward": 8.568115234375, "reward_std": 0.2195383757352829, "rewards/helpfulness_reward/mean": 0.8708133697509766, "rewards/helpfulness_reward/std": 0.6116098165512085, "rewards/safety_reward/mean": 8.568115234375, "rewards/safety_reward/std": 0.894815981388092, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 53.1875, "completions/mean_terminated_length": 53.1875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.20015719151165837, "frac_reward_zero_std": 0.0, "grad_norm": 0.4377864599227905, "kl": 3.8359375, "learning_rate": 5e-05, "loss": 0.0472, "num_tokens": 8159899.0, "reward": 8.4383544921875, "reward_std": 0.46900278329849243, "rewards/helpfulness_reward/mean": 1.0174999237060547, "rewards/helpfulness_reward/std": 0.7452525496482849, "rewards/safety_reward/mean": 8.4383544921875, "rewards/safety_reward/std": 1.1631430387496948, "step": 573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 95.0, "completions/max_terminated_length": 95.0, "completions/mean_length": 52.9921875, "completions/mean_terminated_length": 52.9921875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.2005065059820103, "frac_reward_zero_std": 0.0, "grad_norm": 0.4462168216705322, "kl": 3.72265625, "learning_rate": 5e-05, "loss": 0.041, "num_tokens": 8170994.0, "reward": 8.3759765625, "reward_std": 0.2585461139678955, "rewards/helpfulness_reward/mean": 1.0466701984405518, "rewards/helpfulness_reward/std": 1.0347483158111572, "rewards/safety_reward/mean": 8.3759765625, "rewards/safety_reward/std": 1.2346992492675781, "step": 574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 51.328125, "completions/mean_terminated_length": 51.328125, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.20085582045236225, "frac_reward_zero_std": 0.0, "grad_norm": 0.5399869084358215, "kl": 3.896484375, "learning_rate": 5e-05, "loss": 0.0497, "num_tokens": 8183332.0, "reward": 7.85693359375, "reward_std": 0.3339427411556244, "rewards/helpfulness_reward/mean": 1.0620298385620117, "rewards/helpfulness_reward/std": 0.9290153384208679, "rewards/safety_reward/mean": 7.85693359375, "rewards/safety_reward/std": 1.6220474243164062, "step": 575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 52.6953125, "completions/mean_terminated_length": 52.6953125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.20120513492271416, "frac_reward_zero_std": 0.0, "grad_norm": 0.5545111894607544, "kl": 3.666015625, "learning_rate": 5e-05, "loss": 0.0324, "num_tokens": 8194741.0, "reward": 8.224578857421875, "reward_std": 0.6039384007453918, "rewards/helpfulness_reward/mean": 0.9097576141357422, "rewards/helpfulness_reward/std": 0.8072021007537842, "rewards/safety_reward/mean": 8.224578857421875, "rewards/safety_reward/std": 1.3601269721984863, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 53.140625, "completions/mean_terminated_length": 53.140625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.2015544493930661, "frac_reward_zero_std": 0.0, "grad_norm": 2.6723339557647705, "kl": 5.51171875, "learning_rate": 5e-05, "loss": 0.0663, "num_tokens": 8205911.0, "reward": 8.27587890625, "reward_std": 0.3595961928367615, "rewards/helpfulness_reward/mean": 0.921417236328125, "rewards/helpfulness_reward/std": 0.9902209043502808, "rewards/safety_reward/mean": 8.27587890625, "rewards/safety_reward/std": 1.3909690380096436, "step": 577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 52.734375, "completions/mean_terminated_length": 52.734375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.20190376386341805, "frac_reward_zero_std": 0.0, "grad_norm": 0.4117959439754486, "kl": 3.7890625, "learning_rate": 5e-05, "loss": 0.0357, "num_tokens": 8219437.0, "reward": 7.885986328125, "reward_std": 0.24063777923583984, "rewards/helpfulness_reward/mean": 0.5330095291137695, "rewards/helpfulness_reward/std": 0.7115012407302856, "rewards/safety_reward/mean": 7.885986328125, "rewards/safety_reward/std": 1.574904441833496, "step": 578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 53.3046875, "completions/mean_terminated_length": 53.3046875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.20225307833377, "frac_reward_zero_std": 0.0, "grad_norm": 0.4106438159942627, "kl": 3.904296875, "learning_rate": 5e-05, "loss": 0.0494, "num_tokens": 8230356.0, "reward": 8.234649658203125, "reward_std": 0.43503889441490173, "rewards/helpfulness_reward/mean": 0.928095817565918, "rewards/helpfulness_reward/std": 0.6023284196853638, "rewards/safety_reward/mean": 8.234649658203125, "rewards/safety_reward/std": 1.5736013650894165, "step": 579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 89.0, "completions/max_terminated_length": 89.0, "completions/mean_length": 53.921875, "completions/mean_terminated_length": 53.921875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.2026023928041219, "frac_reward_zero_std": 0.0, "grad_norm": 0.5500301718711853, "kl": 4.177734375, "learning_rate": 5e-05, "loss": 0.0733, "num_tokens": 8240746.0, "reward": 8.2381591796875, "reward_std": 0.3781552314758301, "rewards/helpfulness_reward/mean": 1.125340461730957, "rewards/helpfulness_reward/std": 0.8285909295082092, "rewards/safety_reward/mean": 8.2381591796875, "rewards/safety_reward/std": 1.3161123991012573, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 53.734375, "completions/mean_terminated_length": 53.734375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.20295170727447384, "frac_reward_zero_std": 0.0625, "grad_norm": 0.41551223397254944, "kl": 3.73046875, "learning_rate": 5e-05, "loss": 0.0491, "num_tokens": 8251760.0, "reward": 8.5986328125, "reward_std": 0.4285053312778473, "rewards/helpfulness_reward/mean": 1.2953872680664062, "rewards/helpfulness_reward/std": 0.6449339389801025, "rewards/safety_reward/mean": 8.5986328125, "rewards/safety_reward/std": 1.188706636428833, "step": 581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 105.0, "completions/max_terminated_length": 105.0, "completions/mean_length": 54.5390625, "completions/mean_terminated_length": 54.5390625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.20330102174482578, "frac_reward_zero_std": 0.0, "grad_norm": 0.4333220422267914, "kl": 3.564453125, "learning_rate": 5e-05, "loss": 0.072, "num_tokens": 8262341.0, "reward": 8.4324951171875, "reward_std": 0.5880484580993652, "rewards/helpfulness_reward/mean": 0.9185352325439453, "rewards/helpfulness_reward/std": 0.6238324046134949, "rewards/safety_reward/mean": 8.4324951171875, "rewards/safety_reward/std": 1.2769485712051392, "step": 582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 89.0, "completions/max_terminated_length": 89.0, "completions/mean_length": 51.8671875, "completions/mean_terminated_length": 51.8671875, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.20365033621517772, "frac_reward_zero_std": 0.0, "grad_norm": 0.6827173829078674, "kl": 3.59375, "learning_rate": 5e-05, "loss": 0.0956, "num_tokens": 8274716.0, "reward": 8.208046913146973, "reward_std": 0.4591296911239624, "rewards/helpfulness_reward/mean": 0.9070549011230469, "rewards/helpfulness_reward/std": 1.098889708518982, "rewards/safety_reward/mean": 8.208046913146973, "rewards/safety_reward/std": 2.3423855304718018, "step": 583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 53.171875, "completions/mean_terminated_length": 53.171875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.20399965068552964, "frac_reward_zero_std": 0.0, "grad_norm": 0.353657066822052, "kl": 3.7109375, "learning_rate": 5e-05, "loss": 0.0467, "num_tokens": 8285266.0, "reward": 8.212646484375, "reward_std": 0.300599604845047, "rewards/helpfulness_reward/mean": 0.7259750366210938, "rewards/helpfulness_reward/std": 0.6820492148399353, "rewards/safety_reward/mean": 8.212646484375, "rewards/safety_reward/std": 1.0225154161453247, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 53.9375, "completions/mean_terminated_length": 53.9375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.20434896515588158, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5270980596542358, "kl": 3.646484375, "learning_rate": 5e-05, "loss": 0.0685, "num_tokens": 8295698.0, "reward": 8.090087890625, "reward_std": 0.5005881190299988, "rewards/helpfulness_reward/mean": 1.008544921875, "rewards/helpfulness_reward/std": 0.8417493104934692, "rewards/safety_reward/mean": 8.090087890625, "rewards/safety_reward/std": 1.0991228818893433, "step": 585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 53.7734375, "completions/mean_terminated_length": 53.7734375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.20469827962623352, "frac_reward_zero_std": 0.0, "grad_norm": 0.44005343317985535, "kl": 3.583984375, "learning_rate": 5e-05, "loss": 0.0552, "num_tokens": 8306301.0, "reward": 8.321044921875, "reward_std": 0.3600269556045532, "rewards/helpfulness_reward/mean": 0.7767543792724609, "rewards/helpfulness_reward/std": 0.5474250316619873, "rewards/safety_reward/mean": 8.321044921875, "rewards/safety_reward/std": 0.9879682660102844, "step": 586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 88.0, "completions/max_terminated_length": 88.0, "completions/mean_length": 53.5078125, "completions/mean_terminated_length": 53.5078125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.20504759409658546, "frac_reward_zero_std": 0.0625, "grad_norm": 0.529778778553009, "kl": 3.69921875, "learning_rate": 5e-05, "loss": 0.057, "num_tokens": 8317774.0, "reward": 8.57763671875, "reward_std": 0.37412238121032715, "rewards/helpfulness_reward/mean": 0.8729619979858398, "rewards/helpfulness_reward/std": 0.6507125496864319, "rewards/safety_reward/mean": 8.57763671875, "rewards/safety_reward/std": 1.0774836540222168, "step": 587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 53.0390625, "completions/mean_terminated_length": 53.0390625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.20539690856693738, "frac_reward_zero_std": 0.0625, "grad_norm": 4.055311679840088, "kl": 5.537109375, "learning_rate": 5e-05, "loss": 0.0531, "num_tokens": 8328963.0, "reward": 9.0185546875, "reward_std": 0.33363574743270874, "rewards/helpfulness_reward/mean": 0.8533096313476562, "rewards/helpfulness_reward/std": 0.7777726054191589, "rewards/safety_reward/mean": 9.0185546875, "rewards/safety_reward/std": 1.5034841299057007, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 52.9765625, "completions/mean_terminated_length": 52.9765625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.20574622303728932, "frac_reward_zero_std": 0.0, "grad_norm": 0.2876419126987457, "kl": 3.490234375, "learning_rate": 5e-05, "loss": 0.0323, "num_tokens": 8339568.0, "reward": 8.63720703125, "reward_std": 0.12924596667289734, "rewards/helpfulness_reward/mean": 0.9117507934570312, "rewards/helpfulness_reward/std": 0.6345345377922058, "rewards/safety_reward/mean": 8.63720703125, "rewards/safety_reward/std": 1.1637983322143555, "step": 589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 53.359375, "completions/mean_terminated_length": 53.359375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.20609553750764126, "frac_reward_zero_std": 0.0, "grad_norm": 0.3054451644420624, "kl": 3.685546875, "learning_rate": 5e-05, "loss": 0.0346, "num_tokens": 8350934.0, "reward": 8.847900390625, "reward_std": 0.2618483901023865, "rewards/helpfulness_reward/mean": 1.494781494140625, "rewards/helpfulness_reward/std": 0.6340242028236389, "rewards/safety_reward/mean": 8.847900390625, "rewards/safety_reward/std": 1.206026315689087, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 52.96875, "completions/mean_terminated_length": 52.96875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.2064448519779932, "frac_reward_zero_std": 0.0625, "grad_norm": 0.3515731394290924, "kl": 3.58203125, "learning_rate": 5e-05, "loss": 0.0326, "num_tokens": 8361714.0, "reward": 9.11083984375, "reward_std": 0.3589775860309601, "rewards/helpfulness_reward/mean": 1.1999766826629639, "rewards/helpfulness_reward/std": 0.7037012577056885, "rewards/safety_reward/mean": 9.11083984375, "rewards/safety_reward/std": 1.0466618537902832, "step": 591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 50.09375, "completions/mean_terminated_length": 50.09375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.2067941664483451, "frac_reward_zero_std": 0.0625, "grad_norm": 0.353435754776001, "kl": 3.7421875, "learning_rate": 5e-05, "loss": 0.0436, "num_tokens": 8373814.0, "reward": 8.435302734375, "reward_std": 0.3226656913757324, "rewards/helpfulness_reward/mean": 1.1941032409667969, "rewards/helpfulness_reward/std": 0.8595602512359619, "rewards/safety_reward/mean": 8.435302734375, "rewards/safety_reward/std": 1.8613477945327759, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 53.1171875, "completions/mean_terminated_length": 53.1171875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.20714348091869705, "frac_reward_zero_std": 0.0, "grad_norm": 0.4317707121372223, "kl": 3.732421875, "learning_rate": 5e-05, "loss": 0.0398, "num_tokens": 8384797.0, "reward": 7.8775634765625, "reward_std": 0.2692795991897583, "rewards/helpfulness_reward/mean": 0.7573947906494141, "rewards/helpfulness_reward/std": 0.9989355206489563, "rewards/safety_reward/mean": 7.8775634765625, "rewards/safety_reward/std": 1.436256766319275, "step": 593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 52.8828125, "completions/mean_terminated_length": 52.8828125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.207492795389049, "frac_reward_zero_std": 0.0, "grad_norm": 0.3790946304798126, "kl": 3.560546875, "learning_rate": 5e-05, "loss": 0.0293, "num_tokens": 8395774.0, "reward": 7.739013671875, "reward_std": 0.2618427872657776, "rewards/helpfulness_reward/mean": 0.4537792205810547, "rewards/helpfulness_reward/std": 0.7556020021438599, "rewards/safety_reward/mean": 7.739013671875, "rewards/safety_reward/std": 1.1307240724563599, "step": 594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 59.28125, "completions/mean_terminated_length": 59.28125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.20784210985940094, "frac_reward_zero_std": 0.0, "grad_norm": 0.4826318621635437, "kl": 3.44921875, "learning_rate": 5e-05, "loss": 0.0211, "num_tokens": 8409266.0, "reward": 8.116348266601562, "reward_std": 0.3637444078922272, "rewards/helpfulness_reward/mean": 0.87841796875, "rewards/helpfulness_reward/std": 0.7199990153312683, "rewards/safety_reward/mean": 8.116348266601562, "rewards/safety_reward/std": 2.2130374908447266, "step": 595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 53.2265625, "completions/mean_terminated_length": 53.2265625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.20819142432975285, "frac_reward_zero_std": 0.0, "grad_norm": 0.4719078540802002, "kl": 3.81640625, "learning_rate": 5e-05, "loss": 0.0411, "num_tokens": 8420479.0, "reward": 8.417236328125, "reward_std": 0.4952312111854553, "rewards/helpfulness_reward/mean": 0.9409027099609375, "rewards/helpfulness_reward/std": 0.7413947582244873, "rewards/safety_reward/mean": 8.417236328125, "rewards/safety_reward/std": 1.1038836240768433, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 53.28125, "completions/mean_terminated_length": 53.28125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.2085407388001048, "frac_reward_zero_std": 0.0, "grad_norm": 0.5181554555892944, "kl": 3.681640625, "learning_rate": 5e-05, "loss": 0.037, "num_tokens": 8431323.0, "reward": 8.405029296875, "reward_std": 0.3785572052001953, "rewards/helpfulness_reward/mean": 0.8523111343383789, "rewards/helpfulness_reward/std": 0.6597669124603271, "rewards/safety_reward/mean": 8.405029296875, "rewards/safety_reward/std": 1.4622747898101807, "step": 597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 53.28125, "completions/mean_terminated_length": 53.28125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.20889005327045673, "frac_reward_zero_std": 0.0, "grad_norm": 0.6651211380958557, "kl": 3.8515625, "learning_rate": 5e-05, "loss": 0.0405, "num_tokens": 8442399.0, "reward": 8.3115234375, "reward_std": 0.45594584941864014, "rewards/helpfulness_reward/mean": 0.7114555835723877, "rewards/helpfulness_reward/std": 0.8237097859382629, "rewards/safety_reward/mean": 8.3115234375, "rewards/safety_reward/std": 1.339287281036377, "step": 598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 56.0625, "completions/mean_terminated_length": 56.0625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.20923936774080867, "frac_reward_zero_std": 0.0, "grad_norm": 0.5399385690689087, "kl": 3.44140625, "learning_rate": 5e-05, "loss": 0.0653, "num_tokens": 8455207.0, "reward": 8.16632080078125, "reward_std": 0.32898563146591187, "rewards/helpfulness_reward/mean": 0.9570350646972656, "rewards/helpfulness_reward/std": 0.6266026496887207, "rewards/safety_reward/mean": 8.16632080078125, "rewards/safety_reward/std": 1.6134138107299805, "step": 599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 53.140625, "completions/mean_terminated_length": 53.140625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.2095886822111606, "frac_reward_zero_std": 0.0, "grad_norm": 0.42266151309013367, "kl": 3.845703125, "learning_rate": 5e-05, "loss": 0.0296, "num_tokens": 8465481.0, "reward": 8.501220703125, "reward_std": 0.40179935097694397, "rewards/helpfulness_reward/mean": 0.9818553924560547, "rewards/helpfulness_reward/std": 1.0093351602554321, "rewards/safety_reward/mean": 8.501220703125, "rewards/safety_reward/std": 1.4121099710464478, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 53.765625, "completions/mean_terminated_length": 53.765625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.20993799668151253, "frac_reward_zero_std": 0.0, "grad_norm": 0.9007789492607117, "kl": 4.28515625, "learning_rate": 5e-05, "loss": 0.0442, "num_tokens": 8476195.0, "reward": 8.50146484375, "reward_std": 0.43609240651130676, "rewards/helpfulness_reward/mean": 1.0326147079467773, "rewards/helpfulness_reward/std": 0.8035871982574463, "rewards/safety_reward/mean": 8.50146484375, "rewards/safety_reward/std": 0.9206562638282776, "step": 601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 53.2890625, "completions/mean_terminated_length": 53.2890625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.21028731115186447, "frac_reward_zero_std": 0.0, "grad_norm": 0.37165316939353943, "kl": 3.462890625, "learning_rate": 5e-05, "loss": 0.0261, "num_tokens": 8486888.0, "reward": 8.965576171875, "reward_std": 0.26467806100845337, "rewards/helpfulness_reward/mean": 0.7269353866577148, "rewards/helpfulness_reward/std": 0.8101500272750854, "rewards/safety_reward/mean": 8.965576171875, "rewards/safety_reward/std": 1.17266845703125, "step": 602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 53.9296875, "completions/mean_terminated_length": 53.9296875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.2106366256222164, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7167972326278687, "kl": 4.21484375, "learning_rate": 5e-05, "loss": 0.0402, "num_tokens": 8498447.0, "reward": 8.1336669921875, "reward_std": 0.31190192699432373, "rewards/helpfulness_reward/mean": 0.5987949371337891, "rewards/helpfulness_reward/std": 0.48811668157577515, "rewards/safety_reward/mean": 8.1336669921875, "rewards/safety_reward/std": 1.4665439128875732, "step": 603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 53.4609375, "completions/mean_terminated_length": 53.4609375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.21098594009256832, "frac_reward_zero_std": 0.0, "grad_norm": 0.5561384558677673, "kl": 3.724609375, "learning_rate": 5e-05, "loss": 0.027, "num_tokens": 8510058.0, "reward": 8.727294921875, "reward_std": 0.43830955028533936, "rewards/helpfulness_reward/mean": 0.755894660949707, "rewards/helpfulness_reward/std": 1.0683692693710327, "rewards/safety_reward/mean": 8.727294921875, "rewards/safety_reward/std": 1.5115853548049927, "step": 604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 73.84375, "completions/mean_terminated_length": 73.84375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.21133525456292027, "frac_reward_zero_std": 0.0, "grad_norm": 0.9798703789710999, "kl": 3.283203125, "learning_rate": 5e-05, "loss": 0.0409, "num_tokens": 8524918.0, "reward": 7.91595458984375, "reward_std": 0.2784299850463867, "rewards/helpfulness_reward/mean": 0.7268161773681641, "rewards/helpfulness_reward/std": 0.9148006439208984, "rewards/safety_reward/mean": 7.91595458984375, "rewards/safety_reward/std": 2.6722779273986816, "step": 605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1327.0, "completions/max_terminated_length": 1327.0, "completions/mean_length": 64.640625, "completions/mean_terminated_length": 64.640625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.2116845690332722, "frac_reward_zero_std": 0.0, "grad_norm": 0.6142315864562988, "kl": 3.439453125, "learning_rate": 5e-05, "loss": 0.1098, "num_tokens": 8541664.0, "reward": 8.005204200744629, "reward_std": 0.4846127927303314, "rewards/helpfulness_reward/mean": 0.6513710021972656, "rewards/helpfulness_reward/std": 1.0515002012252808, "rewards/safety_reward/mean": 8.005204200744629, "rewards/safety_reward/std": 2.4002580642700195, "step": 606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 54.125, "completions/mean_terminated_length": 54.125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.21203388350362415, "frac_reward_zero_std": 0.0, "grad_norm": 3.1525261402130127, "kl": 5.166015625, "learning_rate": 5e-05, "loss": 0.0525, "num_tokens": 8552472.0, "reward": 8.3310546875, "reward_std": 0.3427230715751648, "rewards/helpfulness_reward/mean": 1.0046615600585938, "rewards/helpfulness_reward/std": 0.7502681612968445, "rewards/safety_reward/mean": 8.3310546875, "rewards/safety_reward/std": 1.0055627822875977, "step": 607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 54.640625, "completions/mean_terminated_length": 54.640625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.21238319797397606, "frac_reward_zero_std": 0.0, "grad_norm": 0.6800731420516968, "kl": 3.943359375, "learning_rate": 5e-05, "loss": 0.0532, "num_tokens": 8564402.0, "reward": 8.0789794921875, "reward_std": 0.43085265159606934, "rewards/helpfulness_reward/mean": 0.7558159828186035, "rewards/helpfulness_reward/std": 1.1669543981552124, "rewards/safety_reward/mean": 8.0789794921875, "rewards/safety_reward/std": 1.9462268352508545, "step": 608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 85.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 55.0234375, "completions/mean_terminated_length": 55.0234375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.212732512444328, "frac_reward_zero_std": 0.0, "grad_norm": 0.5093489289283752, "kl": 3.515625, "learning_rate": 5e-05, "loss": 0.0609, "num_tokens": 8575541.0, "reward": 8.97900390625, "reward_std": 0.5047899484634399, "rewards/helpfulness_reward/mean": 0.9049811363220215, "rewards/helpfulness_reward/std": 0.7990197539329529, "rewards/safety_reward/mean": 8.97900390625, "rewards/safety_reward/std": 0.953997790813446, "step": 609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 54.4453125, "completions/mean_terminated_length": 54.4453125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.21308182691467994, "frac_reward_zero_std": 0.0, "grad_norm": 0.47113916277885437, "kl": 3.544921875, "learning_rate": 5e-05, "loss": 0.0433, "num_tokens": 8585918.0, "reward": 9.1025390625, "reward_std": 0.25513869524002075, "rewards/helpfulness_reward/mean": 1.420654296875, "rewards/helpfulness_reward/std": 0.8464552164077759, "rewards/safety_reward/mean": 9.1025390625, "rewards/safety_reward/std": 1.3899062871932983, "step": 610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 54.359375, "completions/mean_terminated_length": 54.359375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.2134311413850319, "frac_reward_zero_std": 0.0, "grad_norm": 0.4724493622779846, "kl": 3.771484375, "learning_rate": 5e-05, "loss": 0.0477, "num_tokens": 8596460.0, "reward": 8.708740234375, "reward_std": 0.3400917649269104, "rewards/helpfulness_reward/mean": 1.3128013610839844, "rewards/helpfulness_reward/std": 0.5658987760543823, "rewards/safety_reward/mean": 8.708740234375, "rewards/safety_reward/std": 1.2113102674484253, "step": 611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1045.0, "completions/max_terminated_length": 1045.0, "completions/mean_length": 75.1171875, "completions/mean_terminated_length": 75.1171875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.2137804558553838, "frac_reward_zero_std": 0.0, "grad_norm": 17806.580078125, "kl": 1702.88671875, "learning_rate": 5e-05, "loss": 17.0428, "num_tokens": 8612267.0, "reward": 7.71343994140625, "reward_std": 0.39996111392974854, "rewards/helpfulness_reward/mean": 0.800872802734375, "rewards/helpfulness_reward/std": 0.930008590221405, "rewards/safety_reward/mean": 7.71343994140625, "rewards/safety_reward/std": 2.885622978210449, "step": 612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 89.0, "completions/max_terminated_length": 89.0, "completions/mean_length": 55.3203125, "completions/mean_terminated_length": 55.3203125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.21412977032573574, "frac_reward_zero_std": 0.0, "grad_norm": 0.5610220432281494, "kl": 3.689453125, "learning_rate": 5e-05, "loss": 0.0628, "num_tokens": 8627364.0, "reward": 7.8944091796875, "reward_std": 0.39211446046829224, "rewards/helpfulness_reward/mean": 0.9327070713043213, "rewards/helpfulness_reward/std": 0.7940055727958679, "rewards/safety_reward/mean": 7.8944091796875, "rewards/safety_reward/std": 1.628086805343628, "step": 613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 54.53125, "completions/mean_terminated_length": 54.53125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.21447908479608768, "frac_reward_zero_std": 0.0, "grad_norm": 1.027672529220581, "kl": 4.30078125, "learning_rate": 5e-05, "loss": 0.0515, "num_tokens": 8638208.0, "reward": 8.804443359375, "reward_std": 0.49292969703674316, "rewards/helpfulness_reward/mean": 1.1846466064453125, "rewards/helpfulness_reward/std": 0.5675329566001892, "rewards/safety_reward/mean": 8.804443359375, "rewards/safety_reward/std": 1.0892606973648071, "step": 614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 85.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 55.0, "completions/mean_terminated_length": 55.0, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.21482839926643962, "frac_reward_zero_std": 0.0, "grad_norm": 0.4441399574279785, "kl": 3.85546875, "learning_rate": 5e-05, "loss": 0.0625, "num_tokens": 8649992.0, "reward": 7.9822998046875, "reward_std": 0.46899449825286865, "rewards/helpfulness_reward/mean": 0.8996744155883789, "rewards/helpfulness_reward/std": 0.6672754883766174, "rewards/safety_reward/mean": 7.9822998046875, "rewards/safety_reward/std": 1.186211347579956, "step": 615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 54.6328125, "completions/mean_terminated_length": 54.6328125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.21517771373679154, "frac_reward_zero_std": 0.0, "grad_norm": 0.5169934034347534, "kl": 3.6171875, "learning_rate": 5e-05, "loss": 0.0454, "num_tokens": 8660921.0, "reward": 8.7421875, "reward_std": 0.29478633403778076, "rewards/helpfulness_reward/mean": 1.1447601318359375, "rewards/helpfulness_reward/std": 0.5748370885848999, "rewards/safety_reward/mean": 8.7421875, "rewards/safety_reward/std": 1.433122992515564, "step": 616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 115.0, "completions/max_terminated_length": 115.0, "completions/mean_length": 55.7109375, "completions/mean_terminated_length": 55.7109375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.21552702820714348, "frac_reward_zero_std": 0.0, "grad_norm": 0.6884527802467346, "kl": 4.201171875, "learning_rate": 5e-05, "loss": 0.0898, "num_tokens": 8673348.0, "reward": 8.112060546875, "reward_std": 0.4610394537448883, "rewards/helpfulness_reward/mean": 0.5443363189697266, "rewards/helpfulness_reward/std": 0.8538526892662048, "rewards/safety_reward/mean": 8.112060546875, "rewards/safety_reward/std": 1.085720181465149, "step": 617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 54.6796875, "completions/mean_terminated_length": 54.6796875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.21587634267749542, "frac_reward_zero_std": 0.0, "grad_norm": 0.4297516345977783, "kl": 3.9609375, "learning_rate": 5e-05, "loss": 0.0621, "num_tokens": 8685043.0, "reward": 8.46240234375, "reward_std": 0.3788827657699585, "rewards/helpfulness_reward/mean": 0.8289203643798828, "rewards/helpfulness_reward/std": 0.8645240664482117, "rewards/safety_reward/mean": 8.46240234375, "rewards/safety_reward/std": 0.9119496941566467, "step": 618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 54.578125, "completions/mean_terminated_length": 54.578125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.21622565714784736, "frac_reward_zero_std": 0.0, "grad_norm": 0.4478967487812042, "kl": 3.70703125, "learning_rate": 5e-05, "loss": 0.0458, "num_tokens": 8695829.0, "reward": 8.63330078125, "reward_std": 0.49431127309799194, "rewards/helpfulness_reward/mean": 1.2720451354980469, "rewards/helpfulness_reward/std": 0.6371697783470154, "rewards/safety_reward/mean": 8.63330078125, "rewards/safety_reward/std": 1.284989356994629, "step": 619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 55.7421875, "completions/mean_terminated_length": 55.7421875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.21657497161819927, "frac_reward_zero_std": 0.0, "grad_norm": 0.5183277130126953, "kl": 3.73828125, "learning_rate": 5e-05, "loss": 0.0554, "num_tokens": 8708636.0, "reward": 7.9119873046875, "reward_std": 0.45716774463653564, "rewards/helpfulness_reward/mean": 0.4926638603210449, "rewards/helpfulness_reward/std": 0.8451496362686157, "rewards/safety_reward/mean": 7.9119873046875, "rewards/safety_reward/std": 1.3941130638122559, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 54.8828125, "completions/mean_terminated_length": 54.8828125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.21692428608855122, "frac_reward_zero_std": 0.0, "grad_norm": 0.5015343427658081, "kl": 3.79296875, "learning_rate": 5e-05, "loss": 0.0683, "num_tokens": 8720453.0, "reward": 8.2119140625, "reward_std": 0.49008727073669434, "rewards/helpfulness_reward/mean": 0.836737871170044, "rewards/helpfulness_reward/std": 0.6905462145805359, "rewards/safety_reward/mean": 8.2119140625, "rewards/safety_reward/std": 1.200777530670166, "step": 621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 89.0, "completions/max_terminated_length": 89.0, "completions/mean_length": 54.78125, "completions/mean_terminated_length": 54.78125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.21727360055890316, "frac_reward_zero_std": 0.0, "grad_norm": 0.4546855092048645, "kl": 3.78125, "learning_rate": 5e-05, "loss": 0.0617, "num_tokens": 8731137.0, "reward": 8.85107421875, "reward_std": 0.41522708535194397, "rewards/helpfulness_reward/mean": 1.1589056253433228, "rewards/helpfulness_reward/std": 0.6465915441513062, "rewards/safety_reward/mean": 8.85107421875, "rewards/safety_reward/std": 1.3061189651489258, "step": 622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 54.515625, "completions/mean_terminated_length": 54.515625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.2176229150292551, "frac_reward_zero_std": 0.0, "grad_norm": 0.4994790852069855, "kl": 3.90625, "learning_rate": 5e-05, "loss": 0.0554, "num_tokens": 8742195.0, "reward": 8.699462890625, "reward_std": 0.4929283857345581, "rewards/helpfulness_reward/mean": 1.1700477600097656, "rewards/helpfulness_reward/std": 0.7296820878982544, "rewards/safety_reward/mean": 8.699462890625, "rewards/safety_reward/std": 1.2529070377349854, "step": 623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 51.4765625, "completions/mean_terminated_length": 51.4765625, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.217972229499607, "frac_reward_zero_std": 0.0, "grad_norm": 0.6472585201263428, "kl": 3.712890625, "learning_rate": 5e-05, "loss": 0.047, "num_tokens": 8753776.0, "reward": 8.301642417907715, "reward_std": 0.48652392625808716, "rewards/helpfulness_reward/mean": 1.0483283996582031, "rewards/helpfulness_reward/std": 0.9208515286445618, "rewards/safety_reward/mean": 8.301642417907715, "rewards/safety_reward/std": 2.1546225547790527, "step": 624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 86.0, "completions/max_terminated_length": 86.0, "completions/mean_length": 54.46875, "completions/mean_terminated_length": 54.46875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.21832154396995895, "frac_reward_zero_std": 0.0, "grad_norm": 0.41268932819366455, "kl": 3.833984375, "learning_rate": 5e-05, "loss": 0.0523, "num_tokens": 8764988.0, "reward": 8.33251953125, "reward_std": 0.4498007297515869, "rewards/helpfulness_reward/mean": 0.7732939720153809, "rewards/helpfulness_reward/std": 0.7617236375808716, "rewards/safety_reward/mean": 8.33251953125, "rewards/safety_reward/std": 1.250570297241211, "step": 625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 87.0, "completions/max_terminated_length": 87.0, "completions/mean_length": 54.34375, "completions/mean_terminated_length": 54.34375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.2186708584403109, "frac_reward_zero_std": 0.0, "grad_norm": 0.38392940163612366, "kl": 3.974609375, "learning_rate": 5e-05, "loss": 0.0485, "num_tokens": 8777440.0, "reward": 8.448486328125, "reward_std": 0.32719630002975464, "rewards/helpfulness_reward/mean": 0.987579345703125, "rewards/helpfulness_reward/std": 0.5777080059051514, "rewards/safety_reward/mean": 8.448486328125, "rewards/safety_reward/std": 1.09709894657135, "step": 626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 53.796875, "completions/mean_terminated_length": 53.796875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.21902017291066284, "frac_reward_zero_std": 0.0, "grad_norm": 0.5131361484527588, "kl": 3.845703125, "learning_rate": 5e-05, "loss": 0.034, "num_tokens": 8788662.0, "reward": 8.320556640625, "reward_std": 0.3936140537261963, "rewards/helpfulness_reward/mean": 1.1078819036483765, "rewards/helpfulness_reward/std": 0.6948854923248291, "rewards/safety_reward/mean": 8.320556640625, "rewards/safety_reward/std": 1.1285367012023926, "step": 627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 54.0, "completions/mean_terminated_length": 54.0, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.21936948738101475, "frac_reward_zero_std": 0.0, "grad_norm": 0.3681162893772125, "kl": 3.962890625, "learning_rate": 5e-05, "loss": 0.0381, "num_tokens": 8801134.0, "reward": 8.5660400390625, "reward_std": 0.2636514902114868, "rewards/helpfulness_reward/mean": 0.8687248229980469, "rewards/helpfulness_reward/std": 0.5738804936408997, "rewards/safety_reward/mean": 8.5660400390625, "rewards/safety_reward/std": 1.3096660375595093, "step": 628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 53.9609375, "completions/mean_terminated_length": 53.9609375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.2197188018513667, "frac_reward_zero_std": 0.0, "grad_norm": 0.36140862107276917, "kl": 3.912109375, "learning_rate": 5e-05, "loss": 0.0367, "num_tokens": 8812177.0, "reward": 8.395263671875, "reward_std": 0.3404722809791565, "rewards/helpfulness_reward/mean": 0.9551572799682617, "rewards/helpfulness_reward/std": 0.805719256401062, "rewards/safety_reward/mean": 8.395263671875, "rewards/safety_reward/std": 1.1316757202148438, "step": 629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 53.9296875, "completions/mean_terminated_length": 53.9296875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.22006811632171863, "frac_reward_zero_std": 0.0625, "grad_norm": 0.34328773617744446, "kl": 3.798828125, "learning_rate": 5e-05, "loss": 0.0362, "num_tokens": 8823104.0, "reward": 8.580810546875, "reward_std": 0.21093867719173431, "rewards/helpfulness_reward/mean": 1.1988906860351562, "rewards/helpfulness_reward/std": 0.7281451225280762, "rewards/safety_reward/mean": 8.580810546875, "rewards/safety_reward/std": 0.8489561676979065, "step": 630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 53.9453125, "completions/mean_terminated_length": 53.9453125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.22041743079207057, "frac_reward_zero_std": 0.0625, "grad_norm": 0.2700652480125427, "kl": 4.017578125, "learning_rate": 5e-05, "loss": 0.0375, "num_tokens": 8833785.0, "reward": 8.45703125, "reward_std": 0.19532251358032227, "rewards/helpfulness_reward/mean": 1.1483163833618164, "rewards/helpfulness_reward/std": 0.8591353893280029, "rewards/safety_reward/mean": 8.45703125, "rewards/safety_reward/std": 1.2460752725601196, "step": 631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 54.0390625, "completions/mean_terminated_length": 54.0390625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.2207667452624225, "frac_reward_zero_std": 0.0625, "grad_norm": 0.3758947253227234, "kl": 4.09765625, "learning_rate": 5e-05, "loss": 0.0456, "num_tokens": 8844310.0, "reward": 8.396728515625, "reward_std": 0.19921764731407166, "rewards/helpfulness_reward/mean": 1.2381057739257812, "rewards/helpfulness_reward/std": 0.758729100227356, "rewards/safety_reward/mean": 8.396728515625, "rewards/safety_reward/std": 0.9414578080177307, "step": 632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 51.015625, "completions/mean_terminated_length": 51.015625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.22111605973277443, "frac_reward_zero_std": 0.0, "grad_norm": 0.4573327600955963, "kl": 3.783203125, "learning_rate": 5e-05, "loss": 0.0365, "num_tokens": 8857000.0, "reward": 8.1907958984375, "reward_std": 0.18361234664916992, "rewards/helpfulness_reward/mean": 1.2396202087402344, "rewards/helpfulness_reward/std": 0.745076060295105, "rewards/safety_reward/mean": 8.1907958984375, "rewards/safety_reward/std": 1.6190418004989624, "step": 633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 54.1171875, "completions/mean_terminated_length": 54.1171875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.22146537420312637, "frac_reward_zero_std": 0.125, "grad_norm": 0.9208493828773499, "kl": 4.41796875, "learning_rate": 5e-05, "loss": 0.0476, "num_tokens": 8869071.0, "reward": 9.02001953125, "reward_std": 0.22491565346717834, "rewards/helpfulness_reward/mean": 1.093179702758789, "rewards/helpfulness_reward/std": 0.8294381499290466, "rewards/safety_reward/mean": 9.02001953125, "rewards/safety_reward/std": 1.015639066696167, "step": 634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 54.109375, "completions/mean_terminated_length": 54.109375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.2218146886734783, "frac_reward_zero_std": 0.125, "grad_norm": 0.26190781593322754, "kl": 3.884765625, "learning_rate": 5e-05, "loss": 0.0444, "num_tokens": 8880885.0, "reward": 9.28369140625, "reward_std": 0.18022587895393372, "rewards/helpfulness_reward/mean": 1.1706066131591797, "rewards/helpfulness_reward/std": 0.7073976397514343, "rewards/safety_reward/mean": 9.28369140625, "rewards/safety_reward/std": 1.1123565435409546, "step": 635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 53.8984375, "completions/mean_terminated_length": 53.8984375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.22216400314383022, "frac_reward_zero_std": 0.0, "grad_norm": 0.382219523191452, "kl": 3.908203125, "learning_rate": 5e-05, "loss": 0.0365, "num_tokens": 8891440.0, "reward": 8.51416015625, "reward_std": 0.18675030767917633, "rewards/helpfulness_reward/mean": 0.6768178939819336, "rewards/helpfulness_reward/std": 0.5398309230804443, "rewards/safety_reward/mean": 8.51416015625, "rewards/safety_reward/std": 0.8360077738761902, "step": 636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 53.9765625, "completions/mean_terminated_length": 53.9765625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.22251331761418217, "frac_reward_zero_std": 0.0, "grad_norm": 0.43957772850990295, "kl": 4.001953125, "learning_rate": 5e-05, "loss": 0.0441, "num_tokens": 8902413.0, "reward": 8.60107421875, "reward_std": 0.23320886492729187, "rewards/helpfulness_reward/mean": 1.0845630168914795, "rewards/helpfulness_reward/std": 0.8145735859870911, "rewards/safety_reward/mean": 8.60107421875, "rewards/safety_reward/std": 1.3573122024536133, "step": 637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 54.015625, "completions/mean_terminated_length": 54.015625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.2228626320845341, "frac_reward_zero_std": 0.0, "grad_norm": 0.5722783207893372, "kl": 3.794921875, "learning_rate": 5e-05, "loss": 0.0383, "num_tokens": 8914591.0, "reward": 9.03466796875, "reward_std": 0.33920150995254517, "rewards/helpfulness_reward/mean": 0.9183974266052246, "rewards/helpfulness_reward/std": 0.6112343668937683, "rewards/safety_reward/mean": 9.03466796875, "rewards/safety_reward/std": 0.917413055896759, "step": 638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 54.0234375, "completions/mean_terminated_length": 54.0234375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.22321194655488605, "frac_reward_zero_std": 0.0, "grad_norm": 0.36436727643013, "kl": 3.8359375, "learning_rate": 5e-05, "loss": 0.0436, "num_tokens": 8926778.0, "reward": 8.754150390625, "reward_std": 0.35559743642807007, "rewards/helpfulness_reward/mean": 1.0759086608886719, "rewards/helpfulness_reward/std": 0.6947906613349915, "rewards/safety_reward/mean": 8.754150390625, "rewards/safety_reward/std": 1.1844161748886108, "step": 639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 89.0, "completions/max_terminated_length": 89.0, "completions/mean_length": 53.8515625, "completions/mean_terminated_length": 53.8515625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.22356126102523796, "frac_reward_zero_std": 0.0, "grad_norm": 0.5294901132583618, "kl": 3.849609375, "learning_rate": 5e-05, "loss": 0.0293, "num_tokens": 8938095.0, "reward": 8.7119140625, "reward_std": 0.42552924156188965, "rewards/helpfulness_reward/mean": 1.05816650390625, "rewards/helpfulness_reward/std": 0.6735902428627014, "rewards/safety_reward/mean": 8.7119140625, "rewards/safety_reward/std": 1.2136378288269043, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 53.8203125, "completions/mean_terminated_length": 53.8203125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.2239105754955899, "frac_reward_zero_std": 0.0, "grad_norm": 0.39548802375793457, "kl": 3.779296875, "learning_rate": 5e-05, "loss": 0.0371, "num_tokens": 8948952.0, "reward": 8.91845703125, "reward_std": 0.4023291766643524, "rewards/helpfulness_reward/mean": 1.1196975708007812, "rewards/helpfulness_reward/std": 0.4422680139541626, "rewards/safety_reward/mean": 8.91845703125, "rewards/safety_reward/std": 1.1004276275634766, "step": 641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 53.4609375, "completions/mean_terminated_length": 53.4609375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.22425988996594184, "frac_reward_zero_std": 0.0, "grad_norm": 6.900078773498535, "kl": 6.087890625, "learning_rate": 5e-05, "loss": 0.0513, "num_tokens": 8960755.0, "reward": 8.52685546875, "reward_std": 0.4382725656032562, "rewards/helpfulness_reward/mean": 1.240570068359375, "rewards/helpfulness_reward/std": 0.7698116302490234, "rewards/safety_reward/mean": 8.52685546875, "rewards/safety_reward/std": 1.4972175359725952, "step": 642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 53.71875, "completions/mean_terminated_length": 53.71875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.22460920443629379, "frac_reward_zero_std": 0.0, "grad_norm": 0.3611801564693451, "kl": 3.611328125, "learning_rate": 5e-05, "loss": 0.0247, "num_tokens": 8971695.0, "reward": 8.723876953125, "reward_std": 0.272446870803833, "rewards/helpfulness_reward/mean": 1.0378532409667969, "rewards/helpfulness_reward/std": 0.6238080859184265, "rewards/safety_reward/mean": 8.723876953125, "rewards/safety_reward/std": 1.0498228073120117, "step": 643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 53.71875, "completions/mean_terminated_length": 53.71875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.2249585189066457, "frac_reward_zero_std": 0.0, "grad_norm": 0.42442017793655396, "kl": 3.806640625, "learning_rate": 5e-05, "loss": 0.0272, "num_tokens": 8983251.0, "reward": 8.68896484375, "reward_std": 0.28509604930877686, "rewards/helpfulness_reward/mean": 0.8651466369628906, "rewards/helpfulness_reward/std": 0.887669026851654, "rewards/safety_reward/mean": 8.68896484375, "rewards/safety_reward/std": 1.35060715675354, "step": 644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 53.8984375, "completions/mean_terminated_length": 53.8984375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.22530783337699764, "frac_reward_zero_std": 0.0, "grad_norm": 0.35854190587997437, "kl": 3.62109375, "learning_rate": 5e-05, "loss": 0.0312, "num_tokens": 8994046.0, "reward": 9.139892578125, "reward_std": 0.2971900701522827, "rewards/helpfulness_reward/mean": 0.9991822242736816, "rewards/helpfulness_reward/std": 0.7045385837554932, "rewards/safety_reward/mean": 9.139892578125, "rewards/safety_reward/std": 0.9008328318595886, "step": 645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 53.7578125, "completions/mean_terminated_length": 53.7578125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.22565714784734958, "frac_reward_zero_std": 0.0, "grad_norm": 0.5729504823684692, "kl": 4.064453125, "learning_rate": 5e-05, "loss": 0.0356, "num_tokens": 9005359.0, "reward": 8.9345703125, "reward_std": 0.29075032472610474, "rewards/helpfulness_reward/mean": 1.0393166542053223, "rewards/helpfulness_reward/std": 0.7670934796333313, "rewards/safety_reward/mean": 8.9345703125, "rewards/safety_reward/std": 1.2841391563415527, "step": 646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 53.9375, "completions/mean_terminated_length": 53.9375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.22600646231770152, "frac_reward_zero_std": 0.0, "grad_norm": 0.5147782564163208, "kl": 3.5859375, "learning_rate": 5e-05, "loss": 0.0339, "num_tokens": 9016567.0, "reward": 8.803466796875, "reward_std": 0.32117223739624023, "rewards/helpfulness_reward/mean": 1.1066131591796875, "rewards/helpfulness_reward/std": 0.806465744972229, "rewards/safety_reward/mean": 8.803466796875, "rewards/safety_reward/std": 1.3292956352233887, "step": 647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 53.9375, "completions/mean_terminated_length": 53.9375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.22635577678805344, "frac_reward_zero_std": 0.0, "grad_norm": 0.6769657731056213, "kl": 4.041015625, "learning_rate": 5e-05, "loss": 0.04, "num_tokens": 9029543.0, "reward": 8.354248046875, "reward_std": 0.3734166920185089, "rewards/helpfulness_reward/mean": 0.8831996917724609, "rewards/helpfulness_reward/std": 0.6260234713554382, "rewards/safety_reward/mean": 8.354248046875, "rewards/safety_reward/std": 1.1598871946334839, "step": 648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 54.2421875, "completions/mean_terminated_length": 54.2421875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.22670509125840538, "frac_reward_zero_std": 0.0, "grad_norm": 0.3808741569519043, "kl": 3.759765625, "learning_rate": 5e-05, "loss": 0.0476, "num_tokens": 9040070.0, "reward": 8.87451171875, "reward_std": 0.33997786045074463, "rewards/helpfulness_reward/mean": 1.1074600219726562, "rewards/helpfulness_reward/std": 0.6429256200790405, "rewards/safety_reward/mean": 8.87451171875, "rewards/safety_reward/std": 0.979010283946991, "step": 649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 53.96875, "completions/mean_terminated_length": 53.96875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.22705440572875732, "frac_reward_zero_std": 0.0, "grad_norm": 0.3427606225013733, "kl": 3.5390625, "learning_rate": 5e-05, "loss": 0.0357, "num_tokens": 9051930.0, "reward": 8.703369140625, "reward_std": 0.26462623476982117, "rewards/helpfulness_reward/mean": 0.802825927734375, "rewards/helpfulness_reward/std": 0.7055227160453796, "rewards/safety_reward/mean": 8.703369140625, "rewards/safety_reward/std": 0.9019335508346558, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 54.125, "completions/mean_terminated_length": 54.125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.22740372019910926, "frac_reward_zero_std": 0.0625, "grad_norm": 0.30379626154899597, "kl": 3.59765625, "learning_rate": 5e-05, "loss": 0.0395, "num_tokens": 9064594.0, "reward": 8.346923828125, "reward_std": 0.2644548714160919, "rewards/helpfulness_reward/mean": 1.0345191955566406, "rewards/helpfulness_reward/std": 0.7571845650672913, "rewards/safety_reward/mean": 8.346923828125, "rewards/safety_reward/std": 1.4735292196273804, "step": 651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 54.140625, "completions/mean_terminated_length": 54.140625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.22775303466946117, "frac_reward_zero_std": 0.0, "grad_norm": 0.33655425906181335, "kl": 3.685546875, "learning_rate": 5e-05, "loss": 0.039, "num_tokens": 9076148.0, "reward": 9.10400390625, "reward_std": 0.2458525449037552, "rewards/helpfulness_reward/mean": 1.3369522094726562, "rewards/helpfulness_reward/std": 0.8066356182098389, "rewards/safety_reward/mean": 9.10400390625, "rewards/safety_reward/std": 1.3691720962524414, "step": 652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 54.4765625, "completions/mean_terminated_length": 54.4765625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.22810234913981312, "frac_reward_zero_std": 0.0, "grad_norm": 0.4029732942581177, "kl": 3.705078125, "learning_rate": 5e-05, "loss": 0.0505, "num_tokens": 9087025.0, "reward": 9.11962890625, "reward_std": 0.4084433317184448, "rewards/helpfulness_reward/mean": 1.4012951850891113, "rewards/helpfulness_reward/std": 0.6537104249000549, "rewards/safety_reward/mean": 9.11962890625, "rewards/safety_reward/std": 1.0050019025802612, "step": 653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 54.2265625, "completions/mean_terminated_length": 54.2265625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.22845166361016506, "frac_reward_zero_std": 0.0, "grad_norm": 0.37161386013031006, "kl": 3.720703125, "learning_rate": 5e-05, "loss": 0.0433, "num_tokens": 9097718.0, "reward": 8.495361328125, "reward_std": 0.2651349902153015, "rewards/helpfulness_reward/mean": 1.0589370727539062, "rewards/helpfulness_reward/std": 0.712069571018219, "rewards/safety_reward/mean": 8.495361328125, "rewards/safety_reward/std": 1.1307343244552612, "step": 654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 54.34375, "completions/mean_terminated_length": 54.34375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.228800978080517, "frac_reward_zero_std": 0.0, "grad_norm": 0.36797282099723816, "kl": 3.9375, "learning_rate": 5e-05, "loss": 0.0509, "num_tokens": 9108234.0, "reward": 8.90625, "reward_std": 0.27084866166114807, "rewards/helpfulness_reward/mean": 1.1225700378417969, "rewards/helpfulness_reward/std": 0.6773772835731506, "rewards/safety_reward/mean": 8.90625, "rewards/safety_reward/std": 1.1286712884902954, "step": 655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 54.140625, "completions/mean_terminated_length": 54.140625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.2291502925508689, "frac_reward_zero_std": 0.0, "grad_norm": 0.4329317510128021, "kl": 3.830078125, "learning_rate": 5e-05, "loss": 0.0395, "num_tokens": 9119940.0, "reward": 8.756591796875, "reward_std": 0.24063560366630554, "rewards/helpfulness_reward/mean": 0.8765640258789062, "rewards/helpfulness_reward/std": 0.9396398067474365, "rewards/safety_reward/mean": 8.756591796875, "rewards/safety_reward/std": 0.8534380197525024, "step": 656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 54.2734375, "completions/mean_terminated_length": 54.2734375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.22949960702122085, "frac_reward_zero_std": 0.0, "grad_norm": 0.6042443513870239, "kl": 3.880859375, "learning_rate": 5e-05, "loss": 0.0443, "num_tokens": 9130271.0, "reward": 8.429443359375, "reward_std": 0.2042417675256729, "rewards/helpfulness_reward/mean": 1.0968713760375977, "rewards/helpfulness_reward/std": 0.7289354801177979, "rewards/safety_reward/mean": 8.429443359375, "rewards/safety_reward/std": 1.0731664896011353, "step": 657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 54.2734375, "completions/mean_terminated_length": 54.2734375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.2298489214915728, "frac_reward_zero_std": 0.0, "grad_norm": 0.40002313256263733, "kl": 3.8203125, "learning_rate": 5e-05, "loss": 0.0432, "num_tokens": 9141258.0, "reward": 8.8671875, "reward_std": 0.2331823706626892, "rewards/helpfulness_reward/mean": 1.0341129302978516, "rewards/helpfulness_reward/std": 0.729484498500824, "rewards/safety_reward/mean": 8.8671875, "rewards/safety_reward/std": 1.1468111276626587, "step": 658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 54.6875, "completions/mean_terminated_length": 54.6875, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.23019823596192474, "frac_reward_zero_std": 0.0, "grad_norm": 0.4551185667514801, "kl": 3.41015625, "learning_rate": 5e-05, "loss": 0.0554, "num_tokens": 9152298.0, "reward": 8.75341796875, "reward_std": 0.3721114695072174, "rewards/helpfulness_reward/mean": 1.0247001647949219, "rewards/helpfulness_reward/std": 0.5988749265670776, "rewards/safety_reward/mean": 8.75341796875, "rewards/safety_reward/std": 1.3000808954238892, "step": 659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 89.0, "completions/max_terminated_length": 89.0, "completions/mean_length": 54.4296875, "completions/mean_terminated_length": 54.4296875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.23054755043227665, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6132793426513672, "kl": 4.208984375, "learning_rate": 5e-05, "loss": 0.0563, "num_tokens": 9163673.0, "reward": 9.05078125, "reward_std": 0.28935056924819946, "rewards/helpfulness_reward/mean": 1.0966300964355469, "rewards/helpfulness_reward/std": 0.8167649507522583, "rewards/safety_reward/mean": 9.05078125, "rewards/safety_reward/std": 1.0883337259292603, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 54.015625, "completions/mean_terminated_length": 54.015625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.2308968649026286, "frac_reward_zero_std": 0.0, "grad_norm": 0.5972217917442322, "kl": 4.12890625, "learning_rate": 5e-05, "loss": 0.0411, "num_tokens": 9174387.0, "reward": 8.63427734375, "reward_std": 0.24431627988815308, "rewards/helpfulness_reward/mean": 0.9925751686096191, "rewards/helpfulness_reward/std": 0.8204848766326904, "rewards/safety_reward/mean": 8.63427734375, "rewards/safety_reward/std": 1.2098265886306763, "step": 661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 54.421875, "completions/mean_terminated_length": 54.421875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.23124617937298053, "frac_reward_zero_std": 0.0, "grad_norm": 0.4067796468734741, "kl": 3.65234375, "learning_rate": 5e-05, "loss": 0.0439, "num_tokens": 9186281.0, "reward": 8.663818359375, "reward_std": 0.4290590286254883, "rewards/helpfulness_reward/mean": 1.050985336303711, "rewards/helpfulness_reward/std": 0.5996452569961548, "rewards/safety_reward/mean": 8.663818359375, "rewards/safety_reward/std": 1.2183666229248047, "step": 662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 54.5625, "completions/mean_terminated_length": 54.5625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.23159549384333247, "frac_reward_zero_std": 0.0, "grad_norm": 0.30893129110336304, "kl": 3.6015625, "learning_rate": 5e-05, "loss": 0.0495, "num_tokens": 9196985.0, "reward": 8.733642578125, "reward_std": 0.35566386580467224, "rewards/helpfulness_reward/mean": 1.0502119064331055, "rewards/helpfulness_reward/std": 0.7201612591743469, "rewards/safety_reward/mean": 8.733642578125, "rewards/safety_reward/std": 1.2424473762512207, "step": 663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 54.125, "completions/mean_terminated_length": 54.125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.2319448083136844, "frac_reward_zero_std": 0.0, "grad_norm": 0.6728086471557617, "kl": 4.00390625, "learning_rate": 5e-05, "loss": 0.0451, "num_tokens": 9207953.0, "reward": 8.65869140625, "reward_std": 0.2209683060646057, "rewards/helpfulness_reward/mean": 1.3405694961547852, "rewards/helpfulness_reward/std": 0.8603476285934448, "rewards/safety_reward/mean": 8.65869140625, "rewards/safety_reward/std": 1.8772515058517456, "step": 664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 54.515625, "completions/mean_terminated_length": 54.515625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.23229412278403633, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8663495182991028, "kl": 3.826171875, "learning_rate": 5e-05, "loss": 0.0372, "num_tokens": 9220843.0, "reward": 7.758248329162598, "reward_std": 0.36681416630744934, "rewards/helpfulness_reward/mean": 0.8481318950653076, "rewards/helpfulness_reward/std": 1.0396348237991333, "rewards/safety_reward/mean": 7.758248329162598, "rewards/safety_reward/std": 2.4782943725585938, "step": 665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1792.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 70.2109375, "completions/mean_terminated_length": 56.653541564941406, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.23264343725438827, "frac_reward_zero_std": 0.0, "grad_norm": 0.47325199842453003, "kl": 3.333984375, "learning_rate": 5e-05, "loss": 0.151, "num_tokens": 9236054.0, "reward": 7.67022705078125, "reward_std": 0.45184844732284546, "rewards/helpfulness_reward/mean": 0.41471099853515625, "rewards/helpfulness_reward/std": 0.8172724843025208, "rewards/safety_reward/mean": 7.67022705078125, "rewards/safety_reward/std": 2.6903183460235596, "step": 666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 54.1875, "completions/mean_terminated_length": 54.1875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.2329927517247402, "frac_reward_zero_std": 0.0625, "grad_norm": 0.42786917090415955, "kl": 3.55859375, "learning_rate": 5e-05, "loss": 0.0244, "num_tokens": 9247150.0, "reward": 8.05889892578125, "reward_std": 0.3750075697898865, "rewards/helpfulness_reward/mean": 0.9015026092529297, "rewards/helpfulness_reward/std": 0.9448203444480896, "rewards/safety_reward/mean": 8.05889892578125, "rewards/safety_reward/std": 1.4636598825454712, "step": 667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 54.234375, "completions/mean_terminated_length": 54.234375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.23334206619509212, "frac_reward_zero_std": 0.0, "grad_norm": 0.35925477743148804, "kl": 3.697265625, "learning_rate": 5e-05, "loss": 0.0415, "num_tokens": 9258484.0, "reward": 8.712158203125, "reward_std": 0.3515391945838928, "rewards/helpfulness_reward/mean": 0.8997712135314941, "rewards/helpfulness_reward/std": 0.6751255989074707, "rewards/safety_reward/mean": 8.712158203125, "rewards/safety_reward/std": 1.1822280883789062, "step": 668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 86.0, "completions/max_terminated_length": 86.0, "completions/mean_length": 54.5, "completions/mean_terminated_length": 54.5, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.23369138066544407, "frac_reward_zero_std": 0.0, "grad_norm": 0.4662095010280609, "kl": 3.708984375, "learning_rate": 5e-05, "loss": 0.0369, "num_tokens": 9269796.0, "reward": 8.910400390625, "reward_std": 0.41589978337287903, "rewards/helpfulness_reward/mean": 0.8685321807861328, "rewards/helpfulness_reward/std": 0.9570519924163818, "rewards/safety_reward/mean": 8.910400390625, "rewards/safety_reward/std": 1.4127141237258911, "step": 669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 54.3828125, "completions/mean_terminated_length": 54.3828125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.234040695135796, "frac_reward_zero_std": 0.0, "grad_norm": 0.38431909680366516, "kl": 3.6171875, "learning_rate": 5e-05, "loss": 0.0489, "num_tokens": 9281677.0, "reward": 9.049560546875, "reward_std": 0.25859880447387695, "rewards/helpfulness_reward/mean": 0.8735690116882324, "rewards/helpfulness_reward/std": 0.6809080243110657, "rewards/safety_reward/mean": 9.049560546875, "rewards/safety_reward/std": 1.1189175844192505, "step": 670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 84.0, "completions/max_terminated_length": 84.0, "completions/mean_length": 54.140625, "completions/mean_terminated_length": 54.140625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.23439000960614795, "frac_reward_zero_std": 0.0, "grad_norm": 0.5364174842834473, "kl": 3.955078125, "learning_rate": 5e-05, "loss": 0.0405, "num_tokens": 9292879.0, "reward": 8.738525390625, "reward_std": 0.2911939024925232, "rewards/helpfulness_reward/mean": 0.6648805141448975, "rewards/helpfulness_reward/std": 0.7979316711425781, "rewards/safety_reward/mean": 8.738525390625, "rewards/safety_reward/std": 1.1857041120529175, "step": 671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 54.7109375, "completions/mean_terminated_length": 54.7109375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.23473932407649986, "frac_reward_zero_std": 0.0, "grad_norm": 0.3802773654460907, "kl": 3.6796875, "learning_rate": 5e-05, "loss": 0.0526, "num_tokens": 9304218.0, "reward": 9.3232421875, "reward_std": 0.3638136386871338, "rewards/helpfulness_reward/mean": 0.9954400062561035, "rewards/helpfulness_reward/std": 0.6951193809509277, "rewards/safety_reward/mean": 9.3232421875, "rewards/safety_reward/std": 1.1847541332244873, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 90.0, "completions/max_terminated_length": 90.0, "completions/mean_length": 54.5390625, "completions/mean_terminated_length": 54.5390625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.2350886385468518, "frac_reward_zero_std": 0.0, "grad_norm": 0.6275198459625244, "kl": 4.111328125, "learning_rate": 5e-05, "loss": 0.0577, "num_tokens": 9314823.0, "reward": 8.54443359375, "reward_std": 0.3095538020133972, "rewards/helpfulness_reward/mean": 0.7139043807983398, "rewards/helpfulness_reward/std": 0.5821024179458618, "rewards/safety_reward/mean": 8.54443359375, "rewards/safety_reward/std": 1.5509741306304932, "step": 673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 54.234375, "completions/mean_terminated_length": 54.234375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.23543795301720374, "frac_reward_zero_std": 0.0625, "grad_norm": 0.30720648169517517, "kl": 3.81640625, "learning_rate": 5e-05, "loss": 0.0478, "num_tokens": 9325773.0, "reward": 8.5048828125, "reward_std": 0.28403031826019287, "rewards/helpfulness_reward/mean": 0.9047036170959473, "rewards/helpfulness_reward/std": 0.6824463605880737, "rewards/safety_reward/mean": 8.5048828125, "rewards/safety_reward/std": 1.1752595901489258, "step": 674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 87.0, "completions/max_terminated_length": 87.0, "completions/mean_length": 54.6015625, "completions/mean_terminated_length": 54.6015625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.23578726748755569, "frac_reward_zero_std": 0.0, "grad_norm": 0.32401734590530396, "kl": 3.671875, "learning_rate": 5e-05, "loss": 0.0531, "num_tokens": 9336594.0, "reward": 8.87109375, "reward_std": 0.40608641505241394, "rewards/helpfulness_reward/mean": 1.2195663452148438, "rewards/helpfulness_reward/std": 0.9780802130699158, "rewards/safety_reward/mean": 8.87109375, "rewards/safety_reward/std": 1.4361406564712524, "step": 675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 54.6796875, "completions/mean_terminated_length": 54.6796875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.2361365819579076, "frac_reward_zero_std": 0.0, "grad_norm": 0.361265629529953, "kl": 3.46875, "learning_rate": 5e-05, "loss": 0.0538, "num_tokens": 9348281.0, "reward": 8.8564453125, "reward_std": 0.2904818654060364, "rewards/helpfulness_reward/mean": 0.7833404541015625, "rewards/helpfulness_reward/std": 0.8833719491958618, "rewards/safety_reward/mean": 8.8564453125, "rewards/safety_reward/std": 1.3681066036224365, "step": 676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 54.1796875, "completions/mean_terminated_length": 54.1796875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.23648589642825954, "frac_reward_zero_std": 0.0, "grad_norm": 0.4672792851924896, "kl": 3.8359375, "learning_rate": 5e-05, "loss": 0.044, "num_tokens": 9359240.0, "reward": 8.565673828125, "reward_std": 0.20397719740867615, "rewards/helpfulness_reward/mean": 0.7294001579284668, "rewards/helpfulness_reward/std": 0.8906795382499695, "rewards/safety_reward/mean": 8.565673828125, "rewards/safety_reward/std": 1.394434928894043, "step": 677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 84.0, "completions/max_terminated_length": 84.0, "completions/mean_length": 54.3359375, "completions/mean_terminated_length": 54.3359375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.23683521089861148, "frac_reward_zero_std": 0.125, "grad_norm": 0.34010976552963257, "kl": 3.73828125, "learning_rate": 5e-05, "loss": 0.046, "num_tokens": 9369923.0, "reward": 9.178466796875, "reward_std": 0.2129615992307663, "rewards/helpfulness_reward/mean": 0.561485767364502, "rewards/helpfulness_reward/std": 0.6563804745674133, "rewards/safety_reward/mean": 9.178466796875, "rewards/safety_reward/std": 1.2013230323791504, "step": 678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 54.2578125, "completions/mean_terminated_length": 54.2578125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.23718452536896342, "frac_reward_zero_std": 0.0, "grad_norm": 0.6598954796791077, "kl": 4.091796875, "learning_rate": 5e-05, "loss": 0.0476, "num_tokens": 9381556.0, "reward": 8.6986083984375, "reward_std": 0.19524556398391724, "rewards/helpfulness_reward/mean": 1.3248989582061768, "rewards/helpfulness_reward/std": 0.7022360563278198, "rewards/safety_reward/mean": 8.6986083984375, "rewards/safety_reward/std": 1.8557473421096802, "step": 679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 54.1484375, "completions/mean_terminated_length": 54.1484375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.23753383983931534, "frac_reward_zero_std": 0.0625, "grad_norm": 0.34088999032974243, "kl": 3.759765625, "learning_rate": 5e-05, "loss": 0.0431, "num_tokens": 9392695.0, "reward": 9.179443359375, "reward_std": 0.22666677832603455, "rewards/helpfulness_reward/mean": 1.1606969833374023, "rewards/helpfulness_reward/std": 0.513676106929779, "rewards/safety_reward/mean": 9.179443359375, "rewards/safety_reward/std": 1.0509976148605347, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 54.0234375, "completions/mean_terminated_length": 54.0234375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.23788315430966728, "frac_reward_zero_std": 0.125, "grad_norm": 0.2564432919025421, "kl": 3.681640625, "learning_rate": 5e-05, "loss": 0.0387, "num_tokens": 9403482.0, "reward": 8.146728515625, "reward_std": 0.1682870090007782, "rewards/helpfulness_reward/mean": 0.7658576965332031, "rewards/helpfulness_reward/std": 0.7922462821006775, "rewards/safety_reward/mean": 8.146728515625, "rewards/safety_reward/std": 1.5155067443847656, "step": 681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 54.1171875, "completions/mean_terminated_length": 54.1171875, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.23823246878001922, "frac_reward_zero_std": 0.0, "grad_norm": 1.1424678564071655, "kl": 4.2890625, "learning_rate": 5e-05, "loss": 0.0467, "num_tokens": 9414993.0, "reward": 8.6337890625, "reward_std": 0.24672743678092957, "rewards/helpfulness_reward/mean": 1.094879150390625, "rewards/helpfulness_reward/std": 0.7148733735084534, "rewards/safety_reward/mean": 8.6337890625, "rewards/safety_reward/std": 1.1440180540084839, "step": 682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 54.21875, "completions/mean_terminated_length": 54.21875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.23858178325037116, "frac_reward_zero_std": 0.125, "grad_norm": 0.29266589879989624, "kl": 3.580078125, "learning_rate": 5e-05, "loss": 0.0377, "num_tokens": 9425413.0, "reward": 8.543212890625, "reward_std": 0.2416606843471527, "rewards/helpfulness_reward/mean": 0.6672725677490234, "rewards/helpfulness_reward/std": 0.5091413855552673, "rewards/safety_reward/mean": 8.543212890625, "rewards/safety_reward/std": 1.2692241668701172, "step": 683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 54.2265625, "completions/mean_terminated_length": 54.2265625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.23893109772072307, "frac_reward_zero_std": 0.125, "grad_norm": 0.34080591797828674, "kl": 3.380859375, "learning_rate": 5e-05, "loss": 0.0373, "num_tokens": 9435850.0, "reward": 9.0400390625, "reward_std": 0.17681220173835754, "rewards/helpfulness_reward/mean": 0.8599262237548828, "rewards/helpfulness_reward/std": 0.7058009505271912, "rewards/safety_reward/mean": 9.0400390625, "rewards/safety_reward/std": 0.8642294406890869, "step": 684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 54.296875, "completions/mean_terminated_length": 54.296875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.23928041219107501, "frac_reward_zero_std": 0.0, "grad_norm": 0.34599387645721436, "kl": 3.705078125, "learning_rate": 5e-05, "loss": 0.0384, "num_tokens": 9447576.0, "reward": 8.71435546875, "reward_std": 0.25441011786460876, "rewards/helpfulness_reward/mean": 0.7160739898681641, "rewards/helpfulness_reward/std": 0.8629295229911804, "rewards/safety_reward/mean": 8.71435546875, "rewards/safety_reward/std": 0.9336265921592712, "step": 685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 54.140625, "completions/mean_terminated_length": 54.140625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.23962972666142696, "frac_reward_zero_std": 0.0, "grad_norm": 0.5006977915763855, "kl": 4.001953125, "learning_rate": 5e-05, "loss": 0.0449, "num_tokens": 9461306.0, "reward": 8.735107421875, "reward_std": 0.2685776650905609, "rewards/helpfulness_reward/mean": 0.689849853515625, "rewards/helpfulness_reward/std": 0.7022905945777893, "rewards/safety_reward/mean": 8.735107421875, "rewards/safety_reward/std": 1.6568984985351562, "step": 686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 54.234375, "completions/mean_terminated_length": 54.234375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.2399790411317789, "frac_reward_zero_std": 0.0625, "grad_norm": 0.46647801995277405, "kl": 3.65234375, "learning_rate": 5e-05, "loss": 0.0453, "num_tokens": 9472608.0, "reward": 8.43115234375, "reward_std": 0.2750326097011566, "rewards/helpfulness_reward/mean": 0.7556846141815186, "rewards/helpfulness_reward/std": 0.7364349961280823, "rewards/safety_reward/mean": 8.43115234375, "rewards/safety_reward/std": 1.4874824285507202, "step": 687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 54.171875, "completions/mean_terminated_length": 54.171875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.2403283556021308, "frac_reward_zero_std": 0.0625, "grad_norm": 0.33071544766426086, "kl": 3.69140625, "learning_rate": 5e-05, "loss": 0.0357, "num_tokens": 9483398.0, "reward": 8.54833984375, "reward_std": 0.15510472655296326, "rewards/helpfulness_reward/mean": 0.7933540344238281, "rewards/helpfulness_reward/std": 0.8081408143043518, "rewards/safety_reward/mean": 8.54833984375, "rewards/safety_reward/std": 1.4439092874526978, "step": 688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 54.1328125, "completions/mean_terminated_length": 54.1328125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.24067767007248275, "frac_reward_zero_std": 0.0625, "grad_norm": 0.22197812795639038, "kl": 3.626953125, "learning_rate": 5e-05, "loss": 0.0391, "num_tokens": 9494127.0, "reward": 8.76708984375, "reward_std": 0.12546801567077637, "rewards/helpfulness_reward/mean": 0.8296589851379395, "rewards/helpfulness_reward/std": 0.8106814026832581, "rewards/safety_reward/mean": 8.76708984375, "rewards/safety_reward/std": 0.7948168516159058, "step": 689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 54.6328125, "completions/mean_terminated_length": 54.6328125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.2410269845428347, "frac_reward_zero_std": 0.0, "grad_norm": 0.39937883615493774, "kl": 3.802734375, "learning_rate": 5e-05, "loss": 0.0487, "num_tokens": 9506464.0, "reward": 8.71826171875, "reward_std": 0.24651336669921875, "rewards/helpfulness_reward/mean": 0.6476180553436279, "rewards/helpfulness_reward/std": 0.7841195464134216, "rewards/safety_reward/mean": 8.71826171875, "rewards/safety_reward/std": 1.477781891822815, "step": 690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 54.1484375, "completions/mean_terminated_length": 54.1484375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.24137629901318663, "frac_reward_zero_std": 0.0, "grad_norm": 0.4281160831451416, "kl": 4.05078125, "learning_rate": 5e-05, "loss": 0.0436, "num_tokens": 9517675.0, "reward": 9.376953125, "reward_std": 0.24392342567443848, "rewards/helpfulness_reward/mean": 1.11395263671875, "rewards/helpfulness_reward/std": 0.7407604455947876, "rewards/safety_reward/mean": 9.376953125, "rewards/safety_reward/std": 1.0383039712905884, "step": 691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 57.1953125, "completions/mean_terminated_length": 57.1953125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.24172561348353855, "frac_reward_zero_std": 0.0, "grad_norm": 0.3137902319431305, "kl": 3.62109375, "learning_rate": 5e-05, "loss": 0.039, "num_tokens": 9530732.0, "reward": 7.990267753601074, "reward_std": 0.18047583103179932, "rewards/helpfulness_reward/mean": 1.1051896810531616, "rewards/helpfulness_reward/std": 0.7501510977745056, "rewards/safety_reward/mean": 7.990267753601074, "rewards/safety_reward/std": 2.421403408050537, "step": 692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 54.03125, "completions/mean_terminated_length": 54.03125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.2420749279538905, "frac_reward_zero_std": 0.0625, "grad_norm": 0.24582159519195557, "kl": 3.7734375, "learning_rate": 5e-05, "loss": 0.0392, "num_tokens": 9541408.0, "reward": 8.73388671875, "reward_std": 0.20821861922740936, "rewards/helpfulness_reward/mean": 0.9233465194702148, "rewards/helpfulness_reward/std": 0.7150583863258362, "rewards/safety_reward/mean": 8.73388671875, "rewards/safety_reward/std": 1.3041776418685913, "step": 693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 54.0546875, "completions/mean_terminated_length": 54.0546875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.24242424242424243, "frac_reward_zero_std": 0.0625, "grad_norm": 0.3591889441013336, "kl": 3.7421875, "learning_rate": 5e-05, "loss": 0.039, "num_tokens": 9552551.0, "reward": 9.022216796875, "reward_std": 0.1711033433675766, "rewards/helpfulness_reward/mean": 0.8745110034942627, "rewards/helpfulness_reward/std": 0.8252668380737305, "rewards/safety_reward/mean": 9.022216796875, "rewards/safety_reward/std": 0.8103976845741272, "step": 694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 53.8671875, "completions/mean_terminated_length": 53.8671875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.24277355689459437, "frac_reward_zero_std": 0.0625, "grad_norm": 0.3458375632762909, "kl": 3.763671875, "learning_rate": 5e-05, "loss": 0.0326, "num_tokens": 9563726.0, "reward": 8.5380859375, "reward_std": 0.22456607222557068, "rewards/helpfulness_reward/mean": 0.7666454315185547, "rewards/helpfulness_reward/std": 0.6909003257751465, "rewards/safety_reward/mean": 8.5380859375, "rewards/safety_reward/std": 1.2981969118118286, "step": 695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 54.203125, "completions/mean_terminated_length": 54.203125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.24312287136494629, "frac_reward_zero_std": 0.0, "grad_norm": 0.3042832911014557, "kl": 3.5078125, "learning_rate": 5e-05, "loss": 0.0405, "num_tokens": 9574656.0, "reward": 8.888427734375, "reward_std": 0.11559613049030304, "rewards/helpfulness_reward/mean": 0.6636507511138916, "rewards/helpfulness_reward/std": 0.549172043800354, "rewards/safety_reward/mean": 8.888427734375, "rewards/safety_reward/std": 1.2071818113327026, "step": 696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 54.0546875, "completions/mean_terminated_length": 54.0546875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.24347218583529823, "frac_reward_zero_std": 0.125, "grad_norm": 0.33167093992233276, "kl": 3.98828125, "learning_rate": 5e-05, "loss": 0.0397, "num_tokens": 9585799.0, "reward": 8.66259765625, "reward_std": 0.27043288946151733, "rewards/helpfulness_reward/mean": 0.639864444732666, "rewards/helpfulness_reward/std": 0.9254409074783325, "rewards/safety_reward/mean": 8.66259765625, "rewards/safety_reward/std": 1.4439011812210083, "step": 697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 54.3203125, "completions/mean_terminated_length": 54.3203125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.24382150030565017, "frac_reward_zero_std": 0.0, "grad_norm": 0.4010365307331085, "kl": 3.9609375, "learning_rate": 5e-05, "loss": 0.046, "num_tokens": 9598232.0, "reward": 8.786376953125, "reward_std": 0.21975256502628326, "rewards/helpfulness_reward/mean": 0.486513614654541, "rewards/helpfulness_reward/std": 0.8874216079711914, "rewards/safety_reward/mean": 8.786376953125, "rewards/safety_reward/std": 1.2666325569152832, "step": 698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 54.0078125, "completions/mean_terminated_length": 54.0078125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.2441708147760021, "frac_reward_zero_std": 0.0625, "grad_norm": 0.29063984751701355, "kl": 3.8125, "learning_rate": 5e-05, "loss": 0.0412, "num_tokens": 9609193.0, "reward": 8.575927734375, "reward_std": 0.32380229234695435, "rewards/helpfulness_reward/mean": 1.0027809143066406, "rewards/helpfulness_reward/std": 0.547926664352417, "rewards/safety_reward/mean": 8.575927734375, "rewards/safety_reward/std": 1.1481962203979492, "step": 699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 54.1171875, "completions/mean_terminated_length": 54.1171875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.24452012924635402, "frac_reward_zero_std": 0.1875, "grad_norm": 0.3179037272930145, "kl": 3.662109375, "learning_rate": 5e-05, "loss": 0.039, "num_tokens": 9619760.0, "reward": 8.4183349609375, "reward_std": 0.24194546043872833, "rewards/helpfulness_reward/mean": 0.8315563201904297, "rewards/helpfulness_reward/std": 0.6942752003669739, "rewards/safety_reward/mean": 8.4183349609375, "rewards/safety_reward/std": 1.566343903541565, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 54.125, "completions/mean_terminated_length": 54.125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.24486944371670596, "frac_reward_zero_std": 0.0625, "grad_norm": 0.4887297749519348, "kl": 4.078125, "learning_rate": 5e-05, "loss": 0.0452, "num_tokens": 9630712.0, "reward": 8.61962890625, "reward_std": 0.3351667523384094, "rewards/helpfulness_reward/mean": 0.8229849338531494, "rewards/helpfulness_reward/std": 0.7913590669631958, "rewards/safety_reward/mean": 8.61962890625, "rewards/safety_reward/std": 1.1872092485427856, "step": 701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 84.0, "completions/max_terminated_length": 84.0, "completions/mean_length": 54.0859375, "completions/mean_terminated_length": 54.0859375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.2452187581870579, "frac_reward_zero_std": 0.1875, "grad_norm": 0.469706267118454, "kl": 4.095703125, "learning_rate": 5e-05, "loss": 0.0527, "num_tokens": 9641531.0, "reward": 8.735595703125, "reward_std": 0.2925449311733246, "rewards/helpfulness_reward/mean": 0.7252883911132812, "rewards/helpfulness_reward/std": 0.7783218622207642, "rewards/safety_reward/mean": 8.735595703125, "rewards/safety_reward/std": 1.2217332124710083, "step": 702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 54.046875, "completions/mean_terminated_length": 54.046875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.24556807265740982, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5823994874954224, "kl": 4.076171875, "learning_rate": 5e-05, "loss": 0.0396, "num_tokens": 9653177.0, "reward": 9.0037841796875, "reward_std": 0.39434802532196045, "rewards/helpfulness_reward/mean": 1.0478286743164062, "rewards/helpfulness_reward/std": 0.7944839596748352, "rewards/safety_reward/mean": 9.0037841796875, "rewards/safety_reward/std": 1.1039122343063354, "step": 703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 53.328125, "completions/mean_terminated_length": 53.328125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.24591738712776176, "frac_reward_zero_std": 0.0625, "grad_norm": 1.971266508102417, "kl": 5.1015625, "learning_rate": 5e-05, "loss": 0.0536, "num_tokens": 9667643.0, "reward": 8.2049560546875, "reward_std": 0.3963510990142822, "rewards/helpfulness_reward/mean": 0.8445663452148438, "rewards/helpfulness_reward/std": 0.7226368188858032, "rewards/safety_reward/mean": 8.2049560546875, "rewards/safety_reward/std": 1.6814327239990234, "step": 704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 53.796875, "completions/mean_terminated_length": 53.796875, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.2462667015981137, "frac_reward_zero_std": 0.125, "grad_norm": 0.597831130027771, "kl": 3.80859375, "learning_rate": 5e-05, "loss": 0.056, "num_tokens": 9680825.0, "reward": 8.329538345336914, "reward_std": 0.2377142459154129, "rewards/helpfulness_reward/mean": 0.648162841796875, "rewards/helpfulness_reward/std": 1.0350278615951538, "rewards/safety_reward/mean": 8.329538345336914, "rewards/safety_reward/std": 2.2917299270629883, "step": 705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 54.0234375, "completions/mean_terminated_length": 54.0234375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.24661601606846564, "frac_reward_zero_std": 0.0, "grad_norm": 0.4256148934364319, "kl": 3.9609375, "learning_rate": 5e-05, "loss": 0.0412, "num_tokens": 9691780.0, "reward": 8.52392578125, "reward_std": 0.15915840864181519, "rewards/helpfulness_reward/mean": 0.8212218284606934, "rewards/helpfulness_reward/std": 0.6637086868286133, "rewards/safety_reward/mean": 8.52392578125, "rewards/safety_reward/std": 1.267383337020874, "step": 706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 50.828125, "completions/mean_terminated_length": 50.828125, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.24696533053881756, "frac_reward_zero_std": 0.0625, "grad_norm": 0.41717928647994995, "kl": 3.619140625, "learning_rate": 5e-05, "loss": 0.0335, "num_tokens": 9703518.0, "reward": 8.483009338378906, "reward_std": 0.28613659739494324, "rewards/helpfulness_reward/mean": 1.1263160705566406, "rewards/helpfulness_reward/std": 0.8197352886199951, "rewards/safety_reward/mean": 8.483009338378906, "rewards/safety_reward/std": 2.1073098182678223, "step": 707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 54.046875, "completions/mean_terminated_length": 54.046875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.2473146450091695, "frac_reward_zero_std": 0.0, "grad_norm": 0.9030709862709045, "kl": 4.2578125, "learning_rate": 5e-05, "loss": 0.0438, "num_tokens": 9714620.0, "reward": 8.56396484375, "reward_std": 0.23200289905071259, "rewards/helpfulness_reward/mean": 0.9254627227783203, "rewards/helpfulness_reward/std": 0.8776814937591553, "rewards/safety_reward/mean": 8.56396484375, "rewards/safety_reward/std": 0.8929271101951599, "step": 708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 54.3984375, "completions/mean_terminated_length": 54.3984375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.24766395947952144, "frac_reward_zero_std": 0.0, "grad_norm": 0.3601144850254059, "kl": 4.076171875, "learning_rate": 5e-05, "loss": 0.0514, "num_tokens": 9725479.0, "reward": 8.831787109375, "reward_std": 0.3602328896522522, "rewards/helpfulness_reward/mean": 0.9046602249145508, "rewards/helpfulness_reward/std": 0.8131914734840393, "rewards/safety_reward/mean": 8.831787109375, "rewards/safety_reward/std": 1.3476390838623047, "step": 709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 53.9375, "completions/mean_terminated_length": 53.9375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.24801327394987338, "frac_reward_zero_std": 0.0, "grad_norm": 0.380591481924057, "kl": 3.9609375, "learning_rate": 5e-05, "loss": 0.0393, "num_tokens": 9736287.0, "reward": 8.57470703125, "reward_std": 0.37021639943122864, "rewards/helpfulness_reward/mean": 0.6361680030822754, "rewards/helpfulness_reward/std": 0.6490853428840637, "rewards/safety_reward/mean": 8.57470703125, "rewards/safety_reward/std": 1.4526081085205078, "step": 710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 54.15625, "completions/mean_terminated_length": 54.15625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.2483625884202253, "frac_reward_zero_std": 0.0, "grad_norm": 0.5166580080986023, "kl": 3.833984375, "learning_rate": 5e-05, "loss": 0.0426, "num_tokens": 9747371.0, "reward": 8.9150390625, "reward_std": 0.253304123878479, "rewards/helpfulness_reward/mean": 0.8089017868041992, "rewards/helpfulness_reward/std": 0.7317249178886414, "rewards/safety_reward/mean": 8.9150390625, "rewards/safety_reward/std": 1.2129470109939575, "step": 711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 54.2109375, "completions/mean_terminated_length": 54.2109375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.24871190289057724, "frac_reward_zero_std": 0.0, "grad_norm": 0.26954886317253113, "kl": 3.732421875, "learning_rate": 5e-05, "loss": 0.0416, "num_tokens": 9758590.0, "reward": 8.69287109375, "reward_std": 0.2811664640903473, "rewards/helpfulness_reward/mean": 0.7579107284545898, "rewards/helpfulness_reward/std": 0.7089806199073792, "rewards/safety_reward/mean": 8.69287109375, "rewards/safety_reward/std": 1.2833763360977173, "step": 712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1792.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 93.4453125, "completions/mean_terminated_length": 52.68000411987305, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.24906121736092918, "frac_reward_zero_std": 0.0, "grad_norm": 0.3759540915489197, "kl": 2.8623046875, "learning_rate": 5e-05, "loss": 0.2194, "num_tokens": 9778367.0, "reward": 7.775054931640625, "reward_std": 0.39781734347343445, "rewards/helpfulness_reward/mean": 0.46417713165283203, "rewards/helpfulness_reward/std": 0.9758684635162354, "rewards/safety_reward/mean": 7.775054931640625, "rewards/safety_reward/std": 2.2665345668792725, "step": 713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 54.2734375, "completions/mean_terminated_length": 54.2734375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.24941053183128112, "frac_reward_zero_std": 0.0, "grad_norm": 0.6398337483406067, "kl": 4.0625, "learning_rate": 5e-05, "loss": 0.0456, "num_tokens": 9790810.0, "reward": 8.583984375, "reward_std": 0.2990661859512329, "rewards/helpfulness_reward/mean": 0.8566360473632812, "rewards/helpfulness_reward/std": 1.064414381980896, "rewards/safety_reward/mean": 8.583984375, "rewards/safety_reward/std": 1.6790248155593872, "step": 714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 54.1484375, "completions/mean_terminated_length": 54.1484375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.24975984630163303, "frac_reward_zero_std": 0.0625, "grad_norm": 0.3830866813659668, "kl": 3.751953125, "learning_rate": 5e-05, "loss": 0.0483, "num_tokens": 9801565.0, "reward": 8.903076171875, "reward_std": 0.3151208758354187, "rewards/helpfulness_reward/mean": 1.0603008270263672, "rewards/helpfulness_reward/std": 0.6207162141799927, "rewards/safety_reward/mean": 8.903076171875, "rewards/safety_reward/std": 1.006842017173767, "step": 715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 54.1953125, "completions/mean_terminated_length": 54.1953125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.250109160771985, "frac_reward_zero_std": 0.0, "grad_norm": 0.4749753177165985, "kl": 3.802734375, "learning_rate": 5e-05, "loss": 0.0437, "num_tokens": 9814222.0, "reward": 8.682861328125, "reward_std": 0.35450196266174316, "rewards/helpfulness_reward/mean": 0.8348350524902344, "rewards/helpfulness_reward/std": 0.9414842128753662, "rewards/safety_reward/mean": 8.682861328125, "rewards/safety_reward/std": 1.3944969177246094, "step": 716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1583.0, "completions/max_terminated_length": 1583.0, "completions/mean_length": 103.078125, "completions/mean_terminated_length": 103.078125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.2504584752423369, "frac_reward_zero_std": 0.0, "grad_norm": 0.4236038327217102, "kl": 3.119140625, "learning_rate": 5e-05, "loss": 0.0683, "num_tokens": 9833376.0, "reward": 8.279294967651367, "reward_std": 0.5696762800216675, "rewards/helpfulness_reward/mean": 1.0457520484924316, "rewards/helpfulness_reward/std": 0.7618046998977661, "rewards/safety_reward/mean": 8.279294967651367, "rewards/safety_reward/std": 2.816192150115967, "step": 717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 53.0859375, "completions/mean_terminated_length": 53.0859375, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.25080778971268886, "frac_reward_zero_std": 0.0, "grad_norm": 0.43187791109085083, "kl": 3.923828125, "learning_rate": 5e-05, "loss": 0.0451, "num_tokens": 9844155.0, "reward": 8.43896484375, "reward_std": 0.31585681438446045, "rewards/helpfulness_reward/mean": 0.9045782089233398, "rewards/helpfulness_reward/std": 0.7423791885375977, "rewards/safety_reward/mean": 8.43896484375, "rewards/safety_reward/std": 1.8440725803375244, "step": 718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 54.046875, "completions/mean_terminated_length": 54.046875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.25115710418304077, "frac_reward_zero_std": 0.0, "grad_norm": 0.4068503975868225, "kl": 3.814453125, "learning_rate": 5e-05, "loss": 0.043, "num_tokens": 9854529.0, "reward": 9.173095703125, "reward_std": 0.4879360795021057, "rewards/helpfulness_reward/mean": 1.1377105712890625, "rewards/helpfulness_reward/std": 0.73308265209198, "rewards/safety_reward/mean": 9.173095703125, "rewards/safety_reward/std": 1.3339080810546875, "step": 719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 53.0546875, "completions/mean_terminated_length": 53.0546875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.25150641865339274, "frac_reward_zero_std": 0.0, "grad_norm": 0.43143144249916077, "kl": 4.07421875, "learning_rate": 5e-05, "loss": 0.0275, "num_tokens": 9865536.0, "reward": 9.052734375, "reward_std": 0.5373840928077698, "rewards/helpfulness_reward/mean": 0.7583370208740234, "rewards/helpfulness_reward/std": 0.5688410401344299, "rewards/safety_reward/mean": 9.052734375, "rewards/safety_reward/std": 1.600501298904419, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 53.875, "completions/mean_terminated_length": 53.875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.25185573312374465, "frac_reward_zero_std": 0.0, "grad_norm": 0.4918874502182007, "kl": 3.61328125, "learning_rate": 5e-05, "loss": 0.0373, "num_tokens": 9876240.0, "reward": 8.489990234375, "reward_std": 0.28354209661483765, "rewards/helpfulness_reward/mean": 1.1834182739257812, "rewards/helpfulness_reward/std": 0.9200798273086548, "rewards/safety_reward/mean": 8.489990234375, "rewards/safety_reward/std": 0.9756897687911987, "step": 721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 53.8046875, "completions/mean_terminated_length": 53.8046875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.25220504759409657, "frac_reward_zero_std": 0.0, "grad_norm": 0.4636857211589813, "kl": 3.783203125, "learning_rate": 5e-05, "loss": 0.0343, "num_tokens": 9887271.0, "reward": 8.1373291015625, "reward_std": 0.41484779119491577, "rewards/helpfulness_reward/mean": 0.75469970703125, "rewards/helpfulness_reward/std": 0.8117455244064331, "rewards/safety_reward/mean": 8.1373291015625, "rewards/safety_reward/std": 1.5922157764434814, "step": 722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 53.875, "completions/mean_terminated_length": 53.875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.25255436206444853, "frac_reward_zero_std": 0.0, "grad_norm": 0.4263486862182617, "kl": 3.759765625, "learning_rate": 5e-05, "loss": 0.0346, "num_tokens": 9898135.0, "reward": 8.2255859375, "reward_std": 0.3292890191078186, "rewards/helpfulness_reward/mean": 0.9935073852539062, "rewards/helpfulness_reward/std": 0.7210111618041992, "rewards/safety_reward/mean": 8.2255859375, "rewards/safety_reward/std": 1.314343810081482, "step": 723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 53.828125, "completions/mean_terminated_length": 53.828125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.25290367653480045, "frac_reward_zero_std": 0.0, "grad_norm": 0.3700116276741028, "kl": 3.49609375, "learning_rate": 5e-05, "loss": 0.0333, "num_tokens": 9908729.0, "reward": 9.021240234375, "reward_std": 0.31859320402145386, "rewards/helpfulness_reward/mean": 1.2843170166015625, "rewards/helpfulness_reward/std": 0.825715184211731, "rewards/safety_reward/mean": 9.021240234375, "rewards/safety_reward/std": 1.2221990823745728, "step": 724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 53.7578125, "completions/mean_terminated_length": 53.7578125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.25325299100515236, "frac_reward_zero_std": 0.0, "grad_norm": 0.37996312975883484, "kl": 3.58984375, "learning_rate": 5e-05, "loss": 0.0332, "num_tokens": 9922186.0, "reward": 8.412841796875, "reward_std": 0.21645797789096832, "rewards/helpfulness_reward/mean": 0.6649255752563477, "rewards/helpfulness_reward/std": 0.7173910140991211, "rewards/safety_reward/mean": 8.412841796875, "rewards/safety_reward/std": 1.488052487373352, "step": 725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 53.9453125, "completions/mean_terminated_length": 53.9453125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.25360230547550433, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5647099018096924, "kl": 3.755859375, "learning_rate": 5e-05, "loss": 0.043, "num_tokens": 9932643.0, "reward": 8.34716796875, "reward_std": 0.3137134313583374, "rewards/helpfulness_reward/mean": 1.2061080932617188, "rewards/helpfulness_reward/std": 0.5939213633537292, "rewards/safety_reward/mean": 8.34716796875, "rewards/safety_reward/std": 1.6565412282943726, "step": 726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 53.890625, "completions/mean_terminated_length": 53.890625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.25395161994585624, "frac_reward_zero_std": 0.0, "grad_norm": 0.3528056740760803, "kl": 3.556640625, "learning_rate": 5e-05, "loss": 0.0319, "num_tokens": 9945853.0, "reward": 8.6553955078125, "reward_std": 0.3306143283843994, "rewards/helpfulness_reward/mean": 0.9148610234260559, "rewards/helpfulness_reward/std": 1.0520246028900146, "rewards/safety_reward/mean": 8.6553955078125, "rewards/safety_reward/std": 1.560055136680603, "step": 727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 92.0, "completions/max_terminated_length": 92.0, "completions/mean_length": 54.46875, "completions/mean_terminated_length": 54.46875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.2543009344162082, "frac_reward_zero_std": 0.0, "grad_norm": 0.6345885992050171, "kl": 4.005859375, "learning_rate": 5e-05, "loss": 0.0502, "num_tokens": 9957993.0, "reward": 8.24078369140625, "reward_std": 0.3033410906791687, "rewards/helpfulness_reward/mean": 1.0204675197601318, "rewards/helpfulness_reward/std": 0.6210907697677612, "rewards/safety_reward/mean": 8.24078369140625, "rewards/safety_reward/std": 2.0431599617004395, "step": 728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 53.953125, "completions/mean_terminated_length": 53.953125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.2546502488865601, "frac_reward_zero_std": 0.0, "grad_norm": 0.3385414183139801, "kl": 3.703125, "learning_rate": 5e-05, "loss": 0.0356, "num_tokens": 9969379.0, "reward": 8.69140625, "reward_std": 0.2775535583496094, "rewards/helpfulness_reward/mean": 1.193246841430664, "rewards/helpfulness_reward/std": 0.6973099112510681, "rewards/safety_reward/mean": 8.69140625, "rewards/safety_reward/std": 1.167016625404358, "step": 729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 54.0, "completions/mean_terminated_length": 54.0, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.25499956335691204, "frac_reward_zero_std": 0.0625, "grad_norm": 0.3335829973220825, "kl": 3.859375, "learning_rate": 5e-05, "loss": 0.0433, "num_tokens": 9980947.0, "reward": 8.954833984375, "reward_std": 0.29700130224227905, "rewards/helpfulness_reward/mean": 0.5917357206344604, "rewards/helpfulness_reward/std": 0.7107420563697815, "rewards/safety_reward/mean": 8.954833984375, "rewards/safety_reward/std": 1.0968761444091797, "step": 730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 53.8515625, "completions/mean_terminated_length": 53.8515625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.255348877827264, "frac_reward_zero_std": 0.0, "grad_norm": 0.34426242113113403, "kl": 3.771484375, "learning_rate": 5e-05, "loss": 0.0328, "num_tokens": 9994200.0, "reward": 8.79345703125, "reward_std": 0.30584442615509033, "rewards/helpfulness_reward/mean": 1.0738906860351562, "rewards/helpfulness_reward/std": 0.6335578560829163, "rewards/safety_reward/mean": 8.79345703125, "rewards/safety_reward/std": 1.084428310394287, "step": 731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 54.0546875, "completions/mean_terminated_length": 54.0546875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.2556981922976159, "frac_reward_zero_std": 0.0, "grad_norm": 0.3765099346637726, "kl": 3.876953125, "learning_rate": 5e-05, "loss": 0.0436, "num_tokens": 10004759.0, "reward": 8.8720703125, "reward_std": 0.3099202811717987, "rewards/helpfulness_reward/mean": 1.0790824890136719, "rewards/helpfulness_reward/std": 0.5672736763954163, "rewards/safety_reward/mean": 8.8720703125, "rewards/safety_reward/std": 1.324777603149414, "step": 732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 53.8984375, "completions/mean_terminated_length": 53.8984375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.25604750676796784, "frac_reward_zero_std": 0.0, "grad_norm": 0.28866228461265564, "kl": 3.7109375, "learning_rate": 5e-05, "loss": 0.0368, "num_tokens": 10015714.0, "reward": 8.36376953125, "reward_std": 0.17363353073596954, "rewards/helpfulness_reward/mean": 0.7879419326782227, "rewards/helpfulness_reward/std": 0.8105567097663879, "rewards/safety_reward/mean": 8.36376953125, "rewards/safety_reward/std": 1.020359992980957, "step": 733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 54.265625, "completions/mean_terminated_length": 54.265625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.2563968212383198, "frac_reward_zero_std": 0.0, "grad_norm": 0.5312151312828064, "kl": 3.986328125, "learning_rate": 5e-05, "loss": 0.0445, "num_tokens": 10027172.0, "reward": 9.217529296875, "reward_std": 0.29452621936798096, "rewards/helpfulness_reward/mean": 0.78167724609375, "rewards/helpfulness_reward/std": 0.9113547205924988, "rewards/safety_reward/mean": 9.217529296875, "rewards/safety_reward/std": 1.246626853942871, "step": 734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 72.09375, "completions/mean_terminated_length": 72.09375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.2567461357086717, "frac_reward_zero_std": 0.1875, "grad_norm": 0.4291422665119171, "kl": 3.15234375, "learning_rate": 5e-05, "loss": 0.0512, "num_tokens": 10042752.0, "reward": 8.13372802734375, "reward_std": 0.25244325399398804, "rewards/helpfulness_reward/mean": 0.8979129791259766, "rewards/helpfulness_reward/std": 0.825202226638794, "rewards/safety_reward/mean": 8.13372802734375, "rewards/safety_reward/std": 2.7612357139587402, "step": 735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 54.265625, "completions/mean_terminated_length": 54.265625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.2570954501790237, "frac_reward_zero_std": 0.0, "grad_norm": 0.321049302816391, "kl": 3.46875, "learning_rate": 5e-05, "loss": 0.0442, "num_tokens": 10054930.0, "reward": 9.14794921875, "reward_std": 0.24999737739562988, "rewards/helpfulness_reward/mean": 1.1105365753173828, "rewards/helpfulness_reward/std": 0.6218782067298889, "rewards/safety_reward/mean": 9.14794921875, "rewards/safety_reward/std": 1.5217039585113525, "step": 736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 53.9609375, "completions/mean_terminated_length": 53.9609375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.2574447646493756, "frac_reward_zero_std": 0.0, "grad_norm": 0.44119685888290405, "kl": 3.72265625, "learning_rate": 5e-05, "loss": 0.0374, "num_tokens": 10066461.0, "reward": 8.194580078125, "reward_std": 0.18777522444725037, "rewards/helpfulness_reward/mean": 0.6957988739013672, "rewards/helpfulness_reward/std": 0.5907677412033081, "rewards/safety_reward/mean": 8.194580078125, "rewards/safety_reward/std": 1.3817458152770996, "step": 737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 54.078125, "completions/mean_terminated_length": 54.078125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.2577940791197275, "frac_reward_zero_std": 0.0625, "grad_norm": 0.37911880016326904, "kl": 3.59765625, "learning_rate": 5e-05, "loss": 0.0421, "num_tokens": 10077679.0, "reward": 8.523193359375, "reward_std": 0.30149024724960327, "rewards/helpfulness_reward/mean": 1.017918586730957, "rewards/helpfulness_reward/std": 0.665194034576416, "rewards/safety_reward/mean": 8.523193359375, "rewards/safety_reward/std": 1.0282387733459473, "step": 738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 86.0, "completions/max_terminated_length": 86.0, "completions/mean_length": 54.0390625, "completions/mean_terminated_length": 54.0390625, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.2581433935900795, "frac_reward_zero_std": 0.0, "grad_norm": 0.401244580745697, "kl": 3.826171875, "learning_rate": 5e-05, "loss": 0.0489, "num_tokens": 10090004.0, "reward": 8.436561584472656, "reward_std": 0.4683859348297119, "rewards/helpfulness_reward/mean": 0.910919189453125, "rewards/helpfulness_reward/std": 0.522396445274353, "rewards/safety_reward/mean": 8.436561584472656, "rewards/safety_reward/std": 2.1858341693878174, "step": 739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 54.3125, "completions/mean_terminated_length": 54.3125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.2584927080604314, "frac_reward_zero_std": 0.0, "grad_norm": 0.4146658182144165, "kl": 3.771484375, "learning_rate": 5e-05, "loss": 0.0497, "num_tokens": 10102388.0, "reward": 8.65966796875, "reward_std": 0.2899932563304901, "rewards/helpfulness_reward/mean": 0.925912618637085, "rewards/helpfulness_reward/std": 0.598088800907135, "rewards/safety_reward/mean": 8.65966796875, "rewards/safety_reward/std": 1.3985778093338013, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 54.2109375, "completions/mean_terminated_length": 54.2109375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.2588420225307833, "frac_reward_zero_std": 0.0, "grad_norm": 0.33197322487831116, "kl": 3.611328125, "learning_rate": 5e-05, "loss": 0.0418, "num_tokens": 10113655.0, "reward": 8.721923828125, "reward_std": 0.2252604365348816, "rewards/helpfulness_reward/mean": 1.224416732788086, "rewards/helpfulness_reward/std": 0.6746127605438232, "rewards/safety_reward/mean": 8.721923828125, "rewards/safety_reward/std": 1.1933189630508423, "step": 741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 53.9765625, "completions/mean_terminated_length": 53.9765625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.2591913370011353, "frac_reward_zero_std": 0.0, "grad_norm": 0.3509230315685272, "kl": 3.513671875, "learning_rate": 5e-05, "loss": 0.0344, "num_tokens": 10126276.0, "reward": 8.842529296875, "reward_std": 0.20206740498542786, "rewards/helpfulness_reward/mean": 1.1495060920715332, "rewards/helpfulness_reward/std": 0.8658758997917175, "rewards/safety_reward/mean": 8.842529296875, "rewards/safety_reward/std": 0.9549346566200256, "step": 742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 87.0, "completions/max_terminated_length": 87.0, "completions/mean_length": 54.28125, "completions/mean_terminated_length": 54.28125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.2595406514714872, "frac_reward_zero_std": 0.0, "grad_norm": 0.3145197331905365, "kl": 3.6328125, "learning_rate": 5e-05, "loss": 0.048, "num_tokens": 10140264.0, "reward": 8.6025390625, "reward_std": 0.20747341215610504, "rewards/helpfulness_reward/mean": 1.3397369384765625, "rewards/helpfulness_reward/std": 0.9017218947410583, "rewards/safety_reward/mean": 8.6025390625, "rewards/safety_reward/std": 0.9679355621337891, "step": 743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 53.578125, "completions/mean_terminated_length": 53.578125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.25988996594183916, "frac_reward_zero_std": 0.0, "grad_norm": 0.33380159735679626, "kl": 3.701171875, "learning_rate": 5e-05, "loss": 0.0353, "num_tokens": 10151426.0, "reward": 8.389404296875, "reward_std": 0.2734212577342987, "rewards/helpfulness_reward/mean": 0.7209620475769043, "rewards/helpfulness_reward/std": 0.8293547034263611, "rewards/safety_reward/mean": 8.389404296875, "rewards/safety_reward/std": 1.6187697649002075, "step": 744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 54.2578125, "completions/mean_terminated_length": 54.2578125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.2602392804121911, "frac_reward_zero_std": 0.0, "grad_norm": 0.3514218032360077, "kl": 3.65234375, "learning_rate": 5e-05, "loss": 0.0372, "num_tokens": 10164339.0, "reward": 8.46630859375, "reward_std": 0.21477869153022766, "rewards/helpfulness_reward/mean": 0.8361530303955078, "rewards/helpfulness_reward/std": 0.7743709087371826, "rewards/safety_reward/mean": 8.46630859375, "rewards/safety_reward/std": 1.4842647314071655, "step": 745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 51.4453125, "completions/mean_terminated_length": 51.4453125, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.260588594882543, "frac_reward_zero_std": 0.0, "grad_norm": 0.6873931288719177, "kl": 3.640625, "learning_rate": 5e-05, "loss": 0.0446, "num_tokens": 10180116.0, "reward": 7.894097328186035, "reward_std": 0.3821386694908142, "rewards/helpfulness_reward/mean": 0.7718429565429688, "rewards/helpfulness_reward/std": 1.0339504480361938, "rewards/safety_reward/mean": 7.894097328186035, "rewards/safety_reward/std": 2.810279369354248, "step": 746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 54.3515625, "completions/mean_terminated_length": 54.3515625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.26093790935289496, "frac_reward_zero_std": 0.0, "grad_norm": 0.3571106791496277, "kl": 4.00390625, "learning_rate": 5e-05, "loss": 0.0448, "num_tokens": 10191617.0, "reward": 8.25732421875, "reward_std": 0.1813218891620636, "rewards/helpfulness_reward/mean": 0.8772134780883789, "rewards/helpfulness_reward/std": 0.6984086036682129, "rewards/safety_reward/mean": 8.25732421875, "rewards/safety_reward/std": 0.9688054323196411, "step": 747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 54.0390625, "completions/mean_terminated_length": 54.0390625, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.2612872238232469, "frac_reward_zero_std": 0.0, "grad_norm": 0.24795837700366974, "kl": 3.646484375, "learning_rate": 5e-05, "loss": 0.0388, "num_tokens": 10202798.0, "reward": 8.919189453125, "reward_std": 0.21644119918346405, "rewards/helpfulness_reward/mean": 0.8765463829040527, "rewards/helpfulness_reward/std": 0.7535122036933899, "rewards/safety_reward/mean": 8.919189453125, "rewards/safety_reward/std": 1.152519941329956, "step": 748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 54.0703125, "completions/mean_terminated_length": 54.0703125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.2616365382935988, "frac_reward_zero_std": 0.0, "grad_norm": 0.3650604784488678, "kl": 3.7421875, "learning_rate": 5e-05, "loss": 0.0395, "num_tokens": 10214519.0, "reward": 8.390380859375, "reward_std": 0.2459801435470581, "rewards/helpfulness_reward/mean": 0.7472189664840698, "rewards/helpfulness_reward/std": 0.9439892768859863, "rewards/safety_reward/mean": 8.390380859375, "rewards/safety_reward/std": 1.4838989973068237, "step": 749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 54.4921875, "completions/mean_terminated_length": 54.4921875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.26198585276395076, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5167976021766663, "kl": 3.865234375, "learning_rate": 5e-05, "loss": 0.0568, "num_tokens": 10224926.0, "reward": 9.190673828125, "reward_std": 0.23768918216228485, "rewards/helpfulness_reward/mean": 1.2381820678710938, "rewards/helpfulness_reward/std": 0.6591693162918091, "rewards/safety_reward/mean": 9.190673828125, "rewards/safety_reward/std": 1.002632737159729, "step": 750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 54.15625, "completions/mean_terminated_length": 54.15625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.26233516723430267, "frac_reward_zero_std": 0.0, "grad_norm": 0.727927565574646, "kl": 3.8984375, "learning_rate": 5e-05, "loss": 0.0434, "num_tokens": 10235722.0, "reward": 8.606689453125, "reward_std": 0.21576690673828125, "rewards/helpfulness_reward/mean": 1.0277252197265625, "rewards/helpfulness_reward/std": 0.7336716055870056, "rewards/safety_reward/mean": 8.606689453125, "rewards/safety_reward/std": 1.2873094081878662, "step": 751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 54.203125, "completions/mean_terminated_length": 54.203125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.26268448170465464, "frac_reward_zero_std": 0.0, "grad_norm": 0.4156869351863861, "kl": 3.912109375, "learning_rate": 5e-05, "loss": 0.0458, "num_tokens": 10248004.0, "reward": 8.899169921875, "reward_std": 0.18995621800422668, "rewards/helpfulness_reward/mean": 1.1413822174072266, "rewards/helpfulness_reward/std": 0.685822069644928, "rewards/safety_reward/mean": 8.899169921875, "rewards/safety_reward/std": 1.1950470209121704, "step": 752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 54.1171875, "completions/mean_terminated_length": 54.1171875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.26303379617500655, "frac_reward_zero_std": 0.0, "grad_norm": 1.5657556056976318, "kl": 4.552734375, "learning_rate": 5e-05, "loss": 0.0415, "num_tokens": 10259115.0, "reward": 8.1474609375, "reward_std": 0.161011204123497, "rewards/helpfulness_reward/mean": 1.1397209167480469, "rewards/helpfulness_reward/std": 0.6339350938796997, "rewards/safety_reward/mean": 8.1474609375, "rewards/safety_reward/std": 1.2938003540039062, "step": 753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 84.0, "completions/max_terminated_length": 84.0, "completions/mean_length": 54.578125, "completions/mean_terminated_length": 54.578125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.26338311064535846, "frac_reward_zero_std": 0.0, "grad_norm": 0.35388273000717163, "kl": 3.798828125, "learning_rate": 5e-05, "loss": 0.0564, "num_tokens": 10271853.0, "reward": 8.367919921875, "reward_std": 0.2176991105079651, "rewards/helpfulness_reward/mean": 1.1194705963134766, "rewards/helpfulness_reward/std": 0.8053539395332336, "rewards/safety_reward/mean": 8.367919921875, "rewards/safety_reward/std": 1.3930301666259766, "step": 754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 54.0859375, "completions/mean_terminated_length": 54.0859375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.26373242511571043, "frac_reward_zero_std": 0.0, "grad_norm": 0.3683081865310669, "kl": 3.759765625, "learning_rate": 5e-05, "loss": 0.0377, "num_tokens": 10285848.0, "reward": 7.97216796875, "reward_std": 0.3995438516139984, "rewards/helpfulness_reward/mean": 0.8063666224479675, "rewards/helpfulness_reward/std": 0.6201739311218262, "rewards/safety_reward/mean": 7.97216796875, "rewards/safety_reward/std": 1.4083739519119263, "step": 755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 54.0, "completions/mean_terminated_length": 54.0, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.26408173958606235, "frac_reward_zero_std": 0.0625, "grad_norm": 0.30892816185951233, "kl": 3.76953125, "learning_rate": 5e-05, "loss": 0.035, "num_tokens": 10296680.0, "reward": 8.706298828125, "reward_std": 0.16289223730564117, "rewards/helpfulness_reward/mean": 0.8899745941162109, "rewards/helpfulness_reward/std": 0.6025173664093018, "rewards/safety_reward/mean": 8.706298828125, "rewards/safety_reward/std": 1.0525336265563965, "step": 756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 54.03125, "completions/mean_terminated_length": 54.03125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.26443105405641426, "frac_reward_zero_std": 0.0625, "grad_norm": 0.3346485197544098, "kl": 3.7265625, "learning_rate": 5e-05, "loss": 0.0391, "num_tokens": 10308084.0, "reward": 8.397705078125, "reward_std": 0.22639721632003784, "rewards/helpfulness_reward/mean": 1.2034835815429688, "rewards/helpfulness_reward/std": 0.6388058662414551, "rewards/safety_reward/mean": 8.397705078125, "rewards/safety_reward/std": 1.265417456626892, "step": 757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 51.265625, "completions/mean_terminated_length": 51.265625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.26478036852676623, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5901468396186829, "kl": 4.14453125, "learning_rate": 5e-05, "loss": 0.0617, "num_tokens": 10319558.0, "reward": 8.262451171875, "reward_std": 0.36690062284469604, "rewards/helpfulness_reward/mean": 1.0060871839523315, "rewards/helpfulness_reward/std": 0.579472005367279, "rewards/safety_reward/mean": 8.262451171875, "rewards/safety_reward/std": 1.9962222576141357, "step": 758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 53.8203125, "completions/mean_terminated_length": 53.8203125, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.26512968299711814, "frac_reward_zero_std": 0.0, "grad_norm": 0.3589646816253662, "kl": 3.8203125, "learning_rate": 5e-05, "loss": 0.034, "num_tokens": 10330831.0, "reward": 9.06201171875, "reward_std": 0.32492974400520325, "rewards/helpfulness_reward/mean": 1.283233642578125, "rewards/helpfulness_reward/std": 0.6043639779090881, "rewards/safety_reward/mean": 9.06201171875, "rewards/safety_reward/std": 1.3581870794296265, "step": 759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 54.1015625, "completions/mean_terminated_length": 54.1015625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.2654789974674701, "frac_reward_zero_std": 0.0, "grad_norm": 0.3280688524246216, "kl": 3.6015625, "learning_rate": 5e-05, "loss": 0.0352, "num_tokens": 10341876.0, "reward": 8.94140625, "reward_std": 0.22901500761508942, "rewards/helpfulness_reward/mean": 1.1137809753417969, "rewards/helpfulness_reward/std": 0.6429377198219299, "rewards/safety_reward/mean": 8.94140625, "rewards/safety_reward/std": 0.9491565227508545, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 54.234375, "completions/mean_terminated_length": 54.234375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.265828311937822, "frac_reward_zero_std": 0.0, "grad_norm": 0.3633611798286438, "kl": 3.791015625, "learning_rate": 5e-05, "loss": 0.0441, "num_tokens": 10352546.0, "reward": 8.305419921875, "reward_std": 0.35451269149780273, "rewards/helpfulness_reward/mean": 0.8445472717285156, "rewards/helpfulness_reward/std": 0.8377999663352966, "rewards/safety_reward/mean": 8.305419921875, "rewards/safety_reward/std": 1.21534264087677, "step": 761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 54.2578125, "completions/mean_terminated_length": 54.2578125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.26617762640817394, "frac_reward_zero_std": 0.0, "grad_norm": 0.33714357018470764, "kl": 3.689453125, "learning_rate": 5e-05, "loss": 0.0477, "num_tokens": 10363651.0, "reward": 8.4716796875, "reward_std": 0.26675862073898315, "rewards/helpfulness_reward/mean": 1.0965232849121094, "rewards/helpfulness_reward/std": 0.690579891204834, "rewards/safety_reward/mean": 8.4716796875, "rewards/safety_reward/std": 0.7447469830513, "step": 762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 54.1875, "completions/mean_terminated_length": 54.1875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.2665269408785259, "frac_reward_zero_std": 0.0625, "grad_norm": 0.4010772407054901, "kl": 3.9921875, "learning_rate": 5e-05, "loss": 0.046, "num_tokens": 10374227.0, "reward": 9.109619140625, "reward_std": 0.2780477702617645, "rewards/helpfulness_reward/mean": 0.9473648071289062, "rewards/helpfulness_reward/std": 0.698932945728302, "rewards/safety_reward/mean": 9.109619140625, "rewards/safety_reward/std": 0.8827778697013855, "step": 763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 54.671875, "completions/mean_terminated_length": 54.671875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.2668762553488778, "frac_reward_zero_std": 0.0, "grad_norm": 0.446658730506897, "kl": 3.74609375, "learning_rate": 5e-05, "loss": 0.0522, "num_tokens": 10385257.0, "reward": 8.4375, "reward_std": 0.41391047835350037, "rewards/helpfulness_reward/mean": 0.9465112686157227, "rewards/helpfulness_reward/std": 0.8890675902366638, "rewards/safety_reward/mean": 8.4375, "rewards/safety_reward/std": 1.2661603689193726, "step": 764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 54.0859375, "completions/mean_terminated_length": 54.0859375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.26722556981922974, "frac_reward_zero_std": 0.0625, "grad_norm": 0.4043830633163452, "kl": 3.591796875, "learning_rate": 5e-05, "loss": 0.0417, "num_tokens": 10397772.0, "reward": 8.172119140625, "reward_std": 0.27103495597839355, "rewards/helpfulness_reward/mean": 0.6204404830932617, "rewards/helpfulness_reward/std": 1.0979095697402954, "rewards/safety_reward/mean": 8.172119140625, "rewards/safety_reward/std": 1.2289066314697266, "step": 765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 54.5078125, "completions/mean_terminated_length": 54.5078125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.2675748842895817, "frac_reward_zero_std": 0.0, "grad_norm": 0.4000517427921295, "kl": 3.884765625, "learning_rate": 5e-05, "loss": 0.0564, "num_tokens": 10409469.0, "reward": 8.48828125, "reward_std": 0.22820693254470825, "rewards/helpfulness_reward/mean": 0.9651603698730469, "rewards/helpfulness_reward/std": 0.8219019174575806, "rewards/safety_reward/mean": 8.48828125, "rewards/safety_reward/std": 1.4056620597839355, "step": 766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 54.3125, "completions/mean_terminated_length": 54.3125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.2679241987599336, "frac_reward_zero_std": 0.0, "grad_norm": 0.2888443171977997, "kl": 3.76171875, "learning_rate": 5e-05, "loss": 0.0428, "num_tokens": 10420709.0, "reward": 9.064208984375, "reward_std": 0.2758610248565674, "rewards/helpfulness_reward/mean": 1.2997169494628906, "rewards/helpfulness_reward/std": 0.5759159326553345, "rewards/safety_reward/mean": 9.064208984375, "rewards/safety_reward/std": 1.011904001235962, "step": 767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 54.09375, "completions/mean_terminated_length": 54.09375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.2682735132302856, "frac_reward_zero_std": 0.0, "grad_norm": 0.3545283079147339, "kl": 3.79296875, "learning_rate": 5e-05, "loss": 0.0436, "num_tokens": 10433777.0, "reward": 8.83056640625, "reward_std": 0.31177201867103577, "rewards/helpfulness_reward/mean": 1.2197990417480469, "rewards/helpfulness_reward/std": 0.7307432889938354, "rewards/safety_reward/mean": 8.83056640625, "rewards/safety_reward/std": 1.125755786895752, "step": 768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 54.375, "completions/mean_terminated_length": 54.375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.2686228277006375, "frac_reward_zero_std": 0.0, "grad_norm": 0.3799014687538147, "kl": 3.720703125, "learning_rate": 5e-05, "loss": 0.0476, "num_tokens": 10444569.0, "reward": 8.98583984375, "reward_std": 0.2322903275489807, "rewards/helpfulness_reward/mean": 1.0716137886047363, "rewards/helpfulness_reward/std": 0.6499068140983582, "rewards/safety_reward/mean": 8.98583984375, "rewards/safety_reward/std": 0.8923240900039673, "step": 769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 84.0, "completions/max_terminated_length": 84.0, "completions/mean_length": 54.4140625, "completions/mean_terminated_length": 54.4140625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.2689721421709894, "frac_reward_zero_std": 0.0, "grad_norm": 0.33080968260765076, "kl": 3.7265625, "learning_rate": 5e-05, "loss": 0.0442, "num_tokens": 10456254.0, "reward": 9.162109375, "reward_std": 0.2760905623435974, "rewards/helpfulness_reward/mean": 0.9354243278503418, "rewards/helpfulness_reward/std": 0.508372962474823, "rewards/safety_reward/mean": 9.162109375, "rewards/safety_reward/std": 0.9084131717681885, "step": 770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 54.3828125, "completions/mean_terminated_length": 54.3828125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.2693214566413414, "frac_reward_zero_std": 0.0, "grad_norm": 0.37855833768844604, "kl": 3.869140625, "learning_rate": 5e-05, "loss": 0.0523, "num_tokens": 10467831.0, "reward": 8.7626953125, "reward_std": 0.3113841414451599, "rewards/helpfulness_reward/mean": 1.1993160247802734, "rewards/helpfulness_reward/std": 0.6170637607574463, "rewards/safety_reward/mean": 8.7626953125, "rewards/safety_reward/std": 1.0339229106903076, "step": 771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 85.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 54.296875, "completions/mean_terminated_length": 54.296875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.2696707711116933, "frac_reward_zero_std": 0.0, "grad_norm": 0.3569321036338806, "kl": 3.412109375, "learning_rate": 5e-05, "loss": 0.0463, "num_tokens": 10479789.0, "reward": 8.5546875, "reward_std": 0.20902852714061737, "rewards/helpfulness_reward/mean": 0.7920928001403809, "rewards/helpfulness_reward/std": 0.8645932674407959, "rewards/safety_reward/mean": 8.5546875, "rewards/safety_reward/std": 1.1649062633514404, "step": 772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 54.21875, "completions/mean_terminated_length": 54.21875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.2700200855820452, "frac_reward_zero_std": 0.0, "grad_norm": 0.39823678135871887, "kl": 3.8984375, "learning_rate": 5e-05, "loss": 0.0447, "num_tokens": 10491337.0, "reward": 8.8671875, "reward_std": 0.2224239856004715, "rewards/helpfulness_reward/mean": 0.8347504138946533, "rewards/helpfulness_reward/std": 0.7416970729827881, "rewards/safety_reward/mean": 8.8671875, "rewards/safety_reward/std": 1.1469587087631226, "step": 773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 57.4453125, "completions/mean_terminated_length": 57.4453125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.2703694000523972, "frac_reward_zero_std": 0.0, "grad_norm": 0.4598725736141205, "kl": 3.83203125, "learning_rate": 5e-05, "loss": 0.0511, "num_tokens": 10505106.0, "reward": 7.99078369140625, "reward_std": 0.29429513216018677, "rewards/helpfulness_reward/mean": 0.6312465667724609, "rewards/helpfulness_reward/std": 1.038432002067566, "rewards/safety_reward/mean": 7.99078369140625, "rewards/safety_reward/std": 2.18384051322937, "step": 774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 89.0, "completions/max_terminated_length": 89.0, "completions/mean_length": 54.578125, "completions/mean_terminated_length": 54.578125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.2707187145227491, "frac_reward_zero_std": 0.0, "grad_norm": 0.3778989017009735, "kl": 3.673828125, "learning_rate": 5e-05, "loss": 0.0589, "num_tokens": 10516084.0, "reward": 8.78564453125, "reward_std": 0.2437894344329834, "rewards/helpfulness_reward/mean": 1.2048521041870117, "rewards/helpfulness_reward/std": 0.8128440976142883, "rewards/safety_reward/mean": 8.78564453125, "rewards/safety_reward/std": 0.9579361081123352, "step": 775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 89.0, "completions/max_terminated_length": 89.0, "completions/mean_length": 54.4765625, "completions/mean_terminated_length": 54.4765625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.27106802899310106, "frac_reward_zero_std": 0.0, "grad_norm": 0.49446433782577515, "kl": 4.00390625, "learning_rate": 5e-05, "loss": 0.0544, "num_tokens": 10529393.0, "reward": 8.686233520507812, "reward_std": 0.3097332715988159, "rewards/helpfulness_reward/mean": 1.4828033447265625, "rewards/helpfulness_reward/std": 0.6488044261932373, "rewards/safety_reward/mean": 8.686233520507812, "rewards/safety_reward/std": 1.453997015953064, "step": 776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 54.28125, "completions/mean_terminated_length": 54.28125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.271417343463453, "frac_reward_zero_std": 0.0, "grad_norm": 0.32842567563056946, "kl": 3.943359375, "learning_rate": 5e-05, "loss": 0.0382, "num_tokens": 10540021.0, "reward": 8.6484375, "reward_std": 0.27074000239372253, "rewards/helpfulness_reward/mean": 1.168840765953064, "rewards/helpfulness_reward/std": 0.8633673787117004, "rewards/safety_reward/mean": 8.6484375, "rewards/safety_reward/std": 1.2949414253234863, "step": 777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 54.484375, "completions/mean_terminated_length": 54.484375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.2717666579338049, "frac_reward_zero_std": 0.0, "grad_norm": 0.29844561219215393, "kl": 3.828125, "learning_rate": 5e-05, "loss": 0.0562, "num_tokens": 10551323.0, "reward": 8.8740234375, "reward_std": 0.18930889666080475, "rewards/helpfulness_reward/mean": 1.1225814819335938, "rewards/helpfulness_reward/std": 0.65639728307724, "rewards/safety_reward/mean": 8.8740234375, "rewards/safety_reward/std": 1.4060719013214111, "step": 778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 87.0, "completions/max_terminated_length": 87.0, "completions/mean_length": 54.71875, "completions/mean_terminated_length": 54.71875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.27211597240415686, "frac_reward_zero_std": 0.0, "grad_norm": 0.3000034689903259, "kl": 3.869140625, "learning_rate": 5e-05, "loss": 0.0518, "num_tokens": 10562327.0, "reward": 8.5478515625, "reward_std": 0.33622199296951294, "rewards/helpfulness_reward/mean": 1.1946649551391602, "rewards/helpfulness_reward/std": 0.7719984650611877, "rewards/safety_reward/mean": 8.5478515625, "rewards/safety_reward/std": 1.1720131635665894, "step": 779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 54.2734375, "completions/mean_terminated_length": 54.2734375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.27246528687450877, "frac_reward_zero_std": 0.0, "grad_norm": 0.4474879801273346, "kl": 3.701171875, "learning_rate": 5e-05, "loss": 0.0412, "num_tokens": 10572826.0, "reward": 8.371337890625, "reward_std": 0.1998441219329834, "rewards/helpfulness_reward/mean": 1.036725401878357, "rewards/helpfulness_reward/std": 0.8749814629554749, "rewards/safety_reward/mean": 8.371337890625, "rewards/safety_reward/std": 1.0712040662765503, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 54.2109375, "completions/mean_terminated_length": 54.2109375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.2728146013448607, "frac_reward_zero_std": 0.0, "grad_norm": 0.5589499473571777, "kl": 3.775390625, "learning_rate": 5e-05, "loss": 0.0408, "num_tokens": 10584061.0, "reward": 9.02880859375, "reward_std": 0.18511280417442322, "rewards/helpfulness_reward/mean": 0.9382679462432861, "rewards/helpfulness_reward/std": 0.708937406539917, "rewards/safety_reward/mean": 9.02880859375, "rewards/safety_reward/std": 1.06454336643219, "step": 781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 88.0, "completions/max_terminated_length": 88.0, "completions/mean_length": 55.0859375, "completions/mean_terminated_length": 55.0859375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.27316391581521265, "frac_reward_zero_std": 0.0, "grad_norm": 0.6805875301361084, "kl": 4.203125, "learning_rate": 5e-05, "loss": 0.0667, "num_tokens": 10595520.0, "reward": 8.5782470703125, "reward_std": 0.3344140946865082, "rewards/helpfulness_reward/mean": 1.2816224098205566, "rewards/helpfulness_reward/std": 0.9250046014785767, "rewards/safety_reward/mean": 8.5782470703125, "rewards/safety_reward/std": 1.2424715757369995, "step": 782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 85.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 54.9921875, "completions/mean_terminated_length": 54.9921875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.27351323028556457, "frac_reward_zero_std": 0.0625, "grad_norm": 0.49839454889297485, "kl": 3.671875, "learning_rate": 5e-05, "loss": 0.0753, "num_tokens": 10606159.0, "reward": 8.516845703125, "reward_std": 0.3978712558746338, "rewards/helpfulness_reward/mean": 1.4210357666015625, "rewards/helpfulness_reward/std": 0.663663923740387, "rewards/safety_reward/mean": 8.516845703125, "rewards/safety_reward/std": 1.1357067823410034, "step": 783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 53.875, "completions/mean_terminated_length": 53.875, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.27386254475591654, "frac_reward_zero_std": 0.0, "grad_norm": 1.9692625999450684, "kl": 4.673828125, "learning_rate": 5e-05, "loss": 0.0432, "num_tokens": 10617127.0, "reward": 8.74951171875, "reward_std": 0.23023448884487152, "rewards/helpfulness_reward/mean": 1.0737230777740479, "rewards/helpfulness_reward/std": 0.7725275754928589, "rewards/safety_reward/mean": 8.74951171875, "rewards/safety_reward/std": 0.927961528301239, "step": 784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 54.0, "completions/mean_terminated_length": 54.0, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.27421185922626845, "frac_reward_zero_std": 0.0625, "grad_norm": 0.3281629979610443, "kl": 3.912109375, "learning_rate": 5e-05, "loss": 0.0431, "num_tokens": 10629911.0, "reward": 8.697509765625, "reward_std": 0.28961580991744995, "rewards/helpfulness_reward/mean": 1.142477035522461, "rewards/helpfulness_reward/std": 0.758510947227478, "rewards/safety_reward/mean": 8.697509765625, "rewards/safety_reward/std": 1.5289455652236938, "step": 785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 54.1953125, "completions/mean_terminated_length": 54.1953125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.27456117369662036, "frac_reward_zero_std": 0.0, "grad_norm": 0.36297160387039185, "kl": 3.71484375, "learning_rate": 5e-05, "loss": 0.0455, "num_tokens": 10641624.0, "reward": 8.530029296875, "reward_std": 0.26890379190444946, "rewards/helpfulness_reward/mean": 0.7204494476318359, "rewards/helpfulness_reward/std": 0.7650452852249146, "rewards/safety_reward/mean": 8.530029296875, "rewards/safety_reward/std": 1.0002260208129883, "step": 786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 54.2890625, "completions/mean_terminated_length": 54.2890625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.27491048816697233, "frac_reward_zero_std": 0.0, "grad_norm": 0.5506665110588074, "kl": 4.0, "learning_rate": 5e-05, "loss": 0.0432, "num_tokens": 10655197.0, "reward": 8.6695556640625, "reward_std": 0.22829852998256683, "rewards/helpfulness_reward/mean": 0.7374181747436523, "rewards/helpfulness_reward/std": 1.1229844093322754, "rewards/safety_reward/mean": 8.6695556640625, "rewards/safety_reward/std": 1.6022684574127197, "step": 787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 87.0, "completions/max_terminated_length": 87.0, "completions/mean_length": 54.2890625, "completions/mean_terminated_length": 54.2890625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.27525980263732425, "frac_reward_zero_std": 0.0, "grad_norm": 0.4689847528934479, "kl": 3.875, "learning_rate": 5e-05, "loss": 0.0415, "num_tokens": 10666858.0, "reward": 8.1358642578125, "reward_std": 0.36451613903045654, "rewards/helpfulness_reward/mean": 0.8525928854942322, "rewards/helpfulness_reward/std": 0.7776146531105042, "rewards/safety_reward/mean": 8.1358642578125, "rewards/safety_reward/std": 1.5311150550842285, "step": 788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 54.0859375, "completions/mean_terminated_length": 54.0859375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.27560911710767616, "frac_reward_zero_std": 0.0, "grad_norm": 0.5097286701202393, "kl": 3.814453125, "learning_rate": 5e-05, "loss": 0.04, "num_tokens": 10678357.0, "reward": 8.306640625, "reward_std": 0.2611190378665924, "rewards/helpfulness_reward/mean": 0.7035238742828369, "rewards/helpfulness_reward/std": 0.6234643459320068, "rewards/safety_reward/mean": 8.306640625, "rewards/safety_reward/std": 0.8710342049598694, "step": 789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 54.0078125, "completions/mean_terminated_length": 54.0078125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.27595843157802813, "frac_reward_zero_std": 0.0, "grad_norm": 0.4722411334514618, "kl": 3.935546875, "learning_rate": 5e-05, "loss": 0.0396, "num_tokens": 10688934.0, "reward": 8.715087890625, "reward_std": 0.38098329305648804, "rewards/helpfulness_reward/mean": 0.8821613788604736, "rewards/helpfulness_reward/std": 0.7474682331085205, "rewards/safety_reward/mean": 8.715087890625, "rewards/safety_reward/std": 1.0761386156082153, "step": 790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 54.0546875, "completions/mean_terminated_length": 54.0546875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.27630774604838004, "frac_reward_zero_std": 0.0, "grad_norm": 0.864230215549469, "kl": 4.41015625, "learning_rate": 5e-05, "loss": 0.043, "num_tokens": 10699253.0, "reward": 7.709716796875, "reward_std": 0.4622354209423065, "rewards/helpfulness_reward/mean": 0.8740115165710449, "rewards/helpfulness_reward/std": 0.6987116932868958, "rewards/safety_reward/mean": 7.709716796875, "rewards/safety_reward/std": 1.114963412284851, "step": 791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 54.0, "completions/mean_terminated_length": 54.0, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.276657060518732, "frac_reward_zero_std": 0.0, "grad_norm": 0.3841935098171234, "kl": 3.751953125, "learning_rate": 5e-05, "loss": 0.033, "num_tokens": 10709789.0, "reward": 8.381103515625, "reward_std": 0.3550853729248047, "rewards/helpfulness_reward/mean": 0.6808245182037354, "rewards/helpfulness_reward/std": 0.7101357579231262, "rewards/safety_reward/mean": 8.381103515625, "rewards/safety_reward/std": 1.17780601978302, "step": 792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 54.03125, "completions/mean_terminated_length": 54.03125, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.2770063749890839, "frac_reward_zero_std": 0.0, "grad_norm": 0.2702372372150421, "kl": 3.748046875, "learning_rate": 5e-05, "loss": 0.0371, "num_tokens": 10720385.0, "reward": 8.92041015625, "reward_std": 0.21543753147125244, "rewards/helpfulness_reward/mean": 0.9020347595214844, "rewards/helpfulness_reward/std": 0.8542143106460571, "rewards/safety_reward/mean": 8.92041015625, "rewards/safety_reward/std": 0.8529622554779053, "step": 793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 53.7265625, "completions/mean_terminated_length": 53.7265625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.27735568945943584, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8408326506614685, "kl": 4.173828125, "learning_rate": 5e-05, "loss": 0.0394, "num_tokens": 10731502.0, "reward": 8.702880859375, "reward_std": 0.3891932964324951, "rewards/helpfulness_reward/mean": 0.9840049743652344, "rewards/helpfulness_reward/std": 0.8214701414108276, "rewards/safety_reward/mean": 8.702880859375, "rewards/safety_reward/std": 1.2083616256713867, "step": 794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 54.0078125, "completions/mean_terminated_length": 54.0078125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.2777050039297878, "frac_reward_zero_std": 0.0, "grad_norm": 0.33985331654548645, "kl": 3.841796875, "learning_rate": 5e-05, "loss": 0.0407, "num_tokens": 10743543.0, "reward": 8.5908203125, "reward_std": 0.305833101272583, "rewards/helpfulness_reward/mean": 0.6642427444458008, "rewards/helpfulness_reward/std": 0.7524109482765198, "rewards/safety_reward/mean": 8.5908203125, "rewards/safety_reward/std": 1.1581964492797852, "step": 795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 53.953125, "completions/mean_terminated_length": 53.953125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.2780543184001397, "frac_reward_zero_std": 0.0, "grad_norm": 0.2759024202823639, "kl": 3.916015625, "learning_rate": 5e-05, "loss": 0.0389, "num_tokens": 10753905.0, "reward": 9.21435546875, "reward_std": 0.3143521547317505, "rewards/helpfulness_reward/mean": 1.1403932571411133, "rewards/helpfulness_reward/std": 0.691608726978302, "rewards/safety_reward/mean": 9.21435546875, "rewards/safety_reward/std": 1.1039682626724243, "step": 796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 53.921875, "completions/mean_terminated_length": 53.921875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.27840363287049164, "frac_reward_zero_std": 0.0, "grad_norm": 0.36783069372177124, "kl": 3.712890625, "learning_rate": 5e-05, "loss": 0.0355, "num_tokens": 10764671.0, "reward": 8.608154296875, "reward_std": 0.3101103901863098, "rewards/helpfulness_reward/mean": 1.1095161437988281, "rewards/helpfulness_reward/std": 0.6237231492996216, "rewards/safety_reward/mean": 8.608154296875, "rewards/safety_reward/std": 1.3395626544952393, "step": 797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 53.921875, "completions/mean_terminated_length": 53.921875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.2787529473408436, "frac_reward_zero_std": 0.0625, "grad_norm": 0.3683655560016632, "kl": 3.904296875, "learning_rate": 5e-05, "loss": 0.034, "num_tokens": 10775101.0, "reward": 8.64599609375, "reward_std": 0.2540706396102905, "rewards/helpfulness_reward/mean": 0.4467945098876953, "rewards/helpfulness_reward/std": 0.8488677740097046, "rewards/safety_reward/mean": 8.64599609375, "rewards/safety_reward/std": 1.363724946975708, "step": 798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 54.1015625, "completions/mean_terminated_length": 54.1015625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.2791022618111955, "frac_reward_zero_std": 0.0, "grad_norm": 0.4089009761810303, "kl": 3.7109375, "learning_rate": 5e-05, "loss": 0.0416, "num_tokens": 10786362.0, "reward": 8.826416015625, "reward_std": 0.29098379611968994, "rewards/helpfulness_reward/mean": 0.8645715713500977, "rewards/helpfulness_reward/std": 0.719151496887207, "rewards/safety_reward/mean": 8.826416015625, "rewards/safety_reward/std": 1.232616662979126, "step": 799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 53.8984375, "completions/mean_terminated_length": 53.8984375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.2794515762815475, "frac_reward_zero_std": 0.0, "grad_norm": 0.3867258131504059, "kl": 3.91796875, "learning_rate": 5e-05, "loss": 0.0381, "num_tokens": 10797325.0, "reward": 9.003173828125, "reward_std": 0.3087407052516937, "rewards/helpfulness_reward/mean": 1.1935746669769287, "rewards/helpfulness_reward/std": 0.8686316609382629, "rewards/safety_reward/mean": 9.003173828125, "rewards/safety_reward/std": 0.9904567003250122, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 53.7109375, "completions/mean_terminated_length": 53.7109375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.2798008907518994, "frac_reward_zero_std": 0.0, "grad_norm": 0.3263818025588989, "kl": 3.6875, "learning_rate": 5e-05, "loss": 0.0308, "num_tokens": 10808528.0, "reward": 8.978271484375, "reward_std": 0.2410348653793335, "rewards/helpfulness_reward/mean": 0.9708747863769531, "rewards/helpfulness_reward/std": 0.5127414464950562, "rewards/safety_reward/mean": 8.978271484375, "rewards/safety_reward/std": 1.1375175714492798, "step": 801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 54.03125, "completions/mean_terminated_length": 54.03125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.2801502052222513, "frac_reward_zero_std": 0.0, "grad_norm": 0.2711983621120453, "kl": 3.734375, "learning_rate": 5e-05, "loss": 0.0377, "num_tokens": 10818996.0, "reward": 8.60986328125, "reward_std": 0.1990283727645874, "rewards/helpfulness_reward/mean": 0.8238677978515625, "rewards/helpfulness_reward/std": 0.3805685043334961, "rewards/safety_reward/mean": 8.60986328125, "rewards/safety_reward/std": 0.9679120779037476, "step": 802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 54.1484375, "completions/mean_terminated_length": 54.1484375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.2804995196926033, "frac_reward_zero_std": 0.0, "grad_norm": 0.39311647415161133, "kl": 3.890625, "learning_rate": 5e-05, "loss": 0.0437, "num_tokens": 10829839.0, "reward": 8.766845703125, "reward_std": 0.3271827697753906, "rewards/helpfulness_reward/mean": 0.7534127235412598, "rewards/helpfulness_reward/std": 0.8436444401741028, "rewards/safety_reward/mean": 8.766845703125, "rewards/safety_reward/std": 1.1896989345550537, "step": 803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 53.390625, "completions/mean_terminated_length": 53.390625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.2808488341629552, "frac_reward_zero_std": 0.0, "grad_norm": 0.3760198950767517, "kl": 3.66015625, "learning_rate": 5e-05, "loss": 0.0277, "num_tokens": 10842121.0, "reward": 7.8482666015625, "reward_std": 0.26002246141433716, "rewards/helpfulness_reward/mean": 0.9575324058532715, "rewards/helpfulness_reward/std": 0.8947235345840454, "rewards/safety_reward/mean": 7.8482666015625, "rewards/safety_reward/std": 2.4919657707214355, "step": 804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 54.046875, "completions/mean_terminated_length": 54.046875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.2811981486333071, "frac_reward_zero_std": 0.0, "grad_norm": 0.29451844096183777, "kl": 3.986328125, "learning_rate": 5e-05, "loss": 0.0432, "num_tokens": 10853079.0, "reward": 8.733154296875, "reward_std": 0.2637569308280945, "rewards/helpfulness_reward/mean": 0.9472265243530273, "rewards/helpfulness_reward/std": 0.6427609920501709, "rewards/safety_reward/mean": 8.733154296875, "rewards/safety_reward/std": 0.9763670563697815, "step": 805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 87.0, "completions/max_terminated_length": 87.0, "completions/mean_length": 54.28125, "completions/mean_terminated_length": 54.28125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.2815474631036591, "frac_reward_zero_std": 0.0625, "grad_norm": 1.2936345338821411, "kl": 4.60546875, "learning_rate": 5e-05, "loss": 0.0578, "num_tokens": 10865395.0, "reward": 8.7681884765625, "reward_std": 0.32570311427116394, "rewards/helpfulness_reward/mean": 1.3078727722167969, "rewards/helpfulness_reward/std": 0.45813506841659546, "rewards/safety_reward/mean": 8.7681884765625, "rewards/safety_reward/std": 1.3481488227844238, "step": 806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 53.921875, "completions/mean_terminated_length": 53.921875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.281896777574011, "frac_reward_zero_std": 0.0, "grad_norm": 0.38187897205352783, "kl": 3.970703125, "learning_rate": 5e-05, "loss": 0.0394, "num_tokens": 10877609.0, "reward": 8.418701171875, "reward_std": 0.3009186387062073, "rewards/helpfulness_reward/mean": 0.46215927600860596, "rewards/helpfulness_reward/std": 0.6030858755111694, "rewards/safety_reward/mean": 8.418701171875, "rewards/safety_reward/std": 1.3930141925811768, "step": 807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 55.765625, "completions/mean_terminated_length": 55.765625, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.28224609204436296, "frac_reward_zero_std": 0.0, "grad_norm": 0.3976971209049225, "kl": 3.583984375, "learning_rate": 5e-05, "loss": 0.0299, "num_tokens": 10890379.0, "reward": 8.063201904296875, "reward_std": 0.3212820589542389, "rewards/helpfulness_reward/mean": 1.0273818969726562, "rewards/helpfulness_reward/std": 0.723988950252533, "rewards/safety_reward/mean": 8.063201904296875, "rewards/safety_reward/std": 1.9548722505569458, "step": 808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 54.1640625, "completions/mean_terminated_length": 54.1640625, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.2825954065147149, "frac_reward_zero_std": 0.0, "grad_norm": 0.38754701614379883, "kl": 3.865234375, "learning_rate": 5e-05, "loss": 0.0441, "num_tokens": 10902768.0, "reward": 8.524658203125, "reward_std": 0.32943886518478394, "rewards/helpfulness_reward/mean": 0.9634552001953125, "rewards/helpfulness_reward/std": 0.700502336025238, "rewards/safety_reward/mean": 8.524658203125, "rewards/safety_reward/std": 1.1948375701904297, "step": 809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 54.0546875, "completions/mean_terminated_length": 54.0546875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.2829447209850668, "frac_reward_zero_std": 0.0, "grad_norm": 0.3215387463569641, "kl": 3.671875, "learning_rate": 5e-05, "loss": 0.0386, "num_tokens": 10913519.0, "reward": 8.676025390625, "reward_std": 0.20273393392562866, "rewards/helpfulness_reward/mean": 0.8060569763183594, "rewards/helpfulness_reward/std": 0.6875918507575989, "rewards/safety_reward/mean": 8.676025390625, "rewards/safety_reward/std": 1.3481967449188232, "step": 810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 54.2265625, "completions/mean_terminated_length": 54.2265625, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.28329403545541876, "frac_reward_zero_std": 0.0, "grad_norm": 0.3517970144748688, "kl": 3.81640625, "learning_rate": 5e-05, "loss": 0.0456, "num_tokens": 10925876.0, "reward": 8.3900146484375, "reward_std": 0.2830692529678345, "rewards/helpfulness_reward/mean": 0.9198379516601562, "rewards/helpfulness_reward/std": 0.9524103403091431, "rewards/safety_reward/mean": 8.3900146484375, "rewards/safety_reward/std": 2.166529893875122, "step": 811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 90.0, "completions/max_terminated_length": 90.0, "completions/mean_length": 54.1796875, "completions/mean_terminated_length": 54.1796875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.28364334992577067, "frac_reward_zero_std": 0.0, "grad_norm": 0.26942935585975647, "kl": 3.80859375, "learning_rate": 5e-05, "loss": 0.0405, "num_tokens": 10937651.0, "reward": 8.3848876953125, "reward_std": 0.18291233479976654, "rewards/helpfulness_reward/mean": 0.9518470764160156, "rewards/helpfulness_reward/std": 0.9690670371055603, "rewards/safety_reward/mean": 8.3848876953125, "rewards/safety_reward/std": 1.6326935291290283, "step": 812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 54.140625, "completions/mean_terminated_length": 54.140625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.2839926643961226, "frac_reward_zero_std": 0.0, "grad_norm": 0.35474589467048645, "kl": 3.650390625, "learning_rate": 5e-05, "loss": 0.0404, "num_tokens": 10948813.0, "reward": 8.762939453125, "reward_std": 0.2993135154247284, "rewards/helpfulness_reward/mean": 1.0738654136657715, "rewards/helpfulness_reward/std": 0.5557597875595093, "rewards/safety_reward/mean": 8.762939453125, "rewards/safety_reward/std": 1.045498013496399, "step": 813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 54.2578125, "completions/mean_terminated_length": 54.2578125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.28434197886647455, "frac_reward_zero_std": 0.0, "grad_norm": 0.330410361289978, "kl": 3.826171875, "learning_rate": 5e-05, "loss": 0.0438, "num_tokens": 10960758.0, "reward": 8.5107421875, "reward_std": 0.18491330742835999, "rewards/helpfulness_reward/mean": 0.984046459197998, "rewards/helpfulness_reward/std": 0.6295894384384155, "rewards/safety_reward/mean": 8.5107421875, "rewards/safety_reward/std": 1.1687511205673218, "step": 814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 54.1875, "completions/mean_terminated_length": 54.1875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.28469129333682647, "frac_reward_zero_std": 0.0, "grad_norm": 0.3477899432182312, "kl": 3.82421875, "learning_rate": 5e-05, "loss": 0.0383, "num_tokens": 10972678.0, "reward": 8.506103515625, "reward_std": 0.3112233281135559, "rewards/helpfulness_reward/mean": 0.8876914978027344, "rewards/helpfulness_reward/std": 0.7403368949890137, "rewards/safety_reward/mean": 8.506103515625, "rewards/safety_reward/std": 1.0233949422836304, "step": 815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 53.9453125, "completions/mean_terminated_length": 53.9453125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.28504060780717844, "frac_reward_zero_std": 0.0, "grad_norm": 0.2728027403354645, "kl": 3.796875, "learning_rate": 5e-05, "loss": 0.038, "num_tokens": 10983351.0, "reward": 7.95068359375, "reward_std": 0.16761481761932373, "rewards/helpfulness_reward/mean": 0.7161846160888672, "rewards/helpfulness_reward/std": 0.788645327091217, "rewards/safety_reward/mean": 7.95068359375, "rewards/safety_reward/std": 0.9991970062255859, "step": 816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 54.0078125, "completions/mean_terminated_length": 54.0078125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.28538992227753035, "frac_reward_zero_std": 0.0, "grad_norm": 0.36922210454940796, "kl": 3.572265625, "learning_rate": 5e-05, "loss": 0.0352, "num_tokens": 10994336.0, "reward": 8.98974609375, "reward_std": 0.2722548544406891, "rewards/helpfulness_reward/mean": 1.1595635414123535, "rewards/helpfulness_reward/std": 0.7594119906425476, "rewards/safety_reward/mean": 8.98974609375, "rewards/safety_reward/std": 1.239492654800415, "step": 817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 53.9140625, "completions/mean_terminated_length": 53.9140625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.28573923674788226, "frac_reward_zero_std": 0.0, "grad_norm": 0.32003140449523926, "kl": 3.884765625, "learning_rate": 5e-05, "loss": 0.0366, "num_tokens": 11007181.0, "reward": 8.584716796875, "reward_std": 0.28331246972084045, "rewards/helpfulness_reward/mean": 0.8796415328979492, "rewards/helpfulness_reward/std": 0.6059038639068604, "rewards/safety_reward/mean": 8.584716796875, "rewards/safety_reward/std": 1.3142105340957642, "step": 818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 94.0, "completions/max_terminated_length": 94.0, "completions/mean_length": 54.34375, "completions/mean_terminated_length": 54.34375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.28608855121823423, "frac_reward_zero_std": 0.0, "grad_norm": 0.4314948320388794, "kl": 3.99609375, "learning_rate": 5e-05, "loss": 0.0526, "num_tokens": 11017937.0, "reward": 8.762451171875, "reward_std": 0.23494291305541992, "rewards/helpfulness_reward/mean": 0.9226570129394531, "rewards/helpfulness_reward/std": 0.8824981451034546, "rewards/safety_reward/mean": 8.762451171875, "rewards/safety_reward/std": 1.1387794017791748, "step": 819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 54.109375, "completions/mean_terminated_length": 54.109375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.28643786568858615, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6079363226890564, "kl": 4.10546875, "learning_rate": 5e-05, "loss": 0.0448, "num_tokens": 11029039.0, "reward": 8.913818359375, "reward_std": 0.1931808590888977, "rewards/helpfulness_reward/mean": 0.9079971313476562, "rewards/helpfulness_reward/std": 0.7497140765190125, "rewards/safety_reward/mean": 8.913818359375, "rewards/safety_reward/std": 1.2689250707626343, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 54.234375, "completions/mean_terminated_length": 54.234375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.28678718015893806, "frac_reward_zero_std": 0.125, "grad_norm": 0.4415058493614197, "kl": 4.2734375, "learning_rate": 5e-05, "loss": 0.0525, "num_tokens": 11040397.0, "reward": 8.75341796875, "reward_std": 0.18610000610351562, "rewards/helpfulness_reward/mean": 0.8147516250610352, "rewards/helpfulness_reward/std": 0.8305168151855469, "rewards/safety_reward/mean": 8.75341796875, "rewards/safety_reward/std": 1.6007821559906006, "step": 821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 54.015625, "completions/mean_terminated_length": 54.015625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.28713649462929003, "frac_reward_zero_std": 0.0, "grad_norm": 0.2709997892379761, "kl": 3.765625, "learning_rate": 5e-05, "loss": 0.0377, "num_tokens": 11051719.0, "reward": 9.160888671875, "reward_std": 0.22123998403549194, "rewards/helpfulness_reward/mean": 0.9294548034667969, "rewards/helpfulness_reward/std": 0.7769691944122314, "rewards/safety_reward/mean": 9.160888671875, "rewards/safety_reward/std": 1.1795024871826172, "step": 822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 54.265625, "completions/mean_terminated_length": 54.265625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.28748580909964194, "frac_reward_zero_std": 0.0, "grad_norm": 0.4774497449398041, "kl": 3.98828125, "learning_rate": 5e-05, "loss": 0.0459, "num_tokens": 11064921.0, "reward": 8.1270751953125, "reward_std": 0.2598326504230499, "rewards/helpfulness_reward/mean": 0.5829305648803711, "rewards/helpfulness_reward/std": 0.8892630934715271, "rewards/safety_reward/mean": 8.1270751953125, "rewards/safety_reward/std": 2.3210177421569824, "step": 823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 54.3046875, "completions/mean_terminated_length": 54.3046875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.2878351235699939, "frac_reward_zero_std": 0.125, "grad_norm": 0.27654704451560974, "kl": 3.642578125, "learning_rate": 5e-05, "loss": 0.0458, "num_tokens": 11076152.0, "reward": 8.718505859375, "reward_std": 0.2279953956604004, "rewards/helpfulness_reward/mean": 0.725799560546875, "rewards/helpfulness_reward/std": 0.9217761754989624, "rewards/safety_reward/mean": 8.718505859375, "rewards/safety_reward/std": 1.503520131111145, "step": 824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 54.0703125, "completions/mean_terminated_length": 54.0703125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.2881844380403458, "frac_reward_zero_std": 0.0, "grad_norm": 0.38390398025512695, "kl": 3.7109375, "learning_rate": 5e-05, "loss": 0.0349, "num_tokens": 11087609.0, "reward": 8.486572265625, "reward_std": 0.25410690903663635, "rewards/helpfulness_reward/mean": 1.136260986328125, "rewards/helpfulness_reward/std": 0.9388827681541443, "rewards/safety_reward/mean": 8.486572265625, "rewards/safety_reward/std": 1.6044069528579712, "step": 825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 54.3359375, "completions/mean_terminated_length": 54.3359375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.28853375251069774, "frac_reward_zero_std": 0.0, "grad_norm": 0.38675814867019653, "kl": 3.625, "learning_rate": 5e-05, "loss": 0.0434, "num_tokens": 11098652.0, "reward": 8.9169921875, "reward_std": 0.16797642409801483, "rewards/helpfulness_reward/mean": 0.9644122123718262, "rewards/helpfulness_reward/std": 0.8182457685470581, "rewards/safety_reward/mean": 8.9169921875, "rewards/safety_reward/std": 0.8678741455078125, "step": 826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 54.296875, "completions/mean_terminated_length": 54.296875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.2888830669810497, "frac_reward_zero_std": 0.0, "grad_norm": 0.3396473526954651, "kl": 3.9375, "learning_rate": 5e-05, "loss": 0.0434, "num_tokens": 11110258.0, "reward": 8.60302734375, "reward_std": 0.20947784185409546, "rewards/helpfulness_reward/mean": 0.8926181793212891, "rewards/helpfulness_reward/std": 0.677040159702301, "rewards/safety_reward/mean": 8.60302734375, "rewards/safety_reward/std": 0.8844258189201355, "step": 827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 54.1875, "completions/mean_terminated_length": 54.1875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.2892323814514016, "frac_reward_zero_std": 0.0, "grad_norm": 0.3950490653514862, "kl": 3.947265625, "learning_rate": 5e-05, "loss": 0.0433, "num_tokens": 11121362.0, "reward": 8.470703125, "reward_std": 0.34545499086380005, "rewards/helpfulness_reward/mean": 1.1362533569335938, "rewards/helpfulness_reward/std": 0.5726300477981567, "rewards/safety_reward/mean": 8.470703125, "rewards/safety_reward/std": 0.9981663227081299, "step": 828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 54.203125, "completions/mean_terminated_length": 54.203125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.28958169592175353, "frac_reward_zero_std": 0.0, "grad_norm": 0.3790760934352875, "kl": 4.044921875, "learning_rate": 5e-05, "loss": 0.0513, "num_tokens": 11132380.0, "reward": 8.56103515625, "reward_std": 0.29506993293762207, "rewards/helpfulness_reward/mean": 0.9313201904296875, "rewards/helpfulness_reward/std": 0.9018480181694031, "rewards/safety_reward/mean": 8.56103515625, "rewards/safety_reward/std": 1.531477689743042, "step": 829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 52.2578125, "completions/mean_terminated_length": 52.2578125, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.2899310103921055, "frac_reward_zero_std": 0.0, "grad_norm": 0.41581809520721436, "kl": 3.8046875, "learning_rate": 5e-05, "loss": 0.0401, "num_tokens": 11143581.0, "reward": 8.27850341796875, "reward_std": 0.1694469153881073, "rewards/helpfulness_reward/mean": 1.0693882703781128, "rewards/helpfulness_reward/std": 0.9285973906517029, "rewards/safety_reward/mean": 8.27850341796875, "rewards/safety_reward/std": 2.1153862476348877, "step": 830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 54.2578125, "completions/mean_terminated_length": 54.2578125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.2902803248624574, "frac_reward_zero_std": 0.0625, "grad_norm": 0.35103628039360046, "kl": 3.822265625, "learning_rate": 5e-05, "loss": 0.0457, "num_tokens": 11155430.0, "reward": 8.651611328125, "reward_std": 0.23789408802986145, "rewards/helpfulness_reward/mean": 0.9107112884521484, "rewards/helpfulness_reward/std": 0.5177378058433533, "rewards/safety_reward/mean": 8.651611328125, "rewards/safety_reward/std": 1.0799864530563354, "step": 831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 54.1640625, "completions/mean_terminated_length": 54.1640625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.2906296393328094, "frac_reward_zero_std": 0.0, "grad_norm": 0.34465500712394714, "kl": 3.927734375, "learning_rate": 5e-05, "loss": 0.0435, "num_tokens": 11166659.0, "reward": 8.84375, "reward_std": 0.3057171106338501, "rewards/helpfulness_reward/mean": 0.9901771545410156, "rewards/helpfulness_reward/std": 0.8471987843513489, "rewards/safety_reward/mean": 8.84375, "rewards/safety_reward/std": 0.9729438424110413, "step": 832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 53.921875, "completions/mean_terminated_length": 53.921875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.2909789538031613, "frac_reward_zero_std": 0.0, "grad_norm": 0.4195573925971985, "kl": 3.998046875, "learning_rate": 5e-05, "loss": 0.0364, "num_tokens": 11177425.0, "reward": 8.6220703125, "reward_std": 0.26501551270484924, "rewards/helpfulness_reward/mean": 0.876739501953125, "rewards/helpfulness_reward/std": 0.8935194611549377, "rewards/safety_reward/mean": 8.6220703125, "rewards/safety_reward/std": 1.2166720628738403, "step": 833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 54.2109375, "completions/mean_terminated_length": 54.2109375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.2913282682735132, "frac_reward_zero_std": 0.0625, "grad_norm": 0.3361041843891144, "kl": 3.876953125, "learning_rate": 5e-05, "loss": 0.0472, "num_tokens": 11189012.0, "reward": 8.50341796875, "reward_std": 0.18669679760932922, "rewards/helpfulness_reward/mean": 0.7168703079223633, "rewards/helpfulness_reward/std": 0.6462517976760864, "rewards/safety_reward/mean": 8.50341796875, "rewards/safety_reward/std": 1.0305020809173584, "step": 834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 53.9921875, "completions/mean_terminated_length": 53.9921875, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.2916775827438652, "frac_reward_zero_std": 0.0625, "grad_norm": 0.2941299378871918, "kl": 3.79296875, "learning_rate": 5e-05, "loss": 0.0404, "num_tokens": 11199571.0, "reward": 8.788330078125, "reward_std": 0.1847705990076065, "rewards/helpfulness_reward/mean": 0.9066267013549805, "rewards/helpfulness_reward/std": 0.8121086359024048, "rewards/safety_reward/mean": 8.788330078125, "rewards/safety_reward/std": 1.0661355257034302, "step": 835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 54.0390625, "completions/mean_terminated_length": 54.0390625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.2920268972142171, "frac_reward_zero_std": 0.0625, "grad_norm": 0.3265959620475769, "kl": 3.658203125, "learning_rate": 5e-05, "loss": 0.0375, "num_tokens": 11210776.0, "reward": 8.58349609375, "reward_std": 0.20641747117042542, "rewards/helpfulness_reward/mean": 0.9894380569458008, "rewards/helpfulness_reward/std": 0.895561933517456, "rewards/safety_reward/mean": 8.58349609375, "rewards/safety_reward/std": 1.0373878479003906, "step": 836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 54.1015625, "completions/mean_terminated_length": 54.1015625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.292376211684569, "frac_reward_zero_std": 0.0, "grad_norm": 0.33735430240631104, "kl": 3.900390625, "learning_rate": 5e-05, "loss": 0.0405, "num_tokens": 11221621.0, "reward": 9.1748046875, "reward_std": 0.16921955347061157, "rewards/helpfulness_reward/mean": 1.1182670593261719, "rewards/helpfulness_reward/std": 0.5798830986022949, "rewards/safety_reward/mean": 9.1748046875, "rewards/safety_reward/std": 1.1345317363739014, "step": 837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 54.09375, "completions/mean_terminated_length": 54.09375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.292725526154921, "frac_reward_zero_std": 0.0, "grad_norm": 0.3340994417667389, "kl": 3.841796875, "learning_rate": 5e-05, "loss": 0.0448, "num_tokens": 11233825.0, "reward": 8.719970703125, "reward_std": 0.19012172520160675, "rewards/helpfulness_reward/mean": 1.0006523132324219, "rewards/helpfulness_reward/std": 0.6776752471923828, "rewards/safety_reward/mean": 8.719970703125, "rewards/safety_reward/std": 1.2481740713119507, "step": 838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 54.0234375, "completions/mean_terminated_length": 54.0234375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.2930748406252729, "frac_reward_zero_std": 0.0, "grad_norm": 0.45712387561798096, "kl": 3.89453125, "learning_rate": 5e-05, "loss": 0.0372, "num_tokens": 11245700.0, "reward": 8.61572265625, "reward_std": 0.20463058352470398, "rewards/helpfulness_reward/mean": 0.6407270431518555, "rewards/helpfulness_reward/std": 0.7059844136238098, "rewards/safety_reward/mean": 8.61572265625, "rewards/safety_reward/std": 1.3405522108078003, "step": 839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 54.2578125, "completions/mean_terminated_length": 54.2578125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.29342415509562486, "frac_reward_zero_std": 0.0, "grad_norm": 0.5723667740821838, "kl": 3.97265625, "learning_rate": 5e-05, "loss": 0.0456, "num_tokens": 11256717.0, "reward": 8.585693359375, "reward_std": 0.3292425870895386, "rewards/helpfulness_reward/mean": 1.2574996948242188, "rewards/helpfulness_reward/std": 0.8897282481193542, "rewards/safety_reward/mean": 8.585693359375, "rewards/safety_reward/std": 1.099162220954895, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 54.125, "completions/mean_terminated_length": 54.125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.2937734695659768, "frac_reward_zero_std": 0.125, "grad_norm": 0.5836174488067627, "kl": 4.208984375, "learning_rate": 5e-05, "loss": 0.0464, "num_tokens": 11268053.0, "reward": 8.6884765625, "reward_std": 0.18641400337219238, "rewards/helpfulness_reward/mean": 0.6497592926025391, "rewards/helpfulness_reward/std": 0.6083099842071533, "rewards/safety_reward/mean": 8.6884765625, "rewards/safety_reward/std": 1.1150929927825928, "step": 841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 54.0078125, "completions/mean_terminated_length": 54.0078125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.2941227840363287, "frac_reward_zero_std": 0.0, "grad_norm": 0.4614453911781311, "kl": 3.82421875, "learning_rate": 5e-05, "loss": 0.0395, "num_tokens": 11279358.0, "reward": 9.081787109375, "reward_std": 0.1929732859134674, "rewards/helpfulness_reward/mean": 1.3849029541015625, "rewards/helpfulness_reward/std": 0.8989498615264893, "rewards/safety_reward/mean": 9.081787109375, "rewards/safety_reward/std": 1.3619364500045776, "step": 842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 53.96875, "completions/mean_terminated_length": 53.96875, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.29447209850668066, "frac_reward_zero_std": 0.0, "grad_norm": 0.2771112024784088, "kl": 3.92578125, "learning_rate": 5e-05, "loss": 0.0392, "num_tokens": 11292762.0, "reward": 8.0372314453125, "reward_std": 0.18015345931053162, "rewards/helpfulness_reward/mean": 0.8512039184570312, "rewards/helpfulness_reward/std": 0.8411790728569031, "rewards/safety_reward/mean": 8.0372314453125, "rewards/safety_reward/std": 1.9661850929260254, "step": 843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 54.03125, "completions/mean_terminated_length": 54.03125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.29482141297703257, "frac_reward_zero_std": 0.0, "grad_norm": 0.6035535335540771, "kl": 4.228515625, "learning_rate": 5e-05, "loss": 0.0471, "num_tokens": 11303982.0, "reward": 8.441162109375, "reward_std": 0.2178182750940323, "rewards/helpfulness_reward/mean": 1.2028579711914062, "rewards/helpfulness_reward/std": 0.5888012647628784, "rewards/safety_reward/mean": 8.441162109375, "rewards/safety_reward/std": 0.8192821145057678, "step": 844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 54.2578125, "completions/mean_terminated_length": 54.2578125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.2951707274473845, "frac_reward_zero_std": 0.0, "grad_norm": 0.49793484807014465, "kl": 3.919921875, "learning_rate": 5e-05, "loss": 0.047, "num_tokens": 11314591.0, "reward": 8.77001953125, "reward_std": 0.31873783469200134, "rewards/helpfulness_reward/mean": 1.1017189025878906, "rewards/helpfulness_reward/std": 0.7565565705299377, "rewards/safety_reward/mean": 8.77001953125, "rewards/safety_reward/std": 1.397194266319275, "step": 845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 54.0390625, "completions/mean_terminated_length": 54.0390625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.29552004191773645, "frac_reward_zero_std": 0.0625, "grad_norm": 0.43609926104545593, "kl": 3.84375, "learning_rate": 5e-05, "loss": 0.0403, "num_tokens": 11325588.0, "reward": 8.9921875, "reward_std": 0.242994487285614, "rewards/helpfulness_reward/mean": 1.2333526611328125, "rewards/helpfulness_reward/std": 0.5771146416664124, "rewards/safety_reward/mean": 8.9921875, "rewards/safety_reward/std": 1.0888705253601074, "step": 846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 54.203125, "completions/mean_terminated_length": 54.203125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.29586935638808837, "frac_reward_zero_std": 0.0, "grad_norm": 0.43444713950157166, "kl": 4.083984375, "learning_rate": 5e-05, "loss": 0.0459, "num_tokens": 11337350.0, "reward": 9.25732421875, "reward_std": 0.2820773720741272, "rewards/helpfulness_reward/mean": 1.4957051277160645, "rewards/helpfulness_reward/std": 0.7635603547096252, "rewards/safety_reward/mean": 9.25732421875, "rewards/safety_reward/std": 1.016409158706665, "step": 847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 53.9921875, "completions/mean_terminated_length": 53.9921875, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.29621867085844034, "frac_reward_zero_std": 0.0, "grad_norm": 0.39552751183509827, "kl": 4.046875, "learning_rate": 5e-05, "loss": 0.0416, "num_tokens": 11349005.0, "reward": 8.60986328125, "reward_std": 0.21043077111244202, "rewards/helpfulness_reward/mean": 1.116973876953125, "rewards/helpfulness_reward/std": 0.7598156929016113, "rewards/safety_reward/mean": 8.60986328125, "rewards/safety_reward/std": 1.4757564067840576, "step": 848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 54.109375, "completions/mean_terminated_length": 54.109375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.29656798532879225, "frac_reward_zero_std": 0.0, "grad_norm": 0.6465248465538025, "kl": 4.16796875, "learning_rate": 5e-05, "loss": 0.0496, "num_tokens": 11360251.0, "reward": 8.731201171875, "reward_std": 0.2935861051082611, "rewards/helpfulness_reward/mean": 1.0975799560546875, "rewards/helpfulness_reward/std": 0.6757306456565857, "rewards/safety_reward/mean": 8.731201171875, "rewards/safety_reward/std": 1.00336492061615, "step": 849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 54.40625, "completions/mean_terminated_length": 54.40625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.29691729979914416, "frac_reward_zero_std": 0.0, "grad_norm": 0.5184087157249451, "kl": 4.095703125, "learning_rate": 5e-05, "loss": 0.0558, "num_tokens": 11371383.0, "reward": 8.978271484375, "reward_std": 0.31084075570106506, "rewards/helpfulness_reward/mean": 1.1037826538085938, "rewards/helpfulness_reward/std": 0.7514886856079102, "rewards/safety_reward/mean": 8.978271484375, "rewards/safety_reward/std": 0.9275850057601929, "step": 850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 54.0859375, "completions/mean_terminated_length": 54.0859375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.29726661426949613, "frac_reward_zero_std": 0.0, "grad_norm": 0.3519018292427063, "kl": 3.802734375, "learning_rate": 5e-05, "loss": 0.0371, "num_tokens": 11382178.0, "reward": 9.144287109375, "reward_std": 0.28837525844573975, "rewards/helpfulness_reward/mean": 1.3085696697235107, "rewards/helpfulness_reward/std": 0.702256977558136, "rewards/safety_reward/mean": 9.144287109375, "rewards/safety_reward/std": 0.775754451751709, "step": 851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 54.90625, "completions/mean_terminated_length": 54.90625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.29761592873984805, "frac_reward_zero_std": 0.0, "grad_norm": 0.40643712878227234, "kl": 3.873046875, "learning_rate": 5e-05, "loss": 0.059, "num_tokens": 11394894.0, "reward": 8.585205078125, "reward_std": 0.3285684585571289, "rewards/helpfulness_reward/mean": 1.1998543739318848, "rewards/helpfulness_reward/std": 0.8674685955047607, "rewards/safety_reward/mean": 8.585205078125, "rewards/safety_reward/std": 0.9392185211181641, "step": 852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 54.0859375, "completions/mean_terminated_length": 54.0859375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.29796524321019996, "frac_reward_zero_std": 0.0, "grad_norm": 0.3720882534980774, "kl": 3.849609375, "learning_rate": 5e-05, "loss": 0.0449, "num_tokens": 11406673.0, "reward": 8.771484375, "reward_std": 0.2522651255130768, "rewards/helpfulness_reward/mean": 1.0755882263183594, "rewards/helpfulness_reward/std": 0.977020263671875, "rewards/safety_reward/mean": 8.771484375, "rewards/safety_reward/std": 0.8824359774589539, "step": 853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 54.0859375, "completions/mean_terminated_length": 54.0859375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.29831455768055193, "frac_reward_zero_std": 0.0, "grad_norm": 5.556887626647949, "kl": 6.345703125, "learning_rate": 5e-05, "loss": 0.0662, "num_tokens": 11419164.0, "reward": 9.045166015625, "reward_std": 0.5461542010307312, "rewards/helpfulness_reward/mean": 1.003931999206543, "rewards/helpfulness_reward/std": 0.7083288431167603, "rewards/safety_reward/mean": 9.045166015625, "rewards/safety_reward/std": 1.2261680364608765, "step": 854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 52.4375, "completions/mean_terminated_length": 52.4375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.29866387215090384, "frac_reward_zero_std": 0.0, "grad_norm": 0.5266439318656921, "kl": 3.869140625, "learning_rate": 5e-05, "loss": 0.0303, "num_tokens": 11431844.0, "reward": 7.7526092529296875, "reward_std": 0.4182935357093811, "rewards/helpfulness_reward/mean": 0.8210487365722656, "rewards/helpfulness_reward/std": 0.5466672778129578, "rewards/safety_reward/mean": 7.7526092529296875, "rewards/safety_reward/std": 1.8876168727874756, "step": 855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 53.28125, "completions/mean_terminated_length": 53.28125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.2990131866212558, "frac_reward_zero_std": 0.0, "grad_norm": 1.0701639652252197, "kl": 4.54296875, "learning_rate": 5e-05, "loss": 0.0365, "num_tokens": 11445072.0, "reward": 8.211181640625, "reward_std": 0.3398860692977905, "rewards/helpfulness_reward/mean": 1.2569928169250488, "rewards/helpfulness_reward/std": 0.9031582474708557, "rewards/safety_reward/mean": 8.211181640625, "rewards/safety_reward/std": 1.71107017993927, "step": 856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 54.0703125, "completions/mean_terminated_length": 54.0703125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.2993625010916077, "frac_reward_zero_std": 0.0, "grad_norm": 0.41331547498703003, "kl": 3.810546875, "learning_rate": 5e-05, "loss": 0.0383, "num_tokens": 11455785.0, "reward": 8.300048828125, "reward_std": 0.4893091022968292, "rewards/helpfulness_reward/mean": 0.9660810232162476, "rewards/helpfulness_reward/std": 0.6888313293457031, "rewards/safety_reward/mean": 8.300048828125, "rewards/safety_reward/std": 1.1847275495529175, "step": 857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 54.0, "completions/mean_terminated_length": 54.0, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.29971181556195964, "frac_reward_zero_std": 0.0, "grad_norm": 0.39047399163246155, "kl": 3.625, "learning_rate": 5e-05, "loss": 0.0345, "num_tokens": 11466921.0, "reward": 8.946044921875, "reward_std": 0.3141988515853882, "rewards/helpfulness_reward/mean": 0.8809423446655273, "rewards/helpfulness_reward/std": 0.6154705286026001, "rewards/safety_reward/mean": 8.946044921875, "rewards/safety_reward/std": 1.1782606840133667, "step": 858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 53.8671875, "completions/mean_terminated_length": 53.8671875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.3000611300323116, "frac_reward_zero_std": 0.0, "grad_norm": 0.2971174418926239, "kl": 3.607421875, "learning_rate": 5e-05, "loss": 0.0346, "num_tokens": 11477400.0, "reward": 8.795654296875, "reward_std": 0.26323243975639343, "rewards/helpfulness_reward/mean": 1.0847020149230957, "rewards/helpfulness_reward/std": 0.6752773523330688, "rewards/safety_reward/mean": 8.795654296875, "rewards/safety_reward/std": 0.9292857050895691, "step": 859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 53.8671875, "completions/mean_terminated_length": 53.8671875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.3004104445026635, "frac_reward_zero_std": 0.0, "grad_norm": 0.7542575597763062, "kl": 4.234375, "learning_rate": 5e-05, "loss": 0.0346, "num_tokens": 11489247.0, "reward": 8.71337890625, "reward_std": 0.32580679655075073, "rewards/helpfulness_reward/mean": 0.9564957618713379, "rewards/helpfulness_reward/std": 0.8282496333122253, "rewards/safety_reward/mean": 8.71337890625, "rewards/safety_reward/std": 1.524307370185852, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 54.0, "completions/mean_terminated_length": 54.0, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.30075975897301543, "frac_reward_zero_std": 0.0, "grad_norm": 0.469666063785553, "kl": 3.7109375, "learning_rate": 5e-05, "loss": 0.034, "num_tokens": 11500663.0, "reward": 8.569091796875, "reward_std": 0.25531822443008423, "rewards/helpfulness_reward/mean": 1.079376220703125, "rewards/helpfulness_reward/std": 0.7436104416847229, "rewards/safety_reward/mean": 8.569091796875, "rewards/safety_reward/std": 0.9368899464607239, "step": 861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 54.1953125, "completions/mean_terminated_length": 54.1953125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.3011090734433674, "frac_reward_zero_std": 0.0, "grad_norm": 31.785171508789062, "kl": 17.099609375, "learning_rate": 5e-05, "loss": 0.1756, "num_tokens": 11511552.0, "reward": 8.936767578125, "reward_std": 0.2712939381599426, "rewards/helpfulness_reward/mean": 1.2224845886230469, "rewards/helpfulness_reward/std": 0.7705979347229004, "rewards/safety_reward/mean": 8.936767578125, "rewards/safety_reward/std": 1.2302478551864624, "step": 862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 54.2109375, "completions/mean_terminated_length": 54.2109375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.3014583879137193, "frac_reward_zero_std": 0.0, "grad_norm": 0.5851063132286072, "kl": 4.146484375, "learning_rate": 5e-05, "loss": 0.0399, "num_tokens": 11522011.0, "reward": 8.244873046875, "reward_std": 0.4169761836528778, "rewards/helpfulness_reward/mean": 0.7722015380859375, "rewards/helpfulness_reward/std": 0.7771039009094238, "rewards/safety_reward/mean": 8.244873046875, "rewards/safety_reward/std": 1.212108850479126, "step": 863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 87.0, "completions/max_terminated_length": 87.0, "completions/mean_length": 54.78125, "completions/mean_terminated_length": 54.78125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.3018077023840713, "frac_reward_zero_std": 0.0, "grad_norm": 0.43861818313598633, "kl": 3.6875, "learning_rate": 5e-05, "loss": 0.0476, "num_tokens": 11533375.0, "reward": 8.6162109375, "reward_std": 0.33709517121315, "rewards/helpfulness_reward/mean": 0.9463744163513184, "rewards/helpfulness_reward/std": 0.6633808612823486, "rewards/safety_reward/mean": 8.6162109375, "rewards/safety_reward/std": 1.2169911861419678, "step": 864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 54.6171875, "completions/mean_terminated_length": 54.6171875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.3021570168544232, "frac_reward_zero_std": 0.0, "grad_norm": 0.4434712827205658, "kl": 3.857421875, "learning_rate": 5e-05, "loss": 0.0565, "num_tokens": 11543902.0, "reward": 9.150634765625, "reward_std": 0.43460923433303833, "rewards/helpfulness_reward/mean": 0.9238452911376953, "rewards/helpfulness_reward/std": 0.562254786491394, "rewards/safety_reward/mean": 9.150634765625, "rewards/safety_reward/std": 1.183383822441101, "step": 865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 54.4296875, "completions/mean_terminated_length": 54.4296875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.3025063313247751, "frac_reward_zero_std": 0.0, "grad_norm": 0.42555758357048035, "kl": 3.82421875, "learning_rate": 5e-05, "loss": 0.053, "num_tokens": 11555973.0, "reward": 8.59375, "reward_std": 0.3102344274520874, "rewards/helpfulness_reward/mean": 1.0713882446289062, "rewards/helpfulness_reward/std": 0.5786210894584656, "rewards/safety_reward/mean": 8.59375, "rewards/safety_reward/std": 1.214581847190857, "step": 866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 53.96875, "completions/mean_terminated_length": 53.96875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.3028556457951271, "frac_reward_zero_std": 0.0, "grad_norm": 0.46827396750450134, "kl": 3.943359375, "learning_rate": 5e-05, "loss": 0.041, "num_tokens": 11567953.0, "reward": 9.041015625, "reward_std": 0.41911402344703674, "rewards/helpfulness_reward/mean": 0.8990721702575684, "rewards/helpfulness_reward/std": 0.5708994269371033, "rewards/safety_reward/mean": 9.041015625, "rewards/safety_reward/std": 1.0042413473129272, "step": 867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 54.625, "completions/mean_terminated_length": 54.625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.303204960265479, "frac_reward_zero_std": 0.0, "grad_norm": 0.42338982224464417, "kl": 3.744140625, "learning_rate": 5e-05, "loss": 0.0564, "num_tokens": 11578793.0, "reward": 8.791015625, "reward_std": 0.4005417227745056, "rewards/helpfulness_reward/mean": 1.2404119968414307, "rewards/helpfulness_reward/std": 0.7545763254165649, "rewards/safety_reward/mean": 8.791015625, "rewards/safety_reward/std": 1.1985714435577393, "step": 868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 54.75, "completions/mean_terminated_length": 54.75, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.3035542747358309, "frac_reward_zero_std": 0.0, "grad_norm": 0.5428324341773987, "kl": 3.71484375, "learning_rate": 5e-05, "loss": 0.0637, "num_tokens": 11589609.0, "reward": 8.887939453125, "reward_std": 0.42208993434906006, "rewards/helpfulness_reward/mean": 1.2705116271972656, "rewards/helpfulness_reward/std": 0.5910904407501221, "rewards/safety_reward/mean": 8.887939453125, "rewards/safety_reward/std": 1.280715823173523, "step": 869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 54.3046875, "completions/mean_terminated_length": 54.3046875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.3039035892061829, "frac_reward_zero_std": 0.0, "grad_norm": 0.37109145522117615, "kl": 3.86328125, "learning_rate": 5e-05, "loss": 0.0478, "num_tokens": 11600880.0, "reward": 8.608154296875, "reward_std": 0.40191978216171265, "rewards/helpfulness_reward/mean": 0.7005405426025391, "rewards/helpfulness_reward/std": 0.7894774079322815, "rewards/safety_reward/mean": 8.608154296875, "rewards/safety_reward/std": 1.2116401195526123, "step": 870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 54.1171875, "completions/mean_terminated_length": 54.1171875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.3042529036765348, "frac_reward_zero_std": 0.0, "grad_norm": 0.5135107040405273, "kl": 4.1875, "learning_rate": 5e-05, "loss": 0.0449, "num_tokens": 11612087.0, "reward": 8.5537109375, "reward_std": 0.24994395673274994, "rewards/helpfulness_reward/mean": 1.2198141813278198, "rewards/helpfulness_reward/std": 0.7767350673675537, "rewards/safety_reward/mean": 8.5537109375, "rewards/safety_reward/std": 1.1223443746566772, "step": 871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 88.0, "completions/max_terminated_length": 88.0, "completions/mean_length": 54.28125, "completions/mean_terminated_length": 54.28125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.30460221814688676, "frac_reward_zero_std": 0.0, "grad_norm": 5.715599536895752, "kl": 6.68359375, "learning_rate": 5e-05, "loss": 0.0794, "num_tokens": 11622603.0, "reward": 8.5791015625, "reward_std": 0.3402364253997803, "rewards/helpfulness_reward/mean": 1.2803726196289062, "rewards/helpfulness_reward/std": 0.5697168707847595, "rewards/safety_reward/mean": 8.5791015625, "rewards/safety_reward/std": 1.1256760358810425, "step": 872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 54.1640625, "completions/mean_terminated_length": 54.1640625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.3049515326172387, "frac_reward_zero_std": 0.0, "grad_norm": 0.4369128942489624, "kl": 4.0, "learning_rate": 5e-05, "loss": 0.0496, "num_tokens": 11633968.0, "reward": 8.51953125, "reward_std": 0.2936716675758362, "rewards/helpfulness_reward/mean": 1.0470619201660156, "rewards/helpfulness_reward/std": 0.555662989616394, "rewards/safety_reward/mean": 8.51953125, "rewards/safety_reward/std": 1.18172287940979, "step": 873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 54.0390625, "completions/mean_terminated_length": 54.0390625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.3053008470875906, "frac_reward_zero_std": 0.0, "grad_norm": 0.4244970381259918, "kl": 3.771484375, "learning_rate": 5e-05, "loss": 0.0348, "num_tokens": 11644237.0, "reward": 8.45849609375, "reward_std": 0.37336599826812744, "rewards/helpfulness_reward/mean": 1.0591824054718018, "rewards/helpfulness_reward/std": 0.6338256001472473, "rewards/safety_reward/mean": 8.45849609375, "rewards/safety_reward/std": 1.058127760887146, "step": 874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 53.984375, "completions/mean_terminated_length": 53.984375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.30565016155794256, "frac_reward_zero_std": 0.0, "grad_norm": 0.428303986787796, "kl": 3.666015625, "learning_rate": 5e-05, "loss": 0.0353, "num_tokens": 11655515.0, "reward": 8.5137939453125, "reward_std": 0.40624353289604187, "rewards/helpfulness_reward/mean": 1.0683674812316895, "rewards/helpfulness_reward/std": 0.9671115875244141, "rewards/safety_reward/mean": 8.5137939453125, "rewards/safety_reward/std": 1.2570675611495972, "step": 875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 54.3046875, "completions/mean_terminated_length": 54.3046875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.30599947602829447, "frac_reward_zero_std": 0.0, "grad_norm": 0.5850764513015747, "kl": 4.19921875, "learning_rate": 5e-05, "loss": 0.0487, "num_tokens": 11667082.0, "reward": 8.8720703125, "reward_std": 0.4387770891189575, "rewards/helpfulness_reward/mean": 1.1536140441894531, "rewards/helpfulness_reward/std": 0.7779663801193237, "rewards/safety_reward/mean": 8.8720703125, "rewards/safety_reward/std": 1.3989636898040771, "step": 876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 54.3046875, "completions/mean_terminated_length": 54.3046875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.3063487904986464, "frac_reward_zero_std": 0.0, "grad_norm": 0.33658313751220703, "kl": 3.865234375, "learning_rate": 5e-05, "loss": 0.0399, "num_tokens": 11678121.0, "reward": 8.942626953125, "reward_std": 0.30795469880104065, "rewards/helpfulness_reward/mean": 0.9495725631713867, "rewards/helpfulness_reward/std": 0.6355872750282288, "rewards/safety_reward/mean": 8.942626953125, "rewards/safety_reward/std": 1.252217411994934, "step": 877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 54.46875, "completions/mean_terminated_length": 54.46875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.30669810496899835, "frac_reward_zero_std": 0.0, "grad_norm": 0.4355853199958801, "kl": 4.056640625, "learning_rate": 5e-05, "loss": 0.0534, "num_tokens": 11689797.0, "reward": 9.179931640625, "reward_std": 0.3256615996360779, "rewards/helpfulness_reward/mean": 1.1623730659484863, "rewards/helpfulness_reward/std": 0.8197214603424072, "rewards/safety_reward/mean": 9.179931640625, "rewards/safety_reward/std": 1.119674801826477, "step": 878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 54.546875, "completions/mean_terminated_length": 54.546875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.30704741943935027, "frac_reward_zero_std": 0.0, "grad_norm": 0.6420733332633972, "kl": 4.5078125, "learning_rate": 5e-05, "loss": 0.061, "num_tokens": 11701331.0, "reward": 8.903076171875, "reward_std": 0.2860852777957916, "rewards/helpfulness_reward/mean": 0.9352529048919678, "rewards/helpfulness_reward/std": 0.8396320343017578, "rewards/safety_reward/mean": 8.903076171875, "rewards/safety_reward/std": 1.3000253438949585, "step": 879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 54.0546875, "completions/mean_terminated_length": 54.0546875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.30739673390970224, "frac_reward_zero_std": 0.0, "grad_norm": 0.4824821650981903, "kl": 4.0, "learning_rate": 5e-05, "loss": 0.0456, "num_tokens": 11713258.0, "reward": 8.566162109375, "reward_std": 0.2898222506046295, "rewards/helpfulness_reward/mean": 0.9102077484130859, "rewards/helpfulness_reward/std": 0.7905529141426086, "rewards/safety_reward/mean": 8.566162109375, "rewards/safety_reward/std": 1.1937427520751953, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 53.8203125, "completions/mean_terminated_length": 53.8203125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.30774604838005415, "frac_reward_zero_std": 0.0, "grad_norm": 0.4500992000102997, "kl": 3.984375, "learning_rate": 5e-05, "loss": 0.0341, "num_tokens": 11724483.0, "reward": 8.830810546875, "reward_std": 0.22978344559669495, "rewards/helpfulness_reward/mean": 1.0182983875274658, "rewards/helpfulness_reward/std": 0.7128891944885254, "rewards/safety_reward/mean": 8.830810546875, "rewards/safety_reward/std": 1.3322389125823975, "step": 881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 54.15625, "completions/mean_terminated_length": 54.15625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.30809536285040606, "frac_reward_zero_std": 0.0, "grad_norm": 0.9864846467971802, "kl": 4.16796875, "learning_rate": 5e-05, "loss": 0.049, "num_tokens": 11736031.0, "reward": 8.35888671875, "reward_std": 0.3280841112136841, "rewards/helpfulness_reward/mean": 1.0214157104492188, "rewards/helpfulness_reward/std": 0.7009749412536621, "rewards/safety_reward/mean": 8.35888671875, "rewards/safety_reward/std": 1.3483798503875732, "step": 882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 54.4453125, "completions/mean_terminated_length": 54.4453125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.30844467732075803, "frac_reward_zero_std": 0.0, "grad_norm": 0.5387262105941772, "kl": 4.34765625, "learning_rate": 5e-05, "loss": 0.0555, "num_tokens": 11748048.0, "reward": 8.736572265625, "reward_std": 0.3171028792858124, "rewards/helpfulness_reward/mean": 1.1435785293579102, "rewards/helpfulness_reward/std": 0.7161077260971069, "rewards/safety_reward/mean": 8.736572265625, "rewards/safety_reward/std": 1.1008220911026, "step": 883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 53.796875, "completions/mean_terminated_length": 53.796875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.30879399179110995, "frac_reward_zero_std": 0.0, "grad_norm": 0.4890996217727661, "kl": 3.990234375, "learning_rate": 5e-05, "loss": 0.0403, "num_tokens": 11758894.0, "reward": 8.48388671875, "reward_std": 0.3224107623100281, "rewards/helpfulness_reward/mean": 0.7970623970031738, "rewards/helpfulness_reward/std": 1.0283054113388062, "rewards/safety_reward/mean": 8.48388671875, "rewards/safety_reward/std": 1.4839402437210083, "step": 884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 54.421875, "completions/mean_terminated_length": 54.421875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.30914330626146186, "frac_reward_zero_std": 0.0, "grad_norm": 0.4638102054595947, "kl": 4.046875, "learning_rate": 5e-05, "loss": 0.0425, "num_tokens": 11770564.0, "reward": 9.3798828125, "reward_std": 0.22363877296447754, "rewards/helpfulness_reward/mean": 1.197662353515625, "rewards/helpfulness_reward/std": 0.5818472504615784, "rewards/safety_reward/mean": 9.3798828125, "rewards/safety_reward/std": 1.1398060321807861, "step": 885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 53.9765625, "completions/mean_terminated_length": 53.9765625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.30949262073181383, "frac_reward_zero_std": 0.0, "grad_norm": 0.3652777373790741, "kl": 3.66796875, "learning_rate": 5e-05, "loss": 0.0309, "num_tokens": 11782137.0, "reward": 8.677001953125, "reward_std": 0.26061350107192993, "rewards/helpfulness_reward/mean": 1.1547179222106934, "rewards/helpfulness_reward/std": 0.8207274675369263, "rewards/safety_reward/mean": 8.677001953125, "rewards/safety_reward/std": 1.3785927295684814, "step": 886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 53.8515625, "completions/mean_terminated_length": 53.8515625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.30984193520216574, "frac_reward_zero_std": 0.0, "grad_norm": 0.5587162375450134, "kl": 3.9453125, "learning_rate": 5e-05, "loss": 0.0395, "num_tokens": 11792398.0, "reward": 8.885986328125, "reward_std": 0.32393741607666016, "rewards/helpfulness_reward/mean": 1.183828353881836, "rewards/helpfulness_reward/std": 0.7125279307365417, "rewards/safety_reward/mean": 8.885986328125, "rewards/safety_reward/std": 1.004729986190796, "step": 887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 54.109375, "completions/mean_terminated_length": 54.109375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.3101912496725177, "frac_reward_zero_std": 0.0, "grad_norm": 0.5014169812202454, "kl": 3.94921875, "learning_rate": 5e-05, "loss": 0.0373, "num_tokens": 11802524.0, "reward": 8.77197265625, "reward_std": 0.5165718793869019, "rewards/helpfulness_reward/mean": 1.341033935546875, "rewards/helpfulness_reward/std": 0.6137828230857849, "rewards/safety_reward/mean": 8.77197265625, "rewards/safety_reward/std": 1.293018102645874, "step": 888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 53.7734375, "completions/mean_terminated_length": 53.7734375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.3105405641428696, "frac_reward_zero_std": 0.0, "grad_norm": 0.4086925685405731, "kl": 3.951171875, "learning_rate": 5e-05, "loss": 0.0253, "num_tokens": 11813207.0, "reward": 9.0380859375, "reward_std": 0.4953242242336273, "rewards/helpfulness_reward/mean": 1.0890896320343018, "rewards/helpfulness_reward/std": 0.8918286561965942, "rewards/safety_reward/mean": 9.0380859375, "rewards/safety_reward/std": 0.8630809187889099, "step": 889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 54.1796875, "completions/mean_terminated_length": 54.1796875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.31088987861322154, "frac_reward_zero_std": 0.0, "grad_norm": 0.5639714002609253, "kl": 4.294921875, "learning_rate": 5e-05, "loss": 0.0526, "num_tokens": 11824006.0, "reward": 8.939697265625, "reward_std": 0.3196638524532318, "rewards/helpfulness_reward/mean": 1.0148851871490479, "rewards/helpfulness_reward/std": 0.725494921207428, "rewards/safety_reward/mean": 8.939697265625, "rewards/safety_reward/std": 0.873339056968689, "step": 890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 53.8828125, "completions/mean_terminated_length": 53.8828125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.3112391930835735, "frac_reward_zero_std": 0.0, "grad_norm": 0.9175952076911926, "kl": 4.16796875, "learning_rate": 5e-05, "loss": 0.0416, "num_tokens": 11835271.0, "reward": 8.761474609375, "reward_std": 0.3401031792163849, "rewards/helpfulness_reward/mean": 1.211989402770996, "rewards/helpfulness_reward/std": 0.7156026363372803, "rewards/safety_reward/mean": 8.761474609375, "rewards/safety_reward/std": 1.2570596933364868, "step": 891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 53.59375, "completions/mean_terminated_length": 53.59375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.3115885075539254, "frac_reward_zero_std": 0.0, "grad_norm": 0.4950495958328247, "kl": 4.28515625, "learning_rate": 5e-05, "loss": 0.033, "num_tokens": 11846659.0, "reward": 9.161376953125, "reward_std": 0.43801453709602356, "rewards/helpfulness_reward/mean": 1.3101627826690674, "rewards/helpfulness_reward/std": 0.5825003981590271, "rewards/safety_reward/mean": 9.161376953125, "rewards/safety_reward/std": 1.314676284790039, "step": 892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 53.9609375, "completions/mean_terminated_length": 53.9609375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.31193782202427733, "frac_reward_zero_std": 0.0, "grad_norm": 1.5017650127410889, "kl": 4.716796875, "learning_rate": 5e-05, "loss": 0.0559, "num_tokens": 11857846.0, "reward": 8.6484375, "reward_std": 0.35459551215171814, "rewards/helpfulness_reward/mean": 1.3110504150390625, "rewards/helpfulness_reward/std": 0.8414110541343689, "rewards/safety_reward/mean": 8.6484375, "rewards/safety_reward/std": 1.136160969734192, "step": 893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 51.03125, "completions/mean_terminated_length": 51.03125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.3122871364946293, "frac_reward_zero_std": 0.0625, "grad_norm": 0.4210089445114136, "kl": 3.873046875, "learning_rate": 5e-05, "loss": 0.0436, "num_tokens": 11871178.0, "reward": 8.20849609375, "reward_std": 0.3563912510871887, "rewards/helpfulness_reward/mean": 1.185196876525879, "rewards/helpfulness_reward/std": 0.7633396983146667, "rewards/safety_reward/mean": 8.20849609375, "rewards/safety_reward/std": 2.027057409286499, "step": 894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 53.890625, "completions/mean_terminated_length": 53.890625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.3126364509649812, "frac_reward_zero_std": 0.0, "grad_norm": 0.4295864999294281, "kl": 3.87890625, "learning_rate": 5e-05, "loss": 0.0474, "num_tokens": 11882388.0, "reward": 8.3427734375, "reward_std": 0.3982142508029938, "rewards/helpfulness_reward/mean": 1.1568334102630615, "rewards/helpfulness_reward/std": 1.0116376876831055, "rewards/safety_reward/mean": 8.3427734375, "rewards/safety_reward/std": 1.0234016180038452, "step": 895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 54.0234375, "completions/mean_terminated_length": 54.0234375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.3129857654353332, "frac_reward_zero_std": 0.0, "grad_norm": 0.44473597407341003, "kl": 4.044921875, "learning_rate": 5e-05, "loss": 0.0482, "num_tokens": 11893703.0, "reward": 8.594970703125, "reward_std": 0.45394977927207947, "rewards/helpfulness_reward/mean": 1.2705307006835938, "rewards/helpfulness_reward/std": 0.5398422479629517, "rewards/safety_reward/mean": 8.594970703125, "rewards/safety_reward/std": 1.2075175046920776, "step": 896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 54.1953125, "completions/mean_terminated_length": 54.1953125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.3133350799056851, "frac_reward_zero_std": 0.0, "grad_norm": 0.42889100313186646, "kl": 4.2265625, "learning_rate": 5e-05, "loss": 0.0551, "num_tokens": 11904696.0, "reward": 9.034423828125, "reward_std": 0.3068462908267975, "rewards/helpfulness_reward/mean": 1.0379114151000977, "rewards/helpfulness_reward/std": 0.6404004693031311, "rewards/safety_reward/mean": 9.034423828125, "rewards/safety_reward/std": 0.9292189478874207, "step": 897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 53.953125, "completions/mean_terminated_length": 53.953125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.313684394376037, "frac_reward_zero_std": 0.0, "grad_norm": 0.44574469327926636, "kl": 3.912109375, "learning_rate": 5e-05, "loss": 0.0302, "num_tokens": 11915202.0, "reward": 8.3349609375, "reward_std": 0.3586440682411194, "rewards/helpfulness_reward/mean": 1.0821870565414429, "rewards/helpfulness_reward/std": 0.6576935648918152, "rewards/safety_reward/mean": 8.3349609375, "rewards/safety_reward/std": 1.4087436199188232, "step": 898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 53.8671875, "completions/mean_terminated_length": 53.8671875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.314033708846389, "frac_reward_zero_std": 0.0, "grad_norm": 0.3203877806663513, "kl": 3.87890625, "learning_rate": 5e-05, "loss": 0.0363, "num_tokens": 11925649.0, "reward": 8.815673828125, "reward_std": 0.2903607487678528, "rewards/helpfulness_reward/mean": 1.124349594116211, "rewards/helpfulness_reward/std": 0.6526572108268738, "rewards/safety_reward/mean": 8.815673828125, "rewards/safety_reward/std": 0.9710597395896912, "step": 899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 54.1875, "completions/mean_terminated_length": 54.1875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.3143830233167409, "frac_reward_zero_std": 0.0, "grad_norm": 0.44695642590522766, "kl": 4.009765625, "learning_rate": 5e-05, "loss": 0.0474, "num_tokens": 11936089.0, "reward": 8.719970703125, "reward_std": 0.42406272888183594, "rewards/helpfulness_reward/mean": 0.8324666023254395, "rewards/helpfulness_reward/std": 0.7142460942268372, "rewards/safety_reward/mean": 8.719970703125, "rewards/safety_reward/std": 1.048757553100586, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 53.9140625, "completions/mean_terminated_length": 53.9140625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.3147323377870928, "frac_reward_zero_std": 0.0, "grad_norm": 0.40972933173179626, "kl": 3.888671875, "learning_rate": 5e-05, "loss": 0.0345, "num_tokens": 11947414.0, "reward": 8.8955078125, "reward_std": 0.34024518728256226, "rewards/helpfulness_reward/mean": 1.2538299560546875, "rewards/helpfulness_reward/std": 0.5819846987724304, "rewards/safety_reward/mean": 8.8955078125, "rewards/safety_reward/std": 1.30735445022583, "step": 901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 54.078125, "completions/mean_terminated_length": 54.078125, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.3150816522574448, "frac_reward_zero_std": 0.0, "grad_norm": 0.4830353260040283, "kl": 4.015625, "learning_rate": 5e-05, "loss": 0.0449, "num_tokens": 11959912.0, "reward": 8.3974609375, "reward_std": 0.31563854217529297, "rewards/helpfulness_reward/mean": 1.2285962104797363, "rewards/helpfulness_reward/std": 0.813716471195221, "rewards/safety_reward/mean": 8.3974609375, "rewards/safety_reward/std": 1.4291036128997803, "step": 902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 53.953125, "completions/mean_terminated_length": 53.953125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.3154309667277967, "frac_reward_zero_std": 0.0, "grad_norm": 0.3932898938655853, "kl": 4.037109375, "learning_rate": 5e-05, "loss": 0.0421, "num_tokens": 11972322.0, "reward": 8.909912109375, "reward_std": 0.2797597646713257, "rewards/helpfulness_reward/mean": 0.8755874633789062, "rewards/helpfulness_reward/std": 0.5691474080085754, "rewards/safety_reward/mean": 8.909912109375, "rewards/safety_reward/std": 1.1197013854980469, "step": 903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 53.9921875, "completions/mean_terminated_length": 53.9921875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.31578028119814866, "frac_reward_zero_std": 0.0, "grad_norm": 0.562357485294342, "kl": 4.453125, "learning_rate": 5e-05, "loss": 0.0465, "num_tokens": 11984153.0, "reward": 8.64111328125, "reward_std": 0.28235360980033875, "rewards/helpfulness_reward/mean": 1.1379947662353516, "rewards/helpfulness_reward/std": 0.49842119216918945, "rewards/safety_reward/mean": 8.64111328125, "rewards/safety_reward/std": 0.8501822352409363, "step": 904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 53.8359375, "completions/mean_terminated_length": 53.8359375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.3161295956685006, "frac_reward_zero_std": 0.0, "grad_norm": 0.36521339416503906, "kl": 3.98046875, "learning_rate": 5e-05, "loss": 0.0333, "num_tokens": 11997308.0, "reward": 8.2099609375, "reward_std": 0.25113338232040405, "rewards/helpfulness_reward/mean": 1.0719070434570312, "rewards/helpfulness_reward/std": 1.0685105323791504, "rewards/safety_reward/mean": 8.2099609375, "rewards/safety_reward/std": 1.8018803596496582, "step": 905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 54.4375, "completions/mean_terminated_length": 54.4375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.3164789101388525, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5875120759010315, "kl": 4.34375, "learning_rate": 5e-05, "loss": 0.0526, "num_tokens": 12008628.0, "reward": 8.5263671875, "reward_std": 0.2218477427959442, "rewards/helpfulness_reward/mean": 1.2519588470458984, "rewards/helpfulness_reward/std": 0.6721814870834351, "rewards/safety_reward/mean": 8.5263671875, "rewards/safety_reward/std": 0.9795714020729065, "step": 906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 54.203125, "completions/mean_terminated_length": 54.203125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.31682822460920446, "frac_reward_zero_std": 0.0, "grad_norm": 0.31873586773872375, "kl": 3.99609375, "learning_rate": 5e-05, "loss": 0.047, "num_tokens": 12019454.0, "reward": 8.740478515625, "reward_std": 0.1952328383922577, "rewards/helpfulness_reward/mean": 1.4869165420532227, "rewards/helpfulness_reward/std": 0.6973121166229248, "rewards/safety_reward/mean": 8.740478515625, "rewards/safety_reward/std": 1.0964397192001343, "step": 907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 54.1015625, "completions/mean_terminated_length": 54.1015625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.31717753907955637, "frac_reward_zero_std": 0.0, "grad_norm": 0.38475269079208374, "kl": 4.240234375, "learning_rate": 5e-05, "loss": 0.0436, "num_tokens": 12031019.0, "reward": 8.7451171875, "reward_std": 0.1435053050518036, "rewards/helpfulness_reward/mean": 1.0014610290527344, "rewards/helpfulness_reward/std": 0.7431678771972656, "rewards/safety_reward/mean": 8.7451171875, "rewards/safety_reward/std": 0.9864394664764404, "step": 908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 72.65625, "completions/mean_terminated_length": 72.65625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.3175268535499083, "frac_reward_zero_std": 0.0, "grad_norm": 0.365809828042984, "kl": 3.494140625, "learning_rate": 5e-05, "loss": 0.0012, "num_tokens": 12045775.0, "reward": 7.88848352432251, "reward_std": 0.33879321813583374, "rewards/helpfulness_reward/mean": 0.7295510768890381, "rewards/helpfulness_reward/std": 0.8001226186752319, "rewards/safety_reward/mean": 7.88848352432251, "rewards/safety_reward/std": 2.6706666946411133, "step": 909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 54.546875, "completions/mean_terminated_length": 54.546875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.31787616802026025, "frac_reward_zero_std": 0.0, "grad_norm": 0.4888531565666199, "kl": 4.283203125, "learning_rate": 5e-05, "loss": 0.0587, "num_tokens": 12057613.0, "reward": 8.580810546875, "reward_std": 0.26986098289489746, "rewards/helpfulness_reward/mean": 0.9195327758789062, "rewards/helpfulness_reward/std": 0.7908276319503784, "rewards/safety_reward/mean": 8.580810546875, "rewards/safety_reward/std": 1.3072255849838257, "step": 910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 85.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 54.3125, "completions/mean_terminated_length": 54.3125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.31822548249061217, "frac_reward_zero_std": 0.0, "grad_norm": 0.8566253185272217, "kl": 4.62890625, "learning_rate": 5e-05, "loss": 0.0549, "num_tokens": 12068997.0, "reward": 8.822509765625, "reward_std": 0.235294371843338, "rewards/helpfulness_reward/mean": 1.2294011116027832, "rewards/helpfulness_reward/std": 0.7373190522193909, "rewards/safety_reward/mean": 8.822509765625, "rewards/safety_reward/std": 1.1817679405212402, "step": 911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 54.0078125, "completions/mean_terminated_length": 54.0078125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.31857479696096414, "frac_reward_zero_std": 0.0, "grad_norm": 0.3516460061073303, "kl": 4.056640625, "learning_rate": 5e-05, "loss": 0.0412, "num_tokens": 12081870.0, "reward": 8.4010009765625, "reward_std": 0.15407449007034302, "rewards/helpfulness_reward/mean": 0.9961004257202148, "rewards/helpfulness_reward/std": 0.6155792474746704, "rewards/safety_reward/mean": 8.4010009765625, "rewards/safety_reward/std": 1.7669787406921387, "step": 912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 54.515625, "completions/mean_terminated_length": 54.515625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.31892411143131605, "frac_reward_zero_std": 0.0, "grad_norm": 0.42696595191955566, "kl": 3.982421875, "learning_rate": 5e-05, "loss": 0.0597, "num_tokens": 12093496.0, "reward": 8.768798828125, "reward_std": 0.2692049443721771, "rewards/helpfulness_reward/mean": 0.9461688995361328, "rewards/helpfulness_reward/std": 1.0005425214767456, "rewards/safety_reward/mean": 8.768798828125, "rewards/safety_reward/std": 1.3759243488311768, "step": 913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 55.2265625, "completions/mean_terminated_length": 55.2265625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.31927342590166796, "frac_reward_zero_std": 0.0, "grad_norm": 0.6340134739875793, "kl": 4.50390625, "learning_rate": 5e-05, "loss": 0.0773, "num_tokens": 12104253.0, "reward": 8.89794921875, "reward_std": 0.4176861643791199, "rewards/helpfulness_reward/mean": 0.9386978149414062, "rewards/helpfulness_reward/std": 0.8358709812164307, "rewards/safety_reward/mean": 8.89794921875, "rewards/safety_reward/std": 1.2790334224700928, "step": 914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 53.9765625, "completions/mean_terminated_length": 53.9765625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.31962274037201993, "frac_reward_zero_std": 0.125, "grad_norm": 0.4400688707828522, "kl": 4.169921875, "learning_rate": 5e-05, "loss": 0.0409, "num_tokens": 12114706.0, "reward": 8.913330078125, "reward_std": 0.13929422199726105, "rewards/helpfulness_reward/mean": 1.0384902954101562, "rewards/helpfulness_reward/std": 0.5113218426704407, "rewards/safety_reward/mean": 8.913330078125, "rewards/safety_reward/std": 0.9195895195007324, "step": 915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 88.0, "completions/max_terminated_length": 88.0, "completions/mean_length": 54.59375, "completions/mean_terminated_length": 54.59375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.31997205484237184, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8157216310501099, "kl": 4.736328125, "learning_rate": 5e-05, "loss": 0.0571, "num_tokens": 12127654.0, "reward": 8.5, "reward_std": 0.26585936546325684, "rewards/helpfulness_reward/mean": 0.9792494773864746, "rewards/helpfulness_reward/std": 0.6046189665794373, "rewards/safety_reward/mean": 8.5, "rewards/safety_reward/std": 1.2787479162216187, "step": 916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 54.484375, "completions/mean_terminated_length": 54.484375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.32032136931272376, "frac_reward_zero_std": 0.0, "grad_norm": 0.4510432779788971, "kl": 4.11328125, "learning_rate": 5e-05, "loss": 0.0525, "num_tokens": 12138236.0, "reward": 8.66357421875, "reward_std": 0.3282133936882019, "rewards/helpfulness_reward/mean": 0.67276930809021, "rewards/helpfulness_reward/std": 0.7751685380935669, "rewards/safety_reward/mean": 8.66357421875, "rewards/safety_reward/std": 1.1817845106124878, "step": 917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 53.921875, "completions/mean_terminated_length": 53.921875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.3206706837830757, "frac_reward_zero_std": 0.0, "grad_norm": 0.40240010619163513, "kl": 3.759765625, "learning_rate": 5e-05, "loss": 0.0331, "num_tokens": 12149682.0, "reward": 8.609130859375, "reward_std": 0.34107258915901184, "rewards/helpfulness_reward/mean": 1.033884048461914, "rewards/helpfulness_reward/std": 0.6060664057731628, "rewards/safety_reward/mean": 8.609130859375, "rewards/safety_reward/std": 1.1151416301727295, "step": 918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 54.515625, "completions/mean_terminated_length": 54.515625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.32101999825342764, "frac_reward_zero_std": 0.0, "grad_norm": 2.859755516052246, "kl": 5.349609375, "learning_rate": 5e-05, "loss": 0.0641, "num_tokens": 12161292.0, "reward": 8.495361328125, "reward_std": 0.36663416028022766, "rewards/helpfulness_reward/mean": 1.3833580017089844, "rewards/helpfulness_reward/std": 0.6011472344398499, "rewards/safety_reward/mean": 8.495361328125, "rewards/safety_reward/std": 1.041545033454895, "step": 919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 54.515625, "completions/mean_terminated_length": 54.515625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.3213693127237796, "frac_reward_zero_std": 0.0, "grad_norm": 0.41820579767227173, "kl": 3.962890625, "learning_rate": 5e-05, "loss": 0.0532, "num_tokens": 12171846.0, "reward": 8.8369140625, "reward_std": 0.40692776441574097, "rewards/helpfulness_reward/mean": 1.226625680923462, "rewards/helpfulness_reward/std": 0.7273900508880615, "rewards/safety_reward/mean": 8.8369140625, "rewards/safety_reward/std": 1.112351894378662, "step": 920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 54.0390625, "completions/mean_terminated_length": 54.0390625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.3217186271941315, "frac_reward_zero_std": 0.0, "grad_norm": 0.3825797140598297, "kl": 3.98828125, "learning_rate": 5e-05, "loss": 0.0396, "num_tokens": 12183963.0, "reward": 8.589111328125, "reward_std": 0.37081149220466614, "rewards/helpfulness_reward/mean": 1.3301687240600586, "rewards/helpfulness_reward/std": 0.6999085545539856, "rewards/safety_reward/mean": 8.589111328125, "rewards/safety_reward/std": 1.2517033815383911, "step": 921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 54.765625, "completions/mean_terminated_length": 54.765625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.32206794166448344, "frac_reward_zero_std": 0.0, "grad_norm": 0.41212356090545654, "kl": 3.998046875, "learning_rate": 5e-05, "loss": 0.0552, "num_tokens": 12196237.0, "reward": 8.353271484375, "reward_std": 0.39642786979675293, "rewards/helpfulness_reward/mean": 0.7795138359069824, "rewards/helpfulness_reward/std": 0.8309001326560974, "rewards/safety_reward/mean": 8.353271484375, "rewards/safety_reward/std": 1.1233279705047607, "step": 922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 54.2421875, "completions/mean_terminated_length": 54.2421875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.3224172561348354, "frac_reward_zero_std": 0.0, "grad_norm": 0.44040918350219727, "kl": 3.896484375, "learning_rate": 5e-05, "loss": 0.0362, "num_tokens": 12207156.0, "reward": 9.044921875, "reward_std": 0.4124220013618469, "rewards/helpfulness_reward/mean": 1.3027267456054688, "rewards/helpfulness_reward/std": 0.7206891775131226, "rewards/safety_reward/mean": 9.044921875, "rewards/safety_reward/std": 1.80709969997406, "step": 923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 54.625, "completions/mean_terminated_length": 54.625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.3227665706051873, "frac_reward_zero_std": 0.0, "grad_norm": 0.4879005551338196, "kl": 3.869140625, "learning_rate": 5e-05, "loss": 0.0561, "num_tokens": 12219124.0, "reward": 8.56494140625, "reward_std": 0.40237244963645935, "rewards/helpfulness_reward/mean": 0.7998157143592834, "rewards/helpfulness_reward/std": 0.959751307964325, "rewards/safety_reward/mean": 8.56494140625, "rewards/safety_reward/std": 1.2593804597854614, "step": 924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 54.34375, "completions/mean_terminated_length": 54.34375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.32311588507553923, "frac_reward_zero_std": 0.0, "grad_norm": 0.3950350880622864, "kl": 4.08203125, "learning_rate": 5e-05, "loss": 0.0454, "num_tokens": 12230216.0, "reward": 8.45654296875, "reward_std": 0.36231544613838196, "rewards/helpfulness_reward/mean": 1.0237667560577393, "rewards/helpfulness_reward/std": 0.8356444835662842, "rewards/safety_reward/mean": 8.45654296875, "rewards/safety_reward/std": 0.970904529094696, "step": 925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 54.3828125, "completions/mean_terminated_length": 54.3828125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.3234651995458912, "frac_reward_zero_std": 0.0, "grad_norm": 0.41840606927871704, "kl": 4.115234375, "learning_rate": 5e-05, "loss": 0.0512, "num_tokens": 12242121.0, "reward": 8.785888671875, "reward_std": 0.37280136346817017, "rewards/helpfulness_reward/mean": 0.7789502143859863, "rewards/helpfulness_reward/std": 0.7390221953392029, "rewards/safety_reward/mean": 8.785888671875, "rewards/safety_reward/std": 1.140740990638733, "step": 926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 91.0, "completions/max_terminated_length": 91.0, "completions/mean_length": 55.078125, "completions/mean_terminated_length": 55.078125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.3238145140162431, "frac_reward_zero_std": 0.0, "grad_norm": 0.3962939977645874, "kl": 4.095703125, "learning_rate": 5e-05, "loss": 0.0567, "num_tokens": 12255731.0, "reward": 8.33056640625, "reward_std": 0.4343562126159668, "rewards/helpfulness_reward/mean": 1.1618938446044922, "rewards/helpfulness_reward/std": 0.613861083984375, "rewards/safety_reward/mean": 8.33056640625, "rewards/safety_reward/std": 1.347225308418274, "step": 927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 106.0, "completions/max_terminated_length": 106.0, "completions/mean_length": 54.78125, "completions/mean_terminated_length": 54.78125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.3241638284865951, "frac_reward_zero_std": 0.0, "grad_norm": 0.3595753312110901, "kl": 3.966796875, "learning_rate": 5e-05, "loss": 0.0551, "num_tokens": 12267183.0, "reward": 8.3994140625, "reward_std": 0.3605709671974182, "rewards/helpfulness_reward/mean": 1.1372642517089844, "rewards/helpfulness_reward/std": 0.8778806924819946, "rewards/safety_reward/mean": 8.3994140625, "rewards/safety_reward/std": 1.2738252878189087, "step": 928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 54.59375, "completions/mean_terminated_length": 54.59375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.324513142956947, "frac_reward_zero_std": 0.0, "grad_norm": 0.4736955463886261, "kl": 3.9921875, "learning_rate": 5e-05, "loss": 0.0649, "num_tokens": 12278867.0, "reward": 8.40771484375, "reward_std": 0.3235515058040619, "rewards/helpfulness_reward/mean": 0.9663124084472656, "rewards/helpfulness_reward/std": 0.6599628925323486, "rewards/safety_reward/mean": 8.40771484375, "rewards/safety_reward/std": 0.948401689529419, "step": 929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 54.2109375, "completions/mean_terminated_length": 54.2109375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.3248624574272989, "frac_reward_zero_std": 0.0, "grad_norm": 0.38625702261924744, "kl": 4.28515625, "learning_rate": 5e-05, "loss": 0.0472, "num_tokens": 12289798.0, "reward": 9.0703125, "reward_std": 0.2755778431892395, "rewards/helpfulness_reward/mean": 1.4932193756103516, "rewards/helpfulness_reward/std": 0.7784438133239746, "rewards/safety_reward/mean": 9.0703125, "rewards/safety_reward/std": 0.9834381341934204, "step": 930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 54.3125, "completions/mean_terminated_length": 54.3125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.3252117718976509, "frac_reward_zero_std": 0.0, "grad_norm": 1.0282810926437378, "kl": 4.56640625, "learning_rate": 5e-05, "loss": 0.0579, "num_tokens": 12300942.0, "reward": 8.71630859375, "reward_std": 0.45508140325546265, "rewards/helpfulness_reward/mean": 1.2121734619140625, "rewards/helpfulness_reward/std": 0.7593802809715271, "rewards/safety_reward/mean": 8.71630859375, "rewards/safety_reward/std": 1.1389771699905396, "step": 931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 54.453125, "completions/mean_terminated_length": 54.453125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.3255610863680028, "frac_reward_zero_std": 0.0, "grad_norm": 0.3596835136413574, "kl": 4.140625, "learning_rate": 5e-05, "loss": 0.0631, "num_tokens": 12311840.0, "reward": 8.88134765625, "reward_std": 0.3075103461742401, "rewards/helpfulness_reward/mean": 1.1256561279296875, "rewards/helpfulness_reward/std": 0.7766306400299072, "rewards/safety_reward/mean": 8.88134765625, "rewards/safety_reward/std": 1.0445033311843872, "step": 932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 54.390625, "completions/mean_terminated_length": 54.390625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.3259104008383547, "frac_reward_zero_std": 0.0, "grad_norm": 0.3820090591907501, "kl": 4.08203125, "learning_rate": 5e-05, "loss": 0.0542, "num_tokens": 12322818.0, "reward": 8.87646484375, "reward_std": 0.34354567527770996, "rewards/helpfulness_reward/mean": 0.9047727584838867, "rewards/helpfulness_reward/std": 0.9139894247055054, "rewards/safety_reward/mean": 8.87646484375, "rewards/safety_reward/std": 1.0488708019256592, "step": 933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 54.546875, "completions/mean_terminated_length": 54.546875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.3262597153087067, "frac_reward_zero_std": 0.0, "grad_norm": 0.4699941873550415, "kl": 3.97265625, "learning_rate": 5e-05, "loss": 0.0531, "num_tokens": 12334144.0, "reward": 8.639404296875, "reward_std": 0.3346114158630371, "rewards/helpfulness_reward/mean": 1.1968300342559814, "rewards/helpfulness_reward/std": 0.7898147106170654, "rewards/safety_reward/mean": 8.639404296875, "rewards/safety_reward/std": 1.287084698677063, "step": 934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 54.3125, "completions/mean_terminated_length": 54.3125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.3266090297790586, "frac_reward_zero_std": 0.0, "grad_norm": 0.3899536430835724, "kl": 4.033203125, "learning_rate": 5e-05, "loss": 0.0535, "num_tokens": 12346408.0, "reward": 8.431640625, "reward_std": 0.3072558343410492, "rewards/helpfulness_reward/mean": 0.944654107093811, "rewards/helpfulness_reward/std": 0.7144989967346191, "rewards/safety_reward/mean": 8.431640625, "rewards/safety_reward/std": 1.1461589336395264, "step": 935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 90.0, "completions/max_terminated_length": 90.0, "completions/mean_length": 54.6875, "completions/mean_terminated_length": 54.6875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.3269583442494105, "frac_reward_zero_std": 0.0, "grad_norm": 0.5010970830917358, "kl": 4.173828125, "learning_rate": 5e-05, "loss": 0.0613, "num_tokens": 12357408.0, "reward": 8.75244140625, "reward_std": 0.36332711577415466, "rewards/helpfulness_reward/mean": 1.3832416534423828, "rewards/helpfulness_reward/std": 0.8280216455459595, "rewards/safety_reward/mean": 8.75244140625, "rewards/safety_reward/std": 1.0252758264541626, "step": 936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 53.8984375, "completions/mean_terminated_length": 53.8984375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.3273076587197625, "frac_reward_zero_std": 0.125, "grad_norm": 0.4800202548503876, "kl": 3.962890625, "learning_rate": 5e-05, "loss": 0.0411, "num_tokens": 12370131.0, "reward": 8.4525146484375, "reward_std": 0.34573328495025635, "rewards/helpfulness_reward/mean": 1.1337380409240723, "rewards/helpfulness_reward/std": 0.7538549304008484, "rewards/safety_reward/mean": 8.4525146484375, "rewards/safety_reward/std": 1.4585373401641846, "step": 937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 54.1875, "completions/mean_terminated_length": 54.1875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.3276569731901144, "frac_reward_zero_std": 0.0625, "grad_norm": 0.3412764072418213, "kl": 4.1171875, "learning_rate": 5e-05, "loss": 0.0442, "num_tokens": 12381555.0, "reward": 8.881591796875, "reward_std": 0.2167605459690094, "rewards/helpfulness_reward/mean": 1.287899136543274, "rewards/helpfulness_reward/std": 0.5397031903266907, "rewards/safety_reward/mean": 8.881591796875, "rewards/safety_reward/std": 0.8612767457962036, "step": 938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 54.390625, "completions/mean_terminated_length": 54.390625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.32800628766046636, "frac_reward_zero_std": 0.0, "grad_norm": 0.42833077907562256, "kl": 4.064453125, "learning_rate": 5e-05, "loss": 0.0476, "num_tokens": 12392381.0, "reward": 8.163330078125, "reward_std": 0.2726733386516571, "rewards/helpfulness_reward/mean": 1.2113838195800781, "rewards/helpfulness_reward/std": 0.6047669053077698, "rewards/safety_reward/mean": 8.163330078125, "rewards/safety_reward/std": 1.2187923192977905, "step": 939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 54.078125, "completions/mean_terminated_length": 54.078125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.32835560213081827, "frac_reward_zero_std": 0.0625, "grad_norm": 0.27887818217277527, "kl": 3.974609375, "learning_rate": 5e-05, "loss": 0.0406, "num_tokens": 12406255.0, "reward": 8.68408203125, "reward_std": 0.18296314775943756, "rewards/helpfulness_reward/mean": 0.8932085037231445, "rewards/helpfulness_reward/std": 1.0676461458206177, "rewards/safety_reward/mean": 8.68408203125, "rewards/safety_reward/std": 1.346383810043335, "step": 940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 54.203125, "completions/mean_terminated_length": 54.203125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.3287049166011702, "frac_reward_zero_std": 0.0, "grad_norm": 5.254642963409424, "kl": 5.912109375, "learning_rate": 5e-05, "loss": 0.0625, "num_tokens": 12419281.0, "reward": 8.537353515625, "reward_std": 0.29199308156967163, "rewards/helpfulness_reward/mean": 0.9509963989257812, "rewards/helpfulness_reward/std": 0.9319491386413574, "rewards/safety_reward/mean": 8.537353515625, "rewards/safety_reward/std": 1.4772671461105347, "step": 941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 54.109375, "completions/mean_terminated_length": 54.109375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.32905423107152215, "frac_reward_zero_std": 0.0, "grad_norm": 0.4176904559135437, "kl": 4.076171875, "learning_rate": 5e-05, "loss": 0.0453, "num_tokens": 12433495.0, "reward": 8.6954345703125, "reward_std": 0.2595812976360321, "rewards/helpfulness_reward/mean": 1.4847126007080078, "rewards/helpfulness_reward/std": 0.747857391834259, "rewards/safety_reward/mean": 8.6954345703125, "rewards/safety_reward/std": 1.6265548467636108, "step": 942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 54.0, "completions/mean_terminated_length": 54.0, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.32940354554187407, "frac_reward_zero_std": 0.0, "grad_norm": 0.39291173219680786, "kl": 4.111328125, "learning_rate": 5e-05, "loss": 0.0434, "num_tokens": 12446335.0, "reward": 8.0631103515625, "reward_std": 0.14186397194862366, "rewards/helpfulness_reward/mean": 1.2432441711425781, "rewards/helpfulness_reward/std": 0.9973008632659912, "rewards/safety_reward/mean": 8.0631103515625, "rewards/safety_reward/std": 1.697340488433838, "step": 943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 54.15625, "completions/mean_terminated_length": 54.15625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.329752860012226, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5766130089759827, "kl": 4.208984375, "learning_rate": 5e-05, "loss": 0.051, "num_tokens": 12457227.0, "reward": 8.805419921875, "reward_std": 0.3675246834754944, "rewards/helpfulness_reward/mean": 0.9476165771484375, "rewards/helpfulness_reward/std": 1.0684716701507568, "rewards/safety_reward/mean": 8.805419921875, "rewards/safety_reward/std": 1.4632830619812012, "step": 944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 53.9609375, "completions/mean_terminated_length": 53.9609375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.33010217448257795, "frac_reward_zero_std": 0.0, "grad_norm": 0.46556422114372253, "kl": 4.31640625, "learning_rate": 5e-05, "loss": 0.0418, "num_tokens": 12468398.0, "reward": 9.098388671875, "reward_std": 0.17705538868904114, "rewards/helpfulness_reward/mean": 1.0247230529785156, "rewards/helpfulness_reward/std": 0.5703864693641663, "rewards/safety_reward/mean": 9.098388671875, "rewards/safety_reward/std": 0.893368124961853, "step": 945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 54.015625, "completions/mean_terminated_length": 54.015625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.33045148895292986, "frac_reward_zero_std": 0.0, "grad_norm": 0.4153003394603729, "kl": 3.9453125, "learning_rate": 5e-05, "loss": 0.0411, "num_tokens": 12480632.0, "reward": 8.761962890625, "reward_std": 0.221096470952034, "rewards/helpfulness_reward/mean": 1.3482513427734375, "rewards/helpfulness_reward/std": 0.6494141817092896, "rewards/safety_reward/mean": 8.761962890625, "rewards/safety_reward/std": 0.9988622069358826, "step": 946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 53.953125, "completions/mean_terminated_length": 53.953125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.33080080342328183, "frac_reward_zero_std": 0.0625, "grad_norm": 0.41913822293281555, "kl": 3.783203125, "learning_rate": 5e-05, "loss": 0.0369, "num_tokens": 12492106.0, "reward": 8.915283203125, "reward_std": 0.1436617225408554, "rewards/helpfulness_reward/mean": 1.2315826416015625, "rewards/helpfulness_reward/std": 0.6361302137374878, "rewards/safety_reward/mean": 8.915283203125, "rewards/safety_reward/std": 0.8340767025947571, "step": 947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 54.171875, "completions/mean_terminated_length": 54.171875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.33115011789363374, "frac_reward_zero_std": 0.0, "grad_norm": 0.5015518665313721, "kl": 4.201171875, "learning_rate": 5e-05, "loss": 0.0506, "num_tokens": 12505384.0, "reward": 8.594970703125, "reward_std": 0.21925592422485352, "rewards/helpfulness_reward/mean": 1.0936050415039062, "rewards/helpfulness_reward/std": 0.6857178211212158, "rewards/safety_reward/mean": 8.594970703125, "rewards/safety_reward/std": 1.3789336681365967, "step": 948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 54.078125, "completions/mean_terminated_length": 54.078125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.33149943236398566, "frac_reward_zero_std": 0.0, "grad_norm": 1.4479460716247559, "kl": 4.83984375, "learning_rate": 5e-05, "loss": 0.0521, "num_tokens": 12517378.0, "reward": 8.66796875, "reward_std": 0.22044946253299713, "rewards/helpfulness_reward/mean": 0.9125652313232422, "rewards/helpfulness_reward/std": 0.7066748738288879, "rewards/safety_reward/mean": 8.66796875, "rewards/safety_reward/std": 1.150934100151062, "step": 949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 54.1484375, "completions/mean_terminated_length": 54.1484375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.3318487468343376, "frac_reward_zero_std": 0.0, "grad_norm": 0.453331857919693, "kl": 3.953125, "learning_rate": 5e-05, "loss": 0.0429, "num_tokens": 12528373.0, "reward": 8.573974609375, "reward_std": 0.21817725896835327, "rewards/helpfulness_reward/mean": 1.1793394088745117, "rewards/helpfulness_reward/std": 0.9129785895347595, "rewards/safety_reward/mean": 8.573974609375, "rewards/safety_reward/std": 1.3323447704315186, "step": 950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 53.984375, "completions/mean_terminated_length": 53.984375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.33219806130468954, "frac_reward_zero_std": 0.0, "grad_norm": 0.2998594343662262, "kl": 3.81640625, "learning_rate": 5e-05, "loss": 0.0388, "num_tokens": 12539019.0, "reward": 8.74365234375, "reward_std": 0.17082160711288452, "rewards/helpfulness_reward/mean": 1.3272705078125, "rewards/helpfulness_reward/std": 0.5970360040664673, "rewards/safety_reward/mean": 8.74365234375, "rewards/safety_reward/std": 0.7793960571289062, "step": 951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 54.4140625, "completions/mean_terminated_length": 54.4140625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.33254737577504145, "frac_reward_zero_std": 0.0, "grad_norm": 1.061667561531067, "kl": 4.98046875, "learning_rate": 5e-05, "loss": 0.0577, "num_tokens": 12550704.0, "reward": 9.09130859375, "reward_std": 0.28581497073173523, "rewards/helpfulness_reward/mean": 1.1453170776367188, "rewards/helpfulness_reward/std": 0.6653697490692139, "rewards/safety_reward/mean": 9.09130859375, "rewards/safety_reward/std": 1.2146363258361816, "step": 952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 54.3046875, "completions/mean_terminated_length": 54.3046875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.3328966902453934, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5045182108879089, "kl": 3.923828125, "learning_rate": 5e-05, "loss": 0.0457, "num_tokens": 12561791.0, "reward": 8.84814453125, "reward_std": 0.2021273672580719, "rewards/helpfulness_reward/mean": 1.0596026182174683, "rewards/helpfulness_reward/std": 0.7148849964141846, "rewards/safety_reward/mean": 8.84814453125, "rewards/safety_reward/std": 1.240649938583374, "step": 953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 54.0546875, "completions/mean_terminated_length": 54.0546875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.33324600471574534, "frac_reward_zero_std": 0.0625, "grad_norm": 0.34330394864082336, "kl": 4.0078125, "learning_rate": 5e-05, "loss": 0.0408, "num_tokens": 12572502.0, "reward": 9.23583984375, "reward_std": 0.15027549862861633, "rewards/helpfulness_reward/mean": 1.64813232421875, "rewards/helpfulness_reward/std": 0.5579631924629211, "rewards/safety_reward/mean": 9.23583984375, "rewards/safety_reward/std": 1.2182726860046387, "step": 954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 54.203125, "completions/mean_terminated_length": 54.203125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.3335953191860973, "frac_reward_zero_std": 0.0, "grad_norm": 0.37760138511657715, "kl": 4.212890625, "learning_rate": 5e-05, "loss": 0.0364, "num_tokens": 12583536.0, "reward": 9.028076171875, "reward_std": 0.17564743757247925, "rewards/helpfulness_reward/mean": 1.3627827167510986, "rewards/helpfulness_reward/std": 0.6322814226150513, "rewards/safety_reward/mean": 9.028076171875, "rewards/safety_reward/std": 0.8638814687728882, "step": 955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 54.4375, "completions/mean_terminated_length": 54.4375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.3339446336564492, "frac_reward_zero_std": 0.0, "grad_norm": 0.3866423964500427, "kl": 4.275390625, "learning_rate": 5e-05, "loss": 0.054, "num_tokens": 12596656.0, "reward": 8.95361328125, "reward_std": 0.32577893137931824, "rewards/helpfulness_reward/mean": 1.238074779510498, "rewards/helpfulness_reward/std": 0.7114124894142151, "rewards/safety_reward/mean": 8.95361328125, "rewards/safety_reward/std": 0.9839695692062378, "step": 956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 54.296875, "completions/mean_terminated_length": 54.296875, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.33429394812680113, "frac_reward_zero_std": 0.0, "grad_norm": 1.2498834133148193, "kl": 4.431640625, "learning_rate": 5e-05, "loss": 0.0525, "num_tokens": 12608094.0, "reward": 8.8330078125, "reward_std": 0.29193615913391113, "rewards/helpfulness_reward/mean": 1.3292429447174072, "rewards/helpfulness_reward/std": 0.6126344799995422, "rewards/safety_reward/mean": 8.8330078125, "rewards/safety_reward/std": 1.1673489809036255, "step": 957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 54.6328125, "completions/mean_terminated_length": 54.6328125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.3346432625971531, "frac_reward_zero_std": 0.0, "grad_norm": 0.3648030459880829, "kl": 3.966796875, "learning_rate": 5e-05, "loss": 0.0479, "num_tokens": 12619119.0, "reward": 8.623291015625, "reward_std": 0.30782103538513184, "rewards/helpfulness_reward/mean": 1.1385002136230469, "rewards/helpfulness_reward/std": 0.6011741757392883, "rewards/safety_reward/mean": 8.623291015625, "rewards/safety_reward/std": 0.795981228351593, "step": 958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 54.3203125, "completions/mean_terminated_length": 54.3203125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.334992577067505, "frac_reward_zero_std": 0.0, "grad_norm": 0.43392395973205566, "kl": 4.3125, "learning_rate": 5e-05, "loss": 0.0511, "num_tokens": 12630880.0, "reward": 8.4404296875, "reward_std": 0.3300262987613678, "rewards/helpfulness_reward/mean": 0.908557116985321, "rewards/helpfulness_reward/std": 0.6500107049942017, "rewards/safety_reward/mean": 8.4404296875, "rewards/safety_reward/std": 1.2298663854599, "step": 959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 54.3515625, "completions/mean_terminated_length": 54.3515625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.33534189153785693, "frac_reward_zero_std": 0.0, "grad_norm": 0.40279561281204224, "kl": 3.86328125, "learning_rate": 5e-05, "loss": 0.0474, "num_tokens": 12642165.0, "reward": 8.819580078125, "reward_std": 0.23490504920482635, "rewards/helpfulness_reward/mean": 1.1483089923858643, "rewards/helpfulness_reward/std": 0.8275324702262878, "rewards/safety_reward/mean": 8.819580078125, "rewards/safety_reward/std": 1.636271357536316, "step": 960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 54.53125, "completions/mean_terminated_length": 54.53125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.3356912060082089, "frac_reward_zero_std": 0.0, "grad_norm": 0.4275771975517273, "kl": 4.283203125, "learning_rate": 5e-05, "loss": 0.0564, "num_tokens": 12653377.0, "reward": 8.63037109375, "reward_std": 0.44120633602142334, "rewards/helpfulness_reward/mean": 1.3090877532958984, "rewards/helpfulness_reward/std": 0.6404209733009338, "rewards/safety_reward/mean": 8.63037109375, "rewards/safety_reward/std": 1.2549245357513428, "step": 961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 54.4921875, "completions/mean_terminated_length": 54.4921875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.3360405204785608, "frac_reward_zero_std": 0.0, "grad_norm": 0.37177371978759766, "kl": 4.1484375, "learning_rate": 5e-05, "loss": 0.0517, "num_tokens": 12664680.0, "reward": 8.776123046875, "reward_std": 0.31631895899772644, "rewards/helpfulness_reward/mean": 0.92669677734375, "rewards/helpfulness_reward/std": 0.7532109618186951, "rewards/safety_reward/mean": 8.776123046875, "rewards/safety_reward/std": 1.2622966766357422, "step": 962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 54.3359375, "completions/mean_terminated_length": 54.3359375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.3363898349489128, "frac_reward_zero_std": 0.0, "grad_norm": 0.4577087163925171, "kl": 4.1875, "learning_rate": 5e-05, "loss": 0.0503, "num_tokens": 12675323.0, "reward": 8.5887451171875, "reward_std": 0.3977905511856079, "rewards/helpfulness_reward/mean": 1.0136083364486694, "rewards/helpfulness_reward/std": 0.6610426306724548, "rewards/safety_reward/mean": 8.5887451171875, "rewards/safety_reward/std": 0.9417430758476257, "step": 963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 54.3046875, "completions/mean_terminated_length": 54.3046875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.3367391494192647, "frac_reward_zero_std": 0.0, "grad_norm": 0.5573409199714661, "kl": 4.46875, "learning_rate": 5e-05, "loss": 0.0501, "num_tokens": 12688426.0, "reward": 8.496826171875, "reward_std": 0.3627827763557434, "rewards/helpfulness_reward/mean": 0.8549442291259766, "rewards/helpfulness_reward/std": 0.7939266562461853, "rewards/safety_reward/mean": 8.496826171875, "rewards/safety_reward/std": 1.7785470485687256, "step": 964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 54.375, "completions/mean_terminated_length": 54.375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.3370884638896166, "frac_reward_zero_std": 0.0, "grad_norm": 0.4633565843105316, "kl": 4.07421875, "learning_rate": 5e-05, "loss": 0.0472, "num_tokens": 12700970.0, "reward": 8.15887451171875, "reward_std": 0.4157849848270416, "rewards/helpfulness_reward/mean": 0.9508342742919922, "rewards/helpfulness_reward/std": 0.773253321647644, "rewards/safety_reward/mean": 8.15887451171875, "rewards/safety_reward/std": 2.01780104637146, "step": 965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 54.046875, "completions/mean_terminated_length": 54.046875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.3374377783599686, "frac_reward_zero_std": 0.0625, "grad_norm": 0.3685762286186218, "kl": 3.875, "learning_rate": 5e-05, "loss": 0.035, "num_tokens": 12711904.0, "reward": 8.48193359375, "reward_std": 0.182663232088089, "rewards/helpfulness_reward/mean": 1.158905029296875, "rewards/helpfulness_reward/std": 0.5775387287139893, "rewards/safety_reward/mean": 8.48193359375, "rewards/safety_reward/std": 1.2044072151184082, "step": 966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 54.234375, "completions/mean_terminated_length": 54.234375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.3377870928303205, "frac_reward_zero_std": 0.0, "grad_norm": 0.44463542103767395, "kl": 3.84375, "learning_rate": 5e-05, "loss": 0.0439, "num_tokens": 12724918.0, "reward": 7.99169921875, "reward_std": 0.2972695827484131, "rewards/helpfulness_reward/mean": 1.1566219329833984, "rewards/helpfulness_reward/std": 0.7240773439407349, "rewards/safety_reward/mean": 7.99169921875, "rewards/safety_reward/std": 2.0984556674957275, "step": 967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 54.171875, "completions/mean_terminated_length": 54.171875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.3381364073006724, "frac_reward_zero_std": 0.0, "grad_norm": 0.3462564945220947, "kl": 4.3828125, "learning_rate": 5e-05, "loss": 0.0449, "num_tokens": 12735964.0, "reward": 9.001220703125, "reward_std": 0.31577369570732117, "rewards/helpfulness_reward/mean": 1.292933464050293, "rewards/helpfulness_reward/std": 0.6388101577758789, "rewards/safety_reward/mean": 9.001220703125, "rewards/safety_reward/std": 1.3104208707809448, "step": 968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 54.0078125, "completions/mean_terminated_length": 54.0078125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.3384857217710244, "frac_reward_zero_std": 0.0, "grad_norm": 0.5401602983474731, "kl": 4.263671875, "learning_rate": 5e-05, "loss": 0.0386, "num_tokens": 12746957.0, "reward": 9.0718994140625, "reward_std": 0.4633815884590149, "rewards/helpfulness_reward/mean": 1.113814353942871, "rewards/helpfulness_reward/std": 0.5524821877479553, "rewards/safety_reward/mean": 9.0718994140625, "rewards/safety_reward/std": 1.0957225561141968, "step": 969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 54.2890625, "completions/mean_terminated_length": 54.2890625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.3388350362413763, "frac_reward_zero_std": 0.0, "grad_norm": 0.33194077014923096, "kl": 3.796875, "learning_rate": 5e-05, "loss": 0.0478, "num_tokens": 12758282.0, "reward": 8.50341796875, "reward_std": 0.2244347482919693, "rewards/helpfulness_reward/mean": 0.7624094486236572, "rewards/helpfulness_reward/std": 0.7348828911781311, "rewards/safety_reward/mean": 8.50341796875, "rewards/safety_reward/std": 1.0188857316970825, "step": 970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 54.375, "completions/mean_terminated_length": 54.375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.33918435071172826, "frac_reward_zero_std": 0.0, "grad_norm": 0.40390390157699585, "kl": 4.03125, "learning_rate": 5e-05, "loss": 0.0499, "num_tokens": 12770482.0, "reward": 8.741455078125, "reward_std": 0.33842065930366516, "rewards/helpfulness_reward/mean": 0.9002113342285156, "rewards/helpfulness_reward/std": 0.5655182003974915, "rewards/safety_reward/mean": 8.741455078125, "rewards/safety_reward/std": 1.191544771194458, "step": 971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 54.3125, "completions/mean_terminated_length": 54.3125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.33953366518208017, "frac_reward_zero_std": 0.0, "grad_norm": 0.4601347744464874, "kl": 4.078125, "learning_rate": 5e-05, "loss": 0.0547, "num_tokens": 12783330.0, "reward": 8.560546875, "reward_std": 0.31425753235816956, "rewards/helpfulness_reward/mean": 1.3179931640625, "rewards/helpfulness_reward/std": 0.6166655421257019, "rewards/safety_reward/mean": 8.560546875, "rewards/safety_reward/std": 1.4078634977340698, "step": 972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 54.3359375, "completions/mean_terminated_length": 54.3359375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.3398829796524321, "frac_reward_zero_std": 0.0, "grad_norm": 0.4483763575553894, "kl": 3.9375, "learning_rate": 5e-05, "loss": 0.0469, "num_tokens": 12794605.0, "reward": 8.653564453125, "reward_std": 0.33219945430755615, "rewards/helpfulness_reward/mean": 1.4553909301757812, "rewards/helpfulness_reward/std": 0.7072336077690125, "rewards/safety_reward/mean": 8.653564453125, "rewards/safety_reward/std": 1.2028255462646484, "step": 973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 54.640625, "completions/mean_terminated_length": 54.640625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.34023229412278405, "frac_reward_zero_std": 0.0, "grad_norm": 0.49610447883605957, "kl": 4.048828125, "learning_rate": 5e-05, "loss": 0.0553, "num_tokens": 12805775.0, "reward": 8.986083984375, "reward_std": 0.30363476276397705, "rewards/helpfulness_reward/mean": 1.0349140167236328, "rewards/helpfulness_reward/std": 0.911872923374176, "rewards/safety_reward/mean": 8.986083984375, "rewards/safety_reward/std": 1.3389133214950562, "step": 974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 115.0, "completions/max_terminated_length": 115.0, "completions/mean_length": 54.671875, "completions/mean_terminated_length": 54.671875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.34058160859313596, "frac_reward_zero_std": 0.0625, "grad_norm": 0.42904898524284363, "kl": 4.095703125, "learning_rate": 5e-05, "loss": 0.0653, "num_tokens": 12816333.0, "reward": 8.7386474609375, "reward_std": 0.34189626574516296, "rewards/helpfulness_reward/mean": 1.1593475341796875, "rewards/helpfulness_reward/std": 0.5121880173683167, "rewards/safety_reward/mean": 8.7386474609375, "rewards/safety_reward/std": 1.2923449277877808, "step": 975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 53.5, "completions/mean_terminated_length": 53.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.3409309230634879, "frac_reward_zero_std": 0.0, "grad_norm": 0.3253578841686249, "kl": 4.15234375, "learning_rate": 5e-05, "loss": 0.0221, "num_tokens": 12828613.0, "reward": 8.310546875, "reward_std": 0.28476566076278687, "rewards/helpfulness_reward/mean": 0.9313645362854004, "rewards/helpfulness_reward/std": 0.9110682010650635, "rewards/safety_reward/mean": 8.310546875, "rewards/safety_reward/std": 1.3062500953674316, "step": 976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 53.796875, "completions/mean_terminated_length": 53.796875, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.34128023753383985, "frac_reward_zero_std": 0.0, "grad_norm": 0.366367369890213, "kl": 4.26171875, "learning_rate": 5e-05, "loss": 0.0366, "num_tokens": 12839699.0, "reward": 8.6826171875, "reward_std": 0.3011210858821869, "rewards/helpfulness_reward/mean": 1.1481170654296875, "rewards/helpfulness_reward/std": 0.7350705862045288, "rewards/safety_reward/mean": 8.6826171875, "rewards/safety_reward/std": 1.1990183591842651, "step": 977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 54.234375, "completions/mean_terminated_length": 54.234375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.34162955200419176, "frac_reward_zero_std": 0.0625, "grad_norm": 0.38448184728622437, "kl": 4.357421875, "learning_rate": 5e-05, "loss": 0.0499, "num_tokens": 12851025.0, "reward": 9.04150390625, "reward_std": 0.15381629765033722, "rewards/helpfulness_reward/mean": 1.2540016174316406, "rewards/helpfulness_reward/std": 0.5745155811309814, "rewards/safety_reward/mean": 9.04150390625, "rewards/safety_reward/std": 1.347524881362915, "step": 978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 54.1875, "completions/mean_terminated_length": 54.1875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.34197886647454373, "frac_reward_zero_std": 0.0, "grad_norm": 0.34224647283554077, "kl": 4.033203125, "learning_rate": 5e-05, "loss": 0.0412, "num_tokens": 12862409.0, "reward": 9.047119140625, "reward_std": 0.3316284418106079, "rewards/helpfulness_reward/mean": 1.2012476921081543, "rewards/helpfulness_reward/std": 0.5532510876655579, "rewards/safety_reward/mean": 9.047119140625, "rewards/safety_reward/std": 1.1994194984436035, "step": 979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 53.8671875, "completions/mean_terminated_length": 53.8671875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.34232818094489564, "frac_reward_zero_std": 0.0, "grad_norm": 0.40148064494132996, "kl": 4.375, "learning_rate": 5e-05, "loss": 0.0397, "num_tokens": 12874496.0, "reward": 9.126220703125, "reward_std": 0.3338129222393036, "rewards/helpfulness_reward/mean": 1.2570037841796875, "rewards/helpfulness_reward/std": 0.5925160050392151, "rewards/safety_reward/mean": 9.126220703125, "rewards/safety_reward/std": 1.290273904800415, "step": 980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 54.09375, "completions/mean_terminated_length": 54.09375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.34267749541524756, "frac_reward_zero_std": 0.0, "grad_norm": 0.3987235128879547, "kl": 4.2734375, "learning_rate": 5e-05, "loss": 0.046, "num_tokens": 12884852.0, "reward": 8.59326171875, "reward_std": 0.1582866907119751, "rewards/helpfulness_reward/mean": 1.379678726196289, "rewards/helpfulness_reward/std": 0.5656901001930237, "rewards/safety_reward/mean": 8.59326171875, "rewards/safety_reward/std": 0.8228816986083984, "step": 981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 53.953125, "completions/mean_terminated_length": 53.953125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.3430268098855995, "frac_reward_zero_std": 0.0, "grad_norm": 0.284707635641098, "kl": 4.068359375, "learning_rate": 5e-05, "loss": 0.0395, "num_tokens": 12895958.0, "reward": 9.0048828125, "reward_std": 0.195026695728302, "rewards/helpfulness_reward/mean": 1.5438957214355469, "rewards/helpfulness_reward/std": 0.7626729011535645, "rewards/safety_reward/mean": 9.0048828125, "rewards/safety_reward/std": 1.0580284595489502, "step": 982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 54.125, "completions/mean_terminated_length": 54.125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.34337612435595144, "frac_reward_zero_std": 0.0, "grad_norm": 0.4164467751979828, "kl": 4.1640625, "learning_rate": 5e-05, "loss": 0.0481, "num_tokens": 12908118.0, "reward": 8.679443359375, "reward_std": 0.3362274765968323, "rewards/helpfulness_reward/mean": 1.0642280578613281, "rewards/helpfulness_reward/std": 0.8852719068527222, "rewards/safety_reward/mean": 8.679443359375, "rewards/safety_reward/std": 1.201208472251892, "step": 983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 85.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 54.2421875, "completions/mean_terminated_length": 54.2421875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.34372543882630335, "frac_reward_zero_std": 0.0625, "grad_norm": 0.3250572979450226, "kl": 3.845703125, "learning_rate": 5e-05, "loss": 0.045, "num_tokens": 12918765.0, "reward": 8.759521484375, "reward_std": 0.1409989595413208, "rewards/helpfulness_reward/mean": 0.878662109375, "rewards/helpfulness_reward/std": 0.6037145256996155, "rewards/safety_reward/mean": 8.759521484375, "rewards/safety_reward/std": 0.886164665222168, "step": 984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 53.9765625, "completions/mean_terminated_length": 53.9765625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.3440747532966553, "frac_reward_zero_std": 0.0625, "grad_norm": 0.2864457666873932, "kl": 4.12109375, "learning_rate": 5e-05, "loss": 0.0379, "num_tokens": 12930018.0, "reward": 8.36865234375, "reward_std": 0.26973021030426025, "rewards/helpfulness_reward/mean": 0.9339396953582764, "rewards/helpfulness_reward/std": 0.6497293710708618, "rewards/safety_reward/mean": 8.36865234375, "rewards/safety_reward/std": 0.8504930734634399, "step": 985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 87.0, "completions/max_terminated_length": 87.0, "completions/mean_length": 54.90625, "completions/mean_terminated_length": 54.90625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.34442406776700724, "frac_reward_zero_std": 0.0, "grad_norm": 0.3929842710494995, "kl": 4.2421875, "learning_rate": 5e-05, "loss": 0.0712, "num_tokens": 12942134.0, "reward": 8.71826171875, "reward_std": 0.34092143177986145, "rewards/helpfulness_reward/mean": 1.2560882568359375, "rewards/helpfulness_reward/std": 0.6606929302215576, "rewards/safety_reward/mean": 8.71826171875, "rewards/safety_reward/std": 1.231500506401062, "step": 986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 54.0, "completions/mean_terminated_length": 54.0, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.3447733822373592, "frac_reward_zero_std": 0.0, "grad_norm": 0.3362930119037628, "kl": 4.13671875, "learning_rate": 5e-05, "loss": 0.0317, "num_tokens": 12953470.0, "reward": 8.882568359375, "reward_std": 0.33632105588912964, "rewards/helpfulness_reward/mean": 1.2260990142822266, "rewards/helpfulness_reward/std": 0.7420486211776733, "rewards/safety_reward/mean": 8.882568359375, "rewards/safety_reward/std": 1.1494415998458862, "step": 987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 54.1328125, "completions/mean_terminated_length": 54.1328125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.3451226967077111, "frac_reward_zero_std": 0.0625, "grad_norm": 0.49084562063217163, "kl": 3.8984375, "learning_rate": 5e-05, "loss": 0.0403, "num_tokens": 12964447.0, "reward": 8.424560546875, "reward_std": 0.1997050642967224, "rewards/helpfulness_reward/mean": 0.8826866149902344, "rewards/helpfulness_reward/std": 0.7100235819816589, "rewards/safety_reward/mean": 8.424560546875, "rewards/safety_reward/std": 0.9849501252174377, "step": 988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 54.34375, "completions/mean_terminated_length": 54.34375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.34547201117806303, "frac_reward_zero_std": 0.0625, "grad_norm": 0.36580511927604675, "kl": 4.0703125, "learning_rate": 5e-05, "loss": 0.0505, "num_tokens": 12975451.0, "reward": 8.359619140625, "reward_std": 0.26004090905189514, "rewards/helpfulness_reward/mean": 0.8643198013305664, "rewards/helpfulness_reward/std": 0.7110540270805359, "rewards/safety_reward/mean": 8.359619140625, "rewards/safety_reward/std": 1.2047288417816162, "step": 989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 54.171875, "completions/mean_terminated_length": 54.171875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.345821325648415, "frac_reward_zero_std": 0.0, "grad_norm": 0.35555917024612427, "kl": 4.1953125, "learning_rate": 5e-05, "loss": 0.0499, "num_tokens": 12986209.0, "reward": 8.658447265625, "reward_std": 0.22173742949962616, "rewards/helpfulness_reward/mean": 1.2998418807983398, "rewards/helpfulness_reward/std": 0.628387451171875, "rewards/safety_reward/mean": 8.658447265625, "rewards/safety_reward/std": 1.1476167440414429, "step": 990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 53.9921875, "completions/mean_terminated_length": 53.9921875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.3461706401187669, "frac_reward_zero_std": 0.0625, "grad_norm": 0.3719691038131714, "kl": 4.072265625, "learning_rate": 5e-05, "loss": 0.0407, "num_tokens": 12997048.0, "reward": 9.20361328125, "reward_std": 0.1322462558746338, "rewards/helpfulness_reward/mean": 1.3809967041015625, "rewards/helpfulness_reward/std": 0.5217525959014893, "rewards/safety_reward/mean": 9.20361328125, "rewards/safety_reward/std": 1.0425699949264526, "step": 991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 97.0, "completions/max_terminated_length": 97.0, "completions/mean_length": 54.3515625, "completions/mean_terminated_length": 54.3515625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.34651995458911883, "frac_reward_zero_std": 0.0, "grad_norm": 0.5914506316184998, "kl": 4.17578125, "learning_rate": 5e-05, "loss": 0.0551, "num_tokens": 13008757.0, "reward": 8.55419921875, "reward_std": 0.25219589471817017, "rewards/helpfulness_reward/mean": 1.3024673461914062, "rewards/helpfulness_reward/std": 0.8202577829360962, "rewards/safety_reward/mean": 8.55419921875, "rewards/safety_reward/std": 1.1617035865783691, "step": 992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 54.0625, "completions/mean_terminated_length": 54.0625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.3468692690594708, "frac_reward_zero_std": 0.0, "grad_norm": 0.41226550936698914, "kl": 4.150390625, "learning_rate": 5e-05, "loss": 0.0408, "num_tokens": 13021069.0, "reward": 8.9619140625, "reward_std": 0.22410067915916443, "rewards/helpfulness_reward/mean": 1.202554702758789, "rewards/helpfulness_reward/std": 0.7004518508911133, "rewards/safety_reward/mean": 8.9619140625, "rewards/safety_reward/std": 0.7762481570243835, "step": 993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 54.28125, "completions/mean_terminated_length": 54.28125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.3472185835298227, "frac_reward_zero_std": 0.0, "grad_norm": 0.3599089980125427, "kl": 4.126953125, "learning_rate": 5e-05, "loss": 0.0488, "num_tokens": 13032849.0, "reward": 9.08837890625, "reward_std": 0.251284658908844, "rewards/helpfulness_reward/mean": 1.2927665710449219, "rewards/helpfulness_reward/std": 0.4902353286743164, "rewards/safety_reward/mean": 9.08837890625, "rewards/safety_reward/std": 0.9677462577819824, "step": 994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 54.1484375, "completions/mean_terminated_length": 54.1484375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.3475678980001747, "frac_reward_zero_std": 0.0, "grad_norm": 0.335403174161911, "kl": 3.94921875, "learning_rate": 5e-05, "loss": 0.0453, "num_tokens": 13044140.0, "reward": 8.572021484375, "reward_std": 0.27508461475372314, "rewards/helpfulness_reward/mean": 0.9284858703613281, "rewards/helpfulness_reward/std": 0.6893311738967896, "rewards/safety_reward/mean": 8.572021484375, "rewards/safety_reward/std": 1.1919244527816772, "step": 995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 54.6640625, "completions/mean_terminated_length": 54.6640625, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.3479172124705266, "frac_reward_zero_std": 0.0, "grad_norm": 0.5152756571769714, "kl": 3.978515625, "learning_rate": 5e-05, "loss": 0.053, "num_tokens": 13055785.0, "reward": 8.601318359375, "reward_std": 0.36872464418411255, "rewards/helpfulness_reward/mean": 1.084050178527832, "rewards/helpfulness_reward/std": 0.7060816884040833, "rewards/safety_reward/mean": 8.601318359375, "rewards/safety_reward/std": 1.2101595401763916, "step": 996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 54.515625, "completions/mean_terminated_length": 54.515625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.3482665269408785, "frac_reward_zero_std": 0.0, "grad_norm": 0.33892354369163513, "kl": 4.16796875, "learning_rate": 5e-05, "loss": 0.05, "num_tokens": 13067003.0, "reward": 8.640380859375, "reward_std": 0.22980786859989166, "rewards/helpfulness_reward/mean": 1.004058837890625, "rewards/helpfulness_reward/std": 0.8336489200592041, "rewards/safety_reward/mean": 8.640380859375, "rewards/safety_reward/std": 1.2064381837844849, "step": 997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 54.625, "completions/mean_terminated_length": 54.625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.3486158414112305, "frac_reward_zero_std": 0.0, "grad_norm": 0.3387080430984497, "kl": 3.84375, "learning_rate": 5e-05, "loss": 0.0514, "num_tokens": 13080731.0, "reward": 8.030029296875, "reward_std": 0.2580711245536804, "rewards/helpfulness_reward/mean": 0.9104456305503845, "rewards/helpfulness_reward/std": 0.8089323043823242, "rewards/safety_reward/mean": 8.030029296875, "rewards/safety_reward/std": 1.8585478067398071, "step": 998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 54.65625, "completions/mean_terminated_length": 54.65625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.3489651558815824, "frac_reward_zero_std": 0.0625, "grad_norm": 0.4569501578807831, "kl": 4.009765625, "learning_rate": 5e-05, "loss": 0.0637, "num_tokens": 13091295.0, "reward": 8.406005859375, "reward_std": 0.4022517800331116, "rewards/helpfulness_reward/mean": 1.092513084411621, "rewards/helpfulness_reward/std": 0.5354636907577515, "rewards/safety_reward/mean": 8.406005859375, "rewards/safety_reward/std": 0.9469218850135803, "step": 999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 54.8359375, "completions/mean_terminated_length": 54.8359375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.3493144703519343, "frac_reward_zero_std": 0.0, "grad_norm": 0.34586843848228455, "kl": 4.099609375, "learning_rate": 5e-05, "loss": 0.0534, "num_tokens": 13102210.0, "reward": 9.12353515625, "reward_std": 0.2841973900794983, "rewards/helpfulness_reward/mean": 1.187525749206543, "rewards/helpfulness_reward/std": 0.7206754684448242, "rewards/safety_reward/mean": 9.12353515625, "rewards/safety_reward/std": 1.0057570934295654, "step": 1000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 54.265625, "completions/mean_terminated_length": 54.265625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.34966378482228627, "frac_reward_zero_std": 0.0625, "grad_norm": 2.3412580490112305, "kl": 5.572265625, "learning_rate": 5e-05, "loss": 0.0639, "num_tokens": 13113564.0, "reward": 8.8759765625, "reward_std": 0.2536240518093109, "rewards/helpfulness_reward/mean": 1.24041748046875, "rewards/helpfulness_reward/std": 0.8390331864356995, "rewards/safety_reward/mean": 8.8759765625, "rewards/safety_reward/std": 1.300096869468689, "step": 1001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 92.0, "completions/max_terminated_length": 92.0, "completions/mean_length": 54.6484375, "completions/mean_terminated_length": 54.6484375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.3500130992926382, "frac_reward_zero_std": 0.0, "grad_norm": 0.40287667512893677, "kl": 4.166015625, "learning_rate": 5e-05, "loss": 0.0557, "num_tokens": 13125375.0, "reward": 8.8154296875, "reward_std": 0.3709428906440735, "rewards/helpfulness_reward/mean": 1.2777233123779297, "rewards/helpfulness_reward/std": 0.7625518441200256, "rewards/safety_reward/mean": 8.8154296875, "rewards/safety_reward/std": 1.2532588243484497, "step": 1002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 54.171875, "completions/mean_terminated_length": 54.171875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.35036241376299015, "frac_reward_zero_std": 0.0, "grad_norm": 0.4923807680606842, "kl": 4.029296875, "learning_rate": 5e-05, "loss": 0.0443, "num_tokens": 13136341.0, "reward": 8.799560546875, "reward_std": 0.2325611710548401, "rewards/helpfulness_reward/mean": 0.8439092636108398, "rewards/helpfulness_reward/std": 0.8757749795913696, "rewards/safety_reward/mean": 8.799560546875, "rewards/safety_reward/std": 1.284139633178711, "step": 1003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 54.1328125, "completions/mean_terminated_length": 54.1328125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.35071172823334207, "frac_reward_zero_std": 0.0, "grad_norm": 0.3686533570289612, "kl": 4.16015625, "learning_rate": 5e-05, "loss": 0.0426, "num_tokens": 13147774.0, "reward": 8.6337890625, "reward_std": 0.20069725811481476, "rewards/helpfulness_reward/mean": 1.1648235321044922, "rewards/helpfulness_reward/std": 0.7013720273971558, "rewards/safety_reward/mean": 8.6337890625, "rewards/safety_reward/std": 1.4525220394134521, "step": 1004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 54.4453125, "completions/mean_terminated_length": 54.4453125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.351061042703694, "frac_reward_zero_std": 0.0, "grad_norm": 0.3572978973388672, "kl": 4.24609375, "learning_rate": 5e-05, "loss": 0.0561, "num_tokens": 13159583.0, "reward": 8.909912109375, "reward_std": 0.23934495449066162, "rewards/helpfulness_reward/mean": 1.5029525756835938, "rewards/helpfulness_reward/std": 0.7383021712303162, "rewards/safety_reward/mean": 8.909912109375, "rewards/safety_reward/std": 1.2160474061965942, "step": 1005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 54.375, "completions/mean_terminated_length": 54.375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.35141035717404595, "frac_reward_zero_std": 0.0, "grad_norm": 0.38769233226776123, "kl": 4.015625, "learning_rate": 5e-05, "loss": 0.0475, "num_tokens": 13170695.0, "reward": 9.0810546875, "reward_std": 0.22440585494041443, "rewards/helpfulness_reward/mean": 1.2949066162109375, "rewards/helpfulness_reward/std": 0.5292204022407532, "rewards/safety_reward/mean": 9.0810546875, "rewards/safety_reward/std": 0.8029940128326416, "step": 1006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 86.0, "completions/max_terminated_length": 86.0, "completions/mean_length": 54.6328125, "completions/mean_terminated_length": 54.6328125, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.35175967164439786, "frac_reward_zero_std": 0.0, "grad_norm": 0.39086273312568665, "kl": 4.296875, "learning_rate": 5e-05, "loss": 0.0666, "num_tokens": 13183264.0, "reward": 8.874267578125, "reward_std": 0.3193231225013733, "rewards/helpfulness_reward/mean": 1.4214982986450195, "rewards/helpfulness_reward/std": 0.6159697771072388, "rewards/safety_reward/mean": 8.874267578125, "rewards/safety_reward/std": 1.0852521657943726, "step": 1007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 54.1484375, "completions/mean_terminated_length": 54.1484375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.3521089861147498, "frac_reward_zero_std": 0.0625, "grad_norm": 0.3826528787612915, "kl": 4.3203125, "learning_rate": 5e-05, "loss": 0.0437, "num_tokens": 13195203.0, "reward": 8.73095703125, "reward_std": 0.2963566780090332, "rewards/helpfulness_reward/mean": 0.7262228727340698, "rewards/helpfulness_reward/std": 0.6353182196617126, "rewards/safety_reward/mean": 8.73095703125, "rewards/safety_reward/std": 1.2014708518981934, "step": 1008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 54.1796875, "completions/mean_terminated_length": 54.1796875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.35245830058510175, "frac_reward_zero_std": 0.0, "grad_norm": 0.302397221326828, "kl": 3.935546875, "learning_rate": 5e-05, "loss": 0.0427, "num_tokens": 13207442.0, "reward": 9.239501953125, "reward_std": 0.23504143953323364, "rewards/helpfulness_reward/mean": 1.535980224609375, "rewards/helpfulness_reward/std": 0.5241897106170654, "rewards/safety_reward/mean": 9.239501953125, "rewards/safety_reward/std": 1.2749314308166504, "step": 1009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 54.25, "completions/mean_terminated_length": 54.25, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.35280761505545366, "frac_reward_zero_std": 0.0625, "grad_norm": 0.37353575229644775, "kl": 4.421875, "learning_rate": 5e-05, "loss": 0.05, "num_tokens": 13218442.0, "reward": 9.225830078125, "reward_std": 0.22381198406219482, "rewards/helpfulness_reward/mean": 1.4113006591796875, "rewards/helpfulness_reward/std": 0.5667091608047485, "rewards/safety_reward/mean": 9.225830078125, "rewards/safety_reward/std": 0.8435195088386536, "step": 1010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 84.0, "completions/max_terminated_length": 84.0, "completions/mean_length": 54.5625, "completions/mean_terminated_length": 54.5625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.35315692952580563, "frac_reward_zero_std": 0.0, "grad_norm": 0.39010918140411377, "kl": 4.03125, "learning_rate": 5e-05, "loss": 0.0579, "num_tokens": 13229650.0, "reward": 8.687744140625, "reward_std": 0.2650066614151001, "rewards/helpfulness_reward/mean": 1.3615789413452148, "rewards/helpfulness_reward/std": 0.5805946588516235, "rewards/safety_reward/mean": 8.687744140625, "rewards/safety_reward/std": 0.5930761098861694, "step": 1011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 54.109375, "completions/mean_terminated_length": 54.109375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.35350624399615754, "frac_reward_zero_std": 0.0, "grad_norm": 0.3814920485019684, "kl": 3.951171875, "learning_rate": 5e-05, "loss": 0.0409, "num_tokens": 13241000.0, "reward": 9.068115234375, "reward_std": 0.14476248621940613, "rewards/helpfulness_reward/mean": 1.4336776733398438, "rewards/helpfulness_reward/std": 0.4590052366256714, "rewards/safety_reward/mean": 9.068115234375, "rewards/safety_reward/std": 1.464539647102356, "step": 1012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 54.3671875, "completions/mean_terminated_length": 54.3671875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.35385555846650946, "frac_reward_zero_std": 0.0, "grad_norm": 0.39795786142349243, "kl": 4.162109375, "learning_rate": 5e-05, "loss": 0.0476, "num_tokens": 13253063.0, "reward": 8.6669921875, "reward_std": 0.3205583691596985, "rewards/helpfulness_reward/mean": 1.2662239074707031, "rewards/helpfulness_reward/std": 0.6451355218887329, "rewards/safety_reward/mean": 8.6669921875, "rewards/safety_reward/std": 1.0155459642410278, "step": 1013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 54.2578125, "completions/mean_terminated_length": 54.2578125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.3542048729368614, "frac_reward_zero_std": 0.0, "grad_norm": 0.35724636912345886, "kl": 4.048828125, "learning_rate": 5e-05, "loss": 0.0485, "num_tokens": 13264688.0, "reward": 8.5938720703125, "reward_std": 0.3157307505607605, "rewards/helpfulness_reward/mean": 1.2257499694824219, "rewards/helpfulness_reward/std": 0.7890333533287048, "rewards/safety_reward/mean": 8.5938720703125, "rewards/safety_reward/std": 1.2679574489593506, "step": 1014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 54.4921875, "completions/mean_terminated_length": 54.4921875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.35455418740721334, "frac_reward_zero_std": 0.0, "grad_norm": 0.8297381401062012, "kl": 4.525390625, "learning_rate": 5e-05, "loss": 0.0594, "num_tokens": 13275975.0, "reward": 9.1044921875, "reward_std": 0.28103768825531006, "rewards/helpfulness_reward/mean": 1.2487068176269531, "rewards/helpfulness_reward/std": 0.5522201657295227, "rewards/safety_reward/mean": 9.1044921875, "rewards/safety_reward/std": 1.1783437728881836, "step": 1015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 54.078125, "completions/mean_terminated_length": 54.078125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.35490350187756525, "frac_reward_zero_std": 0.0, "grad_norm": 0.5054092407226562, "kl": 4.32421875, "learning_rate": 5e-05, "loss": 0.047, "num_tokens": 13286681.0, "reward": 8.769287109375, "reward_std": 0.2598116993904114, "rewards/helpfulness_reward/mean": 1.385910987854004, "rewards/helpfulness_reward/std": 0.4624502658843994, "rewards/safety_reward/mean": 8.769287109375, "rewards/safety_reward/std": 1.260811448097229, "step": 1016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 54.546875, "completions/mean_terminated_length": 54.546875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.3552528163479172, "frac_reward_zero_std": 0.0, "grad_norm": 2.8830137252807617, "kl": 5.296875, "learning_rate": 5e-05, "loss": 0.0673, "num_tokens": 13297423.0, "reward": 8.852294921875, "reward_std": 0.33155348896980286, "rewards/helpfulness_reward/mean": 1.1148676872253418, "rewards/helpfulness_reward/std": 0.976499617099762, "rewards/safety_reward/mean": 8.852294921875, "rewards/safety_reward/std": 1.1969212293624878, "step": 1017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 54.046875, "completions/mean_terminated_length": 54.046875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.35560213081826914, "frac_reward_zero_std": 0.0, "grad_norm": 0.2667228579521179, "kl": 4.087890625, "learning_rate": 5e-05, "loss": 0.0422, "num_tokens": 13308101.0, "reward": 8.655029296875, "reward_std": 0.21984857320785522, "rewards/helpfulness_reward/mean": 1.059988021850586, "rewards/helpfulness_reward/std": 0.8245813250541687, "rewards/safety_reward/mean": 8.655029296875, "rewards/safety_reward/std": 1.2210626602172852, "step": 1018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 54.3359375, "completions/mean_terminated_length": 54.3359375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.3559514452886211, "frac_reward_zero_std": 0.0, "grad_norm": 0.4761098325252533, "kl": 4.2578125, "learning_rate": 5e-05, "loss": 0.0558, "num_tokens": 13318504.0, "reward": 9.294921875, "reward_std": 0.28538164496421814, "rewards/helpfulness_reward/mean": 1.3829669952392578, "rewards/helpfulness_reward/std": 0.6981922388076782, "rewards/safety_reward/mean": 9.294921875, "rewards/safety_reward/std": 1.506592035293579, "step": 1019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 54.421875, "completions/mean_terminated_length": 54.421875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.356300759758973, "frac_reward_zero_std": 0.0, "grad_norm": 0.4924279451370239, "kl": 4.52734375, "learning_rate": 5e-05, "loss": 0.0606, "num_tokens": 13330206.0, "reward": 8.780029296875, "reward_std": 0.3600066900253296, "rewards/helpfulness_reward/mean": 1.4185104370117188, "rewards/helpfulness_reward/std": 0.7904649376869202, "rewards/safety_reward/mean": 8.780029296875, "rewards/safety_reward/std": 1.0228638648986816, "step": 1020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 54.53125, "completions/mean_terminated_length": 54.53125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.35665007422932493, "frac_reward_zero_std": 0.0, "grad_norm": 0.3995843827724457, "kl": 4.109375, "learning_rate": 5e-05, "loss": 0.0491, "num_tokens": 13341234.0, "reward": 8.635498046875, "reward_std": 0.3432002663612366, "rewards/helpfulness_reward/mean": 1.203775405883789, "rewards/helpfulness_reward/std": 0.6415616273880005, "rewards/safety_reward/mean": 8.635498046875, "rewards/safety_reward/std": 1.0966761112213135, "step": 1021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 57.5625, "completions/mean_terminated_length": 57.5625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.3569993886996769, "frac_reward_zero_std": 0.0, "grad_norm": 0.40140610933303833, "kl": 3.796875, "learning_rate": 5e-05, "loss": 0.0981, "num_tokens": 13355354.0, "reward": 8.51214599609375, "reward_std": 0.434048593044281, "rewards/helpfulness_reward/mean": 1.2749967575073242, "rewards/helpfulness_reward/std": 0.8054088950157166, "rewards/safety_reward/mean": 8.51214599609375, "rewards/safety_reward/std": 1.5549484491348267, "step": 1022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 54.7734375, "completions/mean_terminated_length": 54.7734375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.3573487031700288, "frac_reward_zero_std": 0.0625, "grad_norm": 0.34376880526542664, "kl": 3.974609375, "learning_rate": 5e-05, "loss": 0.0602, "num_tokens": 13366533.0, "reward": 8.576171875, "reward_std": 0.3406940996646881, "rewards/helpfulness_reward/mean": 1.2577123641967773, "rewards/helpfulness_reward/std": 0.6657001972198486, "rewards/safety_reward/mean": 8.576171875, "rewards/safety_reward/std": 1.091915249824524, "step": 1023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 54.09375, "completions/mean_terminated_length": 54.09375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.3576980176403807, "frac_reward_zero_std": 0.0, "grad_norm": 0.3708425760269165, "kl": 4.009765625, "learning_rate": 5e-05, "loss": 0.0421, "num_tokens": 13377361.0, "reward": 9.541259765625, "reward_std": 0.15806281566619873, "rewards/helpfulness_reward/mean": 1.7130126953125, "rewards/helpfulness_reward/std": 0.5991260409355164, "rewards/safety_reward/mean": 9.541259765625, "rewards/safety_reward/std": 0.9538028836250305, "step": 1024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 54.953125, "completions/mean_terminated_length": 54.953125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.3580473321107327, "frac_reward_zero_std": 0.0, "grad_norm": 0.4836071729660034, "kl": 4.044921875, "learning_rate": 5e-05, "loss": 0.0823, "num_tokens": 13388235.0, "reward": 8.621337890625, "reward_std": 0.4768297076225281, "rewards/helpfulness_reward/mean": 1.2384395599365234, "rewards/helpfulness_reward/std": 0.8328243494033813, "rewards/safety_reward/mean": 8.621337890625, "rewards/safety_reward/std": 1.241776704788208, "step": 1025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 54.109375, "completions/mean_terminated_length": 54.109375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.3583966465810846, "frac_reward_zero_std": 0.0, "grad_norm": 0.4528546929359436, "kl": 4.234375, "learning_rate": 5e-05, "loss": 0.0463, "num_tokens": 13399209.0, "reward": 9.267333984375, "reward_std": 0.22384585440158844, "rewards/helpfulness_reward/mean": 1.3459854125976562, "rewards/helpfulness_reward/std": 0.5984275341033936, "rewards/safety_reward/mean": 9.267333984375, "rewards/safety_reward/std": 1.1939886808395386, "step": 1026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 54.890625, "completions/mean_terminated_length": 54.890625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.3587459610514366, "frac_reward_zero_std": 0.0, "grad_norm": 0.39386874437332153, "kl": 4.12890625, "learning_rate": 5e-05, "loss": 0.071, "num_tokens": 13409971.0, "reward": 8.6658935546875, "reward_std": 0.638097882270813, "rewards/helpfulness_reward/mean": 1.1438045501708984, "rewards/helpfulness_reward/std": 0.6963846683502197, "rewards/safety_reward/mean": 8.6658935546875, "rewards/safety_reward/std": 1.2425529956817627, "step": 1027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 54.2421875, "completions/mean_terminated_length": 54.2421875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.3590952755217885, "frac_reward_zero_std": 0.0, "grad_norm": 0.4173142910003662, "kl": 4.27734375, "learning_rate": 5e-05, "loss": 0.0488, "num_tokens": 13421706.0, "reward": 9.08740234375, "reward_std": 0.2612406611442566, "rewards/helpfulness_reward/mean": 1.2328166961669922, "rewards/helpfulness_reward/std": 0.6836113333702087, "rewards/safety_reward/mean": 9.08740234375, "rewards/safety_reward/std": 1.090665340423584, "step": 1028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 54.296875, "completions/mean_terminated_length": 54.296875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.3594445899921404, "frac_reward_zero_std": 0.0, "grad_norm": 0.33729690313339233, "kl": 4.0703125, "learning_rate": 5e-05, "loss": 0.0502, "num_tokens": 13432296.0, "reward": 9.187744140625, "reward_std": 0.24827229976654053, "rewards/helpfulness_reward/mean": 1.3006973266601562, "rewards/helpfulness_reward/std": 1.0246186256408691, "rewards/safety_reward/mean": 9.187744140625, "rewards/safety_reward/std": 1.0744786262512207, "step": 1029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 89.0, "completions/max_terminated_length": 89.0, "completions/mean_length": 54.484375, "completions/mean_terminated_length": 54.484375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.3597939044624924, "frac_reward_zero_std": 0.0, "grad_norm": 0.48740434646606445, "kl": 4.43359375, "learning_rate": 5e-05, "loss": 0.0525, "num_tokens": 13444974.0, "reward": 8.7294921875, "reward_std": 0.3720049262046814, "rewards/helpfulness_reward/mean": 1.116044044494629, "rewards/helpfulness_reward/std": 0.73552405834198, "rewards/safety_reward/mean": 8.7294921875, "rewards/safety_reward/std": 1.4190034866333008, "step": 1030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 54.1015625, "completions/mean_terminated_length": 54.1015625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.3601432189328443, "frac_reward_zero_std": 0.0, "grad_norm": 0.4148586690425873, "kl": 4.15625, "learning_rate": 5e-05, "loss": 0.0424, "num_tokens": 13455579.0, "reward": 9.013427734375, "reward_std": 0.2661261558532715, "rewards/helpfulness_reward/mean": 1.0926027297973633, "rewards/helpfulness_reward/std": 0.8087120056152344, "rewards/safety_reward/mean": 9.013427734375, "rewards/safety_reward/std": 0.8743202686309814, "step": 1031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 54.6875, "completions/mean_terminated_length": 54.6875, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.3604925334031962, "frac_reward_zero_std": 0.0, "grad_norm": 0.371038019657135, "kl": 4.130859375, "learning_rate": 5e-05, "loss": 0.0573, "num_tokens": 13466675.0, "reward": 8.767333984375, "reward_std": 0.268655002117157, "rewards/helpfulness_reward/mean": 0.9296112060546875, "rewards/helpfulness_reward/std": 0.45120832324028015, "rewards/safety_reward/mean": 8.767333984375, "rewards/safety_reward/std": 1.0618325471878052, "step": 1032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 54.2421875, "completions/mean_terminated_length": 54.2421875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.36084184787354817, "frac_reward_zero_std": 0.0, "grad_norm": 0.36060187220573425, "kl": 4.21484375, "learning_rate": 5e-05, "loss": 0.0515, "num_tokens": 13477698.0, "reward": 8.9970703125, "reward_std": 0.2902703285217285, "rewards/helpfulness_reward/mean": 1.1176338195800781, "rewards/helpfulness_reward/std": 0.7606704831123352, "rewards/safety_reward/mean": 8.9970703125, "rewards/safety_reward/std": 1.1793417930603027, "step": 1033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 54.1796875, "completions/mean_terminated_length": 54.1796875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.3611911623439001, "frac_reward_zero_std": 0.0, "grad_norm": 0.49959176778793335, "kl": 4.359375, "learning_rate": 5e-05, "loss": 0.0467, "num_tokens": 13488497.0, "reward": 8.871826171875, "reward_std": 0.2501041293144226, "rewards/helpfulness_reward/mean": 1.1380558013916016, "rewards/helpfulness_reward/std": 0.8912343978881836, "rewards/safety_reward/mean": 8.871826171875, "rewards/safety_reward/std": 1.1202178001403809, "step": 1034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/max_terminated_length": 568.0, "completions/mean_length": 66.9765625, "completions/mean_terminated_length": 66.9765625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.36154047681425205, "frac_reward_zero_std": 0.0, "grad_norm": 0.42828404903411865, "kl": 3.841796875, "learning_rate": 5e-05, "loss": 0.1397, "num_tokens": 13503174.0, "reward": 8.35498046875, "reward_std": 0.5237988233566284, "rewards/helpfulness_reward/mean": 1.2700109481811523, "rewards/helpfulness_reward/std": 0.7166681289672852, "rewards/safety_reward/mean": 8.35498046875, "rewards/safety_reward/std": 2.270841598510742, "step": 1035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 54.2109375, "completions/mean_terminated_length": 54.2109375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.36188979128460397, "frac_reward_zero_std": 0.0, "grad_norm": 0.4406109154224396, "kl": 3.8125, "learning_rate": 5e-05, "loss": 0.0404, "num_tokens": 13513969.0, "reward": 8.318359375, "reward_std": 0.2944471538066864, "rewards/helpfulness_reward/mean": 0.9905627965927124, "rewards/helpfulness_reward/std": 0.6726572513580322, "rewards/safety_reward/mean": 8.318359375, "rewards/safety_reward/std": 1.1095335483551025, "step": 1036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 54.1875, "completions/mean_terminated_length": 54.1875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.3622391057549559, "frac_reward_zero_std": 0.0, "grad_norm": 0.46944916248321533, "kl": 4.01171875, "learning_rate": 5e-05, "loss": 0.0459, "num_tokens": 13524697.0, "reward": 9.072998046875, "reward_std": 0.3317440152168274, "rewards/helpfulness_reward/mean": 1.431370496749878, "rewards/helpfulness_reward/std": 0.703248918056488, "rewards/safety_reward/mean": 9.072998046875, "rewards/safety_reward/std": 0.8296686410903931, "step": 1037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 54.1796875, "completions/mean_terminated_length": 54.1796875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.36258842022530785, "frac_reward_zero_std": 0.0, "grad_norm": 0.2794610857963562, "kl": 4.1171875, "learning_rate": 5e-05, "loss": 0.0404, "num_tokens": 13536192.0, "reward": 8.754638671875, "reward_std": 0.24152064323425293, "rewards/helpfulness_reward/mean": 1.3617172241210938, "rewards/helpfulness_reward/std": 0.5976154804229736, "rewards/safety_reward/mean": 8.754638671875, "rewards/safety_reward/std": 0.9731420874595642, "step": 1038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 54.1484375, "completions/mean_terminated_length": 54.1484375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.36293773469565976, "frac_reward_zero_std": 0.0, "grad_norm": 0.39159321784973145, "kl": 4.25, "learning_rate": 5e-05, "loss": 0.0363, "num_tokens": 13547507.0, "reward": 8.5792236328125, "reward_std": 0.25535109639167786, "rewards/helpfulness_reward/mean": 1.09124755859375, "rewards/helpfulness_reward/std": 0.7495378255844116, "rewards/safety_reward/mean": 8.5792236328125, "rewards/safety_reward/std": 0.9806137681007385, "step": 1039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 54.1484375, "completions/mean_terminated_length": 54.1484375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.3632870491660117, "frac_reward_zero_std": 0.0, "grad_norm": 0.45851486921310425, "kl": 4.125, "learning_rate": 5e-05, "loss": 0.0431, "num_tokens": 13557870.0, "reward": 8.950927734375, "reward_std": 0.27696692943573, "rewards/helpfulness_reward/mean": 1.53955078125, "rewards/helpfulness_reward/std": 0.6224413514137268, "rewards/safety_reward/mean": 8.950927734375, "rewards/safety_reward/std": 0.9183298945426941, "step": 1040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 53.9765625, "completions/mean_terminated_length": 53.9765625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.36363636363636365, "frac_reward_zero_std": 0.0, "grad_norm": 0.3338521420955658, "kl": 4.021484375, "learning_rate": 5e-05, "loss": 0.0344, "num_tokens": 13568595.0, "reward": 9.0908203125, "reward_std": 0.24538856744766235, "rewards/helpfulness_reward/mean": 1.2422232627868652, "rewards/helpfulness_reward/std": 0.710659384727478, "rewards/safety_reward/mean": 9.0908203125, "rewards/safety_reward/std": 1.1996018886566162, "step": 1041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 54.0625, "completions/mean_terminated_length": 54.0625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.36398567810671556, "frac_reward_zero_std": 0.0, "grad_norm": 0.4050903022289276, "kl": 4.04296875, "learning_rate": 5e-05, "loss": 0.0416, "num_tokens": 13579499.0, "reward": 8.929443359375, "reward_std": 0.21322016417980194, "rewards/helpfulness_reward/mean": 1.357757568359375, "rewards/helpfulness_reward/std": 0.46275708079338074, "rewards/safety_reward/mean": 8.929443359375, "rewards/safety_reward/std": 0.8862834572792053, "step": 1042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 100.0, "completions/max_terminated_length": 100.0, "completions/mean_length": 54.3828125, "completions/mean_terminated_length": 54.3828125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.36433499257706753, "frac_reward_zero_std": 0.0, "grad_norm": 0.3965219557285309, "kl": 4.119140625, "learning_rate": 5e-05, "loss": 0.0586, "num_tokens": 13591028.0, "reward": 9.031494140625, "reward_std": 0.30953723192214966, "rewards/helpfulness_reward/mean": 1.1550378799438477, "rewards/helpfulness_reward/std": 0.686394453048706, "rewards/safety_reward/mean": 9.031494140625, "rewards/safety_reward/std": 1.4082388877868652, "step": 1043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 54.1640625, "completions/mean_terminated_length": 54.1640625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.36468430704741944, "frac_reward_zero_std": 0.0, "grad_norm": 0.45086154341697693, "kl": 4.2265625, "learning_rate": 5e-05, "loss": 0.0459, "num_tokens": 13601721.0, "reward": 9.09033203125, "reward_std": 0.15643835067749023, "rewards/helpfulness_reward/mean": 1.18585205078125, "rewards/helpfulness_reward/std": 0.7487500309944153, "rewards/safety_reward/mean": 9.09033203125, "rewards/safety_reward/std": 0.9263546466827393, "step": 1044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 54.0234375, "completions/mean_terminated_length": 54.0234375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.36503362151777136, "frac_reward_zero_std": 0.0625, "grad_norm": 0.39530959725379944, "kl": 4.25, "learning_rate": 5e-05, "loss": 0.0355, "num_tokens": 13612908.0, "reward": 9.0205078125, "reward_std": 0.29067355394363403, "rewards/helpfulness_reward/mean": 0.9966869354248047, "rewards/helpfulness_reward/std": 0.7118358016014099, "rewards/safety_reward/mean": 9.0205078125, "rewards/safety_reward/std": 0.7507832646369934, "step": 1045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 53.9921875, "completions/mean_terminated_length": 53.9921875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.3653829359881233, "frac_reward_zero_std": 0.0, "grad_norm": 16.80134391784668, "kl": 12.75390625, "learning_rate": 5e-05, "loss": 0.126, "num_tokens": 13623747.0, "reward": 9.007568359375, "reward_std": 0.2167188972234726, "rewards/helpfulness_reward/mean": 1.3379707336425781, "rewards/helpfulness_reward/std": 0.8138722777366638, "rewards/safety_reward/mean": 9.007568359375, "rewards/safety_reward/std": 0.7902848124504089, "step": 1046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 54.0703125, "completions/mean_terminated_length": 54.0703125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.36573225045847524, "frac_reward_zero_std": 0.0, "grad_norm": 0.44786831736564636, "kl": 4.1796875, "learning_rate": 5e-05, "loss": 0.0422, "num_tokens": 13634508.0, "reward": 9.191650390625, "reward_std": 0.17287597060203552, "rewards/helpfulness_reward/mean": 1.1197662353515625, "rewards/helpfulness_reward/std": 0.5614249110221863, "rewards/safety_reward/mean": 9.191650390625, "rewards/safety_reward/std": 1.0495268106460571, "step": 1047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 54.1796875, "completions/mean_terminated_length": 54.1796875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.36608156492882715, "frac_reward_zero_std": 0.0, "grad_norm": 0.35035431385040283, "kl": 4.052734375, "learning_rate": 5e-05, "loss": 0.0368, "num_tokens": 13646139.0, "reward": 8.677490234375, "reward_std": 0.2351846694946289, "rewards/helpfulness_reward/mean": 1.3580780029296875, "rewards/helpfulness_reward/std": 0.6849490404129028, "rewards/safety_reward/mean": 8.677490234375, "rewards/safety_reward/std": 1.2279610633850098, "step": 1048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 54.265625, "completions/mean_terminated_length": 54.265625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.3664308793991791, "frac_reward_zero_std": 0.0625, "grad_norm": 1.5428441762924194, "kl": 4.80859375, "learning_rate": 5e-05, "loss": 0.0603, "num_tokens": 13658021.0, "reward": 8.967041015625, "reward_std": 0.340947687625885, "rewards/helpfulness_reward/mean": 1.1955299377441406, "rewards/helpfulness_reward/std": 0.5756097435951233, "rewards/safety_reward/mean": 8.967041015625, "rewards/safety_reward/std": 1.3460016250610352, "step": 1049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 118.0, "completions/max_terminated_length": 118.0, "completions/mean_length": 55.5859375, "completions/mean_terminated_length": 55.5859375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.36678019386953103, "frac_reward_zero_std": 0.0, "grad_norm": 0.36343318223953247, "kl": 4.03125, "learning_rate": 5e-05, "loss": 0.0544, "num_tokens": 13671560.0, "reward": 8.3905029296875, "reward_std": 0.26118648052215576, "rewards/helpfulness_reward/mean": 1.2463912963867188, "rewards/helpfulness_reward/std": 0.5512993931770325, "rewards/safety_reward/mean": 8.3905029296875, "rewards/safety_reward/std": 1.9665405750274658, "step": 1050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 54.109375, "completions/mean_terminated_length": 54.109375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.367129508339883, "frac_reward_zero_std": 0.0, "grad_norm": 0.4093508720397949, "kl": 4.455078125, "learning_rate": 5e-05, "loss": 0.0455, "num_tokens": 13682942.0, "reward": 8.167724609375, "reward_std": 0.4040237069129944, "rewards/helpfulness_reward/mean": 1.2490768432617188, "rewards/helpfulness_reward/std": 0.530701756477356, "rewards/safety_reward/mean": 8.167724609375, "rewards/safety_reward/std": 1.271026849746704, "step": 1051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 54.03125, "completions/mean_terminated_length": 54.03125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.3674788228102349, "frac_reward_zero_std": 0.0, "grad_norm": 0.3120267987251282, "kl": 4.048828125, "learning_rate": 5e-05, "loss": 0.0406, "num_tokens": 13693986.0, "reward": 9.498779296875, "reward_std": 0.1440771520137787, "rewards/helpfulness_reward/mean": 1.322610855102539, "rewards/helpfulness_reward/std": 0.8052700161933899, "rewards/safety_reward/mean": 9.498779296875, "rewards/safety_reward/std": 0.8225304484367371, "step": 1052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 53.8828125, "completions/mean_terminated_length": 53.8828125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.36782813728058683, "frac_reward_zero_std": 0.0625, "grad_norm": 0.35558801889419556, "kl": 4.05078125, "learning_rate": 5e-05, "loss": 0.0391, "num_tokens": 13705171.0, "reward": 8.857666015625, "reward_std": 0.2311275750398636, "rewards/helpfulness_reward/mean": 1.4037132263183594, "rewards/helpfulness_reward/std": 0.763451337814331, "rewards/safety_reward/mean": 8.857666015625, "rewards/safety_reward/std": 1.337154746055603, "step": 1053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 54.3359375, "completions/mean_terminated_length": 54.3359375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.3681774517509388, "frac_reward_zero_std": 0.0, "grad_norm": 0.31832605600357056, "kl": 3.9765625, "learning_rate": 5e-05, "loss": 0.0474, "num_tokens": 13715582.0, "reward": 9.134521484375, "reward_std": 0.19196918606758118, "rewards/helpfulness_reward/mean": 1.396315336227417, "rewards/helpfulness_reward/std": 0.7766242623329163, "rewards/safety_reward/mean": 9.134521484375, "rewards/safety_reward/std": 0.8650352954864502, "step": 1054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 54.03125, "completions/mean_terminated_length": 54.03125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.3685267662212907, "frac_reward_zero_std": 0.0, "grad_norm": 0.46844837069511414, "kl": 3.966796875, "learning_rate": 5e-05, "loss": 0.0474, "num_tokens": 13728002.0, "reward": 9.081298828125, "reward_std": 0.2576271593570709, "rewards/helpfulness_reward/mean": 1.2771224975585938, "rewards/helpfulness_reward/std": 0.6773401498794556, "rewards/safety_reward/mean": 9.081298828125, "rewards/safety_reward/std": 1.3483591079711914, "step": 1055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 54.0234375, "completions/mean_terminated_length": 54.0234375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.3688760806916426, "frac_reward_zero_std": 0.0, "grad_norm": 0.3617154061794281, "kl": 4.017578125, "learning_rate": 5e-05, "loss": 0.0439, "num_tokens": 13739245.0, "reward": 8.959228515625, "reward_std": 0.26218682527542114, "rewards/helpfulness_reward/mean": 1.1359384059906006, "rewards/helpfulness_reward/std": 0.6546652317047119, "rewards/safety_reward/mean": 8.959228515625, "rewards/safety_reward/std": 1.0862585306167603, "step": 1056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 54.9140625, "completions/mean_terminated_length": 54.9140625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.3692253951619946, "frac_reward_zero_std": 0.0, "grad_norm": 0.5798121690750122, "kl": 4.595703125, "learning_rate": 5e-05, "loss": 0.0761, "num_tokens": 13750162.0, "reward": 8.99365234375, "reward_std": 0.3657711446285248, "rewards/helpfulness_reward/mean": 1.483734130859375, "rewards/helpfulness_reward/std": 0.5353149175643921, "rewards/safety_reward/mean": 8.99365234375, "rewards/safety_reward/std": 0.8918887972831726, "step": 1057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 110.0, "completions/max_terminated_length": 110.0, "completions/mean_length": 52.4375, "completions/mean_terminated_length": 52.4375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.3695747096323465, "frac_reward_zero_std": 0.0, "grad_norm": 6.375075817108154, "kl": 6.60546875, "learning_rate": 5e-05, "loss": 0.0592, "num_tokens": 13763810.0, "reward": 8.3782958984375, "reward_std": 0.260293573141098, "rewards/helpfulness_reward/mean": 1.6410667896270752, "rewards/helpfulness_reward/std": 0.8341822624206543, "rewards/safety_reward/mean": 8.3782958984375, "rewards/safety_reward/std": 2.006906270980835, "step": 1058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 54.2578125, "completions/mean_terminated_length": 54.2578125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.3699240241026985, "frac_reward_zero_std": 0.0, "grad_norm": 0.35129329562187195, "kl": 4.36328125, "learning_rate": 5e-05, "loss": 0.0483, "num_tokens": 13775275.0, "reward": 8.646728515625, "reward_std": 0.2602434456348419, "rewards/helpfulness_reward/mean": 1.2673110961914062, "rewards/helpfulness_reward/std": 0.7544266581535339, "rewards/safety_reward/mean": 8.646728515625, "rewards/safety_reward/std": 1.1017149686813354, "step": 1059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 52.2890625, "completions/mean_terminated_length": 52.2890625, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.3702733385730504, "frac_reward_zero_std": 0.0, "grad_norm": 9559973888.0, "kl": 3703570434.5039062, "learning_rate": 5e-05, "loss": 36994080.0, "num_tokens": 13787976.0, "reward": 8.16241455078125, "reward_std": 0.2581871747970581, "rewards/helpfulness_reward/mean": 1.3165950775146484, "rewards/helpfulness_reward/std": 0.9340946078300476, "rewards/safety_reward/mean": 8.16241455078125, "rewards/safety_reward/std": 1.972995638847351, "step": 1060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 54.609375, "completions/mean_terminated_length": 54.609375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.3706226530434023, "frac_reward_zero_std": 0.0, "grad_norm": 0.4150345027446747, "kl": 4.37890625, "learning_rate": 5e-05, "loss": 0.0628, "num_tokens": 13799606.0, "reward": 8.975341796875, "reward_std": 0.2892695963382721, "rewards/helpfulness_reward/mean": 1.2870173454284668, "rewards/helpfulness_reward/std": 0.8565593957901001, "rewards/safety_reward/mean": 8.975341796875, "rewards/safety_reward/std": 1.0828630924224854, "step": 1061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 53.9609375, "completions/mean_terminated_length": 53.9609375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.3709719675137543, "frac_reward_zero_std": 0.0, "grad_norm": 0.36142924427986145, "kl": 4.02734375, "learning_rate": 5e-05, "loss": 0.0414, "num_tokens": 13810665.0, "reward": 9.0224609375, "reward_std": 0.24355000257492065, "rewards/helpfulness_reward/mean": 1.3664093017578125, "rewards/helpfulness_reward/std": 0.48394855856895447, "rewards/safety_reward/mean": 9.0224609375, "rewards/safety_reward/std": 0.8108225464820862, "step": 1062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 54.1484375, "completions/mean_terminated_length": 54.1484375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.3713212819841062, "frac_reward_zero_std": 0.0, "grad_norm": 0.33523625135421753, "kl": 4.1875, "learning_rate": 5e-05, "loss": 0.0431, "num_tokens": 13822020.0, "reward": 9.19189453125, "reward_std": 0.2134464681148529, "rewards/helpfulness_reward/mean": 1.31707763671875, "rewards/helpfulness_reward/std": 0.5122272372245789, "rewards/safety_reward/mean": 9.19189453125, "rewards/safety_reward/std": 1.029266357421875, "step": 1063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 54.515625, "completions/mean_terminated_length": 54.515625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.3716705964544581, "frac_reward_zero_std": 0.0, "grad_norm": 0.3861035108566284, "kl": 4.162109375, "learning_rate": 5e-05, "loss": 0.0576, "num_tokens": 13832838.0, "reward": 8.516845703125, "reward_std": 0.30255556106567383, "rewards/helpfulness_reward/mean": 1.3018240928649902, "rewards/helpfulness_reward/std": 0.7946638464927673, "rewards/safety_reward/mean": 8.516845703125, "rewards/safety_reward/std": 1.061572551727295, "step": 1064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 55.859375, "completions/mean_terminated_length": 55.859375, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.37201991092481007, "frac_reward_zero_std": 0.0, "grad_norm": 0.5711678266525269, "kl": 4.109375, "learning_rate": 5e-05, "loss": 0.0371, "num_tokens": 13845380.0, "reward": 8.2135009765625, "reward_std": 0.45580118894577026, "rewards/helpfulness_reward/mean": 1.2286815643310547, "rewards/helpfulness_reward/std": 0.6289548873901367, "rewards/safety_reward/mean": 8.2135009765625, "rewards/safety_reward/std": 2.410735607147217, "step": 1065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 92.0, "completions/max_terminated_length": 92.0, "completions/mean_length": 55.15625, "completions/mean_terminated_length": 55.15625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.372369225395162, "frac_reward_zero_std": 0.0, "grad_norm": 0.5132355093955994, "kl": 4.37890625, "learning_rate": 5e-05, "loss": 0.068, "num_tokens": 13856552.0, "reward": 8.84033203125, "reward_std": 0.423605740070343, "rewards/helpfulness_reward/mean": 1.3677825927734375, "rewards/helpfulness_reward/std": 0.6038599014282227, "rewards/safety_reward/mean": 8.84033203125, "rewards/safety_reward/std": 1.2844372987747192, "step": 1066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 54.9375, "completions/mean_terminated_length": 54.9375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.37271853986551395, "frac_reward_zero_std": 0.0, "grad_norm": 0.49914196133613586, "kl": 3.98046875, "learning_rate": 5e-05, "loss": 0.0585, "num_tokens": 13867048.0, "reward": 8.813720703125, "reward_std": 0.34652113914489746, "rewards/helpfulness_reward/mean": 1.3220796585083008, "rewards/helpfulness_reward/std": 0.7305936813354492, "rewards/safety_reward/mean": 8.813720703125, "rewards/safety_reward/std": 1.2643821239471436, "step": 1067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 54.265625, "completions/mean_terminated_length": 54.265625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.37306785433586587, "frac_reward_zero_std": 0.0, "grad_norm": 0.4376702606678009, "kl": 3.734375, "learning_rate": 5e-05, "loss": 0.0452, "num_tokens": 13878538.0, "reward": 8.796630859375, "reward_std": 0.296477735042572, "rewards/helpfulness_reward/mean": 1.2831649780273438, "rewards/helpfulness_reward/std": 0.8741790056228638, "rewards/safety_reward/mean": 8.796630859375, "rewards/safety_reward/std": 1.0547715425491333, "step": 1068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 85.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 54.9921875, "completions/mean_terminated_length": 54.9921875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.3734171688062178, "frac_reward_zero_std": 0.0, "grad_norm": 0.923069417476654, "kl": 4.744140625, "learning_rate": 5e-05, "loss": 0.074, "num_tokens": 13889473.0, "reward": 8.78271484375, "reward_std": 0.2626458406448364, "rewards/helpfulness_reward/mean": 1.2427022457122803, "rewards/helpfulness_reward/std": 0.7772464156150818, "rewards/safety_reward/mean": 8.78271484375, "rewards/safety_reward/std": 1.1254090070724487, "step": 1069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 86.0, "completions/max_terminated_length": 86.0, "completions/mean_length": 54.3984375, "completions/mean_terminated_length": 54.3984375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.37376648327656975, "frac_reward_zero_std": 0.0, "grad_norm": 0.4940149486064911, "kl": 4.1640625, "learning_rate": 5e-05, "loss": 0.0578, "num_tokens": 13902180.0, "reward": 8.1663818359375, "reward_std": 0.4824778437614441, "rewards/helpfulness_reward/mean": 1.0001347064971924, "rewards/helpfulness_reward/std": 0.7384554743766785, "rewards/safety_reward/mean": 8.1663818359375, "rewards/safety_reward/std": 1.852879285812378, "step": 1070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 54.84375, "completions/mean_terminated_length": 54.84375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.37411579774692166, "frac_reward_zero_std": 0.0, "grad_norm": 0.40408995747566223, "kl": 4.07421875, "learning_rate": 5e-05, "loss": 0.0503, "num_tokens": 13913232.0, "reward": 8.928955078125, "reward_std": 0.5435336828231812, "rewards/helpfulness_reward/mean": 1.3968896865844727, "rewards/helpfulness_reward/std": 0.6136090755462646, "rewards/safety_reward/mean": 8.928955078125, "rewards/safety_reward/std": 1.187957763671875, "step": 1071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 92.0, "completions/max_terminated_length": 92.0, "completions/mean_length": 54.203125, "completions/mean_terminated_length": 54.203125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.3744651122172736, "frac_reward_zero_std": 0.0, "grad_norm": 0.5378521084785461, "kl": 4.064453125, "learning_rate": 5e-05, "loss": 0.1018, "num_tokens": 13926618.0, "reward": 8.358283996582031, "reward_std": 0.5565682053565979, "rewards/helpfulness_reward/mean": 1.1789088249206543, "rewards/helpfulness_reward/std": 0.6561278700828552, "rewards/safety_reward/mean": 8.358283996582031, "rewards/safety_reward/std": 2.018204689025879, "step": 1072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 53.265625, "completions/mean_terminated_length": 53.265625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.37481442668762555, "frac_reward_zero_std": 0.0, "grad_norm": 0.44456395506858826, "kl": 4.048828125, "learning_rate": 5e-05, "loss": 0.041, "num_tokens": 13939220.0, "reward": 7.95263671875, "reward_std": 0.4067802429199219, "rewards/helpfulness_reward/mean": 1.2185516357421875, "rewards/helpfulness_reward/std": 0.7245096564292908, "rewards/safety_reward/mean": 7.95263671875, "rewards/safety_reward/std": 1.9633554220199585, "step": 1073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 88.0, "completions/max_terminated_length": 88.0, "completions/mean_length": 54.65625, "completions/mean_terminated_length": 54.65625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.37516374115797746, "frac_reward_zero_std": 0.0, "grad_norm": 0.46019595861434937, "kl": 4.15625, "learning_rate": 5e-05, "loss": 0.0597, "num_tokens": 13951048.0, "reward": 8.3558349609375, "reward_std": 0.5247521996498108, "rewards/helpfulness_reward/mean": 0.8426055908203125, "rewards/helpfulness_reward/std": 0.6882117390632629, "rewards/safety_reward/mean": 8.3558349609375, "rewards/safety_reward/std": 1.2632908821105957, "step": 1074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 54.359375, "completions/mean_terminated_length": 54.359375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.37551305562832943, "frac_reward_zero_std": 0.0, "grad_norm": 0.42375677824020386, "kl": 4.11328125, "learning_rate": 5e-05, "loss": 0.0534, "num_tokens": 13964134.0, "reward": 8.197509765625, "reward_std": 0.5280590057373047, "rewards/helpfulness_reward/mean": 1.1024551391601562, "rewards/helpfulness_reward/std": 0.9988338947296143, "rewards/safety_reward/mean": 8.197509765625, "rewards/safety_reward/std": 1.435062289237976, "step": 1075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 86.0, "completions/max_terminated_length": 86.0, "completions/mean_length": 55.03125, "completions/mean_terminated_length": 55.03125, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.37586237009868134, "frac_reward_zero_std": 0.0, "grad_norm": 2.31121826171875, "kl": 5.48828125, "learning_rate": 5e-05, "loss": 0.0884, "num_tokens": 13975354.0, "reward": 9.13037109375, "reward_std": 0.45237505435943604, "rewards/helpfulness_reward/mean": 1.3140640258789062, "rewards/helpfulness_reward/std": 0.6073969006538391, "rewards/safety_reward/mean": 9.13037109375, "rewards/safety_reward/std": 1.277557373046875, "step": 1076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 54.40625, "completions/mean_terminated_length": 54.40625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.37621168456903326, "frac_reward_zero_std": 0.0, "grad_norm": 0.39715084433555603, "kl": 4.19921875, "learning_rate": 5e-05, "loss": 0.0552, "num_tokens": 13986478.0, "reward": 8.625244140625, "reward_std": 0.3812933564186096, "rewards/helpfulness_reward/mean": 1.012725830078125, "rewards/helpfulness_reward/std": 0.6844280362129211, "rewards/safety_reward/mean": 8.625244140625, "rewards/safety_reward/std": 1.0435837507247925, "step": 1077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 54.1953125, "completions/mean_terminated_length": 54.1953125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.3765609990393852, "frac_reward_zero_std": 0.0, "grad_norm": 1.6711468696594238, "kl": 4.826171875, "learning_rate": 5e-05, "loss": 0.0535, "num_tokens": 13999823.0, "reward": 8.5257568359375, "reward_std": 0.24842233955860138, "rewards/helpfulness_reward/mean": 1.1463565826416016, "rewards/helpfulness_reward/std": 1.0247817039489746, "rewards/safety_reward/mean": 8.5257568359375, "rewards/safety_reward/std": 2.068769693374634, "step": 1078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 54.515625, "completions/mean_terminated_length": 54.515625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.37691031350973714, "frac_reward_zero_std": 0.0, "grad_norm": 0.47576531767845154, "kl": 4.259765625, "learning_rate": 5e-05, "loss": 0.0568, "num_tokens": 14011585.0, "reward": 9.029541015625, "reward_std": 0.5206832885742188, "rewards/helpfulness_reward/mean": 1.1963272094726562, "rewards/helpfulness_reward/std": 0.6980628371238708, "rewards/safety_reward/mean": 9.029541015625, "rewards/safety_reward/std": 1.2332433462142944, "step": 1079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 54.2578125, "completions/mean_terminated_length": 54.2578125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.37725962798008905, "frac_reward_zero_std": 0.0, "grad_norm": 0.4200514554977417, "kl": 4.1171875, "learning_rate": 5e-05, "loss": 0.0327, "num_tokens": 14025050.0, "reward": 7.71905517578125, "reward_std": 0.38093167543411255, "rewards/helpfulness_reward/mean": 0.8962016105651855, "rewards/helpfulness_reward/std": 0.7756282687187195, "rewards/safety_reward/mean": 7.71905517578125, "rewards/safety_reward/std": 2.177328586578369, "step": 1080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 54.2265625, "completions/mean_terminated_length": 54.2265625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.377608942450441, "frac_reward_zero_std": 0.0, "grad_norm": 0.36412179470062256, "kl": 3.947265625, "learning_rate": 5e-05, "loss": 0.0375, "num_tokens": 14036119.0, "reward": 8.778076171875, "reward_std": 0.373454749584198, "rewards/helpfulness_reward/mean": 1.0571346282958984, "rewards/helpfulness_reward/std": 0.6261608600616455, "rewards/safety_reward/mean": 8.778076171875, "rewards/safety_reward/std": 1.1093608140945435, "step": 1081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 95.0, "completions/max_terminated_length": 95.0, "completions/mean_length": 55.234375, "completions/mean_terminated_length": 55.234375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.37795825692079293, "frac_reward_zero_std": 0.0, "grad_norm": 0.5910381078720093, "kl": 4.30078125, "learning_rate": 5e-05, "loss": 0.069, "num_tokens": 14047973.0, "reward": 8.332130432128906, "reward_std": 0.464599072933197, "rewards/helpfulness_reward/mean": 1.112375259399414, "rewards/helpfulness_reward/std": 0.7765957713127136, "rewards/safety_reward/mean": 8.332130432128906, "rewards/safety_reward/std": 2.3654348850250244, "step": 1082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 54.5625, "completions/mean_terminated_length": 54.5625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.3783075713911449, "frac_reward_zero_std": 0.0, "grad_norm": 0.40362709760665894, "kl": 3.984375, "learning_rate": 5e-05, "loss": 0.0815, "num_tokens": 14059821.0, "reward": 8.239810943603516, "reward_std": 0.5983525514602661, "rewards/helpfulness_reward/mean": 0.9576582908630371, "rewards/helpfulness_reward/std": 1.270957589149475, "rewards/safety_reward/mean": 8.239810943603516, "rewards/safety_reward/std": 2.2614316940307617, "step": 1083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 71.140625, "completions/mean_terminated_length": 71.140625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.3786568858614968, "frac_reward_zero_std": 0.0, "grad_norm": 0.6214457750320435, "kl": 3.615234375, "learning_rate": 5e-05, "loss": 0.0187, "num_tokens": 14075583.0, "reward": 8.079978942871094, "reward_std": 0.4477752149105072, "rewards/helpfulness_reward/mean": 1.0744171142578125, "rewards/helpfulness_reward/std": 1.0484827756881714, "rewards/safety_reward/mean": 8.079978942871094, "rewards/safety_reward/std": 2.782417058944702, "step": 1084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 54.21875, "completions/mean_terminated_length": 54.21875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.37900620033184873, "frac_reward_zero_std": 0.0, "grad_norm": 0.4023265540599823, "kl": 4.349609375, "learning_rate": 5e-05, "loss": 0.0534, "num_tokens": 14086875.0, "reward": 8.07373046875, "reward_std": 0.3548416495323181, "rewards/helpfulness_reward/mean": 0.7612965106964111, "rewards/helpfulness_reward/std": 0.6320981383323669, "rewards/safety_reward/mean": 8.07373046875, "rewards/safety_reward/std": 1.4414141178131104, "step": 1085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 54.484375, "completions/mean_terminated_length": 54.484375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.3793555148022007, "frac_reward_zero_std": 0.0, "grad_norm": 0.4808359742164612, "kl": 4.138671875, "learning_rate": 5e-05, "loss": 0.0513, "num_tokens": 14098401.0, "reward": 8.8076171875, "reward_std": 0.30582666397094727, "rewards/helpfulness_reward/mean": 1.0841426849365234, "rewards/helpfulness_reward/std": 0.8085454702377319, "rewards/safety_reward/mean": 8.8076171875, "rewards/safety_reward/std": 1.122114896774292, "step": 1086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 54.1953125, "completions/mean_terminated_length": 54.1953125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.3797048292725526, "frac_reward_zero_std": 0.0, "grad_norm": 0.36964938044548035, "kl": 3.873046875, "learning_rate": 5e-05, "loss": 0.0402, "num_tokens": 14109122.0, "reward": 8.803955078125, "reward_std": 0.2572026252746582, "rewards/helpfulness_reward/mean": 0.9968311190605164, "rewards/helpfulness_reward/std": 0.6826171278953552, "rewards/safety_reward/mean": 8.803955078125, "rewards/safety_reward/std": 1.1384105682373047, "step": 1087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 54.96875, "completions/mean_terminated_length": 54.96875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.3800541437429045, "frac_reward_zero_std": 0.0, "grad_norm": 0.4655836522579193, "kl": 4.087890625, "learning_rate": 5e-05, "loss": 0.0683, "num_tokens": 14120070.0, "reward": 8.99072265625, "reward_std": 0.37401822209358215, "rewards/helpfulness_reward/mean": 1.226531982421875, "rewards/helpfulness_reward/std": 0.6472173929214478, "rewards/safety_reward/mean": 8.99072265625, "rewards/safety_reward/std": 1.267012357711792, "step": 1088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 53.015625, "completions/mean_terminated_length": 53.015625, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.3804034582132565, "frac_reward_zero_std": 0.0, "grad_norm": 0.7431610822677612, "kl": 4.166015625, "learning_rate": 5e-05, "loss": 0.0407, "num_tokens": 14134032.0, "reward": 8.27008056640625, "reward_std": 0.5062226057052612, "rewards/helpfulness_reward/mean": 0.9332437515258789, "rewards/helpfulness_reward/std": 0.8340498805046082, "rewards/safety_reward/mean": 8.27008056640625, "rewards/safety_reward/std": 2.0853471755981445, "step": 1089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 54.484375, "completions/mean_terminated_length": 54.484375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.3807527726836084, "frac_reward_zero_std": 0.0, "grad_norm": 0.3817989230155945, "kl": 4.193359375, "learning_rate": 5e-05, "loss": 0.0527, "num_tokens": 14145774.0, "reward": 9.365234375, "reward_std": 0.36015957593917847, "rewards/helpfulness_reward/mean": 1.3848509788513184, "rewards/helpfulness_reward/std": 0.797248363494873, "rewards/safety_reward/mean": 9.365234375, "rewards/safety_reward/std": 1.086825966835022, "step": 1090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 54.15625, "completions/mean_terminated_length": 54.15625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.3811020871539604, "frac_reward_zero_std": 0.0, "grad_norm": 0.45235109329223633, "kl": 4.1640625, "learning_rate": 5e-05, "loss": 0.0468, "num_tokens": 14156362.0, "reward": 8.671875, "reward_std": 0.42159098386764526, "rewards/helpfulness_reward/mean": 1.4241180419921875, "rewards/helpfulness_reward/std": 0.6039553284645081, "rewards/safety_reward/mean": 8.671875, "rewards/safety_reward/std": 1.1940736770629883, "step": 1091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 54.640625, "completions/mean_terminated_length": 54.640625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.3814514016243123, "frac_reward_zero_std": 0.0, "grad_norm": 3.3945376873016357, "kl": 5.43359375, "learning_rate": 5e-05, "loss": 0.0735, "num_tokens": 14167124.0, "reward": 8.3740234375, "reward_std": 0.5285120010375977, "rewards/helpfulness_reward/mean": 1.15130615234375, "rewards/helpfulness_reward/std": 0.8544408679008484, "rewards/safety_reward/mean": 8.3740234375, "rewards/safety_reward/std": 1.3170146942138672, "step": 1092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 92.0, "completions/max_terminated_length": 92.0, "completions/mean_length": 54.3203125, "completions/mean_terminated_length": 54.3203125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.3818007160946642, "frac_reward_zero_std": 0.0, "grad_norm": 0.6526175737380981, "kl": 4.5, "learning_rate": 5e-05, "loss": 0.0576, "num_tokens": 14177757.0, "reward": 9.279296875, "reward_std": 0.18774500489234924, "rewards/helpfulness_reward/mean": 1.190065622329712, "rewards/helpfulness_reward/std": 0.5043384432792664, "rewards/safety_reward/mean": 9.279296875, "rewards/safety_reward/std": 1.0658581256866455, "step": 1093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 53.96875, "completions/mean_terminated_length": 53.96875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.3821500305650162, "frac_reward_zero_std": 0.0, "grad_norm": 0.4682140648365021, "kl": 4.1328125, "learning_rate": 5e-05, "loss": 0.0435, "num_tokens": 14188361.0, "reward": 9.064453125, "reward_std": 0.3070557415485382, "rewards/helpfulness_reward/mean": 1.518280029296875, "rewards/helpfulness_reward/std": 0.6840084791183472, "rewards/safety_reward/mean": 9.064453125, "rewards/safety_reward/std": 1.1131936311721802, "step": 1094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 53.6953125, "completions/mean_terminated_length": 53.6953125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.3824993450353681, "frac_reward_zero_std": 0.0, "grad_norm": 0.5090779066085815, "kl": 3.896484375, "learning_rate": 5e-05, "loss": 0.0277, "num_tokens": 14199746.0, "reward": 8.6431884765625, "reward_std": 0.40502041578292847, "rewards/helpfulness_reward/mean": 1.5007095336914062, "rewards/helpfulness_reward/std": 0.8124374747276306, "rewards/safety_reward/mean": 8.6431884765625, "rewards/safety_reward/std": 1.2033300399780273, "step": 1095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 54.359375, "completions/mean_terminated_length": 54.359375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.38284865950572, "frac_reward_zero_std": 0.0, "grad_norm": 1.5498117208480835, "kl": 4.87890625, "learning_rate": 5e-05, "loss": 0.0636, "num_tokens": 14211608.0, "reward": 8.82275390625, "reward_std": 0.43775564432144165, "rewards/helpfulness_reward/mean": 1.4514713287353516, "rewards/helpfulness_reward/std": 0.8890751004219055, "rewards/safety_reward/mean": 8.82275390625, "rewards/safety_reward/std": 1.3436733484268188, "step": 1096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 54.4453125, "completions/mean_terminated_length": 54.4453125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.38319797397607197, "frac_reward_zero_std": 0.0, "grad_norm": 0.6038928627967834, "kl": 4.283203125, "learning_rate": 5e-05, "loss": 0.0503, "num_tokens": 14223097.0, "reward": 8.156982421875, "reward_std": 0.4002060890197754, "rewards/helpfulness_reward/mean": 1.211904525756836, "rewards/helpfulness_reward/std": 0.5906509757041931, "rewards/safety_reward/mean": 8.156982421875, "rewards/safety_reward/std": 1.4628691673278809, "step": 1097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 54.109375, "completions/mean_terminated_length": 54.109375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.3835472884464239, "frac_reward_zero_std": 0.0, "grad_norm": 0.3794189691543579, "kl": 4.203125, "learning_rate": 5e-05, "loss": 0.0444, "num_tokens": 14234295.0, "reward": 8.99072265625, "reward_std": 0.36654403805732727, "rewards/helpfulness_reward/mean": 1.31581449508667, "rewards/helpfulness_reward/std": 0.7430265545845032, "rewards/safety_reward/mean": 8.99072265625, "rewards/safety_reward/std": 1.0951822996139526, "step": 1098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 53.7578125, "completions/mean_terminated_length": 53.7578125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.38389660291677585, "frac_reward_zero_std": 0.0, "grad_norm": 0.4841720759868622, "kl": 3.931640625, "learning_rate": 5e-05, "loss": 0.0303, "num_tokens": 14245440.0, "reward": 8.6480712890625, "reward_std": 0.40386247634887695, "rewards/helpfulness_reward/mean": 1.1240739822387695, "rewards/helpfulness_reward/std": 0.7866911888122559, "rewards/safety_reward/mean": 8.6480712890625, "rewards/safety_reward/std": 1.9198880195617676, "step": 1099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 55.6875, "completions/mean_terminated_length": 55.6875, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.38424591738712777, "frac_reward_zero_std": 0.0, "grad_norm": 3103052.25, "kl": 598024.560546875, "learning_rate": 5e-05, "loss": 5989.5845, "num_tokens": 14259224.0, "reward": 8.384273529052734, "reward_std": 0.3579474091529846, "rewards/helpfulness_reward/mean": 1.43511962890625, "rewards/helpfulness_reward/std": 0.7407494783401489, "rewards/safety_reward/mean": 8.384273529052734, "rewards/safety_reward/std": 2.1582305431365967, "step": 1100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 53.6640625, "completions/mean_terminated_length": 53.6640625, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.3845952318574797, "frac_reward_zero_std": 0.0, "grad_norm": 0.591399610042572, "kl": 4.318359375, "learning_rate": 5e-05, "loss": 0.0251, "num_tokens": 14270565.0, "reward": 8.4066162109375, "reward_std": 0.7088579535484314, "rewards/helpfulness_reward/mean": 1.0985984802246094, "rewards/helpfulness_reward/std": 0.6142939329147339, "rewards/safety_reward/mean": 8.4066162109375, "rewards/safety_reward/std": 1.3214136362075806, "step": 1101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 53.3125, "completions/mean_terminated_length": 53.3125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.38494454632783165, "frac_reward_zero_std": 0.0, "grad_norm": 0.47393909096717834, "kl": 4.119140625, "learning_rate": 5e-05, "loss": 0.0266, "num_tokens": 14281933.0, "reward": 8.976806640625, "reward_std": 0.5567315816879272, "rewards/helpfulness_reward/mean": 1.05936598777771, "rewards/helpfulness_reward/std": 0.6394737958908081, "rewards/safety_reward/mean": 8.976806640625, "rewards/safety_reward/std": 0.892017126083374, "step": 1102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 53.8671875, "completions/mean_terminated_length": 53.8671875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.38529386079818356, "frac_reward_zero_std": 0.0, "grad_norm": 0.53467857837677, "kl": 4.189453125, "learning_rate": 5e-05, "loss": 0.0364, "num_tokens": 14293236.0, "reward": 8.861572265625, "reward_std": 0.5047217607498169, "rewards/helpfulness_reward/mean": 1.2314863204956055, "rewards/helpfulness_reward/std": 0.5450931191444397, "rewards/safety_reward/mean": 8.861572265625, "rewards/safety_reward/std": 1.3320780992507935, "step": 1103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 54.3828125, "completions/mean_terminated_length": 54.3828125, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.3856431752685355, "frac_reward_zero_std": 0.0, "grad_norm": 0.5367498397827148, "kl": 3.8125, "learning_rate": 5e-05, "loss": 0.0621, "num_tokens": 14304445.0, "reward": 8.4957275390625, "reward_std": 0.7633069157600403, "rewards/helpfulness_reward/mean": 1.2631149291992188, "rewards/helpfulness_reward/std": 0.615454375743866, "rewards/safety_reward/mean": 8.4957275390625, "rewards/safety_reward/std": 1.9033039808273315, "step": 1104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 54.15625, "completions/mean_terminated_length": 54.15625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.38599248973888745, "frac_reward_zero_std": 0.0, "grad_norm": 0.4243502616882324, "kl": 4.05859375, "learning_rate": 5e-05, "loss": 0.038, "num_tokens": 14316985.0, "reward": 9.31640625, "reward_std": 0.37830767035484314, "rewards/helpfulness_reward/mean": 1.383890151977539, "rewards/helpfulness_reward/std": 0.7188699245452881, "rewards/safety_reward/mean": 9.31640625, "rewards/safety_reward/std": 1.074210286140442, "step": 1105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 54.359375, "completions/mean_terminated_length": 54.359375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.38634180420923936, "frac_reward_zero_std": 0.0, "grad_norm": 0.5923948884010315, "kl": 4.0390625, "learning_rate": 5e-05, "loss": 0.0444, "num_tokens": 14328143.0, "reward": 9.171875, "reward_std": 0.5159948468208313, "rewards/helpfulness_reward/mean": 1.34320068359375, "rewards/helpfulness_reward/std": 0.613499641418457, "rewards/safety_reward/mean": 9.171875, "rewards/safety_reward/std": 1.0506153106689453, "step": 1106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 55.3359375, "completions/mean_terminated_length": 55.3359375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.38669111867959133, "frac_reward_zero_std": 0.0, "grad_norm": 0.3922881484031677, "kl": 3.767578125, "learning_rate": 5e-05, "loss": 0.0549, "num_tokens": 14339730.0, "reward": 8.412818908691406, "reward_std": 0.48256200551986694, "rewards/helpfulness_reward/mean": 1.0355987548828125, "rewards/helpfulness_reward/std": 0.5864003896713257, "rewards/safety_reward/mean": 8.412818908691406, "rewards/safety_reward/std": 2.2892491817474365, "step": 1107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 54.5546875, "completions/mean_terminated_length": 54.5546875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.38704043314994324, "frac_reward_zero_std": 0.0, "grad_norm": 0.44434961676597595, "kl": 4.26953125, "learning_rate": 5e-05, "loss": 0.0427, "num_tokens": 14351297.0, "reward": 8.98388671875, "reward_std": 0.4448477029800415, "rewards/helpfulness_reward/mean": 1.1742324829101562, "rewards/helpfulness_reward/std": 0.5829422473907471, "rewards/safety_reward/mean": 8.98388671875, "rewards/safety_reward/std": 1.0992094278335571, "step": 1108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 95.0, "completions/max_terminated_length": 95.0, "completions/mean_length": 54.5625, "completions/mean_terminated_length": 54.5625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.38738974762029516, "frac_reward_zero_std": 0.0, "grad_norm": 0.4501495361328125, "kl": 4.03515625, "learning_rate": 5e-05, "loss": 0.0547, "num_tokens": 14362057.0, "reward": 8.5748291015625, "reward_std": 0.4661250710487366, "rewards/helpfulness_reward/mean": 0.9341224431991577, "rewards/helpfulness_reward/std": 0.7454785704612732, "rewards/safety_reward/mean": 8.5748291015625, "rewards/safety_reward/std": 1.2369189262390137, "step": 1109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 54.515625, "completions/mean_terminated_length": 54.515625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.3877390620906471, "frac_reward_zero_std": 0.0, "grad_norm": 0.4468874931335449, "kl": 4.15234375, "learning_rate": 5e-05, "loss": 0.0361, "num_tokens": 14372803.0, "reward": 8.76220703125, "reward_std": 0.4321204721927643, "rewards/helpfulness_reward/mean": 1.3097457885742188, "rewards/helpfulness_reward/std": 0.8589820265769958, "rewards/safety_reward/mean": 8.76220703125, "rewards/safety_reward/std": 1.3144917488098145, "step": 1110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/max_terminated_length": 550.0, "completions/mean_length": 77.5703125, "completions/mean_terminated_length": 77.5703125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.38808837656099904, "frac_reward_zero_std": 0.0, "grad_norm": 0.2844606935977936, "kl": 3.12109375, "learning_rate": 5e-05, "loss": 0.0006, "num_tokens": 14388636.0, "reward": 8.647064208984375, "reward_std": 0.21942156553268433, "rewards/helpfulness_reward/mean": 1.0641978979110718, "rewards/helpfulness_reward/std": 0.7475810647010803, "rewards/safety_reward/mean": 8.647064208984375, "rewards/safety_reward/std": 2.726771116256714, "step": 1111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 56.1953125, "completions/mean_terminated_length": 56.1953125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.38843769103135095, "frac_reward_zero_std": 0.0, "grad_norm": 0.8282017707824707, "kl": 4.6953125, "learning_rate": 5e-05, "loss": 0.0934, "num_tokens": 14399813.0, "reward": 9.0135498046875, "reward_std": 0.41724830865859985, "rewards/helpfulness_reward/mean": 1.1194934844970703, "rewards/helpfulness_reward/std": 0.754497766494751, "rewards/safety_reward/mean": 9.0135498046875, "rewards/safety_reward/std": 1.0889171361923218, "step": 1112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 54.7734375, "completions/mean_terminated_length": 54.7734375, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.3887870055017029, "frac_reward_zero_std": 0.0, "grad_norm": 0.3311588764190674, "kl": 4.017578125, "learning_rate": 5e-05, "loss": 0.0348, "num_tokens": 14410984.0, "reward": 8.786376953125, "reward_std": 0.2875185012817383, "rewards/helpfulness_reward/mean": 1.0441020727157593, "rewards/helpfulness_reward/std": 0.7339288592338562, "rewards/safety_reward/mean": 8.786376953125, "rewards/safety_reward/std": 1.5778224468231201, "step": 1113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 54.6796875, "completions/mean_terminated_length": 54.6796875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.38913631997205483, "frac_reward_zero_std": 0.0, "grad_norm": 0.4994020462036133, "kl": 4.119140625, "learning_rate": 5e-05, "loss": 0.0549, "num_tokens": 14421719.0, "reward": 8.8719482421875, "reward_std": 0.3150666654109955, "rewards/helpfulness_reward/mean": 1.4898834228515625, "rewards/helpfulness_reward/std": 0.6586723923683167, "rewards/safety_reward/mean": 8.8719482421875, "rewards/safety_reward/std": 1.1978511810302734, "step": 1114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 86.0, "completions/max_terminated_length": 86.0, "completions/mean_length": 55.0234375, "completions/mean_terminated_length": 55.0234375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.3894856344424068, "frac_reward_zero_std": 0.0, "grad_norm": 0.5697539448738098, "kl": 4.259765625, "learning_rate": 5e-05, "loss": 0.0546, "num_tokens": 14434842.0, "reward": 8.775634765625, "reward_std": 0.3253939151763916, "rewards/helpfulness_reward/mean": 1.0402755737304688, "rewards/helpfulness_reward/std": 0.882066011428833, "rewards/safety_reward/mean": 8.775634765625, "rewards/safety_reward/std": 1.6546193361282349, "step": 1115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 55.328125, "completions/mean_terminated_length": 55.328125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.3898349489127587, "frac_reward_zero_std": 0.0, "grad_norm": 0.41685378551483154, "kl": 4.349609375, "learning_rate": 5e-05, "loss": 0.0596, "num_tokens": 14446052.0, "reward": 9.373779296875, "reward_std": 0.3332514762878418, "rewards/helpfulness_reward/mean": 1.2275848388671875, "rewards/helpfulness_reward/std": 0.7964800596237183, "rewards/safety_reward/mean": 9.373779296875, "rewards/safety_reward/std": 1.2803488969802856, "step": 1116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 54.40625, "completions/mean_terminated_length": 54.40625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.39018426338311063, "frac_reward_zero_std": 0.0625, "grad_norm": 0.4223119914531708, "kl": 4.205078125, "learning_rate": 5e-05, "loss": 0.0446, "num_tokens": 14456392.0, "reward": 8.8134765625, "reward_std": 0.26194435358047485, "rewards/helpfulness_reward/mean": 1.2874603271484375, "rewards/helpfulness_reward/std": 0.6582244634628296, "rewards/safety_reward/mean": 8.8134765625, "rewards/safety_reward/std": 0.8767815828323364, "step": 1117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 54.9765625, "completions/mean_terminated_length": 54.9765625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.3905335778534626, "frac_reward_zero_std": 0.0, "grad_norm": 0.472373902797699, "kl": 4.35546875, "learning_rate": 5e-05, "loss": 0.0676, "num_tokens": 14466869.0, "reward": 9.21337890625, "reward_std": 0.29852551221847534, "rewards/helpfulness_reward/mean": 1.6510696411132812, "rewards/helpfulness_reward/std": 0.5103866457939148, "rewards/safety_reward/mean": 9.21337890625, "rewards/safety_reward/std": 0.9658453464508057, "step": 1118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 55.2421875, "completions/mean_terminated_length": 55.2421875, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.3908828923238145, "frac_reward_zero_std": 0.0, "grad_norm": 0.3590335547924042, "kl": 4.0859375, "learning_rate": 5e-05, "loss": 0.0637, "num_tokens": 14478180.0, "reward": 9.1513671875, "reward_std": 0.31532424688339233, "rewards/helpfulness_reward/mean": 1.1370601654052734, "rewards/helpfulness_reward/std": 0.5854569673538208, "rewards/safety_reward/mean": 9.1513671875, "rewards/safety_reward/std": 0.9933297038078308, "step": 1119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 54.765625, "completions/mean_terminated_length": 54.765625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.3912322067941664, "frac_reward_zero_std": 0.0, "grad_norm": 0.4368596076965332, "kl": 4.12109375, "learning_rate": 5e-05, "loss": 0.0546, "num_tokens": 14489630.0, "reward": 9.06884765625, "reward_std": 0.41283416748046875, "rewards/helpfulness_reward/mean": 1.3241500854492188, "rewards/helpfulness_reward/std": 0.6905742287635803, "rewards/safety_reward/mean": 9.06884765625, "rewards/safety_reward/std": 1.2997206449508667, "step": 1120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 54.609375, "completions/mean_terminated_length": 54.609375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.3915815212645184, "frac_reward_zero_std": 0.0, "grad_norm": 0.3178209662437439, "kl": 3.92578125, "learning_rate": 5e-05, "loss": 0.0431, "num_tokens": 14500764.0, "reward": 9.192138671875, "reward_std": 0.24221482872962952, "rewards/helpfulness_reward/mean": 1.364654541015625, "rewards/helpfulness_reward/std": 0.5516927242279053, "rewards/safety_reward/mean": 9.192138671875, "rewards/safety_reward/std": 1.32290518283844, "step": 1121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 54.7890625, "completions/mean_terminated_length": 54.7890625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.3919308357348703, "frac_reward_zero_std": 0.0, "grad_norm": 0.32929518818855286, "kl": 4.1484375, "learning_rate": 5e-05, "loss": 0.0556, "num_tokens": 14511681.0, "reward": 9.32373046875, "reward_std": 0.30710816383361816, "rewards/helpfulness_reward/mean": 1.5552520751953125, "rewards/helpfulness_reward/std": 0.6738204956054688, "rewards/safety_reward/mean": 9.32373046875, "rewards/safety_reward/std": 1.126814842224121, "step": 1122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 90.0, "completions/max_terminated_length": 90.0, "completions/mean_length": 55.1796875, "completions/mean_terminated_length": 55.1796875, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.3922801502052223, "frac_reward_zero_std": 0.0, "grad_norm": 1.0420173406600952, "kl": 4.681640625, "learning_rate": 5e-05, "loss": 0.0633, "num_tokens": 14522944.0, "reward": 8.79296875, "reward_std": 0.24083620309829712, "rewards/helpfulness_reward/mean": 0.9655599594116211, "rewards/helpfulness_reward/std": 0.5697736740112305, "rewards/safety_reward/mean": 8.79296875, "rewards/safety_reward/std": 1.0821481943130493, "step": 1123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 54.6015625, "completions/mean_terminated_length": 54.6015625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.3926294646755742, "frac_reward_zero_std": 0.0, "grad_norm": 0.349891722202301, "kl": 4.421875, "learning_rate": 5e-05, "loss": 0.0529, "num_tokens": 14533733.0, "reward": 8.996826171875, "reward_std": 0.216963529586792, "rewards/helpfulness_reward/mean": 1.2762384414672852, "rewards/helpfulness_reward/std": 0.9277679324150085, "rewards/safety_reward/mean": 8.996826171875, "rewards/safety_reward/std": 1.1124203205108643, "step": 1124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 54.96875, "completions/mean_terminated_length": 54.96875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.3929787791459261, "frac_reward_zero_std": 0.0, "grad_norm": 0.4897025525569916, "kl": 4.19921875, "learning_rate": 5e-05, "loss": 0.0458, "num_tokens": 14545361.0, "reward": 8.862060546875, "reward_std": 0.29338234663009644, "rewards/helpfulness_reward/mean": 1.2976570129394531, "rewards/helpfulness_reward/std": 0.6381270289421082, "rewards/safety_reward/mean": 8.862060546875, "rewards/safety_reward/std": 1.124955654144287, "step": 1125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 55.0390625, "completions/mean_terminated_length": 55.0390625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.3933280936162781, "frac_reward_zero_std": 0.0, "grad_norm": 0.36814236640930176, "kl": 4.26953125, "learning_rate": 5e-05, "loss": 0.055, "num_tokens": 14556430.0, "reward": 9.002197265625, "reward_std": 0.256853848695755, "rewards/helpfulness_reward/mean": 1.1657794713974, "rewards/helpfulness_reward/std": 0.7144281268119812, "rewards/safety_reward/mean": 9.002197265625, "rewards/safety_reward/std": 1.0297776460647583, "step": 1126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 54.953125, "completions/mean_terminated_length": 54.953125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.39367740808663, "frac_reward_zero_std": 0.0, "grad_norm": 0.37919285893440247, "kl": 4.185546875, "learning_rate": 5e-05, "loss": 0.0552, "num_tokens": 14567664.0, "reward": 9.04296875, "reward_std": 0.26330819725990295, "rewards/helpfulness_reward/mean": 1.092453956604004, "rewards/helpfulness_reward/std": 0.671372652053833, "rewards/safety_reward/mean": 9.04296875, "rewards/safety_reward/std": 0.9946569204330444, "step": 1127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 54.8671875, "completions/mean_terminated_length": 54.8671875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.3940267225569819, "frac_reward_zero_std": 0.0, "grad_norm": 0.7160872220993042, "kl": 4.83203125, "learning_rate": 5e-05, "loss": 0.0578, "num_tokens": 14578719.0, "reward": 8.65380859375, "reward_std": 0.24978554248809814, "rewards/helpfulness_reward/mean": 1.0623416900634766, "rewards/helpfulness_reward/std": 0.7774060964584351, "rewards/safety_reward/mean": 8.65380859375, "rewards/safety_reward/std": 1.4771761894226074, "step": 1128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 54.6953125, "completions/mean_terminated_length": 54.6953125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.39437603702733387, "frac_reward_zero_std": 0.0, "grad_norm": 0.408547967672348, "kl": 4.15625, "learning_rate": 5e-05, "loss": 0.0479, "num_tokens": 14589504.0, "reward": 8.759765625, "reward_std": 0.19580471515655518, "rewards/helpfulness_reward/mean": 0.8959569931030273, "rewards/helpfulness_reward/std": 0.5372382402420044, "rewards/safety_reward/mean": 8.759765625, "rewards/safety_reward/std": 1.024549126625061, "step": 1129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 54.859375, "completions/mean_terminated_length": 54.859375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.3947253514976858, "frac_reward_zero_std": 0.0, "grad_norm": 0.29353752732276917, "kl": 3.970703125, "learning_rate": 5e-05, "loss": 0.0475, "num_tokens": 14600470.0, "reward": 8.94384765625, "reward_std": 0.2048800140619278, "rewards/helpfulness_reward/mean": 1.0398330688476562, "rewards/helpfulness_reward/std": 0.5000700950622559, "rewards/safety_reward/mean": 8.94384765625, "rewards/safety_reward/std": 0.9316282272338867, "step": 1130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 54.4140625, "completions/mean_terminated_length": 54.4140625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.39507466596803775, "frac_reward_zero_std": 0.0, "grad_norm": 0.30773675441741943, "kl": 4.236328125, "learning_rate": 5e-05, "loss": 0.0401, "num_tokens": 14611035.0, "reward": 9.169189453125, "reward_std": 0.24402394890785217, "rewards/helpfulness_reward/mean": 1.418548583984375, "rewards/helpfulness_reward/std": 0.6424508690834045, "rewards/safety_reward/mean": 9.169189453125, "rewards/safety_reward/std": 1.1700339317321777, "step": 1131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 84.0, "completions/max_terminated_length": 84.0, "completions/mean_length": 55.109375, "completions/mean_terminated_length": 55.109375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.39542398043838967, "frac_reward_zero_std": 0.0, "grad_norm": 0.45030683279037476, "kl": 4.51171875, "learning_rate": 5e-05, "loss": 0.0653, "num_tokens": 14622705.0, "reward": 8.954345703125, "reward_std": 0.31526991724967957, "rewards/helpfulness_reward/mean": 1.0874176025390625, "rewards/helpfulness_reward/std": 0.5326099395751953, "rewards/safety_reward/mean": 8.954345703125, "rewards/safety_reward/std": 0.935512125492096, "step": 1132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 54.90625, "completions/mean_terminated_length": 54.90625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.3957732949087416, "frac_reward_zero_std": 0.0, "grad_norm": 0.3886290490627289, "kl": 4.08984375, "learning_rate": 5e-05, "loss": 0.0545, "num_tokens": 14635237.0, "reward": 8.966796875, "reward_std": 0.37872838973999023, "rewards/helpfulness_reward/mean": 1.3382841348648071, "rewards/helpfulness_reward/std": 0.7235842943191528, "rewards/safety_reward/mean": 8.966796875, "rewards/safety_reward/std": 1.4280153512954712, "step": 1133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 84.0, "completions/max_terminated_length": 84.0, "completions/mean_length": 54.8515625, "completions/mean_terminated_length": 54.8515625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.39612260937909355, "frac_reward_zero_std": 0.0, "grad_norm": 0.4268920421600342, "kl": 3.94140625, "learning_rate": 5e-05, "loss": 0.0572, "num_tokens": 14648554.0, "reward": 8.257080078125, "reward_std": 0.2717336118221283, "rewards/helpfulness_reward/mean": 1.0075721740722656, "rewards/helpfulness_reward/std": 0.982613742351532, "rewards/safety_reward/mean": 8.257080078125, "rewards/safety_reward/std": 1.4883757829666138, "step": 1134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 54.3203125, "completions/mean_terminated_length": 54.3203125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.39647192384944546, "frac_reward_zero_std": 0.0, "grad_norm": 0.5723981261253357, "kl": 4.3828125, "learning_rate": 5e-05, "loss": 0.05, "num_tokens": 14659995.0, "reward": 8.9169921875, "reward_std": 0.2880113422870636, "rewards/helpfulness_reward/mean": 1.2548484802246094, "rewards/helpfulness_reward/std": 0.6581695675849915, "rewards/safety_reward/mean": 8.9169921875, "rewards/safety_reward/std": 1.256182074546814, "step": 1135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 54.5234375, "completions/mean_terminated_length": 54.5234375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.3968212383197974, "frac_reward_zero_std": 0.0, "grad_norm": 0.3349383771419525, "kl": 4.30859375, "learning_rate": 5e-05, "loss": 0.0437, "num_tokens": 14671830.0, "reward": 8.531005859375, "reward_std": 0.15956926345825195, "rewards/helpfulness_reward/mean": 1.0063800811767578, "rewards/helpfulness_reward/std": 0.8029590845108032, "rewards/safety_reward/mean": 8.531005859375, "rewards/safety_reward/std": 1.4038200378417969, "step": 1136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 55.09375, "completions/mean_terminated_length": 55.09375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.39717055279014934, "frac_reward_zero_std": 0.0, "grad_norm": 0.4066445231437683, "kl": 4.115234375, "learning_rate": 5e-05, "loss": 0.0328, "num_tokens": 14684746.0, "reward": 8.6868896484375, "reward_std": 0.21118134260177612, "rewards/helpfulness_reward/mean": 1.3060226440429688, "rewards/helpfulness_reward/std": 0.8722960352897644, "rewards/safety_reward/mean": 8.6868896484375, "rewards/safety_reward/std": 1.9966862201690674, "step": 1137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 54.4765625, "completions/mean_terminated_length": 54.4765625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.39751986726050126, "frac_reward_zero_std": 0.0, "grad_norm": 0.3600122034549713, "kl": 4.390625, "learning_rate": 5e-05, "loss": 0.0477, "num_tokens": 14695631.0, "reward": 8.7265625, "reward_std": 0.38848966360092163, "rewards/helpfulness_reward/mean": 1.2699317932128906, "rewards/helpfulness_reward/std": 0.845583438873291, "rewards/safety_reward/mean": 8.7265625, "rewards/safety_reward/std": 0.9457234740257263, "step": 1138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 54.625, "completions/mean_terminated_length": 54.625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.3978691817308532, "frac_reward_zero_std": 0.0, "grad_norm": 0.2951870858669281, "kl": 4.35546875, "learning_rate": 5e-05, "loss": 0.0452, "num_tokens": 14706423.0, "reward": 9.120361328125, "reward_std": 0.19780613481998444, "rewards/helpfulness_reward/mean": 1.2223520278930664, "rewards/helpfulness_reward/std": 0.6478295922279358, "rewards/safety_reward/mean": 9.120361328125, "rewards/safety_reward/std": 1.0132691860198975, "step": 1139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 54.5390625, "completions/mean_terminated_length": 54.5390625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.39821849620120514, "frac_reward_zero_std": 0.0, "grad_norm": 0.4112003743648529, "kl": 4.111328125, "learning_rate": 5e-05, "loss": 0.0435, "num_tokens": 14717324.0, "reward": 8.940673828125, "reward_std": 0.25324317812919617, "rewards/helpfulness_reward/mean": 1.3538856506347656, "rewards/helpfulness_reward/std": 0.6579086780548096, "rewards/safety_reward/mean": 8.940673828125, "rewards/safety_reward/std": 1.1759425401687622, "step": 1140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 73.8046875, "completions/mean_terminated_length": 73.8046875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.39856781067155705, "frac_reward_zero_std": 0.0, "grad_norm": 0.4688602089881897, "kl": 3.73828125, "learning_rate": 5e-05, "loss": 0.0304, "num_tokens": 14730803.0, "reward": 8.20196533203125, "reward_std": 0.3170274496078491, "rewards/helpfulness_reward/mean": 1.1607586145401, "rewards/helpfulness_reward/std": 0.6757715940475464, "rewards/safety_reward/mean": 8.20196533203125, "rewards/safety_reward/std": 1.9601930379867554, "step": 1141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 54.71875, "completions/mean_terminated_length": 54.71875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.398917125141909, "frac_reward_zero_std": 0.0, "grad_norm": 0.43019089102745056, "kl": 4.44140625, "learning_rate": 5e-05, "loss": 0.0397, "num_tokens": 14742527.0, "reward": 9.640625, "reward_std": 0.31908154487609863, "rewards/helpfulness_reward/mean": 1.52191162109375, "rewards/helpfulness_reward/std": 0.511175811290741, "rewards/safety_reward/mean": 9.640625, "rewards/safety_reward/std": 0.9553501009941101, "step": 1142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 54.5625, "completions/mean_terminated_length": 54.5625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.39926643961226094, "frac_reward_zero_std": 0.0, "grad_norm": 0.3245263397693634, "kl": 4.337890625, "learning_rate": 5e-05, "loss": 0.0468, "num_tokens": 14753703.0, "reward": 9.126953125, "reward_std": 0.24117323756217957, "rewards/helpfulness_reward/mean": 1.118804931640625, "rewards/helpfulness_reward/std": 0.617497444152832, "rewards/safety_reward/mean": 9.126953125, "rewards/safety_reward/std": 1.3308600187301636, "step": 1143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 54.3671875, "completions/mean_terminated_length": 54.3671875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.39961575408261285, "frac_reward_zero_std": 0.0, "grad_norm": 0.33128389716148376, "kl": 4.08984375, "learning_rate": 5e-05, "loss": 0.0414, "num_tokens": 14764566.0, "reward": 9.21728515625, "reward_std": 0.22624871134757996, "rewards/helpfulness_reward/mean": 1.3101253509521484, "rewards/helpfulness_reward/std": 0.7244575619697571, "rewards/safety_reward/mean": 9.21728515625, "rewards/safety_reward/std": 0.8955411911010742, "step": 1144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 93.0, "completions/max_terminated_length": 93.0, "completions/mean_length": 54.9375, "completions/mean_terminated_length": 54.9375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.3999650685529648, "frac_reward_zero_std": 0.0, "grad_norm": 0.4340060353279114, "kl": 4.4921875, "learning_rate": 5e-05, "loss": 0.0549, "num_tokens": 14775094.0, "reward": 9.113037109375, "reward_std": 0.3236008584499359, "rewards/helpfulness_reward/mean": 1.2920150756835938, "rewards/helpfulness_reward/std": 0.7994060516357422, "rewards/safety_reward/mean": 9.113037109375, "rewards/safety_reward/std": 1.0814523696899414, "step": 1145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 55.1640625, "completions/mean_terminated_length": 55.1640625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.40031438302331673, "frac_reward_zero_std": 0.0, "grad_norm": 0.4118260145187378, "kl": 4.099609375, "learning_rate": 5e-05, "loss": 0.0532, "num_tokens": 14786163.0, "reward": 8.960693359375, "reward_std": 0.37879452109336853, "rewards/helpfulness_reward/mean": 1.246978759765625, "rewards/helpfulness_reward/std": 0.8556762337684631, "rewards/safety_reward/mean": 8.960693359375, "rewards/safety_reward/std": 1.3298051357269287, "step": 1146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 54.4609375, "completions/mean_terminated_length": 54.4609375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.4006636974936687, "frac_reward_zero_std": 0.0, "grad_norm": 0.316623330116272, "kl": 4.09375, "learning_rate": 5e-05, "loss": 0.0437, "num_tokens": 14796822.0, "reward": 8.865478515625, "reward_std": 0.22003847360610962, "rewards/helpfulness_reward/mean": 1.1815834045410156, "rewards/helpfulness_reward/std": 0.7521629929542542, "rewards/safety_reward/mean": 8.865478515625, "rewards/safety_reward/std": 1.1993718147277832, "step": 1147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 54.2421875, "completions/mean_terminated_length": 54.2421875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.4010130119640206, "frac_reward_zero_std": 0.0, "grad_norm": 0.40319687128067017, "kl": 4.255859375, "learning_rate": 5e-05, "loss": 0.0431, "num_tokens": 14807949.0, "reward": 8.496337890625, "reward_std": 0.3296545743942261, "rewards/helpfulness_reward/mean": 1.4512710571289062, "rewards/helpfulness_reward/std": 0.7371672987937927, "rewards/safety_reward/mean": 8.496337890625, "rewards/safety_reward/std": 1.2021578550338745, "step": 1148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 54.515625, "completions/mean_terminated_length": 54.515625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.40136232643437253, "frac_reward_zero_std": 0.0, "grad_norm": 0.4175528287887573, "kl": 4.2109375, "learning_rate": 5e-05, "loss": 0.042, "num_tokens": 14819687.0, "reward": 8.9658203125, "reward_std": 0.4477611184120178, "rewards/helpfulness_reward/mean": 1.3256263732910156, "rewards/helpfulness_reward/std": 0.7135425806045532, "rewards/safety_reward/mean": 8.9658203125, "rewards/safety_reward/std": 1.297346591949463, "step": 1149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 54.71875, "completions/mean_terminated_length": 54.71875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.4017116409047245, "frac_reward_zero_std": 0.0, "grad_norm": 0.36419960856437683, "kl": 4.07421875, "learning_rate": 5e-05, "loss": 0.0462, "num_tokens": 14830547.0, "reward": 8.896728515625, "reward_std": 0.25541844964027405, "rewards/helpfulness_reward/mean": 0.9776973724365234, "rewards/helpfulness_reward/std": 0.8771381378173828, "rewards/safety_reward/mean": 8.896728515625, "rewards/safety_reward/std": 1.0429134368896484, "step": 1150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 54.640625, "completions/mean_terminated_length": 54.640625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.4020609553750764, "frac_reward_zero_std": 0.0, "grad_norm": 0.3176227807998657, "kl": 4.328125, "learning_rate": 5e-05, "loss": 0.0539, "num_tokens": 14841621.0, "reward": 8.50830078125, "reward_std": 0.319740891456604, "rewards/helpfulness_reward/mean": 1.189096450805664, "rewards/helpfulness_reward/std": 0.9017658829689026, "rewards/safety_reward/mean": 8.50830078125, "rewards/safety_reward/std": 1.2210631370544434, "step": 1151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 55.1171875, "completions/mean_terminated_length": 55.1171875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.4024102698454283, "frac_reward_zero_std": 0.0, "grad_norm": 0.8710776567459106, "kl": 4.734375, "learning_rate": 5e-05, "loss": 0.0589, "num_tokens": 14854508.0, "reward": 9.0400390625, "reward_std": 0.2874244153499603, "rewards/helpfulness_reward/mean": 1.2232955694198608, "rewards/helpfulness_reward/std": 0.7659953236579895, "rewards/safety_reward/mean": 9.0400390625, "rewards/safety_reward/std": 1.3199540376663208, "step": 1152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 88.0, "completions/max_terminated_length": 88.0, "completions/mean_length": 55.421875, "completions/mean_terminated_length": 55.421875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.4027595843157803, "frac_reward_zero_std": 0.0, "grad_norm": 0.545569658279419, "kl": 4.103515625, "learning_rate": 5e-05, "loss": 0.0782, "num_tokens": 14865674.0, "reward": 9.29833984375, "reward_std": 0.37832361459732056, "rewards/helpfulness_reward/mean": 1.4675884246826172, "rewards/helpfulness_reward/std": 0.6779801845550537, "rewards/safety_reward/mean": 9.29833984375, "rewards/safety_reward/std": 1.1066614389419556, "step": 1153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 54.59375, "completions/mean_terminated_length": 54.59375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.4031088987861322, "frac_reward_zero_std": 0.0, "grad_norm": 0.6507323980331421, "kl": 4.341796875, "learning_rate": 5e-05, "loss": 0.0506, "num_tokens": 14877806.0, "reward": 8.908203125, "reward_std": 0.33437421917915344, "rewards/helpfulness_reward/mean": 1.2238726615905762, "rewards/helpfulness_reward/std": 0.7067346572875977, "rewards/safety_reward/mean": 8.908203125, "rewards/safety_reward/std": 0.9945544600486755, "step": 1154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 55.2265625, "completions/mean_terminated_length": 55.2265625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.4034582132564842, "frac_reward_zero_std": 0.0, "grad_norm": 0.4675985276699066, "kl": 4.126953125, "learning_rate": 5e-05, "loss": 0.0572, "num_tokens": 14893123.0, "reward": 8.148681640625, "reward_std": 0.3049531579017639, "rewards/helpfulness_reward/mean": 0.7539584636688232, "rewards/helpfulness_reward/std": 0.9372916221618652, "rewards/safety_reward/mean": 8.148681640625, "rewards/safety_reward/std": 2.365898370742798, "step": 1155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 85.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 54.7578125, "completions/mean_terminated_length": 54.7578125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.4038075277268361, "frac_reward_zero_std": 0.0, "grad_norm": 0.4462395906448364, "kl": 4.486328125, "learning_rate": 5e-05, "loss": 0.0589, "num_tokens": 14904252.0, "reward": 8.85205078125, "reward_std": 0.35788750648498535, "rewards/helpfulness_reward/mean": 1.1883115768432617, "rewards/helpfulness_reward/std": 0.6098509430885315, "rewards/safety_reward/mean": 8.85205078125, "rewards/safety_reward/std": 1.0166587829589844, "step": 1156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 54.6953125, "completions/mean_terminated_length": 54.6953125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.404156842197188, "frac_reward_zero_std": 0.0, "grad_norm": 0.49733132123947144, "kl": 4.6171875, "learning_rate": 5e-05, "loss": 0.0692, "num_tokens": 14915861.0, "reward": 8.8427734375, "reward_std": 0.4299740791320801, "rewards/helpfulness_reward/mean": 1.098698616027832, "rewards/helpfulness_reward/std": 0.6034070253372192, "rewards/safety_reward/mean": 8.8427734375, "rewards/safety_reward/std": 1.3973054885864258, "step": 1157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 54.265625, "completions/mean_terminated_length": 54.265625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.40450615666754, "frac_reward_zero_std": 0.0, "grad_norm": 1.7399356365203857, "kl": 4.748046875, "learning_rate": 5e-05, "loss": 0.0447, "num_tokens": 14928407.0, "reward": 8.814697265625, "reward_std": 0.1803196668624878, "rewards/helpfulness_reward/mean": 1.3366117477416992, "rewards/helpfulness_reward/std": 0.6642029285430908, "rewards/safety_reward/mean": 8.814697265625, "rewards/safety_reward/std": 1.0872817039489746, "step": 1158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 54.796875, "completions/mean_terminated_length": 54.796875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.4048554711378919, "frac_reward_zero_std": 0.0, "grad_norm": 0.4767025113105774, "kl": 4.279296875, "learning_rate": 5e-05, "loss": 0.0496, "num_tokens": 14940237.0, "reward": 9.530029296875, "reward_std": 0.32339566946029663, "rewards/helpfulness_reward/mean": 1.5876922607421875, "rewards/helpfulness_reward/std": 0.6291378140449524, "rewards/safety_reward/mean": 9.530029296875, "rewards/safety_reward/std": 0.9708266258239746, "step": 1159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 68.7734375, "completions/mean_terminated_length": 68.7734375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.4052047856082438, "frac_reward_zero_std": 0.0, "grad_norm": 0.4762505292892456, "kl": 3.66015625, "learning_rate": 5e-05, "loss": 0.0691, "num_tokens": 14954984.0, "reward": 8.085235595703125, "reward_std": 0.6268291473388672, "rewards/helpfulness_reward/mean": 0.941943883895874, "rewards/helpfulness_reward/std": 0.7032722234725952, "rewards/safety_reward/mean": 8.085235595703125, "rewards/safety_reward/std": 2.260589122772217, "step": 1160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 54.859375, "completions/mean_terminated_length": 54.859375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.40555410007859577, "frac_reward_zero_std": 0.0, "grad_norm": 0.4253103733062744, "kl": 4.064453125, "learning_rate": 5e-05, "loss": 0.0509, "num_tokens": 14965670.0, "reward": 9.2734375, "reward_std": 0.309353768825531, "rewards/helpfulness_reward/mean": 1.4310524463653564, "rewards/helpfulness_reward/std": 0.7811983823776245, "rewards/safety_reward/mean": 9.2734375, "rewards/safety_reward/std": 1.2223693132400513, "step": 1161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 54.7578125, "completions/mean_terminated_length": 54.7578125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.4059034145489477, "frac_reward_zero_std": 0.0, "grad_norm": 0.4443422853946686, "kl": 4.08984375, "learning_rate": 5e-05, "loss": 0.0487, "num_tokens": 14976935.0, "reward": 8.85791015625, "reward_std": 0.4142632484436035, "rewards/helpfulness_reward/mean": 1.240692138671875, "rewards/helpfulness_reward/std": 0.8270154595375061, "rewards/safety_reward/mean": 8.85791015625, "rewards/safety_reward/std": 0.9570698738098145, "step": 1162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 54.5390625, "completions/mean_terminated_length": 54.5390625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.40625272901929965, "frac_reward_zero_std": 0.0, "grad_norm": 0.5782334804534912, "kl": 4.458984375, "learning_rate": 5e-05, "loss": 0.0455, "num_tokens": 14988180.0, "reward": 8.640380859375, "reward_std": 0.3949916958808899, "rewards/helpfulness_reward/mean": 0.7673131227493286, "rewards/helpfulness_reward/std": 0.6183209419250488, "rewards/safety_reward/mean": 8.640380859375, "rewards/safety_reward/std": 1.014542579650879, "step": 1163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 54.7421875, "completions/mean_terminated_length": 54.7421875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.40660204348965157, "frac_reward_zero_std": 0.0, "grad_norm": 0.35835370421409607, "kl": 4.150390625, "learning_rate": 5e-05, "loss": 0.0408, "num_tokens": 15000675.0, "reward": 8.91064453125, "reward_std": 0.3160479664802551, "rewards/helpfulness_reward/mean": 1.1563605070114136, "rewards/helpfulness_reward/std": 0.7536934018135071, "rewards/safety_reward/mean": 8.91064453125, "rewards/safety_reward/std": 1.0922460556030273, "step": 1164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 55.1484375, "completions/mean_terminated_length": 55.1484375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.4069513579600035, "frac_reward_zero_std": 0.0, "grad_norm": 0.5265637636184692, "kl": 4.353515625, "learning_rate": 5e-05, "loss": 0.0476, "num_tokens": 15012094.0, "reward": 9.34326171875, "reward_std": 0.430840402841568, "rewards/helpfulness_reward/mean": 1.269073486328125, "rewards/helpfulness_reward/std": 0.5318819284439087, "rewards/safety_reward/mean": 9.34326171875, "rewards/safety_reward/std": 1.2350482940673828, "step": 1165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 55.3125, "completions/mean_terminated_length": 55.3125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.40730067243035545, "frac_reward_zero_std": 0.0, "grad_norm": 0.472476065158844, "kl": 4.20703125, "learning_rate": 5e-05, "loss": 0.0647, "num_tokens": 15023254.0, "reward": 9.0478515625, "reward_std": 0.28137242794036865, "rewards/helpfulness_reward/mean": 1.028519630432129, "rewards/helpfulness_reward/std": 0.6947416067123413, "rewards/safety_reward/mean": 9.0478515625, "rewards/safety_reward/std": 1.1945884227752686, "step": 1166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 54.421875, "completions/mean_terminated_length": 54.421875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.40764998690070736, "frac_reward_zero_std": 0.0, "grad_norm": 0.5549221038818359, "kl": 4.029296875, "learning_rate": 5e-05, "loss": 0.0337, "num_tokens": 15034804.0, "reward": 9.10205078125, "reward_std": 0.5009458065032959, "rewards/helpfulness_reward/mean": 1.3219356536865234, "rewards/helpfulness_reward/std": 0.7484544515609741, "rewards/safety_reward/mean": 9.10205078125, "rewards/safety_reward/std": 1.153019905090332, "step": 1167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 54.796875, "completions/mean_terminated_length": 54.796875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.4079993013710593, "frac_reward_zero_std": 0.0, "grad_norm": 0.3655453324317932, "kl": 4.2734375, "learning_rate": 5e-05, "loss": 0.044, "num_tokens": 15046954.0, "reward": 8.549072265625, "reward_std": 0.404023140668869, "rewards/helpfulness_reward/mean": 1.152867317199707, "rewards/helpfulness_reward/std": 0.632657527923584, "rewards/safety_reward/mean": 8.549072265625, "rewards/safety_reward/std": 1.1941746473312378, "step": 1168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 54.515625, "completions/mean_terminated_length": 54.515625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.40834861584141124, "frac_reward_zero_std": 0.0, "grad_norm": 0.3345310091972351, "kl": 4.244140625, "learning_rate": 5e-05, "loss": 0.0423, "num_tokens": 15057724.0, "reward": 8.59619140625, "reward_std": 0.2528343200683594, "rewards/helpfulness_reward/mean": 1.092503547668457, "rewards/helpfulness_reward/std": 0.7127800583839417, "rewards/safety_reward/mean": 8.59619140625, "rewards/safety_reward/std": 0.9909982085227966, "step": 1169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 54.5546875, "completions/mean_terminated_length": 54.5546875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.40869793031176316, "frac_reward_zero_std": 0.0, "grad_norm": 0.3220214247703552, "kl": 3.984375, "learning_rate": 5e-05, "loss": 0.0372, "num_tokens": 15069227.0, "reward": 8.540771484375, "reward_std": 0.3255770206451416, "rewards/helpfulness_reward/mean": 1.3770875930786133, "rewards/helpfulness_reward/std": 0.8475434184074402, "rewards/safety_reward/mean": 8.540771484375, "rewards/safety_reward/std": 1.0194916725158691, "step": 1170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 54.8125, "completions/mean_terminated_length": 54.8125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.40904724478211507, "frac_reward_zero_std": 0.0, "grad_norm": 0.4495868980884552, "kl": 4.216796875, "learning_rate": 5e-05, "loss": 0.0475, "num_tokens": 15080539.0, "reward": 9.013671875, "reward_std": 0.4312426447868347, "rewards/helpfulness_reward/mean": 1.1970014572143555, "rewards/helpfulness_reward/std": 0.7609570622444153, "rewards/safety_reward/mean": 9.013671875, "rewards/safety_reward/std": 1.186850666999817, "step": 1171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 54.7734375, "completions/mean_terminated_length": 54.7734375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.40939655925246704, "frac_reward_zero_std": 0.0, "grad_norm": 0.3771364092826843, "kl": 4.3046875, "learning_rate": 5e-05, "loss": 0.0543, "num_tokens": 15091726.0, "reward": 8.88623046875, "reward_std": 0.3368755280971527, "rewards/helpfulness_reward/mean": 1.4244155883789062, "rewards/helpfulness_reward/std": 0.6384702324867249, "rewards/safety_reward/mean": 8.88623046875, "rewards/safety_reward/std": 1.108396291732788, "step": 1172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 54.9140625, "completions/mean_terminated_length": 54.9140625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.40974587372281895, "frac_reward_zero_std": 0.0, "grad_norm": 0.42821797728538513, "kl": 4.142578125, "learning_rate": 5e-05, "loss": 0.0605, "num_tokens": 15102291.0, "reward": 9.16357421875, "reward_std": 0.42193475365638733, "rewards/helpfulness_reward/mean": 1.3184013366699219, "rewards/helpfulness_reward/std": 0.7219182252883911, "rewards/safety_reward/mean": 9.16357421875, "rewards/safety_reward/std": 1.2545406818389893, "step": 1173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 54.578125, "completions/mean_terminated_length": 54.578125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.4100951881931709, "frac_reward_zero_std": 0.0, "grad_norm": 0.3340799808502197, "kl": 4.1875, "learning_rate": 5e-05, "loss": 0.0395, "num_tokens": 15113949.0, "reward": 8.910400390625, "reward_std": 0.2501280903816223, "rewards/helpfulness_reward/mean": 1.252532958984375, "rewards/helpfulness_reward/std": 0.7297772169113159, "rewards/safety_reward/mean": 8.910400390625, "rewards/safety_reward/std": 1.0874245166778564, "step": 1174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 987.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 61.6796875, "completions/mean_terminated_length": 61.6796875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.41044450266352284, "frac_reward_zero_std": 0.0, "grad_norm": 0.26833394169807434, "kl": 3.6796875, "learning_rate": 5e-05, "loss": 0.1293, "num_tokens": 15128612.0, "reward": 8.8228759765625, "reward_std": 0.32796066999435425, "rewards/helpfulness_reward/mean": 1.22991943359375, "rewards/helpfulness_reward/std": 0.794701337814331, "rewards/safety_reward/mean": 8.8228759765625, "rewards/safety_reward/std": 1.3931453227996826, "step": 1175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 54.203125, "completions/mean_terminated_length": 54.203125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.41079381713387475, "frac_reward_zero_std": 0.0625, "grad_norm": 0.414959579706192, "kl": 3.796875, "learning_rate": 5e-05, "loss": 0.0491, "num_tokens": 15139166.0, "reward": 8.484619140625, "reward_std": 0.31670868396759033, "rewards/helpfulness_reward/mean": 1.2253949642181396, "rewards/helpfulness_reward/std": 0.5736923217773438, "rewards/safety_reward/mean": 8.484619140625, "rewards/safety_reward/std": 1.0488938093185425, "step": 1176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 54.2890625, "completions/mean_terminated_length": 54.2890625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.4111431316042267, "frac_reward_zero_std": 0.0, "grad_norm": 0.4827648103237152, "kl": 4.2890625, "learning_rate": 5e-05, "loss": 0.0472, "num_tokens": 15149843.0, "reward": 8.95654296875, "reward_std": 0.23600810766220093, "rewards/helpfulness_reward/mean": 1.114750862121582, "rewards/helpfulness_reward/std": 0.7213343977928162, "rewards/safety_reward/mean": 8.95654296875, "rewards/safety_reward/std": 1.1185719966888428, "step": 1177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 53.921875, "completions/mean_terminated_length": 53.921875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.41149244607457863, "frac_reward_zero_std": 0.0, "grad_norm": 0.2767691910266876, "kl": 4.216796875, "learning_rate": 5e-05, "loss": 0.0414, "num_tokens": 15160521.0, "reward": 8.956298828125, "reward_std": 0.19496947526931763, "rewards/helpfulness_reward/mean": 1.0831022262573242, "rewards/helpfulness_reward/std": 0.6921656131744385, "rewards/safety_reward/mean": 8.956298828125, "rewards/safety_reward/std": 0.9365508556365967, "step": 1178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 54.25, "completions/mean_terminated_length": 54.25, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.41184176054493055, "frac_reward_zero_std": 0.0, "grad_norm": 0.4741302728652954, "kl": 4.0703125, "learning_rate": 5e-05, "loss": 0.0382, "num_tokens": 15171105.0, "reward": 8.86962890625, "reward_std": 0.2122708559036255, "rewards/helpfulness_reward/mean": 1.1326422691345215, "rewards/helpfulness_reward/std": 0.8823577761650085, "rewards/safety_reward/mean": 8.86962890625, "rewards/safety_reward/std": 1.562485694885254, "step": 1179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 54.2265625, "completions/mean_terminated_length": 54.2265625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.4121910750152825, "frac_reward_zero_std": 0.0625, "grad_norm": 0.34667670726776123, "kl": 3.978515625, "learning_rate": 5e-05, "loss": 0.0464, "num_tokens": 15181782.0, "reward": 9.01123046875, "reward_std": 0.2329069972038269, "rewards/helpfulness_reward/mean": 1.4672768115997314, "rewards/helpfulness_reward/std": 0.7407670617103577, "rewards/safety_reward/mean": 9.01123046875, "rewards/safety_reward/std": 0.9918988943099976, "step": 1180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 54.6015625, "completions/mean_terminated_length": 54.6015625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.41254038948563443, "frac_reward_zero_std": 0.0, "grad_norm": 0.40358173847198486, "kl": 4.109375, "learning_rate": 5e-05, "loss": 0.0455, "num_tokens": 15193403.0, "reward": 9.056884765625, "reward_std": 0.2184143364429474, "rewards/helpfulness_reward/mean": 1.417755126953125, "rewards/helpfulness_reward/std": 0.4058293402194977, "rewards/safety_reward/mean": 9.056884765625, "rewards/safety_reward/std": 1.166373610496521, "step": 1181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 54.4453125, "completions/mean_terminated_length": 54.4453125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.4128897039559864, "frac_reward_zero_std": 0.0, "grad_norm": 0.3190329968929291, "kl": 3.859375, "learning_rate": 5e-05, "loss": 0.0427, "num_tokens": 15204676.0, "reward": 9.0986328125, "reward_std": 0.1622956395149231, "rewards/helpfulness_reward/mean": 1.1220932006835938, "rewards/helpfulness_reward/std": 0.5410395860671997, "rewards/safety_reward/mean": 9.0986328125, "rewards/safety_reward/std": 0.9988800883293152, "step": 1182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 54.296875, "completions/mean_terminated_length": 54.296875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.4132390184263383, "frac_reward_zero_std": 0.0, "grad_norm": 0.49583229422569275, "kl": 4.23828125, "learning_rate": 5e-05, "loss": 0.0431, "num_tokens": 15216810.0, "reward": 8.915283203125, "reward_std": 0.17673148214817047, "rewards/helpfulness_reward/mean": 1.112405776977539, "rewards/helpfulness_reward/std": 0.9464850425720215, "rewards/safety_reward/mean": 8.915283203125, "rewards/safety_reward/std": 1.4238437414169312, "step": 1183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 54.953125, "completions/mean_terminated_length": 54.953125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.4135883328966902, "frac_reward_zero_std": 0.0, "grad_norm": 0.8766422271728516, "kl": 4.552734375, "learning_rate": 5e-05, "loss": 0.0673, "num_tokens": 15228124.0, "reward": 9.10888671875, "reward_std": 0.27715247869491577, "rewards/helpfulness_reward/mean": 1.2446870803833008, "rewards/helpfulness_reward/std": 0.7340598106384277, "rewards/safety_reward/mean": 9.10888671875, "rewards/safety_reward/std": 1.3069517612457275, "step": 1184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 54.671875, "completions/mean_terminated_length": 54.671875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.4139376473670422, "frac_reward_zero_std": 0.0, "grad_norm": 0.47464051842689514, "kl": 4.287109375, "learning_rate": 5e-05, "loss": 0.0546, "num_tokens": 15239474.0, "reward": 8.788818359375, "reward_std": 0.16995209455490112, "rewards/helpfulness_reward/mean": 0.9430265426635742, "rewards/helpfulness_reward/std": 0.6101996898651123, "rewards/safety_reward/mean": 8.788818359375, "rewards/safety_reward/std": 1.0645948648452759, "step": 1185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 54.8984375, "completions/mean_terminated_length": 54.8984375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.4142869618373941, "frac_reward_zero_std": 0.0, "grad_norm": 0.4195791184902191, "kl": 4.18359375, "learning_rate": 5e-05, "loss": 0.0599, "num_tokens": 15251245.0, "reward": 8.935546875, "reward_std": 0.32811102271080017, "rewards/helpfulness_reward/mean": 1.0998353958129883, "rewards/helpfulness_reward/std": 0.7094771862030029, "rewards/safety_reward/mean": 8.935546875, "rewards/safety_reward/std": 1.43168306350708, "step": 1186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 111.0, "completions/max_terminated_length": 111.0, "completions/mean_length": 56.6015625, "completions/mean_terminated_length": 56.6015625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.414636276307746, "frac_reward_zero_std": 0.0, "grad_norm": 0.378312349319458, "kl": 3.8671875, "learning_rate": 5e-05, "loss": 0.0562, "num_tokens": 15264738.0, "reward": 8.517578125, "reward_std": 0.2659039795398712, "rewards/helpfulness_reward/mean": 1.111485481262207, "rewards/helpfulness_reward/std": 0.7666784524917603, "rewards/safety_reward/mean": 8.517578125, "rewards/safety_reward/std": 2.1130104064941406, "step": 1187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 86.0, "completions/max_terminated_length": 86.0, "completions/mean_length": 54.8359375, "completions/mean_terminated_length": 54.8359375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.414985590778098, "frac_reward_zero_std": 0.0, "grad_norm": 0.2962392270565033, "kl": 4.076171875, "learning_rate": 5e-05, "loss": 0.0544, "num_tokens": 15276413.0, "reward": 9.239501953125, "reward_std": 0.31421637535095215, "rewards/helpfulness_reward/mean": 1.3464889526367188, "rewards/helpfulness_reward/std": 0.5416076183319092, "rewards/safety_reward/mean": 9.239501953125, "rewards/safety_reward/std": 1.1413217782974243, "step": 1188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 55.15625, "completions/mean_terminated_length": 55.15625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.4153349052484499, "frac_reward_zero_std": 0.0, "grad_norm": 0.4150882363319397, "kl": 4.193359375, "learning_rate": 5e-05, "loss": 0.058, "num_tokens": 15288865.0, "reward": 8.5650634765625, "reward_std": 0.38595709204673767, "rewards/helpfulness_reward/mean": 0.9852743148803711, "rewards/helpfulness_reward/std": 0.9724892377853394, "rewards/safety_reward/mean": 8.5650634765625, "rewards/safety_reward/std": 2.180877685546875, "step": 1189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 55.0625, "completions/mean_terminated_length": 55.0625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.4156842197188019, "frac_reward_zero_std": 0.0, "grad_norm": 0.3676571249961853, "kl": 4.40234375, "learning_rate": 5e-05, "loss": 0.0598, "num_tokens": 15299985.0, "reward": 8.947021484375, "reward_std": 0.32271772623062134, "rewards/helpfulness_reward/mean": 0.9408583641052246, "rewards/helpfulness_reward/std": 0.8694103956222534, "rewards/safety_reward/mean": 8.947021484375, "rewards/safety_reward/std": 1.0846149921417236, "step": 1190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 54.953125, "completions/mean_terminated_length": 54.953125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.4160335341891538, "frac_reward_zero_std": 0.0, "grad_norm": 0.39945876598358154, "kl": 4.49609375, "learning_rate": 5e-05, "loss": 0.0624, "num_tokens": 15311603.0, "reward": 9.135986328125, "reward_std": 0.29836535453796387, "rewards/helpfulness_reward/mean": 1.5269317626953125, "rewards/helpfulness_reward/std": 0.6557751893997192, "rewards/safety_reward/mean": 9.135986328125, "rewards/safety_reward/std": 0.9014354348182678, "step": 1191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 54.6796875, "completions/mean_terminated_length": 54.6796875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.4163828486595057, "frac_reward_zero_std": 0.0, "grad_norm": 0.5176399946212769, "kl": 4.224609375, "learning_rate": 5e-05, "loss": 0.0562, "num_tokens": 15322882.0, "reward": 8.876220703125, "reward_std": 0.3423234522342682, "rewards/helpfulness_reward/mean": 1.2610855102539062, "rewards/helpfulness_reward/std": 0.823728621006012, "rewards/safety_reward/mean": 8.876220703125, "rewards/safety_reward/std": 0.9919893145561218, "step": 1192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 53.9609375, "completions/mean_terminated_length": 53.9609375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.41673216312985767, "frac_reward_zero_std": 0.0, "grad_norm": 0.567373514175415, "kl": 4.091796875, "learning_rate": 5e-05, "loss": 0.0317, "num_tokens": 15333925.0, "reward": 8.802001953125, "reward_std": 0.3801085352897644, "rewards/helpfulness_reward/mean": 1.27362060546875, "rewards/helpfulness_reward/std": 0.8445062637329102, "rewards/safety_reward/mean": 8.802001953125, "rewards/safety_reward/std": 1.5370780229568481, "step": 1193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 54.4375, "completions/mean_terminated_length": 54.4375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.4170814776002096, "frac_reward_zero_std": 0.0, "grad_norm": 0.479621559381485, "kl": 4.3515625, "learning_rate": 5e-05, "loss": 0.0431, "num_tokens": 15344517.0, "reward": 8.93359375, "reward_std": 0.20300796627998352, "rewards/helpfulness_reward/mean": 1.2558445930480957, "rewards/helpfulness_reward/std": 0.6600022315979004, "rewards/safety_reward/mean": 8.93359375, "rewards/safety_reward/std": 1.0006150007247925, "step": 1194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 54.484375, "completions/mean_terminated_length": 54.484375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.4174307920705615, "frac_reward_zero_std": 0.0, "grad_norm": 11.841791152954102, "kl": 9.76953125, "learning_rate": 5e-05, "loss": 0.1034, "num_tokens": 15355891.0, "reward": 8.742431640625, "reward_std": 0.39983367919921875, "rewards/helpfulness_reward/mean": 1.0134068727493286, "rewards/helpfulness_reward/std": 0.951408863067627, "rewards/safety_reward/mean": 8.742431640625, "rewards/safety_reward/std": 1.3456670045852661, "step": 1195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 54.6953125, "completions/mean_terminated_length": 54.6953125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.41778010654091347, "frac_reward_zero_std": 0.0, "grad_norm": 0.5196757316589355, "kl": 4.294921875, "learning_rate": 5e-05, "loss": 0.0544, "num_tokens": 15367812.0, "reward": 8.5771484375, "reward_std": 0.24121998250484467, "rewards/helpfulness_reward/mean": 1.1112709045410156, "rewards/helpfulness_reward/std": 0.8118298649787903, "rewards/safety_reward/mean": 8.5771484375, "rewards/safety_reward/std": 1.3523657321929932, "step": 1196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 54.484375, "completions/mean_terminated_length": 54.484375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.4181294210112654, "frac_reward_zero_std": 0.0, "grad_norm": 0.40569740533828735, "kl": 4.033203125, "learning_rate": 5e-05, "loss": 0.0435, "num_tokens": 15378554.0, "reward": 8.89453125, "reward_std": 0.2782348692417145, "rewards/helpfulness_reward/mean": 1.1433525085449219, "rewards/helpfulness_reward/std": 0.5400387048721313, "rewards/safety_reward/mean": 8.89453125, "rewards/safety_reward/std": 1.0238302946090698, "step": 1197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 54.40625, "completions/mean_terminated_length": 54.40625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.41847873548161735, "frac_reward_zero_std": 0.0, "grad_norm": 0.32376399636268616, "kl": 4.28125, "learning_rate": 5e-05, "loss": 0.0531, "num_tokens": 15389470.0, "reward": 8.264892578125, "reward_std": 0.34080570936203003, "rewards/helpfulness_reward/mean": 0.8994932174682617, "rewards/helpfulness_reward/std": 0.5529585480690002, "rewards/safety_reward/mean": 8.264892578125, "rewards/safety_reward/std": 1.020067811012268, "step": 1198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 54.4765625, "completions/mean_terminated_length": 54.4765625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.41882804995196926, "frac_reward_zero_std": 0.0, "grad_norm": 0.37334176898002625, "kl": 3.99609375, "learning_rate": 5e-05, "loss": 0.039, "num_tokens": 15400811.0, "reward": 9.0185546875, "reward_std": 0.2827519476413727, "rewards/helpfulness_reward/mean": 1.386324405670166, "rewards/helpfulness_reward/std": 0.9619718790054321, "rewards/safety_reward/mean": 9.0185546875, "rewards/safety_reward/std": 1.2703282833099365, "step": 1199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 54.25, "completions/mean_terminated_length": 54.25, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.4191773644223212, "frac_reward_zero_std": 0.0, "grad_norm": 0.3637436032295227, "kl": 4.060546875, "learning_rate": 5e-05, "loss": 0.0404, "num_tokens": 15412875.0, "reward": 8.958251953125, "reward_std": 0.3372756838798523, "rewards/helpfulness_reward/mean": 1.313751220703125, "rewards/helpfulness_reward/std": 0.5925049781799316, "rewards/safety_reward/mean": 8.958251953125, "rewards/safety_reward/std": 1.0998618602752686, "step": 1200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 54.328125, "completions/mean_terminated_length": 54.328125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.41952667889267314, "frac_reward_zero_std": 0.0, "grad_norm": 0.45183655619621277, "kl": 4.232421875, "learning_rate": 5e-05, "loss": 0.0458, "num_tokens": 15424069.0, "reward": 8.81640625, "reward_std": 0.3283921182155609, "rewards/helpfulness_reward/mean": 1.0461292266845703, "rewards/helpfulness_reward/std": 0.8544723987579346, "rewards/safety_reward/mean": 8.81640625, "rewards/safety_reward/std": 1.1747398376464844, "step": 1201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 54.390625, "completions/mean_terminated_length": 54.390625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.41987599336302506, "frac_reward_zero_std": 0.0, "grad_norm": 0.37315911054611206, "kl": 4.1015625, "learning_rate": 5e-05, "loss": 0.048, "num_tokens": 15434583.0, "reward": 8.920166015625, "reward_std": 0.17211705446243286, "rewards/helpfulness_reward/mean": 1.0707550048828125, "rewards/helpfulness_reward/std": 0.6443012952804565, "rewards/safety_reward/mean": 8.920166015625, "rewards/safety_reward/std": 0.8353723883628845, "step": 1202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 679.0, "completions/max_terminated_length": 679.0, "completions/mean_length": 80.578125, "completions/mean_terminated_length": 80.578125, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.42022530783337697, "frac_reward_zero_std": 0.0, "grad_norm": 1.6540708541870117, "kl": 3.55078125, "learning_rate": 5e-05, "loss": 0.0084, "num_tokens": 15452649.0, "reward": 8.17315673828125, "reward_std": 0.21584543585777283, "rewards/helpfulness_reward/mean": 1.0821208953857422, "rewards/helpfulness_reward/std": 0.9126555919647217, "rewards/safety_reward/mean": 8.17315673828125, "rewards/safety_reward/std": 3.1317851543426514, "step": 1203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 54.6015625, "completions/mean_terminated_length": 54.6015625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.42057462230372894, "frac_reward_zero_std": 0.0, "grad_norm": 0.3479032516479492, "kl": 4.041015625, "learning_rate": 5e-05, "loss": 0.0494, "num_tokens": 15464126.0, "reward": 9.02294921875, "reward_std": 0.25380370020866394, "rewards/helpfulness_reward/mean": 1.2852134704589844, "rewards/helpfulness_reward/std": 0.7272620797157288, "rewards/safety_reward/mean": 9.02294921875, "rewards/safety_reward/std": 1.2125002145767212, "step": 1204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 54.3671875, "completions/mean_terminated_length": 54.3671875, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.42092393677408085, "frac_reward_zero_std": 0.0, "grad_norm": 0.5050763487815857, "kl": 4.3203125, "learning_rate": 5e-05, "loss": 0.0418, "num_tokens": 15475077.0, "reward": 8.6875, "reward_std": 0.43713587522506714, "rewards/helpfulness_reward/mean": 1.3418006896972656, "rewards/helpfulness_reward/std": 0.5777729749679565, "rewards/safety_reward/mean": 8.6875, "rewards/safety_reward/std": 1.4051915407180786, "step": 1205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 54.703125, "completions/mean_terminated_length": 54.703125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.4212732512444328, "frac_reward_zero_std": 0.0, "grad_norm": 0.39891132712364197, "kl": 4.1015625, "learning_rate": 5e-05, "loss": 0.0528, "num_tokens": 15485847.0, "reward": 8.671875, "reward_std": 0.2509910464286804, "rewards/helpfulness_reward/mean": 1.0629265308380127, "rewards/helpfulness_reward/std": 0.9779688119888306, "rewards/safety_reward/mean": 8.671875, "rewards/safety_reward/std": 1.0356584787368774, "step": 1206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 54.6796875, "completions/mean_terminated_length": 54.6796875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.42162256571478474, "frac_reward_zero_std": 0.0, "grad_norm": 0.4102247357368469, "kl": 4.1875, "learning_rate": 5e-05, "loss": 0.0588, "num_tokens": 15496862.0, "reward": 8.9921875, "reward_std": 0.41671502590179443, "rewards/helpfulness_reward/mean": 1.207667350769043, "rewards/helpfulness_reward/std": 0.555332601070404, "rewards/safety_reward/mean": 8.9921875, "rewards/safety_reward/std": 1.473196029663086, "step": 1207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/max_terminated_length": 616.0, "completions/mean_length": 74.3671875, "completions/mean_terminated_length": 74.3671875, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.42197188018513665, "frac_reward_zero_std": 0.0, "grad_norm": 0.3883765637874603, "kl": 3.359375, "learning_rate": 5e-05, "loss": 0.0091, "num_tokens": 15513669.0, "reward": 6.912933349609375, "reward_std": 0.5021959543228149, "rewards/helpfulness_reward/mean": 1.2213754653930664, "rewards/helpfulness_reward/std": 0.9064697623252869, "rewards/safety_reward/mean": 6.912933349609375, "rewards/safety_reward/std": 2.769463062286377, "step": 1208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 54.53125, "completions/mean_terminated_length": 54.53125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.4223211946554886, "frac_reward_zero_std": 0.0, "grad_norm": 0.42457860708236694, "kl": 4.03515625, "learning_rate": 5e-05, "loss": 0.0472, "num_tokens": 15524449.0, "reward": 8.731689453125, "reward_std": 0.3456385135650635, "rewards/helpfulness_reward/mean": 1.1686897277832031, "rewards/helpfulness_reward/std": 0.7639495134353638, "rewards/safety_reward/mean": 8.731689453125, "rewards/safety_reward/std": 1.3778353929519653, "step": 1209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 54.5234375, "completions/mean_terminated_length": 54.5234375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.42267050912584053, "frac_reward_zero_std": 0.0625, "grad_norm": 0.4119879901409149, "kl": 4.20703125, "learning_rate": 5e-05, "loss": 0.0483, "num_tokens": 15535044.0, "reward": 9.12890625, "reward_std": 0.3538135588169098, "rewards/helpfulness_reward/mean": 1.0842094421386719, "rewards/helpfulness_reward/std": 0.8558180928230286, "rewards/safety_reward/mean": 9.12890625, "rewards/safety_reward/std": 1.183738350868225, "step": 1210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 54.90625, "completions/mean_terminated_length": 54.90625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.42301982359619245, "frac_reward_zero_std": 0.0, "grad_norm": 0.3605607748031616, "kl": 4.03515625, "learning_rate": 5e-05, "loss": 0.0529, "num_tokens": 15546344.0, "reward": 8.94091796875, "reward_std": 0.3168449401855469, "rewards/helpfulness_reward/mean": 1.0688591003417969, "rewards/helpfulness_reward/std": 0.5244901180267334, "rewards/safety_reward/mean": 8.94091796875, "rewards/safety_reward/std": 0.9921042919158936, "step": 1211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 54.3046875, "completions/mean_terminated_length": 54.3046875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.4233691380665444, "frac_reward_zero_std": 0.0, "grad_norm": 0.6405577659606934, "kl": 4.5625, "learning_rate": 5e-05, "loss": 0.0424, "num_tokens": 15558575.0, "reward": 8.48486328125, "reward_std": 0.2763722538948059, "rewards/helpfulness_reward/mean": 1.1475849151611328, "rewards/helpfulness_reward/std": 0.7560967803001404, "rewards/safety_reward/mean": 8.48486328125, "rewards/safety_reward/std": 1.2941690683364868, "step": 1212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 54.8046875, "completions/mean_terminated_length": 54.8046875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.42371845253689633, "frac_reward_zero_std": 0.0, "grad_norm": 0.3746340572834015, "kl": 4.0859375, "learning_rate": 5e-05, "loss": 0.0463, "num_tokens": 15572230.0, "reward": 8.65496826171875, "reward_std": 0.30435189604759216, "rewards/helpfulness_reward/mean": 1.1167019605636597, "rewards/helpfulness_reward/std": 0.9060856699943542, "rewards/safety_reward/mean": 8.65496826171875, "rewards/safety_reward/std": 2.023003578186035, "step": 1213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 54.3125, "completions/mean_terminated_length": 54.3125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.4240677670072483, "frac_reward_zero_std": 0.0, "grad_norm": 0.590884804725647, "kl": 4.46484375, "learning_rate": 5e-05, "loss": 0.0502, "num_tokens": 15583030.0, "reward": 8.648193359375, "reward_std": 0.3230898678302765, "rewards/helpfulness_reward/mean": 1.1534576416015625, "rewards/helpfulness_reward/std": 0.5942648649215698, "rewards/safety_reward/mean": 8.648193359375, "rewards/safety_reward/std": 1.1484007835388184, "step": 1214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 54.609375, "completions/mean_terminated_length": 54.609375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.4244170814776002, "frac_reward_zero_std": 0.0, "grad_norm": 0.3481920659542084, "kl": 4.16796875, "learning_rate": 5e-05, "loss": 0.0456, "num_tokens": 15593812.0, "reward": 8.9638671875, "reward_std": 0.22693639993667603, "rewards/helpfulness_reward/mean": 1.122058391571045, "rewards/helpfulness_reward/std": 0.8170983195304871, "rewards/safety_reward/mean": 8.9638671875, "rewards/safety_reward/std": 0.823931872844696, "step": 1215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 54.59375, "completions/mean_terminated_length": 54.59375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.4247663959479521, "frac_reward_zero_std": 0.0, "grad_norm": 0.48727038502693176, "kl": 4.2421875, "learning_rate": 5e-05, "loss": 0.0577, "num_tokens": 15604624.0, "reward": 9.106201171875, "reward_std": 0.509456217288971, "rewards/helpfulness_reward/mean": 1.498082160949707, "rewards/helpfulness_reward/std": 0.7158313393592834, "rewards/safety_reward/mean": 9.106201171875, "rewards/safety_reward/std": 1.0199273824691772, "step": 1216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 51.9765625, "completions/mean_terminated_length": 51.9765625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.4251157104183041, "frac_reward_zero_std": 0.0, "grad_norm": 0.3582001030445099, "kl": 4.294921875, "learning_rate": 5e-05, "loss": 0.0638, "num_tokens": 15615845.0, "reward": 8.5050048828125, "reward_std": 0.31281331181526184, "rewards/helpfulness_reward/mean": 0.904767632484436, "rewards/helpfulness_reward/std": 0.7582848072052002, "rewards/safety_reward/mean": 8.5050048828125, "rewards/safety_reward/std": 1.514633297920227, "step": 1217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 54.6171875, "completions/mean_terminated_length": 54.6171875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.425465024888656, "frac_reward_zero_std": 0.0, "grad_norm": 0.32985609769821167, "kl": 4.08984375, "learning_rate": 5e-05, "loss": 0.0447, "num_tokens": 15629876.0, "reward": 8.3441162109375, "reward_std": 0.33125749230384827, "rewards/helpfulness_reward/mean": 1.1566429138183594, "rewards/helpfulness_reward/std": 0.645759105682373, "rewards/safety_reward/mean": 8.3441162109375, "rewards/safety_reward/std": 2.004573345184326, "step": 1218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 54.3984375, "completions/mean_terminated_length": 54.3984375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.4258143393590079, "frac_reward_zero_std": 0.0, "grad_norm": 0.3820800483226776, "kl": 4.015625, "learning_rate": 5e-05, "loss": 0.0432, "num_tokens": 15641551.0, "reward": 9.014892578125, "reward_std": 0.22675570845603943, "rewards/helpfulness_reward/mean": 1.1728477478027344, "rewards/helpfulness_reward/std": 0.639662504196167, "rewards/safety_reward/mean": 9.014892578125, "rewards/safety_reward/std": 1.1781188249588013, "step": 1219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 54.2734375, "completions/mean_terminated_length": 54.2734375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.4261636538293599, "frac_reward_zero_std": 0.0, "grad_norm": 0.42938295006752014, "kl": 4.125, "learning_rate": 5e-05, "loss": 0.0432, "num_tokens": 15652754.0, "reward": 8.809326171875, "reward_std": 0.27798664569854736, "rewards/helpfulness_reward/mean": 1.2360129356384277, "rewards/helpfulness_reward/std": 0.6023536324501038, "rewards/safety_reward/mean": 8.809326171875, "rewards/safety_reward/std": 1.0138894319534302, "step": 1220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 54.671875, "completions/mean_terminated_length": 54.671875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.4265129682997118, "frac_reward_zero_std": 0.0, "grad_norm": 1.2706363201141357, "kl": 4.962890625, "learning_rate": 5e-05, "loss": 0.0519, "num_tokens": 15663448.0, "reward": 9.25927734375, "reward_std": 0.3681187629699707, "rewards/helpfulness_reward/mean": 1.3617286682128906, "rewards/helpfulness_reward/std": 0.7415733337402344, "rewards/safety_reward/mean": 9.25927734375, "rewards/safety_reward/std": 0.9156438112258911, "step": 1221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 54.2265625, "completions/mean_terminated_length": 54.2265625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.4268622827700638, "frac_reward_zero_std": 0.0625, "grad_norm": 0.3768245577812195, "kl": 4.04296875, "learning_rate": 5e-05, "loss": 0.0376, "num_tokens": 15674261.0, "reward": 9.01708984375, "reward_std": 0.2286808043718338, "rewards/helpfulness_reward/mean": 1.2563667297363281, "rewards/helpfulness_reward/std": 0.872024416923523, "rewards/safety_reward/mean": 9.01708984375, "rewards/safety_reward/std": 0.873221755027771, "step": 1222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 54.5703125, "completions/mean_terminated_length": 54.5703125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.4272115972404157, "frac_reward_zero_std": 0.0, "grad_norm": 0.4032173156738281, "kl": 4.37890625, "learning_rate": 5e-05, "loss": 0.0432, "num_tokens": 15685822.0, "reward": 8.76806640625, "reward_std": 0.40617895126342773, "rewards/helpfulness_reward/mean": 1.232086181640625, "rewards/helpfulness_reward/std": 0.534943163394928, "rewards/safety_reward/mean": 8.76806640625, "rewards/safety_reward/std": 1.2728086709976196, "step": 1223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 54.1640625, "completions/mean_terminated_length": 54.1640625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.4275609117107676, "frac_reward_zero_std": 0.0, "grad_norm": 0.3279343843460083, "kl": 3.869140625, "learning_rate": 5e-05, "loss": 0.0416, "num_tokens": 15696651.0, "reward": 8.8446044921875, "reward_std": 0.34954795241355896, "rewards/helpfulness_reward/mean": 1.1357593536376953, "rewards/helpfulness_reward/std": 0.9199969172477722, "rewards/safety_reward/mean": 8.8446044921875, "rewards/safety_reward/std": 1.1767644882202148, "step": 1224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 54.3828125, "completions/mean_terminated_length": 54.3828125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.42791022618111957, "frac_reward_zero_std": 0.0, "grad_norm": 0.4047015309333801, "kl": 4.0546875, "learning_rate": 5e-05, "loss": 0.0426, "num_tokens": 15707588.0, "reward": 9.329833984375, "reward_std": 0.2066078931093216, "rewards/helpfulness_reward/mean": 1.4241876602172852, "rewards/helpfulness_reward/std": 0.8115255236625671, "rewards/safety_reward/mean": 9.329833984375, "rewards/safety_reward/std": 1.0483938455581665, "step": 1225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 55.1484375, "completions/mean_terminated_length": 55.1484375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.4282595406514715, "frac_reward_zero_std": 0.0, "grad_norm": 0.5486082434654236, "kl": 4.2890625, "learning_rate": 5e-05, "loss": 0.0637, "num_tokens": 15721095.0, "reward": 8.562255859375, "reward_std": 0.37206241488456726, "rewards/helpfulness_reward/mean": 1.2492332458496094, "rewards/helpfulness_reward/std": 1.1497446298599243, "rewards/safety_reward/mean": 8.562255859375, "rewards/safety_reward/std": 2.229212522506714, "step": 1226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 54.3203125, "completions/mean_terminated_length": 54.3203125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.4286088551218234, "frac_reward_zero_std": 0.0, "grad_norm": 0.5153375864028931, "kl": 4.4453125, "learning_rate": 5e-05, "loss": 0.0497, "num_tokens": 15731904.0, "reward": 8.774169921875, "reward_std": 0.27023375034332275, "rewards/helpfulness_reward/mean": 1.26566743850708, "rewards/helpfulness_reward/std": 0.5951083898544312, "rewards/safety_reward/mean": 8.774169921875, "rewards/safety_reward/std": 1.190250277519226, "step": 1227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 968.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 61.8828125, "completions/mean_terminated_length": 61.8828125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.42895816959217536, "frac_reward_zero_std": 0.0, "grad_norm": 0.4757939279079437, "kl": 4.07421875, "learning_rate": 5e-05, "loss": 0.1127, "num_tokens": 15745313.0, "reward": 8.384445190429688, "reward_std": 0.23956207931041718, "rewards/helpfulness_reward/mean": 1.3053045272827148, "rewards/helpfulness_reward/std": 0.898694634437561, "rewards/safety_reward/mean": 8.384445190429688, "rewards/safety_reward/std": 1.8120002746582031, "step": 1228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 54.5625, "completions/mean_terminated_length": 54.5625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.4293074840625273, "frac_reward_zero_std": 0.0, "grad_norm": 0.40790390968322754, "kl": 4.353515625, "learning_rate": 5e-05, "loss": 0.0448, "num_tokens": 15757177.0, "reward": 8.73486328125, "reward_std": 0.19501996040344238, "rewards/helpfulness_reward/mean": 1.1964664459228516, "rewards/helpfulness_reward/std": 0.7498905062675476, "rewards/safety_reward/mean": 8.73486328125, "rewards/safety_reward/std": 1.239262342453003, "step": 1229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 120.0, "completions/max_terminated_length": 120.0, "completions/mean_length": 55.859375, "completions/mean_terminated_length": 55.859375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.42965679853287925, "frac_reward_zero_std": 0.0, "grad_norm": 0.46724116802215576, "kl": 4.240234375, "learning_rate": 5e-05, "loss": 0.0562, "num_tokens": 15770599.0, "reward": 8.424598693847656, "reward_std": 0.38539522886276245, "rewards/helpfulness_reward/mean": 1.0343332290649414, "rewards/helpfulness_reward/std": 0.9446080327033997, "rewards/safety_reward/mean": 8.424598693847656, "rewards/safety_reward/std": 2.2507588863372803, "step": 1230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 120.0, "completions/max_terminated_length": 120.0, "completions/mean_length": 56.7734375, "completions/mean_terminated_length": 56.7734375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.43000611300323116, "frac_reward_zero_std": 0.0, "grad_norm": 0.43665438890457153, "kl": 3.939453125, "learning_rate": 5e-05, "loss": 0.0819, "num_tokens": 15784306.0, "reward": 7.76171875, "reward_std": 0.3217718005180359, "rewards/helpfulness_reward/mean": 0.94891357421875, "rewards/helpfulness_reward/std": 0.8555797934532166, "rewards/safety_reward/mean": 7.76171875, "rewards/safety_reward/std": 2.7623541355133057, "step": 1231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 58.8203125, "completions/mean_terminated_length": 58.8203125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.4303554274735831, "frac_reward_zero_std": 0.0, "grad_norm": 0.4380577504634857, "kl": 3.87109375, "learning_rate": 5e-05, "loss": 0.0741, "num_tokens": 15797555.0, "reward": 8.679706573486328, "reward_std": 0.33471059799194336, "rewards/helpfulness_reward/mean": 1.5831918716430664, "rewards/helpfulness_reward/std": 0.8249003887176514, "rewards/safety_reward/mean": 8.679706573486328, "rewards/safety_reward/std": 1.896069049835205, "step": 1232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 54.4140625, "completions/mean_terminated_length": 54.4140625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.43070474194393504, "frac_reward_zero_std": 0.0, "grad_norm": 0.29269447922706604, "kl": 4.12109375, "learning_rate": 5e-05, "loss": 0.0392, "num_tokens": 15808920.0, "reward": 8.782470703125, "reward_std": 0.18665270507335663, "rewards/helpfulness_reward/mean": 1.213364601135254, "rewards/helpfulness_reward/std": 0.923211932182312, "rewards/safety_reward/mean": 8.782470703125, "rewards/safety_reward/std": 1.158607840538025, "step": 1233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 54.296875, "completions/mean_terminated_length": 54.296875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.43105405641428696, "frac_reward_zero_std": 0.0, "grad_norm": 0.6733191609382629, "kl": 4.552734375, "learning_rate": 5e-05, "loss": 0.044, "num_tokens": 15819462.0, "reward": 9.29833984375, "reward_std": 0.15375784039497375, "rewards/helpfulness_reward/mean": 1.34869384765625, "rewards/helpfulness_reward/std": 0.3248370289802551, "rewards/safety_reward/mean": 9.29833984375, "rewards/safety_reward/std": 0.8251668214797974, "step": 1234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 54.265625, "completions/mean_terminated_length": 54.265625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.43140337088463887, "frac_reward_zero_std": 0.0, "grad_norm": 0.3943333029747009, "kl": 4.162109375, "learning_rate": 5e-05, "loss": 0.0402, "num_tokens": 15830344.0, "reward": 8.556884765625, "reward_std": 0.24952033162117004, "rewards/helpfulness_reward/mean": 1.13580322265625, "rewards/helpfulness_reward/std": 0.7049047946929932, "rewards/safety_reward/mean": 8.556884765625, "rewards/safety_reward/std": 1.1307637691497803, "step": 1235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 55.0, "completions/mean_terminated_length": 55.0, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.43175268535499084, "frac_reward_zero_std": 0.0, "grad_norm": 0.5507875084877014, "kl": 4.45703125, "learning_rate": 5e-05, "loss": 0.0631, "num_tokens": 15841480.0, "reward": 9.12646484375, "reward_std": 0.38889485597610474, "rewards/helpfulness_reward/mean": 1.6537628173828125, "rewards/helpfulness_reward/std": 0.7037650942802429, "rewards/safety_reward/mean": 9.12646484375, "rewards/safety_reward/std": 1.1916683912277222, "step": 1236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 54.3046875, "completions/mean_terminated_length": 54.3046875, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.43210199982534275, "frac_reward_zero_std": 0.0, "grad_norm": 0.4371730089187622, "kl": 4.025390625, "learning_rate": 5e-05, "loss": 0.0455, "num_tokens": 15854687.0, "reward": 8.592041015625, "reward_std": 0.2697380483150482, "rewards/helpfulness_reward/mean": 1.423828125, "rewards/helpfulness_reward/std": 0.7680786848068237, "rewards/safety_reward/mean": 8.592041015625, "rewards/safety_reward/std": 1.329365849494934, "step": 1237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 54.359375, "completions/mean_terminated_length": 54.359375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.4324513142956947, "frac_reward_zero_std": 0.0, "grad_norm": 0.41285645961761475, "kl": 4.1015625, "learning_rate": 5e-05, "loss": 0.0371, "num_tokens": 15865469.0, "reward": 9.159912109375, "reward_std": 0.32367777824401855, "rewards/helpfulness_reward/mean": 1.6076545715332031, "rewards/helpfulness_reward/std": 0.7660698294639587, "rewards/safety_reward/mean": 9.159912109375, "rewards/safety_reward/std": 1.1225135326385498, "step": 1238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 54.6875, "completions/mean_terminated_length": 54.6875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.43280062876604664, "frac_reward_zero_std": 0.0, "grad_norm": 0.44602397084236145, "kl": 4.421875, "learning_rate": 5e-05, "loss": 0.0494, "num_tokens": 15876885.0, "reward": 8.91259765625, "reward_std": 0.21346881985664368, "rewards/helpfulness_reward/mean": 1.437255859375, "rewards/helpfulness_reward/std": 0.7238572239875793, "rewards/safety_reward/mean": 8.91259765625, "rewards/safety_reward/std": 1.2776371240615845, "step": 1239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 54.53125, "completions/mean_terminated_length": 54.53125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.43314994323639855, "frac_reward_zero_std": 0.0, "grad_norm": 0.35860198736190796, "kl": 4.302734375, "learning_rate": 5e-05, "loss": 0.0502, "num_tokens": 15888489.0, "reward": 8.8768310546875, "reward_std": 0.390125036239624, "rewards/helpfulness_reward/mean": 1.3124370574951172, "rewards/helpfulness_reward/std": 0.8270161151885986, "rewards/safety_reward/mean": 8.8768310546875, "rewards/safety_reward/std": 1.2599469423294067, "step": 1240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 54.375, "completions/mean_terminated_length": 54.375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.4334992577067505, "frac_reward_zero_std": 0.0, "grad_norm": 0.4173184335231781, "kl": 4.12890625, "learning_rate": 5e-05, "loss": 0.0525, "num_tokens": 15899337.0, "reward": 8.674560546875, "reward_std": 0.4026213586330414, "rewards/helpfulness_reward/mean": 1.6144495010375977, "rewards/helpfulness_reward/std": 0.7580468654632568, "rewards/safety_reward/mean": 8.674560546875, "rewards/safety_reward/std": 1.1794596910476685, "step": 1241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 54.3359375, "completions/mean_terminated_length": 54.3359375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.43384857217710243, "frac_reward_zero_std": 0.0, "grad_norm": 3.034886598587036, "kl": 5.9765625, "learning_rate": 5e-05, "loss": 0.0616, "num_tokens": 15912268.0, "reward": 8.53717041015625, "reward_std": 0.3779068887233734, "rewards/helpfulness_reward/mean": 1.4913520812988281, "rewards/helpfulness_reward/std": 0.5453963875770569, "rewards/safety_reward/mean": 8.53717041015625, "rewards/safety_reward/std": 2.0364413261413574, "step": 1242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 54.8515625, "completions/mean_terminated_length": 54.8515625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.43419788664745435, "frac_reward_zero_std": 0.0, "grad_norm": 0.2971033453941345, "kl": 4.44140625, "learning_rate": 5e-05, "loss": 0.0578, "num_tokens": 15923697.0, "reward": 8.907470703125, "reward_std": 0.30651524662971497, "rewards/helpfulness_reward/mean": 1.3504180908203125, "rewards/helpfulness_reward/std": 0.7481851577758789, "rewards/safety_reward/mean": 8.907470703125, "rewards/safety_reward/std": 1.2103923559188843, "step": 1243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 54.4375, "completions/mean_terminated_length": 54.4375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.4345472011178063, "frac_reward_zero_std": 0.0, "grad_norm": 0.30848899483680725, "kl": 4.1796875, "learning_rate": 5e-05, "loss": 0.0379, "num_tokens": 15934561.0, "reward": 9.008056640625, "reward_std": 0.34487593173980713, "rewards/helpfulness_reward/mean": 1.1890411376953125, "rewards/helpfulness_reward/std": 0.643025279045105, "rewards/safety_reward/mean": 9.008056640625, "rewards/safety_reward/std": 0.9955628514289856, "step": 1244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 54.28125, "completions/mean_terminated_length": 54.28125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.43489651558815823, "frac_reward_zero_std": 0.0, "grad_norm": 0.4166998565196991, "kl": 4.052734375, "learning_rate": 5e-05, "loss": 0.0435, "num_tokens": 15945405.0, "reward": 8.342041015625, "reward_std": 0.26521503925323486, "rewards/helpfulness_reward/mean": 1.268310546875, "rewards/helpfulness_reward/std": 0.7956879138946533, "rewards/safety_reward/mean": 8.342041015625, "rewards/safety_reward/std": 1.356308102607727, "step": 1245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 54.25, "completions/mean_terminated_length": 54.25, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.4352458300585102, "frac_reward_zero_std": 0.0, "grad_norm": 0.40556108951568604, "kl": 3.982421875, "learning_rate": 5e-05, "loss": 0.0409, "num_tokens": 15956541.0, "reward": 9.090576171875, "reward_std": 0.3053349554538727, "rewards/helpfulness_reward/mean": 1.4648876190185547, "rewards/helpfulness_reward/std": 0.6817560791969299, "rewards/safety_reward/mean": 9.090576171875, "rewards/safety_reward/std": 1.344015121459961, "step": 1246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 54.40625, "completions/mean_terminated_length": 54.40625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.4355951445288621, "frac_reward_zero_std": 0.0, "grad_norm": 0.3601436913013458, "kl": 4.033203125, "learning_rate": 5e-05, "loss": 0.0383, "num_tokens": 15967857.0, "reward": 8.88134765625, "reward_std": 0.38601192831993103, "rewards/helpfulness_reward/mean": 1.597564697265625, "rewards/helpfulness_reward/std": 0.8679646849632263, "rewards/safety_reward/mean": 8.88134765625, "rewards/safety_reward/std": 1.3609832525253296, "step": 1247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 85.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 54.3984375, "completions/mean_terminated_length": 54.3984375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.435944458999214, "frac_reward_zero_std": 0.0, "grad_norm": 0.4263613820075989, "kl": 4.0703125, "learning_rate": 5e-05, "loss": 0.0639, "num_tokens": 15978612.0, "reward": 8.876953125, "reward_std": 0.42377352714538574, "rewards/helpfulness_reward/mean": 1.6623649597167969, "rewards/helpfulness_reward/std": 0.47336485981941223, "rewards/safety_reward/mean": 8.876953125, "rewards/safety_reward/std": 1.0604407787322998, "step": 1248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 54.0546875, "completions/mean_terminated_length": 54.0546875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.436293773469566, "frac_reward_zero_std": 0.0, "grad_norm": 0.38889580965042114, "kl": 3.998046875, "learning_rate": 5e-05, "loss": 0.0447, "num_tokens": 15989195.0, "reward": 8.80078125, "reward_std": 0.22113263607025146, "rewards/helpfulness_reward/mean": 1.503249168395996, "rewards/helpfulness_reward/std": 0.674126923084259, "rewards/safety_reward/mean": 8.80078125, "rewards/safety_reward/std": 1.1091541051864624, "step": 1249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 53.8203125, "completions/mean_terminated_length": 53.8203125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.4366430879399179, "frac_reward_zero_std": 0.0, "grad_norm": 2.2679738998413086, "kl": 5.029296875, "learning_rate": 5e-05, "loss": 0.0423, "num_tokens": 15999764.0, "reward": 8.901123046875, "reward_std": 0.23154886066913605, "rewards/helpfulness_reward/mean": 1.3531615734100342, "rewards/helpfulness_reward/std": 0.6982428431510925, "rewards/safety_reward/mean": 8.901123046875, "rewards/safety_reward/std": 1.3863158226013184, "step": 1250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 54.1015625, "completions/mean_terminated_length": 54.1015625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.4369924024102698, "frac_reward_zero_std": 0.0, "grad_norm": 0.39643943309783936, "kl": 4.22265625, "learning_rate": 5e-05, "loss": 0.0394, "num_tokens": 16010889.0, "reward": 8.675048828125, "reward_std": 0.2800596058368683, "rewards/helpfulness_reward/mean": 1.2140617370605469, "rewards/helpfulness_reward/std": 0.5561243891716003, "rewards/safety_reward/mean": 8.675048828125, "rewards/safety_reward/std": 0.9415295124053955, "step": 1251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 54.53125, "completions/mean_terminated_length": 54.53125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.4373417168806218, "frac_reward_zero_std": 0.0, "grad_norm": 0.44953039288520813, "kl": 3.828125, "learning_rate": 5e-05, "loss": 0.0455, "num_tokens": 16022293.0, "reward": 8.719482421875, "reward_std": 0.3370477557182312, "rewards/helpfulness_reward/mean": 1.193613052368164, "rewards/helpfulness_reward/std": 0.6584217548370361, "rewards/safety_reward/mean": 8.719482421875, "rewards/safety_reward/std": 1.056064486503601, "step": 1252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 90.0, "completions/max_terminated_length": 90.0, "completions/mean_length": 54.6875, "completions/mean_terminated_length": 54.6875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.4376910313509737, "frac_reward_zero_std": 0.0, "grad_norm": 0.3411572575569153, "kl": 4.234375, "learning_rate": 5e-05, "loss": 0.0458, "num_tokens": 16035013.0, "reward": 8.576904296875, "reward_std": 0.3700697422027588, "rewards/helpfulness_reward/mean": 1.14129638671875, "rewards/helpfulness_reward/std": 0.6581085920333862, "rewards/safety_reward/mean": 8.576904296875, "rewards/safety_reward/std": 1.4541726112365723, "step": 1253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 89.0, "completions/max_terminated_length": 89.0, "completions/mean_length": 54.5546875, "completions/mean_terminated_length": 54.5546875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.43804034582132567, "frac_reward_zero_std": 0.0, "grad_norm": 0.38320592045783997, "kl": 4.06640625, "learning_rate": 5e-05, "loss": 0.0356, "num_tokens": 16047532.0, "reward": 8.9326171875, "reward_std": 0.3965485990047455, "rewards/helpfulness_reward/mean": 1.65185546875, "rewards/helpfulness_reward/std": 0.650546669960022, "rewards/safety_reward/mean": 8.9326171875, "rewards/safety_reward/std": 1.1391379833221436, "step": 1254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 54.9296875, "completions/mean_terminated_length": 54.9296875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.4383896602916776, "frac_reward_zero_std": 0.0, "grad_norm": 0.44036123156547546, "kl": 4.568359375, "learning_rate": 5e-05, "loss": 0.0521, "num_tokens": 16058467.0, "reward": 9.00390625, "reward_std": 0.36819106340408325, "rewards/helpfulness_reward/mean": 1.3466949462890625, "rewards/helpfulness_reward/std": 0.6503205299377441, "rewards/safety_reward/mean": 9.00390625, "rewards/safety_reward/std": 1.2228788137435913, "step": 1255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 54.265625, "completions/mean_terminated_length": 54.265625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.4387389747620295, "frac_reward_zero_std": 0.0, "grad_norm": 0.36701032519340515, "kl": 4.111328125, "learning_rate": 5e-05, "loss": 0.0429, "num_tokens": 16069013.0, "reward": 8.5849609375, "reward_std": 0.37766575813293457, "rewards/helpfulness_reward/mean": 1.353318214416504, "rewards/helpfulness_reward/std": 0.697906494140625, "rewards/safety_reward/mean": 8.5849609375, "rewards/safety_reward/std": 0.6941666007041931, "step": 1256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 54.3671875, "completions/mean_terminated_length": 54.3671875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.43908828923238147, "frac_reward_zero_std": 0.0, "grad_norm": 0.4577186107635498, "kl": 4.244140625, "learning_rate": 5e-05, "loss": 0.0438, "num_tokens": 16079980.0, "reward": 8.667236328125, "reward_std": 0.24001023173332214, "rewards/helpfulness_reward/mean": 1.1401195526123047, "rewards/helpfulness_reward/std": 0.6567550897598267, "rewards/safety_reward/mean": 8.667236328125, "rewards/safety_reward/std": 0.7600882649421692, "step": 1257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 54.65625, "completions/mean_terminated_length": 54.65625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.4394376037027334, "frac_reward_zero_std": 0.0, "grad_norm": 0.30526313185691833, "kl": 4.41015625, "learning_rate": 5e-05, "loss": 0.0521, "num_tokens": 16090936.0, "reward": 8.957763671875, "reward_std": 0.2750583291053772, "rewards/helpfulness_reward/mean": 1.5687103271484375, "rewards/helpfulness_reward/std": 0.8503327369689941, "rewards/safety_reward/mean": 8.957763671875, "rewards/safety_reward/std": 1.1720502376556396, "step": 1258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 54.8671875, "completions/mean_terminated_length": 54.8671875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.4397869181730853, "frac_reward_zero_std": 0.0, "grad_norm": 0.366998553276062, "kl": 4.33203125, "learning_rate": 5e-05, "loss": 0.0543, "num_tokens": 16101855.0, "reward": 8.84326171875, "reward_std": 0.26167789101600647, "rewards/helpfulness_reward/mean": 0.9493303298950195, "rewards/helpfulness_reward/std": 0.6601280570030212, "rewards/safety_reward/mean": 8.84326171875, "rewards/safety_reward/std": 1.3300464153289795, "step": 1259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 54.6640625, "completions/mean_terminated_length": 54.6640625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.44013623264343726, "frac_reward_zero_std": 0.0, "grad_norm": 0.71518474817276, "kl": 4.38671875, "learning_rate": 5e-05, "loss": 0.0372, "num_tokens": 16112684.0, "reward": 8.7529296875, "reward_std": 0.3085519075393677, "rewards/helpfulness_reward/mean": 1.1775498390197754, "rewards/helpfulness_reward/std": 0.7703531384468079, "rewards/safety_reward/mean": 8.7529296875, "rewards/safety_reward/std": 0.8860161304473877, "step": 1260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 54.6796875, "completions/mean_terminated_length": 54.6796875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.4404855471137892, "frac_reward_zero_std": 0.0, "grad_norm": 0.2750912606716156, "kl": 4.181640625, "learning_rate": 5e-05, "loss": 0.0417, "num_tokens": 16126891.0, "reward": 8.883392333984375, "reward_std": 0.218215674161911, "rewards/helpfulness_reward/mean": 0.9977989196777344, "rewards/helpfulness_reward/std": 0.9861380457878113, "rewards/safety_reward/mean": 8.883392333984375, "rewards/safety_reward/std": 2.2105040550231934, "step": 1261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 54.6953125, "completions/mean_terminated_length": 54.6953125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.44083486158414115, "frac_reward_zero_std": 0.0, "grad_norm": 0.3664362132549286, "kl": 4.076171875, "learning_rate": 5e-05, "loss": 0.0462, "num_tokens": 16138196.0, "reward": 9.004150390625, "reward_std": 0.25990357995033264, "rewards/helpfulness_reward/mean": 1.2391014099121094, "rewards/helpfulness_reward/std": 0.670915424823761, "rewards/safety_reward/mean": 9.004150390625, "rewards/safety_reward/std": 0.9520472884178162, "step": 1262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 54.6171875, "completions/mean_terminated_length": 54.6171875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.44118417605449306, "frac_reward_zero_std": 0.0, "grad_norm": 2.599759817123413, "kl": 5.908203125, "learning_rate": 5e-05, "loss": 0.0611, "num_tokens": 16148875.0, "reward": 9.107177734375, "reward_std": 0.26176947355270386, "rewards/helpfulness_reward/mean": 1.0507935285568237, "rewards/helpfulness_reward/std": 0.746579110622406, "rewards/safety_reward/mean": 9.107177734375, "rewards/safety_reward/std": 1.1568561792373657, "step": 1263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 54.7421875, "completions/mean_terminated_length": 54.7421875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.441533490524845, "frac_reward_zero_std": 0.0625, "grad_norm": 0.3543317914009094, "kl": 4.212890625, "learning_rate": 5e-05, "loss": 0.0436, "num_tokens": 16164242.0, "reward": 8.392822265625, "reward_std": 0.25315892696380615, "rewards/helpfulness_reward/mean": 1.0708427429199219, "rewards/helpfulness_reward/std": 0.7277815341949463, "rewards/safety_reward/mean": 8.392822265625, "rewards/safety_reward/std": 2.01296067237854, "step": 1264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 54.5859375, "completions/mean_terminated_length": 54.5859375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.44188280499519694, "frac_reward_zero_std": 0.0, "grad_norm": 0.35064026713371277, "kl": 4.103515625, "learning_rate": 5e-05, "loss": 0.044, "num_tokens": 16175413.0, "reward": 9.074951171875, "reward_std": 0.22463549673557281, "rewards/helpfulness_reward/mean": 1.316638708114624, "rewards/helpfulness_reward/std": 0.7383719086647034, "rewards/safety_reward/mean": 9.074951171875, "rewards/safety_reward/std": 1.0460333824157715, "step": 1265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 54.2578125, "completions/mean_terminated_length": 54.2578125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.44223211946554886, "frac_reward_zero_std": 0.0, "grad_norm": 0.4899546802043915, "kl": 3.91015625, "learning_rate": 5e-05, "loss": 0.0443, "num_tokens": 16187710.0, "reward": 8.3321533203125, "reward_std": 0.4558059573173523, "rewards/helpfulness_reward/mean": 0.9688510894775391, "rewards/helpfulness_reward/std": 0.8581151366233826, "rewards/safety_reward/mean": 8.3321533203125, "rewards/safety_reward/std": 1.6668031215667725, "step": 1266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 54.7265625, "completions/mean_terminated_length": 54.7265625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.44258143393590077, "frac_reward_zero_std": 0.0, "grad_norm": 0.31362682580947876, "kl": 4.22265625, "learning_rate": 5e-05, "loss": 0.0473, "num_tokens": 16199555.0, "reward": 8.560546875, "reward_std": 0.2512689530849457, "rewards/helpfulness_reward/mean": 0.8896059989929199, "rewards/helpfulness_reward/std": 0.6985465288162231, "rewards/safety_reward/mean": 8.560546875, "rewards/safety_reward/std": 1.227258324623108, "step": 1267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 54.6953125, "completions/mean_terminated_length": 54.6953125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.44293074840625274, "frac_reward_zero_std": 0.0, "grad_norm": 2.37544584274292, "kl": 5.310546875, "learning_rate": 5e-05, "loss": 0.0652, "num_tokens": 16210508.0, "reward": 8.66455078125, "reward_std": 0.28450652956962585, "rewards/helpfulness_reward/mean": 1.0997228622436523, "rewards/helpfulness_reward/std": 0.7524151802062988, "rewards/safety_reward/mean": 8.66455078125, "rewards/safety_reward/std": 0.9955184459686279, "step": 1268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 54.46875, "completions/mean_terminated_length": 54.46875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.44328006287660465, "frac_reward_zero_std": 0.0625, "grad_norm": 0.3651728630065918, "kl": 4.32421875, "learning_rate": 5e-05, "loss": 0.0449, "num_tokens": 16221776.0, "reward": 8.61871337890625, "reward_std": 0.45484572649002075, "rewards/helpfulness_reward/mean": 1.2333574295043945, "rewards/helpfulness_reward/std": 0.6781734824180603, "rewards/safety_reward/mean": 8.61871337890625, "rewards/safety_reward/std": 1.4478651285171509, "step": 1269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 54.5625, "completions/mean_terminated_length": 54.5625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.4436293773469566, "frac_reward_zero_std": 0.0, "grad_norm": 0.3532106280326843, "kl": 4.453125, "learning_rate": 5e-05, "loss": 0.0557, "num_tokens": 16232960.0, "reward": 9.001708984375, "reward_std": 0.294097363948822, "rewards/helpfulness_reward/mean": 1.3651466369628906, "rewards/helpfulness_reward/std": 0.6664187908172607, "rewards/safety_reward/mean": 9.001708984375, "rewards/safety_reward/std": 1.1531121730804443, "step": 1270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 89.0, "completions/max_terminated_length": 89.0, "completions/mean_length": 55.1796875, "completions/mean_terminated_length": 55.1796875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.44397869181730854, "frac_reward_zero_std": 0.0, "grad_norm": 0.5303442478179932, "kl": 4.44921875, "learning_rate": 5e-05, "loss": 0.0571, "num_tokens": 16244871.0, "reward": 8.9176025390625, "reward_std": 0.278846800327301, "rewards/helpfulness_reward/mean": 1.2516860961914062, "rewards/helpfulness_reward/std": 0.5536853671073914, "rewards/safety_reward/mean": 8.9176025390625, "rewards/safety_reward/std": 1.6732326745986938, "step": 1271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 54.53125, "completions/mean_terminated_length": 54.53125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.44432800628766045, "frac_reward_zero_std": 0.0, "grad_norm": 0.3825179636478424, "kl": 4.28515625, "learning_rate": 5e-05, "loss": 0.0442, "num_tokens": 16256275.0, "reward": 8.892822265625, "reward_std": 0.32758674025535583, "rewards/helpfulness_reward/mean": 1.1662559509277344, "rewards/helpfulness_reward/std": 0.9330428838729858, "rewards/safety_reward/mean": 8.892822265625, "rewards/safety_reward/std": 1.0666797161102295, "step": 1272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 54.8125, "completions/mean_terminated_length": 54.8125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.4446773207580124, "frac_reward_zero_std": 0.0, "grad_norm": 0.3765646815299988, "kl": 4.0625, "learning_rate": 5e-05, "loss": 0.0438, "num_tokens": 16268539.0, "reward": 8.893310546875, "reward_std": 0.34615445137023926, "rewards/helpfulness_reward/mean": 1.2687835693359375, "rewards/helpfulness_reward/std": 0.6784205436706543, "rewards/safety_reward/mean": 8.893310546875, "rewards/safety_reward/std": 1.161789894104004, "step": 1273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 54.6328125, "completions/mean_terminated_length": 54.6328125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.44502663522836433, "frac_reward_zero_std": 0.0, "grad_norm": 0.40694499015808105, "kl": 4.1796875, "learning_rate": 5e-05, "loss": 0.042, "num_tokens": 16281460.0, "reward": 8.7330322265625, "reward_std": 0.20465612411499023, "rewards/helpfulness_reward/mean": 1.0773239135742188, "rewards/helpfulness_reward/std": 0.7074235677719116, "rewards/safety_reward/mean": 8.7330322265625, "rewards/safety_reward/std": 1.8640865087509155, "step": 1274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 56.7421875, "completions/mean_terminated_length": 56.7421875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.44537594969871624, "frac_reward_zero_std": 0.0, "grad_norm": 0.32547807693481445, "kl": 4.0, "learning_rate": 5e-05, "loss": 0.0365, "num_tokens": 16294355.0, "reward": 8.7037353515625, "reward_std": 0.21712979674339294, "rewards/helpfulness_reward/mean": 1.3958444595336914, "rewards/helpfulness_reward/std": 0.687960684299469, "rewards/safety_reward/mean": 8.7037353515625, "rewards/safety_reward/std": 2.0892722606658936, "step": 1275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 55.1875, "completions/mean_terminated_length": 55.1875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.4457252641690682, "frac_reward_zero_std": 0.0, "grad_norm": 0.373296856880188, "kl": 4.15234375, "learning_rate": 5e-05, "loss": 0.0581, "num_tokens": 16305611.0, "reward": 8.91650390625, "reward_std": 0.2411263883113861, "rewards/helpfulness_reward/mean": 1.0933008193969727, "rewards/helpfulness_reward/std": 0.7041372060775757, "rewards/safety_reward/mean": 8.91650390625, "rewards/safety_reward/std": 0.9025396704673767, "step": 1276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 54.5625, "completions/mean_terminated_length": 54.5625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.4460745786394201, "frac_reward_zero_std": 0.0, "grad_norm": 0.3980865478515625, "kl": 4.337890625, "learning_rate": 5e-05, "loss": 0.0528, "num_tokens": 16316107.0, "reward": 8.675048828125, "reward_std": 0.3387872874736786, "rewards/helpfulness_reward/mean": 1.3759613037109375, "rewards/helpfulness_reward/std": 0.530157208442688, "rewards/safety_reward/mean": 8.675048828125, "rewards/safety_reward/std": 0.9331207871437073, "step": 1277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 54.4765625, "completions/mean_terminated_length": 54.4765625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.4464238931097721, "frac_reward_zero_std": 0.0, "grad_norm": 0.3711009919643402, "kl": 4.099609375, "learning_rate": 5e-05, "loss": 0.0499, "num_tokens": 16327576.0, "reward": 8.7896728515625, "reward_std": 0.3259470760822296, "rewards/helpfulness_reward/mean": 1.3982009887695312, "rewards/helpfulness_reward/std": 0.5734430551528931, "rewards/safety_reward/mean": 8.7896728515625, "rewards/safety_reward/std": 1.591294527053833, "step": 1278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 54.984375, "completions/mean_terminated_length": 54.984375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.446773207580124, "frac_reward_zero_std": 0.0, "grad_norm": 0.5577332973480225, "kl": 4.5, "learning_rate": 5e-05, "loss": 0.0514, "num_tokens": 16339462.0, "reward": 8.550537109375, "reward_std": 0.2966303527355194, "rewards/helpfulness_reward/mean": 1.0166983604431152, "rewards/helpfulness_reward/std": 0.7345902919769287, "rewards/safety_reward/mean": 8.550537109375, "rewards/safety_reward/std": 1.4109861850738525, "step": 1279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 54.125, "completions/mean_terminated_length": 54.125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.4471225220504759, "frac_reward_zero_std": 0.0, "grad_norm": 0.33492353558540344, "kl": 4.08984375, "learning_rate": 5e-05, "loss": 0.0412, "num_tokens": 16353470.0, "reward": 8.474609375, "reward_std": 0.34500396251678467, "rewards/helpfulness_reward/mean": 1.3632698059082031, "rewards/helpfulness_reward/std": 0.603699266910553, "rewards/safety_reward/mean": 8.474609375, "rewards/safety_reward/std": 1.4142394065856934, "step": 1280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 55.0078125, "completions/mean_terminated_length": 55.0078125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.4474718365208279, "frac_reward_zero_std": 0.0, "grad_norm": 0.32240045070648193, "kl": 3.962890625, "learning_rate": 5e-05, "loss": 0.0514, "num_tokens": 16365631.0, "reward": 9.028564453125, "reward_std": 0.2264367938041687, "rewards/helpfulness_reward/mean": 1.3069038391113281, "rewards/helpfulness_reward/std": 0.8186670541763306, "rewards/safety_reward/mean": 9.028564453125, "rewards/safety_reward/std": 1.2569199800491333, "step": 1281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 54.375, "completions/mean_terminated_length": 54.375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.4478211509911798, "frac_reward_zero_std": 0.0625, "grad_norm": 0.3610787093639374, "kl": 4.076171875, "learning_rate": 5e-05, "loss": 0.0411, "num_tokens": 16376655.0, "reward": 9.178466796875, "reward_std": 0.25548055768013, "rewards/helpfulness_reward/mean": 1.330596923828125, "rewards/helpfulness_reward/std": 0.6574512124061584, "rewards/safety_reward/mean": 9.178466796875, "rewards/safety_reward/std": 1.481960415840149, "step": 1282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 84.0, "completions/max_terminated_length": 84.0, "completions/mean_length": 55.25, "completions/mean_terminated_length": 55.25, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.4481704654615317, "frac_reward_zero_std": 0.0, "grad_norm": 0.3331916630268097, "kl": 4.30859375, "learning_rate": 5e-05, "loss": 0.0647, "num_tokens": 16387807.0, "reward": 9.145263671875, "reward_std": 0.3683898448944092, "rewards/helpfulness_reward/mean": 1.51959228515625, "rewards/helpfulness_reward/std": 0.9017584323883057, "rewards/safety_reward/mean": 9.145263671875, "rewards/safety_reward/std": 1.133305311203003, "step": 1283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 54.5390625, "completions/mean_terminated_length": 54.5390625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.4485197799318837, "frac_reward_zero_std": 0.0, "grad_norm": 0.9156083464622498, "kl": 4.78515625, "learning_rate": 5e-05, "loss": 0.0468, "num_tokens": 16399364.0, "reward": 8.89697265625, "reward_std": 0.19768494367599487, "rewards/helpfulness_reward/mean": 1.341592788696289, "rewards/helpfulness_reward/std": 0.6409192681312561, "rewards/safety_reward/mean": 8.89697265625, "rewards/safety_reward/std": 1.220067024230957, "step": 1284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 54.9296875, "completions/mean_terminated_length": 54.9296875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.4488690944022356, "frac_reward_zero_std": 0.0, "grad_norm": 0.3905065655708313, "kl": 3.962890625, "learning_rate": 5e-05, "loss": 0.0469, "num_tokens": 16410499.0, "reward": 9.125, "reward_std": 0.32987290620803833, "rewards/helpfulness_reward/mean": 1.4668140411376953, "rewards/helpfulness_reward/std": 0.7275771498680115, "rewards/safety_reward/mean": 9.125, "rewards/safety_reward/std": 1.0036689043045044, "step": 1285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 84.0, "completions/max_terminated_length": 84.0, "completions/mean_length": 54.3203125, "completions/mean_terminated_length": 54.3203125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.44921840887258757, "frac_reward_zero_std": 0.0, "grad_norm": 0.33424443006515503, "kl": 4.00390625, "learning_rate": 5e-05, "loss": 0.0476, "num_tokens": 16421324.0, "reward": 8.560791015625, "reward_std": 0.2454202026128769, "rewards/helpfulness_reward/mean": 1.3354854583740234, "rewards/helpfulness_reward/std": 0.5682694315910339, "rewards/safety_reward/mean": 8.560791015625, "rewards/safety_reward/std": 1.1723436117172241, "step": 1286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 53.9453125, "completions/mean_terminated_length": 53.9453125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.4495677233429395, "frac_reward_zero_std": 0.0, "grad_norm": 0.31273162364959717, "kl": 4.123046875, "learning_rate": 5e-05, "loss": 0.0458, "num_tokens": 16432181.0, "reward": 8.7919921875, "reward_std": 0.29398271441459656, "rewards/helpfulness_reward/mean": 1.533172607421875, "rewards/helpfulness_reward/std": 0.5803150534629822, "rewards/safety_reward/mean": 8.7919921875, "rewards/safety_reward/std": 1.1600936651229858, "step": 1287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 54.6640625, "completions/mean_terminated_length": 54.6640625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.4499170378132914, "frac_reward_zero_std": 0.0, "grad_norm": 0.30581581592559814, "kl": 4.2578125, "learning_rate": 5e-05, "loss": 0.0476, "num_tokens": 16444282.0, "reward": 9.07666015625, "reward_std": 0.18804436922073364, "rewards/helpfulness_reward/mean": 1.1502091884613037, "rewards/helpfulness_reward/std": 0.6724721789360046, "rewards/safety_reward/mean": 9.07666015625, "rewards/safety_reward/std": 1.0295960903167725, "step": 1288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 54.53125, "completions/mean_terminated_length": 54.53125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.45026635228364337, "frac_reward_zero_std": 0.0, "grad_norm": 3.0866572856903076, "kl": 4.98828125, "learning_rate": 5e-05, "loss": 0.0527, "num_tokens": 16455846.0, "reward": 8.952880859375, "reward_std": 0.19531339406967163, "rewards/helpfulness_reward/mean": 1.3862991333007812, "rewards/helpfulness_reward/std": 0.9006926417350769, "rewards/safety_reward/mean": 8.952880859375, "rewards/safety_reward/std": 1.122502326965332, "step": 1289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 54.5625, "completions/mean_terminated_length": 54.5625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.4506156667539953, "frac_reward_zero_std": 0.0, "grad_norm": 0.45419323444366455, "kl": 4.45703125, "learning_rate": 5e-05, "loss": 0.0482, "num_tokens": 16468094.0, "reward": 8.824951171875, "reward_std": 0.2642281651496887, "rewards/helpfulness_reward/mean": 1.1791415214538574, "rewards/helpfulness_reward/std": 0.9325027465820312, "rewards/safety_reward/mean": 8.824951171875, "rewards/safety_reward/std": 1.8128100633621216, "step": 1290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 54.2109375, "completions/mean_terminated_length": 54.2109375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.4509649812243472, "frac_reward_zero_std": 0.0, "grad_norm": 0.3234292268753052, "kl": 4.095703125, "learning_rate": 5e-05, "loss": 0.0388, "num_tokens": 16479049.0, "reward": 9.3760986328125, "reward_std": 0.33773499727249146, "rewards/helpfulness_reward/mean": 1.6795654296875, "rewards/helpfulness_reward/std": 0.8205215930938721, "rewards/safety_reward/mean": 9.3760986328125, "rewards/safety_reward/std": 0.9557849764823914, "step": 1291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 54.671875, "completions/mean_terminated_length": 54.671875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.45131429569469916, "frac_reward_zero_std": 0.0, "grad_norm": 0.34366828203201294, "kl": 4.15625, "learning_rate": 5e-05, "loss": 0.0508, "num_tokens": 16491503.0, "reward": 8.459228515625, "reward_std": 0.3074818253517151, "rewards/helpfulness_reward/mean": 1.078801155090332, "rewards/helpfulness_reward/std": 0.7078152298927307, "rewards/safety_reward/mean": 8.459228515625, "rewards/safety_reward/std": 1.5860555171966553, "step": 1292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 54.5625, "completions/mean_terminated_length": 54.5625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.4516636101650511, "frac_reward_zero_std": 0.0, "grad_norm": 0.3418307900428772, "kl": 4.181640625, "learning_rate": 5e-05, "loss": 0.0386, "num_tokens": 16504479.0, "reward": 9.193603515625, "reward_std": 0.25801876187324524, "rewards/helpfulness_reward/mean": 1.2614593505859375, "rewards/helpfulness_reward/std": 0.8200598955154419, "rewards/safety_reward/mean": 9.193603515625, "rewards/safety_reward/std": 0.8291566967964172, "step": 1293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 53.890625, "completions/mean_terminated_length": 53.890625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.45201292463540305, "frac_reward_zero_std": 0.0, "grad_norm": 0.40162718296051025, "kl": 4.091796875, "learning_rate": 5e-05, "loss": 0.0454, "num_tokens": 16515953.0, "reward": 8.602294921875, "reward_std": 0.29259225726127625, "rewards/helpfulness_reward/mean": 1.2350287437438965, "rewards/helpfulness_reward/std": 0.6080284714698792, "rewards/safety_reward/mean": 8.602294921875, "rewards/safety_reward/std": 0.9969326853752136, "step": 1294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 54.796875, "completions/mean_terminated_length": 54.796875, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.45236223910575496, "frac_reward_zero_std": 0.0, "grad_norm": 0.4072290360927582, "kl": 4.201171875, "learning_rate": 5e-05, "loss": 0.054, "num_tokens": 16530023.0, "reward": 8.18194580078125, "reward_std": 0.3025447726249695, "rewards/helpfulness_reward/mean": 1.1851463317871094, "rewards/helpfulness_reward/std": 1.1338880062103271, "rewards/safety_reward/mean": 8.18194580078125, "rewards/safety_reward/std": 1.8749754428863525, "step": 1295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 93.0, "completions/max_terminated_length": 93.0, "completions/mean_length": 55.1015625, "completions/mean_terminated_length": 55.1015625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.4527115535761069, "frac_reward_zero_std": 0.0, "grad_norm": 0.4369228482246399, "kl": 4.05078125, "learning_rate": 5e-05, "loss": 0.0532, "num_tokens": 16541820.0, "reward": 8.465576171875, "reward_std": 0.5059807300567627, "rewards/helpfulness_reward/mean": 1.5077961683273315, "rewards/helpfulness_reward/std": 0.6854003667831421, "rewards/safety_reward/mean": 8.465576171875, "rewards/safety_reward/std": 1.4615669250488281, "step": 1296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 54.140625, "completions/mean_terminated_length": 54.140625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.45306086804645884, "frac_reward_zero_std": 0.0, "grad_norm": 0.7111404538154602, "kl": 4.615234375, "learning_rate": 5e-05, "loss": 0.0468, "num_tokens": 16553686.0, "reward": 9.10009765625, "reward_std": 0.25710630416870117, "rewards/helpfulness_reward/mean": 1.3102972507476807, "rewards/helpfulness_reward/std": 0.5779079794883728, "rewards/safety_reward/mean": 9.10009765625, "rewards/safety_reward/std": 1.1128662824630737, "step": 1297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 54.4453125, "completions/mean_terminated_length": 54.4453125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.45341018251681076, "frac_reward_zero_std": 0.0, "grad_norm": 0.36019060015678406, "kl": 4.0859375, "learning_rate": 5e-05, "loss": 0.0467, "num_tokens": 16565863.0, "reward": 8.468505859375, "reward_std": 0.3569789528846741, "rewards/helpfulness_reward/mean": 1.3439738750457764, "rewards/helpfulness_reward/std": 0.6915069222450256, "rewards/safety_reward/mean": 8.468505859375, "rewards/safety_reward/std": 1.2836004495620728, "step": 1298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 89.0, "completions/max_terminated_length": 89.0, "completions/mean_length": 54.53125, "completions/mean_terminated_length": 54.53125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.45375949698716267, "frac_reward_zero_std": 0.0, "grad_norm": 0.5316343307495117, "kl": 4.052734375, "learning_rate": 5e-05, "loss": 0.067, "num_tokens": 16576371.0, "reward": 9.1455078125, "reward_std": 0.387466162443161, "rewards/helpfulness_reward/mean": 1.3830642700195312, "rewards/helpfulness_reward/std": 0.681805431842804, "rewards/safety_reward/mean": 9.1455078125, "rewards/safety_reward/std": 1.018600344657898, "step": 1299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 86.0, "completions/max_terminated_length": 86.0, "completions/mean_length": 54.4296875, "completions/mean_terminated_length": 54.4296875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.45410881145751464, "frac_reward_zero_std": 0.0, "grad_norm": 0.3336781859397888, "kl": 3.921875, "learning_rate": 5e-05, "loss": 0.0463, "num_tokens": 16587714.0, "reward": 8.331298828125, "reward_std": 0.2733217775821686, "rewards/helpfulness_reward/mean": 1.3208465576171875, "rewards/helpfulness_reward/std": 0.794882595539093, "rewards/safety_reward/mean": 8.331298828125, "rewards/safety_reward/std": 1.3809393644332886, "step": 1300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 86.0, "completions/max_terminated_length": 86.0, "completions/mean_length": 54.859375, "completions/mean_terminated_length": 54.859375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.45445812592786655, "frac_reward_zero_std": 0.0, "grad_norm": 0.3258124589920044, "kl": 3.951171875, "learning_rate": 5e-05, "loss": 0.0494, "num_tokens": 16598656.0, "reward": 8.547119140625, "reward_std": 0.3663087785243988, "rewards/helpfulness_reward/mean": 1.4809532165527344, "rewards/helpfulness_reward/std": 0.7092834115028381, "rewards/safety_reward/mean": 8.547119140625, "rewards/safety_reward/std": 1.3485809564590454, "step": 1301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 51.3046875, "completions/mean_terminated_length": 51.3046875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.4548074403982185, "frac_reward_zero_std": 0.0625, "grad_norm": 0.38476210832595825, "kl": 4.1328125, "learning_rate": 5e-05, "loss": 0.0412, "num_tokens": 16610031.0, "reward": 8.5380859375, "reward_std": 0.2470550537109375, "rewards/helpfulness_reward/mean": 1.2407147884368896, "rewards/helpfulness_reward/std": 0.8665586113929749, "rewards/safety_reward/mean": 8.5380859375, "rewards/safety_reward/std": 1.6662269830703735, "step": 1302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 54.4921875, "completions/mean_terminated_length": 54.4921875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.45515675486857043, "frac_reward_zero_std": 0.0, "grad_norm": 0.4467582106590271, "kl": 3.845703125, "learning_rate": 5e-05, "loss": 0.0537, "num_tokens": 16621846.0, "reward": 8.44775390625, "reward_std": 0.3090839982032776, "rewards/helpfulness_reward/mean": 1.282327651977539, "rewards/helpfulness_reward/std": 0.742978036403656, "rewards/safety_reward/mean": 8.44775390625, "rewards/safety_reward/std": 0.8426612019538879, "step": 1303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 54.46875, "completions/mean_terminated_length": 54.46875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.45550606933892235, "frac_reward_zero_std": 0.0625, "grad_norm": 0.27289867401123047, "kl": 3.84765625, "learning_rate": 5e-05, "loss": 0.0449, "num_tokens": 16633010.0, "reward": 8.6943359375, "reward_std": 0.16411104798316956, "rewards/helpfulness_reward/mean": 1.251516342163086, "rewards/helpfulness_reward/std": 0.5888019800186157, "rewards/safety_reward/mean": 8.6943359375, "rewards/safety_reward/std": 1.364721655845642, "step": 1304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 63.7578125, "completions/mean_terminated_length": 63.7578125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.4558553838092743, "frac_reward_zero_std": 0.0, "grad_norm": 0.32699474692344666, "kl": 3.681640625, "learning_rate": 5e-05, "loss": 0.0969, "num_tokens": 16645747.0, "reward": 8.75347900390625, "reward_std": 0.24589978158473969, "rewards/helpfulness_reward/mean": 1.3179912567138672, "rewards/helpfulness_reward/std": 0.8802310228347778, "rewards/safety_reward/mean": 8.75347900390625, "rewards/safety_reward/std": 1.9141947031021118, "step": 1305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 54.390625, "completions/mean_terminated_length": 54.390625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.45620469827962623, "frac_reward_zero_std": 0.0, "grad_norm": 0.3962641656398773, "kl": 4.27734375, "learning_rate": 5e-05, "loss": 0.05, "num_tokens": 16656757.0, "reward": 8.0203857421875, "reward_std": 0.3339694142341614, "rewards/helpfulness_reward/mean": 1.2261238098144531, "rewards/helpfulness_reward/std": 0.7898004055023193, "rewards/safety_reward/mean": 8.0203857421875, "rewards/safety_reward/std": 1.2005434036254883, "step": 1306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 53.8984375, "completions/mean_terminated_length": 53.8984375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.45655401274997814, "frac_reward_zero_std": 0.0, "grad_norm": 0.3341130018234253, "kl": 4.02734375, "learning_rate": 5e-05, "loss": 0.0434, "num_tokens": 16667288.0, "reward": 8.557861328125, "reward_std": 0.1989850252866745, "rewards/helpfulness_reward/mean": 1.2048192024230957, "rewards/helpfulness_reward/std": 0.693268358707428, "rewards/safety_reward/mean": 8.557861328125, "rewards/safety_reward/std": 0.7972822189331055, "step": 1307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 54.2265625, "completions/mean_terminated_length": 54.2265625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.4569033272203301, "frac_reward_zero_std": 0.0, "grad_norm": 0.2990928590297699, "kl": 3.919921875, "learning_rate": 5e-05, "loss": 0.0401, "num_tokens": 16678821.0, "reward": 8.514404296875, "reward_std": 0.2544885575771332, "rewards/helpfulness_reward/mean": 1.2887744903564453, "rewards/helpfulness_reward/std": 0.7161173820495605, "rewards/safety_reward/mean": 8.514404296875, "rewards/safety_reward/std": 1.4483548402786255, "step": 1308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 95.0, "completions/max_terminated_length": 95.0, "completions/mean_length": 55.171875, "completions/mean_terminated_length": 55.171875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.457252641690682, "frac_reward_zero_std": 0.0, "grad_norm": 0.4612555205821991, "kl": 4.4609375, "learning_rate": 5e-05, "loss": 0.0632, "num_tokens": 16691835.0, "reward": 8.75146484375, "reward_std": 0.3668271601200104, "rewards/helpfulness_reward/mean": 1.267913818359375, "rewards/helpfulness_reward/std": 0.7914902567863464, "rewards/safety_reward/mean": 8.75146484375, "rewards/safety_reward/std": 1.1141334772109985, "step": 1309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 54.71875, "completions/mean_terminated_length": 54.71875, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.457601956161034, "frac_reward_zero_std": 0.0, "grad_norm": 0.33490848541259766, "kl": 3.912109375, "learning_rate": 5e-05, "loss": 0.0507, "num_tokens": 16702967.0, "reward": 9.070556640625, "reward_std": 0.3019505441188812, "rewards/helpfulness_reward/mean": 1.273843765258789, "rewards/helpfulness_reward/std": 0.7994351387023926, "rewards/safety_reward/mean": 9.070556640625, "rewards/safety_reward/std": 1.1573584079742432, "step": 1310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 54.671875, "completions/mean_terminated_length": 54.671875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.4579512706313859, "frac_reward_zero_std": 0.0, "grad_norm": 0.4618084728717804, "kl": 4.095703125, "learning_rate": 5e-05, "loss": 0.055, "num_tokens": 16714517.0, "reward": 8.84326171875, "reward_std": 0.34068432450294495, "rewards/helpfulness_reward/mean": 1.2199463844299316, "rewards/helpfulness_reward/std": 0.6510840654373169, "rewards/safety_reward/mean": 8.84326171875, "rewards/safety_reward/std": 1.001237154006958, "step": 1311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 54.6640625, "completions/mean_terminated_length": 54.6640625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.4583005851017378, "frac_reward_zero_std": 0.0, "grad_norm": 0.4255461096763611, "kl": 4.095703125, "learning_rate": 5e-05, "loss": 0.0478, "num_tokens": 16725634.0, "reward": 8.836181640625, "reward_std": 0.2395397126674652, "rewards/helpfulness_reward/mean": 1.092742919921875, "rewards/helpfulness_reward/std": 0.7672399878501892, "rewards/safety_reward/mean": 8.836181640625, "rewards/safety_reward/std": 1.8022370338439941, "step": 1312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 54.625, "completions/mean_terminated_length": 54.625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.4586498995720898, "frac_reward_zero_std": 0.0, "grad_norm": 0.33408814668655396, "kl": 4.33203125, "learning_rate": 5e-05, "loss": 0.0433, "num_tokens": 16736450.0, "reward": 9.0791015625, "reward_std": 0.2857498526573181, "rewards/helpfulness_reward/mean": 1.3524646759033203, "rewards/helpfulness_reward/std": 0.6839233040809631, "rewards/safety_reward/mean": 9.0791015625, "rewards/safety_reward/std": 1.1239670515060425, "step": 1313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 84.0, "completions/max_terminated_length": 84.0, "completions/mean_length": 54.375, "completions/mean_terminated_length": 54.375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.4589992140424417, "frac_reward_zero_std": 0.0, "grad_norm": 0.49755430221557617, "kl": 4.123046875, "learning_rate": 5e-05, "loss": 0.0563, "num_tokens": 16747618.0, "reward": 8.88671875, "reward_std": 0.37645936012268066, "rewards/helpfulness_reward/mean": 1.3034820556640625, "rewards/helpfulness_reward/std": 0.4476100504398346, "rewards/safety_reward/mean": 8.88671875, "rewards/safety_reward/std": 0.9881261587142944, "step": 1314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 54.53125, "completions/mean_terminated_length": 54.53125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.4593485285127936, "frac_reward_zero_std": 0.0, "grad_norm": 0.2883858382701874, "kl": 4.02734375, "learning_rate": 5e-05, "loss": 0.0403, "num_tokens": 16759606.0, "reward": 8.82763671875, "reward_std": 0.16986137628555298, "rewards/helpfulness_reward/mean": 1.33673095703125, "rewards/helpfulness_reward/std": 0.7546889781951904, "rewards/safety_reward/mean": 8.82763671875, "rewards/safety_reward/std": 1.1483856439590454, "step": 1315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 54.7578125, "completions/mean_terminated_length": 54.7578125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.4596978429831456, "frac_reward_zero_std": 0.0, "grad_norm": 0.5415982007980347, "kl": 3.869140625, "learning_rate": 5e-05, "loss": 0.0385, "num_tokens": 16771351.0, "reward": 8.85791015625, "reward_std": 0.2781106233596802, "rewards/helpfulness_reward/mean": 1.263718605041504, "rewards/helpfulness_reward/std": 0.8756296634674072, "rewards/safety_reward/mean": 8.85791015625, "rewards/safety_reward/std": 1.4724067449569702, "step": 1316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 54.453125, "completions/mean_terminated_length": 54.453125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.4600471574534975, "frac_reward_zero_std": 0.0, "grad_norm": 0.32648566365242004, "kl": 4.234375, "learning_rate": 5e-05, "loss": 0.0355, "num_tokens": 16782857.0, "reward": 9.37890625, "reward_std": 0.24030697345733643, "rewards/helpfulness_reward/mean": 1.4879379272460938, "rewards/helpfulness_reward/std": 0.5802586674690247, "rewards/safety_reward/mean": 9.37890625, "rewards/safety_reward/std": 1.3710798025131226, "step": 1317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 54.6328125, "completions/mean_terminated_length": 54.6328125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.46039647192384947, "frac_reward_zero_std": 0.0, "grad_norm": 0.3783320486545563, "kl": 3.9453125, "learning_rate": 5e-05, "loss": 0.0462, "num_tokens": 16793882.0, "reward": 8.592529296875, "reward_std": 0.3027460277080536, "rewards/helpfulness_reward/mean": 1.1505260467529297, "rewards/helpfulness_reward/std": 0.8679041862487793, "rewards/safety_reward/mean": 8.592529296875, "rewards/safety_reward/std": 0.9002986550331116, "step": 1318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 54.21875, "completions/mean_terminated_length": 54.21875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.4607457863942014, "frac_reward_zero_std": 0.0, "grad_norm": 0.3328794240951538, "kl": 4.171875, "learning_rate": 5e-05, "loss": 0.0398, "num_tokens": 16804558.0, "reward": 9.27294921875, "reward_std": 0.2862485647201538, "rewards/helpfulness_reward/mean": 1.4334564208984375, "rewards/helpfulness_reward/std": 0.6565537452697754, "rewards/safety_reward/mean": 9.27294921875, "rewards/safety_reward/std": 1.3914906978607178, "step": 1319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 54.4453125, "completions/mean_terminated_length": 54.4453125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.4610951008645533, "frac_reward_zero_std": 0.0, "grad_norm": 0.7018808722496033, "kl": 4.69140625, "learning_rate": 5e-05, "loss": 0.0497, "num_tokens": 16815575.0, "reward": 9.344482421875, "reward_std": 0.26020681858062744, "rewards/helpfulness_reward/mean": 1.1409635543823242, "rewards/helpfulness_reward/std": 0.7175666093826294, "rewards/safety_reward/mean": 9.344482421875, "rewards/safety_reward/std": 0.9173227548599243, "step": 1320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 54.4453125, "completions/mean_terminated_length": 54.4453125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.46144441533490527, "frac_reward_zero_std": 0.0, "grad_norm": 0.5046210885047913, "kl": 4.322265625, "learning_rate": 5e-05, "loss": 0.047, "num_tokens": 16826328.0, "reward": 9.54638671875, "reward_std": 0.22110208868980408, "rewards/helpfulness_reward/mean": 1.3420486450195312, "rewards/helpfulness_reward/std": 0.8016624450683594, "rewards/safety_reward/mean": 9.54638671875, "rewards/safety_reward/std": 1.2122243642807007, "step": 1321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 54.09375, "completions/mean_terminated_length": 54.09375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.4617937298052572, "frac_reward_zero_std": 0.0, "grad_norm": 0.3557213842868805, "kl": 3.943359375, "learning_rate": 5e-05, "loss": 0.0396, "num_tokens": 16837732.0, "reward": 8.957763671875, "reward_std": 0.18539148569107056, "rewards/helpfulness_reward/mean": 1.592437744140625, "rewards/helpfulness_reward/std": 0.6225117444992065, "rewards/safety_reward/mean": 8.957763671875, "rewards/safety_reward/std": 1.1668031215667725, "step": 1322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 55.015625, "completions/mean_terminated_length": 55.015625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.4621430442756091, "frac_reward_zero_std": 0.0625, "grad_norm": 0.34568941593170166, "kl": 4.42578125, "learning_rate": 5e-05, "loss": 0.0556, "num_tokens": 16849166.0, "reward": 9.5224609375, "reward_std": 0.2524636387825012, "rewards/helpfulness_reward/mean": 1.4290008544921875, "rewards/helpfulness_reward/std": 0.5531143546104431, "rewards/safety_reward/mean": 9.5224609375, "rewards/safety_reward/std": 0.9930858612060547, "step": 1323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 54.359375, "completions/mean_terminated_length": 54.359375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.46249235874596106, "frac_reward_zero_std": 0.0, "grad_norm": 0.3371945321559906, "kl": 4.240234375, "learning_rate": 5e-05, "loss": 0.049, "num_tokens": 16860332.0, "reward": 8.95654296875, "reward_std": 0.20112740993499756, "rewards/helpfulness_reward/mean": 1.4893097877502441, "rewards/helpfulness_reward/std": 0.9119543433189392, "rewards/safety_reward/mean": 8.95654296875, "rewards/safety_reward/std": 0.8136465549468994, "step": 1324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 87.0, "completions/max_terminated_length": 87.0, "completions/mean_length": 54.9140625, "completions/mean_terminated_length": 54.9140625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.462841673216313, "frac_reward_zero_std": 0.0, "grad_norm": 0.4308638274669647, "kl": 4.123046875, "learning_rate": 5e-05, "loss": 0.0538, "num_tokens": 16871297.0, "reward": 9.341796875, "reward_std": 0.2687157690525055, "rewards/helpfulness_reward/mean": 1.2469520568847656, "rewards/helpfulness_reward/std": 0.8314929604530334, "rewards/safety_reward/mean": 9.341796875, "rewards/safety_reward/std": 1.0641252994537354, "step": 1325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 84.0, "completions/max_terminated_length": 84.0, "completions/mean_length": 54.4921875, "completions/mean_terminated_length": 54.4921875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.46319098768666495, "frac_reward_zero_std": 0.0, "grad_norm": 0.3378441035747528, "kl": 3.828125, "learning_rate": 5e-05, "loss": 0.0513, "num_tokens": 16882912.0, "reward": 9.068115234375, "reward_std": 0.19043222069740295, "rewards/helpfulness_reward/mean": 1.4136290550231934, "rewards/helpfulness_reward/std": 0.5622181296348572, "rewards/safety_reward/mean": 9.068115234375, "rewards/safety_reward/std": 0.798345685005188, "step": 1326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 54.3984375, "completions/mean_terminated_length": 54.3984375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.46354030215701686, "frac_reward_zero_std": 0.0, "grad_norm": 0.36076268553733826, "kl": 4.126953125, "learning_rate": 5e-05, "loss": 0.0414, "num_tokens": 16895275.0, "reward": 8.991943359375, "reward_std": 0.2526935338973999, "rewards/helpfulness_reward/mean": 1.40960693359375, "rewards/helpfulness_reward/std": 0.7123595476150513, "rewards/safety_reward/mean": 8.991943359375, "rewards/safety_reward/std": 1.0517436265945435, "step": 1327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 54.671875, "completions/mean_terminated_length": 54.671875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.4638896166273688, "frac_reward_zero_std": 0.0, "grad_norm": 0.3894818425178528, "kl": 4.15625, "learning_rate": 5e-05, "loss": 0.0538, "num_tokens": 16907585.0, "reward": 8.575927734375, "reward_std": 0.2833002209663391, "rewards/helpfulness_reward/mean": 1.3301239013671875, "rewards/helpfulness_reward/std": 0.8020328879356384, "rewards/safety_reward/mean": 8.575927734375, "rewards/safety_reward/std": 1.2374783754348755, "step": 1328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 54.3515625, "completions/mean_terminated_length": 54.3515625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.46423893109772074, "frac_reward_zero_std": 0.0, "grad_norm": 0.6645556092262268, "kl": 4.46875, "learning_rate": 5e-05, "loss": 0.0537, "num_tokens": 16919582.0, "reward": 8.889404296875, "reward_std": 0.26447826623916626, "rewards/helpfulness_reward/mean": 1.450164794921875, "rewards/helpfulness_reward/std": 0.833283007144928, "rewards/safety_reward/mean": 8.889404296875, "rewards/safety_reward/std": 1.0722177028656006, "step": 1329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 54.0859375, "completions/mean_terminated_length": 54.0859375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.46458824556807266, "frac_reward_zero_std": 0.0625, "grad_norm": 0.3530852794647217, "kl": 3.900390625, "learning_rate": 5e-05, "loss": 0.0517, "num_tokens": 16930041.0, "reward": 8.8662109375, "reward_std": 0.19953294098377228, "rewards/helpfulness_reward/mean": 1.2400107383728027, "rewards/helpfulness_reward/std": 0.8368192315101624, "rewards/safety_reward/mean": 8.8662109375, "rewards/safety_reward/std": 0.9391958117485046, "step": 1330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 54.0078125, "completions/mean_terminated_length": 54.0078125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.46493756003842457, "frac_reward_zero_std": 0.0, "grad_norm": 0.4643813967704773, "kl": 4.390625, "learning_rate": 5e-05, "loss": 0.0416, "num_tokens": 16940858.0, "reward": 8.42431640625, "reward_std": 0.24217896163463593, "rewards/helpfulness_reward/mean": 1.234025001525879, "rewards/helpfulness_reward/std": 0.771935760974884, "rewards/safety_reward/mean": 8.42431640625, "rewards/safety_reward/std": 1.0810647010803223, "step": 1331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 89.0, "completions/max_terminated_length": 89.0, "completions/mean_length": 54.4296875, "completions/mean_terminated_length": 54.4296875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.46528687450877654, "frac_reward_zero_std": 0.0625, "grad_norm": 0.3803219199180603, "kl": 4.0390625, "learning_rate": 5e-05, "loss": 0.0566, "num_tokens": 16952809.0, "reward": 8.7109375, "reward_std": 0.26094144582748413, "rewards/helpfulness_reward/mean": 1.233863115310669, "rewards/helpfulness_reward/std": 0.8027662038803101, "rewards/safety_reward/mean": 8.7109375, "rewards/safety_reward/std": 1.1570444107055664, "step": 1332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 54.1328125, "completions/mean_terminated_length": 54.1328125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.46563618897912845, "frac_reward_zero_std": 0.0, "grad_norm": 0.3781585395336151, "kl": 4.1875, "learning_rate": 5e-05, "loss": 0.0447, "num_tokens": 16964066.0, "reward": 8.846923828125, "reward_std": 0.23276287317276, "rewards/helpfulness_reward/mean": 1.2772560119628906, "rewards/helpfulness_reward/std": 0.8325515985488892, "rewards/safety_reward/mean": 8.846923828125, "rewards/safety_reward/std": 1.164429783821106, "step": 1333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 54.4765625, "completions/mean_terminated_length": 54.4765625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.4659855034494804, "frac_reward_zero_std": 0.0, "grad_norm": 0.4388837516307831, "kl": 4.36328125, "learning_rate": 5e-05, "loss": 0.0555, "num_tokens": 16976167.0, "reward": 8.874755859375, "reward_std": 0.2702590227127075, "rewards/helpfulness_reward/mean": 1.3269407749176025, "rewards/helpfulness_reward/std": 0.9060660600662231, "rewards/safety_reward/mean": 8.874755859375, "rewards/safety_reward/std": 1.061381220817566, "step": 1334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 53.890625, "completions/mean_terminated_length": 53.890625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.46633481791983233, "frac_reward_zero_std": 0.125, "grad_norm": 0.3168525993824005, "kl": 4.013671875, "learning_rate": 5e-05, "loss": 0.0399, "num_tokens": 16987065.0, "reward": 8.740234375, "reward_std": 0.17774690687656403, "rewards/helpfulness_reward/mean": 1.2039375305175781, "rewards/helpfulness_reward/std": 0.8153295516967773, "rewards/safety_reward/mean": 8.740234375, "rewards/safety_reward/std": 0.9963080286979675, "step": 1335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 54.1171875, "completions/mean_terminated_length": 54.1171875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.46668413239018425, "frac_reward_zero_std": 0.0, "grad_norm": 0.3167761564254761, "kl": 4.15234375, "learning_rate": 5e-05, "loss": 0.0436, "num_tokens": 16997840.0, "reward": 9.117919921875, "reward_std": 0.19942864775657654, "rewards/helpfulness_reward/mean": 1.4260039329528809, "rewards/helpfulness_reward/std": 0.6435471177101135, "rewards/safety_reward/mean": 9.117919921875, "rewards/safety_reward/std": 0.9736407399177551, "step": 1336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 54.0, "completions/mean_terminated_length": 54.0, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.4670334468605362, "frac_reward_zero_std": 0.0, "grad_norm": 0.3784959316253662, "kl": 4.0625, "learning_rate": 5e-05, "loss": 0.0394, "num_tokens": 17009112.0, "reward": 9.049072265625, "reward_std": 0.2336081862449646, "rewards/helpfulness_reward/mean": 1.464834213256836, "rewards/helpfulness_reward/std": 0.7478013038635254, "rewards/safety_reward/mean": 9.049072265625, "rewards/safety_reward/std": 1.2184735536575317, "step": 1337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 55.1484375, "completions/mean_terminated_length": 55.1484375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.46738276133088813, "frac_reward_zero_std": 0.0, "grad_norm": 0.36475780606269836, "kl": 4.40625, "learning_rate": 5e-05, "loss": 0.0626, "num_tokens": 17021587.0, "reward": 9.0693359375, "reward_std": 0.31939151883125305, "rewards/helpfulness_reward/mean": 1.4617338180541992, "rewards/helpfulness_reward/std": 0.6287851929664612, "rewards/safety_reward/mean": 9.0693359375, "rewards/safety_reward/std": 1.2097095251083374, "step": 1338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 54.234375, "completions/mean_terminated_length": 54.234375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.46773207580124004, "frac_reward_zero_std": 0.0, "grad_norm": 1.2950780391693115, "kl": 4.5859375, "learning_rate": 5e-05, "loss": 0.0486, "num_tokens": 17034057.0, "reward": 8.879638671875, "reward_std": 0.22279447317123413, "rewards/helpfulness_reward/mean": 1.2311182022094727, "rewards/helpfulness_reward/std": 0.6747846007347107, "rewards/safety_reward/mean": 8.879638671875, "rewards/safety_reward/std": 1.1159418821334839, "step": 1339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 54.640625, "completions/mean_terminated_length": 54.640625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.468081390271592, "frac_reward_zero_std": 0.125, "grad_norm": 0.37687596678733826, "kl": 3.994140625, "learning_rate": 5e-05, "loss": 0.0531, "num_tokens": 17046211.0, "reward": 8.703369140625, "reward_std": 0.20318417251110077, "rewards/helpfulness_reward/mean": 1.1182241439819336, "rewards/helpfulness_reward/std": 0.6573651432991028, "rewards/safety_reward/mean": 8.703369140625, "rewards/safety_reward/std": 1.0129269361495972, "step": 1340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 54.8046875, "completions/mean_terminated_length": 54.8046875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.4684307047419439, "frac_reward_zero_std": 0.0625, "grad_norm": 0.3982393443584442, "kl": 4.21484375, "learning_rate": 5e-05, "loss": 0.0539, "num_tokens": 17057562.0, "reward": 9.1878662109375, "reward_std": 0.3763169050216675, "rewards/helpfulness_reward/mean": 1.0830917358398438, "rewards/helpfulness_reward/std": 0.8692050576210022, "rewards/safety_reward/mean": 9.1878662109375, "rewards/safety_reward/std": 1.2717419862747192, "step": 1341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 85.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 54.6796875, "completions/mean_terminated_length": 54.6796875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.4687800192122959, "frac_reward_zero_std": 0.0, "grad_norm": 0.3942863941192627, "kl": 4.28515625, "learning_rate": 5e-05, "loss": 0.0605, "num_tokens": 17068505.0, "reward": 9.51953125, "reward_std": 0.2870839238166809, "rewards/helpfulness_reward/mean": 1.5618820190429688, "rewards/helpfulness_reward/std": 0.704538881778717, "rewards/safety_reward/mean": 9.51953125, "rewards/safety_reward/std": 0.8192106485366821, "step": 1342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 54.96875, "completions/mean_terminated_length": 54.96875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.4691293336826478, "frac_reward_zero_std": 0.0625, "grad_norm": 0.45271801948547363, "kl": 4.546875, "learning_rate": 5e-05, "loss": 0.052, "num_tokens": 17080645.0, "reward": 9.1719970703125, "reward_std": 0.44698649644851685, "rewards/helpfulness_reward/mean": 1.259347915649414, "rewards/helpfulness_reward/std": 0.5319613218307495, "rewards/safety_reward/mean": 9.1719970703125, "rewards/safety_reward/std": 1.519384503364563, "step": 1343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 845.0, "completions/max_terminated_length": 845.0, "completions/mean_length": 64.8125, "completions/mean_terminated_length": 64.8125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.4694786481529997, "frac_reward_zero_std": 0.0, "grad_norm": 0.5598967671394348, "kl": 4.0546875, "learning_rate": 5e-05, "loss": 0.064, "num_tokens": 17095301.0, "reward": 8.339508056640625, "reward_std": 0.46405351161956787, "rewards/helpfulness_reward/mean": 1.3448505401611328, "rewards/helpfulness_reward/std": 0.7224341034889221, "rewards/safety_reward/mean": 8.339508056640625, "rewards/safety_reward/std": 2.2395691871643066, "step": 1344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 54.3515625, "completions/mean_terminated_length": 54.3515625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.4698279626233517, "frac_reward_zero_std": 0.0, "grad_norm": 0.4330935776233673, "kl": 3.978515625, "learning_rate": 5e-05, "loss": 0.0341, "num_tokens": 17106786.0, "reward": 9.08892822265625, "reward_std": 0.43303182721138, "rewards/helpfulness_reward/mean": 1.2829132080078125, "rewards/helpfulness_reward/std": 0.513526201248169, "rewards/safety_reward/mean": 9.08892822265625, "rewards/safety_reward/std": 1.0325206518173218, "step": 1345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 55.1171875, "completions/mean_terminated_length": 55.1171875, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.4701772770937036, "frac_reward_zero_std": 0.0, "grad_norm": 0.4552983045578003, "kl": 4.08984375, "learning_rate": 5e-05, "loss": 0.0585, "num_tokens": 17117825.0, "reward": 9.115234375, "reward_std": 0.35944846272468567, "rewards/helpfulness_reward/mean": 0.7937417030334473, "rewards/helpfulness_reward/std": 0.48015981912612915, "rewards/safety_reward/mean": 9.115234375, "rewards/safety_reward/std": 0.8816339373588562, "step": 1346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 54.8984375, "completions/mean_terminated_length": 54.8984375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.4705265915640555, "frac_reward_zero_std": 0.0, "grad_norm": 0.40231022238731384, "kl": 4.333984375, "learning_rate": 5e-05, "loss": 0.0435, "num_tokens": 17128788.0, "reward": 9.24462890625, "reward_std": 0.30709144473075867, "rewards/helpfulness_reward/mean": 1.0068111419677734, "rewards/helpfulness_reward/std": 0.792567253112793, "rewards/safety_reward/mean": 9.24462890625, "rewards/safety_reward/std": 0.9177975058555603, "step": 1347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 53.9453125, "completions/mean_terminated_length": 53.9453125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.4708759060344075, "frac_reward_zero_std": 0.0, "grad_norm": 0.38508448004722595, "kl": 4.3984375, "learning_rate": 5e-05, "loss": 0.0326, "num_tokens": 17139261.0, "reward": 8.884521484375, "reward_std": 0.4532668888568878, "rewards/helpfulness_reward/mean": 1.1531810760498047, "rewards/helpfulness_reward/std": 0.8456488251686096, "rewards/safety_reward/mean": 8.884521484375, "rewards/safety_reward/std": 1.1094212532043457, "step": 1348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 54.4140625, "completions/mean_terminated_length": 54.4140625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.4712252205047594, "frac_reward_zero_std": 0.0, "grad_norm": 0.3791181743144989, "kl": 4.28125, "learning_rate": 5e-05, "loss": 0.0412, "num_tokens": 17150002.0, "reward": 9.212890625, "reward_std": 0.29426366090774536, "rewards/helpfulness_reward/mean": 1.6198463439941406, "rewards/helpfulness_reward/std": 0.8597764372825623, "rewards/safety_reward/mean": 9.212890625, "rewards/safety_reward/std": 0.9777902960777283, "step": 1349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 55.2109375, "completions/mean_terminated_length": 55.2109375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.47157453497511137, "frac_reward_zero_std": 0.0, "grad_norm": 1.4526753425598145, "kl": 5.1171875, "learning_rate": 5e-05, "loss": 0.0619, "num_tokens": 17162373.0, "reward": 8.885009765625, "reward_std": 0.23231445252895355, "rewards/helpfulness_reward/mean": 1.0117859840393066, "rewards/helpfulness_reward/std": 0.7708406448364258, "rewards/safety_reward/mean": 8.885009765625, "rewards/safety_reward/std": 1.6935447454452515, "step": 1350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 54.8359375, "completions/mean_terminated_length": 54.8359375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.4719238494454633, "frac_reward_zero_std": 0.0, "grad_norm": 0.4079577922821045, "kl": 4.01171875, "learning_rate": 5e-05, "loss": 0.0458, "num_tokens": 17173368.0, "reward": 9.021484375, "reward_std": 0.3766445815563202, "rewards/helpfulness_reward/mean": 0.8879852294921875, "rewards/helpfulness_reward/std": 0.5990394949913025, "rewards/safety_reward/mean": 9.021484375, "rewards/safety_reward/std": 1.1106764078140259, "step": 1351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 54.5859375, "completions/mean_terminated_length": 54.5859375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.4722731639158152, "frac_reward_zero_std": 0.0, "grad_norm": 0.41163909435272217, "kl": 4.048828125, "learning_rate": 5e-05, "loss": 0.0487, "num_tokens": 17185027.0, "reward": 8.775146484375, "reward_std": 0.5491183400154114, "rewards/helpfulness_reward/mean": 1.1958951950073242, "rewards/helpfulness_reward/std": 0.5908061265945435, "rewards/safety_reward/mean": 8.775146484375, "rewards/safety_reward/std": 1.1886523962020874, "step": 1352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 54.53125, "completions/mean_terminated_length": 54.53125, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.47262247838616717, "frac_reward_zero_std": 0.0, "grad_norm": 0.4311966896057129, "kl": 4.306640625, "learning_rate": 5e-05, "loss": 0.0469, "num_tokens": 17196439.0, "reward": 8.5848388671875, "reward_std": 0.41961610317230225, "rewards/helpfulness_reward/mean": 1.3632164001464844, "rewards/helpfulness_reward/std": 0.9328058958053589, "rewards/safety_reward/mean": 8.5848388671875, "rewards/safety_reward/std": 1.6867955923080444, "step": 1353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 54.3046875, "completions/mean_terminated_length": 54.3046875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.4729717928565191, "frac_reward_zero_std": 0.0, "grad_norm": 0.41639524698257446, "kl": 3.919921875, "learning_rate": 5e-05, "loss": 0.0382, "num_tokens": 17207326.0, "reward": 8.438720703125, "reward_std": 0.38329797983169556, "rewards/helpfulness_reward/mean": 0.9707612991333008, "rewards/helpfulness_reward/std": 0.7167741656303406, "rewards/safety_reward/mean": 8.438720703125, "rewards/safety_reward/std": 1.3087592124938965, "step": 1354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 55.0703125, "completions/mean_terminated_length": 55.0703125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.473321107326871, "frac_reward_zero_std": 0.0, "grad_norm": 0.36343836784362793, "kl": 3.939453125, "learning_rate": 5e-05, "loss": 0.0628, "num_tokens": 17218543.0, "reward": 8.850830078125, "reward_std": 0.39210015535354614, "rewards/helpfulness_reward/mean": 1.3209457397460938, "rewards/helpfulness_reward/std": 0.7834843397140503, "rewards/safety_reward/mean": 8.850830078125, "rewards/safety_reward/std": 0.9521187543869019, "step": 1355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 54.46875, "completions/mean_terminated_length": 54.46875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.47367042179722296, "frac_reward_zero_std": 0.0, "grad_norm": 0.39443251490592957, "kl": 4.013671875, "learning_rate": 5e-05, "loss": 0.0383, "num_tokens": 17229307.0, "reward": 8.816162109375, "reward_std": 0.3249676823616028, "rewards/helpfulness_reward/mean": 1.1588914394378662, "rewards/helpfulness_reward/std": 0.9158827662467957, "rewards/safety_reward/mean": 8.816162109375, "rewards/safety_reward/std": 1.1329594850540161, "step": 1356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 55.359375, "completions/mean_terminated_length": 55.359375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.4740197362675749, "frac_reward_zero_std": 0.0, "grad_norm": 0.40614718198776245, "kl": 4.09765625, "learning_rate": 5e-05, "loss": 0.0597, "num_tokens": 17240129.0, "reward": 8.741943359375, "reward_std": 0.526810884475708, "rewards/helpfulness_reward/mean": 1.1156082153320312, "rewards/helpfulness_reward/std": 0.643389105796814, "rewards/safety_reward/mean": 8.741943359375, "rewards/safety_reward/std": 1.1387450695037842, "step": 1357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 54.984375, "completions/mean_terminated_length": 54.984375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.47436905073792685, "frac_reward_zero_std": 0.0, "grad_norm": 0.4647179841995239, "kl": 4.244140625, "learning_rate": 5e-05, "loss": 0.0611, "num_tokens": 17253519.0, "reward": 8.5439453125, "reward_std": 0.41512882709503174, "rewards/helpfulness_reward/mean": 1.1589164733886719, "rewards/helpfulness_reward/std": 0.8430868983268738, "rewards/safety_reward/mean": 8.5439453125, "rewards/safety_reward/std": 1.310327172279358, "step": 1358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 54.765625, "completions/mean_terminated_length": 54.765625, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.47471836520827876, "frac_reward_zero_std": 0.0, "grad_norm": 0.4535658657550812, "kl": 4.169921875, "learning_rate": 5e-05, "loss": 0.0497, "num_tokens": 17266209.0, "reward": 8.15032958984375, "reward_std": 0.34472566843032837, "rewards/helpfulness_reward/mean": 1.3596735000610352, "rewards/helpfulness_reward/std": 0.8082863688468933, "rewards/safety_reward/mean": 8.15032958984375, "rewards/safety_reward/std": 2.120598793029785, "step": 1359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 54.546875, "completions/mean_terminated_length": 54.546875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.47506767967863067, "frac_reward_zero_std": 0.0, "grad_norm": 0.4326079785823822, "kl": 4.29296875, "learning_rate": 5e-05, "loss": 0.0514, "num_tokens": 17278399.0, "reward": 8.9296875, "reward_std": 0.2163209617137909, "rewards/helpfulness_reward/mean": 1.4308032989501953, "rewards/helpfulness_reward/std": 0.6762406229972839, "rewards/safety_reward/mean": 8.9296875, "rewards/safety_reward/std": 0.9814971089363098, "step": 1360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 93.0, "completions/max_terminated_length": 93.0, "completions/mean_length": 54.875, "completions/mean_terminated_length": 54.875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.47541699414898264, "frac_reward_zero_std": 0.0, "grad_norm": 0.3985840678215027, "kl": 4.0625, "learning_rate": 5e-05, "loss": 0.0596, "num_tokens": 17290007.0, "reward": 8.911865234375, "reward_std": 0.45481377840042114, "rewards/helpfulness_reward/mean": 1.361480712890625, "rewards/helpfulness_reward/std": 0.6208617687225342, "rewards/safety_reward/mean": 8.911865234375, "rewards/safety_reward/std": 1.1193772554397583, "step": 1361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 54.3828125, "completions/mean_terminated_length": 54.3828125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.47576630861933455, "frac_reward_zero_std": 0.0, "grad_norm": 0.29100972414016724, "kl": 4.150390625, "learning_rate": 5e-05, "loss": 0.0478, "num_tokens": 17300880.0, "reward": 8.988525390625, "reward_std": 0.23450331389904022, "rewards/helpfulness_reward/mean": 1.1154489517211914, "rewards/helpfulness_reward/std": 0.7432476878166199, "rewards/safety_reward/mean": 8.988525390625, "rewards/safety_reward/std": 0.9083803296089172, "step": 1362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 54.6171875, "completions/mean_terminated_length": 54.6171875, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.47611562308968647, "frac_reward_zero_std": 0.0, "grad_norm": 0.4095039665699005, "kl": 4.3984375, "learning_rate": 5e-05, "loss": 0.0577, "num_tokens": 17311447.0, "reward": 8.68603515625, "reward_std": 0.2598300576210022, "rewards/helpfulness_reward/mean": 1.0781707763671875, "rewards/helpfulness_reward/std": 0.8807232975959778, "rewards/safety_reward/mean": 8.68603515625, "rewards/safety_reward/std": 1.53826105594635, "step": 1363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 85.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 55.015625, "completions/mean_terminated_length": 55.015625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.47646493756003844, "frac_reward_zero_std": 0.0, "grad_norm": 0.40164539217948914, "kl": 3.833984375, "learning_rate": 5e-05, "loss": 0.0587, "num_tokens": 17322881.0, "reward": 8.34375, "reward_std": 0.37903064489364624, "rewards/helpfulness_reward/mean": 1.0768470764160156, "rewards/helpfulness_reward/std": 0.8828683495521545, "rewards/safety_reward/mean": 8.34375, "rewards/safety_reward/std": 1.7202179431915283, "step": 1364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 54.765625, "completions/mean_terminated_length": 54.765625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.47681425203039035, "frac_reward_zero_std": 0.0, "grad_norm": 0.45103615522384644, "kl": 4.107421875, "learning_rate": 5e-05, "loss": 0.0514, "num_tokens": 17334179.0, "reward": 8.737060546875, "reward_std": 0.3279542326927185, "rewards/helpfulness_reward/mean": 1.32830810546875, "rewards/helpfulness_reward/std": 0.7761324048042297, "rewards/safety_reward/mean": 8.737060546875, "rewards/safety_reward/std": 1.1430872678756714, "step": 1365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 54.78125, "completions/mean_terminated_length": 54.78125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.4771635665007423, "frac_reward_zero_std": 0.0, "grad_norm": 0.3091281056404114, "kl": 4.111328125, "learning_rate": 5e-05, "loss": 0.0449, "num_tokens": 17348567.0, "reward": 8.97509765625, "reward_std": 0.2764981985092163, "rewards/helpfulness_reward/mean": 1.1247749328613281, "rewards/helpfulness_reward/std": 0.7198681235313416, "rewards/safety_reward/mean": 8.97509765625, "rewards/safety_reward/std": 1.3668015003204346, "step": 1366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 54.5546875, "completions/mean_terminated_length": 54.5546875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.47751288097109423, "frac_reward_zero_std": 0.0, "grad_norm": 0.4247170686721802, "kl": 4.111328125, "learning_rate": 5e-05, "loss": 0.0519, "num_tokens": 17360198.0, "reward": 8.779296875, "reward_std": 0.2915149927139282, "rewards/helpfulness_reward/mean": 1.0898451805114746, "rewards/helpfulness_reward/std": 0.7639932036399841, "rewards/safety_reward/mean": 8.779296875, "rewards/safety_reward/std": 1.2533833980560303, "step": 1367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 54.5859375, "completions/mean_terminated_length": 54.5859375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.47786219544144615, "frac_reward_zero_std": 0.0, "grad_norm": 0.3726419508457184, "kl": 4.08984375, "learning_rate": 5e-05, "loss": 0.0498, "num_tokens": 17371649.0, "reward": 8.749755859375, "reward_std": 0.26757824420928955, "rewards/helpfulness_reward/mean": 1.349029541015625, "rewards/helpfulness_reward/std": 0.714814305305481, "rewards/safety_reward/mean": 8.749755859375, "rewards/safety_reward/std": 1.016439437866211, "step": 1368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 54.71875, "completions/mean_terminated_length": 54.71875, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.4782115099117981, "frac_reward_zero_std": 0.0, "grad_norm": 0.41406378149986267, "kl": 4.15234375, "learning_rate": 5e-05, "loss": 0.0465, "num_tokens": 17382885.0, "reward": 8.969970703125, "reward_std": 0.1816606968641281, "rewards/helpfulness_reward/mean": 1.4007339477539062, "rewards/helpfulness_reward/std": 0.7364588975906372, "rewards/safety_reward/mean": 8.969970703125, "rewards/safety_reward/std": 1.0107793807983398, "step": 1369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 54.59375, "completions/mean_terminated_length": 54.59375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.47856082438215003, "frac_reward_zero_std": 0.0625, "grad_norm": 0.36788690090179443, "kl": 4.17578125, "learning_rate": 5e-05, "loss": 0.0439, "num_tokens": 17394369.0, "reward": 9.03759765625, "reward_std": 0.21401506662368774, "rewards/helpfulness_reward/mean": 1.2891178131103516, "rewards/helpfulness_reward/std": 0.82792067527771, "rewards/safety_reward/mean": 9.03759765625, "rewards/safety_reward/std": 1.1114212274551392, "step": 1370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1627.0, "completions/mean_length": 90.3046875, "completions/mean_terminated_length": 76.90550994873047, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.47891013885250194, "frac_reward_zero_std": 0.0, "grad_norm": 0.6058803200721741, "kl": 3.783203125, "learning_rate": 5e-05, "loss": 0.1775, "num_tokens": 17413416.0, "reward": 8.379318237304688, "reward_std": 0.4761214852333069, "rewards/helpfulness_reward/mean": 1.1467552185058594, "rewards/helpfulness_reward/std": 0.8679392337799072, "rewards/safety_reward/mean": 8.379318237304688, "rewards/safety_reward/std": 2.1100194454193115, "step": 1371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 89.0, "completions/max_terminated_length": 89.0, "completions/mean_length": 55.0234375, "completions/mean_terminated_length": 55.0234375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.4792594533228539, "frac_reward_zero_std": 0.0, "grad_norm": 0.4993738532066345, "kl": 4.353515625, "learning_rate": 5e-05, "loss": 0.058, "num_tokens": 17425131.0, "reward": 9.140625, "reward_std": 0.26194894313812256, "rewards/helpfulness_reward/mean": 1.5191307067871094, "rewards/helpfulness_reward/std": 0.7245408892631531, "rewards/safety_reward/mean": 9.140625, "rewards/safety_reward/std": 1.1239606142044067, "step": 1372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 54.5, "completions/mean_terminated_length": 54.5, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.4796087677932058, "frac_reward_zero_std": 0.0, "grad_norm": 0.42552125453948975, "kl": 4.21875, "learning_rate": 5e-05, "loss": 0.0538, "num_tokens": 17436547.0, "reward": 9.023681640625, "reward_std": 0.27016109228134155, "rewards/helpfulness_reward/mean": 1.2950477600097656, "rewards/helpfulness_reward/std": 0.6543920040130615, "rewards/safety_reward/mean": 9.023681640625, "rewards/safety_reward/std": 1.1454474925994873, "step": 1373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 54.421875, "completions/mean_terminated_length": 54.421875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.4799580822635578, "frac_reward_zero_std": 0.0, "grad_norm": 0.4406724274158478, "kl": 4.236328125, "learning_rate": 5e-05, "loss": 0.0469, "num_tokens": 17447265.0, "reward": 8.851318359375, "reward_std": 0.3824120759963989, "rewards/helpfulness_reward/mean": 0.8639907836914062, "rewards/helpfulness_reward/std": 0.7787062525749207, "rewards/safety_reward/mean": 8.851318359375, "rewards/safety_reward/std": 1.4923151731491089, "step": 1374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 54.296875, "completions/mean_terminated_length": 54.296875, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.4803073967339097, "frac_reward_zero_std": 0.0, "grad_norm": 0.38220492005348206, "kl": 3.990234375, "learning_rate": 5e-05, "loss": 0.0405, "num_tokens": 17457879.0, "reward": 9.53515625, "reward_std": 0.18736150860786438, "rewards/helpfulness_reward/mean": 1.51861572265625, "rewards/helpfulness_reward/std": 0.7654559016227722, "rewards/safety_reward/mean": 9.53515625, "rewards/safety_reward/std": 1.0577199459075928, "step": 1375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 54.734375, "completions/mean_terminated_length": 54.734375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.4806567112042616, "frac_reward_zero_std": 0.0, "grad_norm": 0.6126121878623962, "kl": 4.509765625, "learning_rate": 5e-05, "loss": 0.0427, "num_tokens": 17469093.0, "reward": 8.9609375, "reward_std": 0.2887658476829529, "rewards/helpfulness_reward/mean": 1.208343505859375, "rewards/helpfulness_reward/std": 1.0715183019638062, "rewards/safety_reward/mean": 8.9609375, "rewards/safety_reward/std": 1.6174728870391846, "step": 1376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 54.515625, "completions/mean_terminated_length": 54.515625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.4810060256746136, "frac_reward_zero_std": 0.0625, "grad_norm": 0.36332398653030396, "kl": 4.232421875, "learning_rate": 5e-05, "loss": 0.0461, "num_tokens": 17480095.0, "reward": 9.072998046875, "reward_std": 0.2118762731552124, "rewards/helpfulness_reward/mean": 1.278554081916809, "rewards/helpfulness_reward/std": 0.7792560458183289, "rewards/safety_reward/mean": 9.072998046875, "rewards/safety_reward/std": 1.532207727432251, "step": 1377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 54.53125, "completions/mean_terminated_length": 54.53125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.4813553401449655, "frac_reward_zero_std": 0.0, "grad_norm": 0.39973098039627075, "kl": 4.142578125, "learning_rate": 5e-05, "loss": 0.0468, "num_tokens": 17491235.0, "reward": 9.111083984375, "reward_std": 0.1935443878173828, "rewards/helpfulness_reward/mean": 1.2510662078857422, "rewards/helpfulness_reward/std": 0.6877946853637695, "rewards/safety_reward/mean": 9.111083984375, "rewards/safety_reward/std": 1.1807256937026978, "step": 1378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 54.6796875, "completions/mean_terminated_length": 54.6796875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.4817046546153174, "frac_reward_zero_std": 0.0, "grad_norm": 0.41890814900398254, "kl": 4.109375, "learning_rate": 5e-05, "loss": 0.0454, "num_tokens": 17502258.0, "reward": 9.296630859375, "reward_std": 0.12584739923477173, "rewards/helpfulness_reward/mean": 1.2964706420898438, "rewards/helpfulness_reward/std": 0.7277919054031372, "rewards/safety_reward/mean": 9.296630859375, "rewards/safety_reward/std": 1.1844396591186523, "step": 1379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 54.609375, "completions/mean_terminated_length": 54.609375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.4820539690856694, "frac_reward_zero_std": 0.0, "grad_norm": 0.3847395181655884, "kl": 4.09765625, "learning_rate": 5e-05, "loss": 0.0422, "num_tokens": 17512880.0, "reward": 9.269287109375, "reward_std": 0.21049824357032776, "rewards/helpfulness_reward/mean": 1.307952880859375, "rewards/helpfulness_reward/std": 0.5132157802581787, "rewards/safety_reward/mean": 9.269287109375, "rewards/safety_reward/std": 0.9498620629310608, "step": 1380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 54.734375, "completions/mean_terminated_length": 54.734375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.4824032835560213, "frac_reward_zero_std": 0.0, "grad_norm": 0.3507576584815979, "kl": 4.1953125, "learning_rate": 5e-05, "loss": 0.0398, "num_tokens": 17524174.0, "reward": 8.958740234375, "reward_std": 0.21813681721687317, "rewards/helpfulness_reward/mean": 1.3423454761505127, "rewards/helpfulness_reward/std": 0.7631791234016418, "rewards/safety_reward/mean": 8.958740234375, "rewards/safety_reward/std": 0.7370094656944275, "step": 1381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 54.828125, "completions/mean_terminated_length": 54.828125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.48275259802637327, "frac_reward_zero_std": 0.0, "grad_norm": 0.43023115396499634, "kl": 4.302734375, "learning_rate": 5e-05, "loss": 0.0554, "num_tokens": 17535944.0, "reward": 9.01171875, "reward_std": 0.22499041259288788, "rewards/helpfulness_reward/mean": 1.4937124252319336, "rewards/helpfulness_reward/std": 0.5241170525550842, "rewards/safety_reward/mean": 9.01171875, "rewards/safety_reward/std": 1.064762830734253, "step": 1382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 54.609375, "completions/mean_terminated_length": 54.609375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.4831019124967252, "frac_reward_zero_std": 0.0, "grad_norm": 0.36446866393089294, "kl": 4.169921875, "learning_rate": 5e-05, "loss": 0.0559, "num_tokens": 17546918.0, "reward": 8.701416015625, "reward_std": 0.27490055561065674, "rewards/helpfulness_reward/mean": 1.4360427856445312, "rewards/helpfulness_reward/std": 0.9932846426963806, "rewards/safety_reward/mean": 8.701416015625, "rewards/safety_reward/std": 1.3716785907745361, "step": 1383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 62.0078125, "completions/mean_terminated_length": 62.0078125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.4834512269670771, "frac_reward_zero_std": 0.0, "grad_norm": 0.4542270600795746, "kl": 3.984375, "learning_rate": 5e-05, "loss": 0.08, "num_tokens": 17561103.0, "reward": 8.70001220703125, "reward_std": 0.32531240582466125, "rewards/helpfulness_reward/mean": 1.4896583557128906, "rewards/helpfulness_reward/std": 0.8202377557754517, "rewards/safety_reward/mean": 8.70001220703125, "rewards/safety_reward/std": 1.9769026041030884, "step": 1384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 77.9296875, "completions/mean_terminated_length": 77.9296875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.48380054143742907, "frac_reward_zero_std": 0.0, "grad_norm": 0.3529530465602875, "kl": 3.37890625, "learning_rate": 5e-05, "loss": 0.016, "num_tokens": 17576654.0, "reward": 8.536422729492188, "reward_std": 0.18756262958049774, "rewards/helpfulness_reward/mean": 1.5012903213500977, "rewards/helpfulness_reward/std": 0.7792834639549255, "rewards/safety_reward/mean": 8.536422729492188, "rewards/safety_reward/std": 2.728729248046875, "step": 1385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 54.6953125, "completions/mean_terminated_length": 54.6953125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.484149855907781, "frac_reward_zero_std": 0.0, "grad_norm": 0.40742334723472595, "kl": 4.1796875, "learning_rate": 5e-05, "loss": 0.0528, "num_tokens": 17589263.0, "reward": 9.115966796875, "reward_std": 0.2702268958091736, "rewards/helpfulness_reward/mean": 1.3524351119995117, "rewards/helpfulness_reward/std": 0.7225268483161926, "rewards/safety_reward/mean": 9.115966796875, "rewards/safety_reward/std": 1.0701161623001099, "step": 1386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 88.0, "completions/max_terminated_length": 88.0, "completions/mean_length": 55.0703125, "completions/mean_terminated_length": 55.0703125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.4844991703781329, "frac_reward_zero_std": 0.0, "grad_norm": 0.3606836795806885, "kl": 4.2421875, "learning_rate": 5e-05, "loss": 0.0494, "num_tokens": 17601856.0, "reward": 8.879150390625, "reward_std": 0.17908397316932678, "rewards/helpfulness_reward/mean": 1.3225555419921875, "rewards/helpfulness_reward/std": 0.6631689667701721, "rewards/safety_reward/mean": 8.879150390625, "rewards/safety_reward/std": 1.2811832427978516, "step": 1387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 54.953125, "completions/mean_terminated_length": 54.953125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.48484848484848486, "frac_reward_zero_std": 0.0, "grad_norm": 0.42082494497299194, "kl": 4.20703125, "learning_rate": 5e-05, "loss": 0.0622, "num_tokens": 17615378.0, "reward": 8.800537109375, "reward_std": 0.3573538064956665, "rewards/helpfulness_reward/mean": 1.4837608337402344, "rewards/helpfulness_reward/std": 0.846476137638092, "rewards/safety_reward/mean": 8.800537109375, "rewards/safety_reward/std": 1.9902650117874146, "step": 1388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 54.828125, "completions/mean_terminated_length": 54.828125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.4851977993188368, "frac_reward_zero_std": 0.0, "grad_norm": 0.35536834597587585, "kl": 4.46875, "learning_rate": 5e-05, "loss": 0.0508, "num_tokens": 17626636.0, "reward": 9.1383056640625, "reward_std": 0.2916598916053772, "rewards/helpfulness_reward/mean": 1.33575439453125, "rewards/helpfulness_reward/std": 0.5721263885498047, "rewards/safety_reward/mean": 9.1383056640625, "rewards/safety_reward/std": 1.214433193206787, "step": 1389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 984.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 66.96875, "completions/mean_terminated_length": 66.96875, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.48554711378918874, "frac_reward_zero_std": 0.125, "grad_norm": 0.435080885887146, "kl": 3.892578125, "learning_rate": 5e-05, "loss": 0.1302, "num_tokens": 17639456.0, "reward": 8.798095703125, "reward_std": 0.3480031490325928, "rewards/helpfulness_reward/mean": 1.443328857421875, "rewards/helpfulness_reward/std": 0.6089722514152527, "rewards/safety_reward/mean": 8.798095703125, "rewards/safety_reward/std": 1.7608826160430908, "step": 1390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 54.953125, "completions/mean_terminated_length": 54.953125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.48589642825954066, "frac_reward_zero_std": 0.0, "grad_norm": 0.3892560601234436, "kl": 4.25, "learning_rate": 5e-05, "loss": 0.0674, "num_tokens": 17650786.0, "reward": 9.029296875, "reward_std": 0.3935719132423401, "rewards/helpfulness_reward/mean": 1.2739028930664062, "rewards/helpfulness_reward/std": 0.7086517810821533, "rewards/safety_reward/mean": 9.029296875, "rewards/safety_reward/std": 1.0758821964263916, "step": 1391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 54.75, "completions/mean_terminated_length": 54.75, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.48624574272989257, "frac_reward_zero_std": 0.0, "grad_norm": 0.39290547370910645, "kl": 4.42578125, "learning_rate": 5e-05, "loss": 0.0574, "num_tokens": 17663402.0, "reward": 8.47021484375, "reward_std": 0.42629900574684143, "rewards/helpfulness_reward/mean": 0.9102640151977539, "rewards/helpfulness_reward/std": 0.7828944325447083, "rewards/safety_reward/mean": 8.47021484375, "rewards/safety_reward/std": 1.1370936632156372, "step": 1392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 54.8984375, "completions/mean_terminated_length": 54.8984375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.48659505720024454, "frac_reward_zero_std": 0.0, "grad_norm": 0.31370168924331665, "kl": 3.990234375, "learning_rate": 5e-05, "loss": 0.0617, "num_tokens": 17674421.0, "reward": 8.898681640625, "reward_std": 0.33505675196647644, "rewards/helpfulness_reward/mean": 1.4646167755126953, "rewards/helpfulness_reward/std": 0.6524845361709595, "rewards/safety_reward/mean": 8.898681640625, "rewards/safety_reward/std": 1.3102879524230957, "step": 1393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 54.71875, "completions/mean_terminated_length": 54.71875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.48694437167059645, "frac_reward_zero_std": 0.0, "grad_norm": 0.38383638858795166, "kl": 4.12109375, "learning_rate": 5e-05, "loss": 0.0564, "num_tokens": 17686977.0, "reward": 8.930908203125, "reward_std": 0.38332194089889526, "rewards/helpfulness_reward/mean": 1.3325400352478027, "rewards/helpfulness_reward/std": 0.6983293294906616, "rewards/safety_reward/mean": 8.930908203125, "rewards/safety_reward/std": 1.184145212173462, "step": 1394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 54.5703125, "completions/mean_terminated_length": 54.5703125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.48729368614094837, "frac_reward_zero_std": 0.0, "grad_norm": 0.34192878007888794, "kl": 4.2265625, "learning_rate": 5e-05, "loss": 0.0532, "num_tokens": 17699546.0, "reward": 8.834716796875, "reward_std": 0.2663272023200989, "rewards/helpfulness_reward/mean": 1.0988616943359375, "rewards/helpfulness_reward/std": 0.5277961492538452, "rewards/safety_reward/mean": 8.834716796875, "rewards/safety_reward/std": 1.1317815780639648, "step": 1395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 54.2109375, "completions/mean_terminated_length": 54.2109375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.48764300061130034, "frac_reward_zero_std": 0.0625, "grad_norm": 0.35912540555000305, "kl": 4.3515625, "learning_rate": 5e-05, "loss": 0.0503, "num_tokens": 17710053.0, "reward": 8.624755859375, "reward_std": 0.23771031200885773, "rewards/helpfulness_reward/mean": 1.2250404357910156, "rewards/helpfulness_reward/std": 0.7741780877113342, "rewards/safety_reward/mean": 8.624755859375, "rewards/safety_reward/std": 1.5625566244125366, "step": 1396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 94.0, "completions/max_terminated_length": 94.0, "completions/mean_length": 54.3671875, "completions/mean_terminated_length": 54.3671875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.48799231508165225, "frac_reward_zero_std": 0.0, "grad_norm": 0.33663210272789, "kl": 4.140625, "learning_rate": 5e-05, "loss": 0.0576, "num_tokens": 17720900.0, "reward": 8.580810546875, "reward_std": 0.3169782757759094, "rewards/helpfulness_reward/mean": 1.1147699356079102, "rewards/helpfulness_reward/std": 0.9320194125175476, "rewards/safety_reward/mean": 8.580810546875, "rewards/safety_reward/std": 1.1345051527023315, "step": 1397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 54.453125, "completions/mean_terminated_length": 54.453125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.4883416295520042, "frac_reward_zero_std": 0.0, "grad_norm": 0.3169555962085724, "kl": 4.046875, "learning_rate": 5e-05, "loss": 0.0519, "num_tokens": 17732358.0, "reward": 9.02880859375, "reward_std": 0.26538991928100586, "rewards/helpfulness_reward/mean": 1.36688232421875, "rewards/helpfulness_reward/std": 0.9517849683761597, "rewards/safety_reward/mean": 9.02880859375, "rewards/safety_reward/std": 1.1794732809066772, "step": 1398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 51.0, "completions/mean_terminated_length": 51.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.48869094402235613, "frac_reward_zero_std": 0.0625, "grad_norm": 0.346630334854126, "kl": 4.234375, "learning_rate": 5e-05, "loss": 0.0381, "num_tokens": 17745246.0, "reward": 8.757080078125, "reward_std": 0.3879374861717224, "rewards/helpfulness_reward/mean": 1.721954345703125, "rewards/helpfulness_reward/std": 0.9413244128227234, "rewards/safety_reward/mean": 8.757080078125, "rewards/safety_reward/std": 1.6704363822937012, "step": 1399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 86.0, "completions/max_terminated_length": 86.0, "completions/mean_length": 54.21875, "completions/mean_terminated_length": 54.21875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.48904025849270805, "frac_reward_zero_std": 0.0, "grad_norm": 0.3463357388973236, "kl": 4.166015625, "learning_rate": 5e-05, "loss": 0.0482, "num_tokens": 17755906.0, "reward": 9.2763671875, "reward_std": 0.26394450664520264, "rewards/helpfulness_reward/mean": 1.470168113708496, "rewards/helpfulness_reward/std": 0.6912729144096375, "rewards/safety_reward/mean": 9.2763671875, "rewards/safety_reward/std": 0.8539134860038757, "step": 1400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 54.46875, "completions/mean_terminated_length": 54.46875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.48938957296306, "frac_reward_zero_std": 0.0625, "grad_norm": 0.3563671410083771, "kl": 4.015625, "learning_rate": 5e-05, "loss": 0.0529, "num_tokens": 17767446.0, "reward": 8.803466796875, "reward_std": 0.43385761976242065, "rewards/helpfulness_reward/mean": 1.3674697875976562, "rewards/helpfulness_reward/std": 0.5595429539680481, "rewards/safety_reward/mean": 8.803466796875, "rewards/safety_reward/std": 1.2087697982788086, "step": 1401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 54.5625, "completions/mean_terminated_length": 54.5625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.48973888743341193, "frac_reward_zero_std": 0.0, "grad_norm": 0.4014948010444641, "kl": 4.234375, "learning_rate": 5e-05, "loss": 0.0422, "num_tokens": 17779006.0, "reward": 8.7509765625, "reward_std": 0.2675879895687103, "rewards/helpfulness_reward/mean": 1.2434067726135254, "rewards/helpfulness_reward/std": 0.8551396727561951, "rewards/safety_reward/mean": 8.7509765625, "rewards/safety_reward/std": 1.208978295326233, "step": 1402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 54.1875, "completions/mean_terminated_length": 54.1875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.49008820190376384, "frac_reward_zero_std": 0.0, "grad_norm": 0.3494725823402405, "kl": 3.953125, "learning_rate": 5e-05, "loss": 0.0523, "num_tokens": 17790070.0, "reward": 9.097412109375, "reward_std": 0.3194648027420044, "rewards/helpfulness_reward/mean": 1.3280792236328125, "rewards/helpfulness_reward/std": 0.7831404209136963, "rewards/safety_reward/mean": 9.097412109375, "rewards/safety_reward/std": 1.1436797380447388, "step": 1403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 54.53125, "completions/mean_terminated_length": 54.53125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.4904375163741158, "frac_reward_zero_std": 0.0, "grad_norm": 0.28438761830329895, "kl": 4.17578125, "learning_rate": 5e-05, "loss": 0.0454, "num_tokens": 17802258.0, "reward": 9.019775390625, "reward_std": 0.2077418863773346, "rewards/helpfulness_reward/mean": 1.33935546875, "rewards/helpfulness_reward/std": 0.8901202082633972, "rewards/safety_reward/mean": 9.019775390625, "rewards/safety_reward/std": 1.909769892692566, "step": 1404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 90.0, "completions/max_terminated_length": 90.0, "completions/mean_length": 54.890625, "completions/mean_terminated_length": 54.890625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.4907868308444677, "frac_reward_zero_std": 0.0, "grad_norm": 0.4341640770435333, "kl": 4.240234375, "learning_rate": 5e-05, "loss": 0.0639, "num_tokens": 17813412.0, "reward": 8.758544921875, "reward_std": 0.27449026703834534, "rewards/helpfulness_reward/mean": 1.1120598316192627, "rewards/helpfulness_reward/std": 0.6414793729782104, "rewards/safety_reward/mean": 8.758544921875, "rewards/safety_reward/std": 1.005060076713562, "step": 1405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 54.5859375, "completions/mean_terminated_length": 54.5859375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.49113614531481964, "frac_reward_zero_std": 0.0, "grad_norm": 0.2619378864765167, "kl": 4.19921875, "learning_rate": 5e-05, "loss": 0.0459, "num_tokens": 17825583.0, "reward": 9.383544921875, "reward_std": 0.16911563277244568, "rewards/helpfulness_reward/mean": 1.2052125930786133, "rewards/helpfulness_reward/std": 0.7777675986289978, "rewards/safety_reward/mean": 9.383544921875, "rewards/safety_reward/std": 0.9850530624389648, "step": 1406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 54.109375, "completions/mean_terminated_length": 54.109375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.4914854597851716, "frac_reward_zero_std": 0.0, "grad_norm": 0.45351457595825195, "kl": 4.43359375, "learning_rate": 5e-05, "loss": 0.0532, "num_tokens": 17836285.0, "reward": 9.065185546875, "reward_std": 0.4010286331176758, "rewards/helpfulness_reward/mean": 1.5662975311279297, "rewards/helpfulness_reward/std": 0.9573284983634949, "rewards/safety_reward/mean": 9.065185546875, "rewards/safety_reward/std": 1.1891696453094482, "step": 1407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 53.9609375, "completions/mean_terminated_length": 53.9609375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.4918347742555235, "frac_reward_zero_std": 0.0, "grad_norm": 0.346261590719223, "kl": 4.16796875, "learning_rate": 5e-05, "loss": 0.0424, "num_tokens": 17847568.0, "reward": 9.266357421875, "reward_std": 0.26341962814331055, "rewards/helpfulness_reward/mean": 1.4149169921875, "rewards/helpfulness_reward/std": 0.6890286803245544, "rewards/safety_reward/mean": 9.266357421875, "rewards/safety_reward/std": 1.2313318252563477, "step": 1408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 54.1640625, "completions/mean_terminated_length": 54.1640625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.4921840887258755, "frac_reward_zero_std": 0.0, "grad_norm": 0.34015291929244995, "kl": 4.416015625, "learning_rate": 5e-05, "loss": 0.0461, "num_tokens": 17857997.0, "reward": 8.9765625, "reward_std": 0.21411636471748352, "rewards/helpfulness_reward/mean": 1.208059310913086, "rewards/helpfulness_reward/std": 0.9429867267608643, "rewards/safety_reward/mean": 8.9765625, "rewards/safety_reward/std": 1.1488945484161377, "step": 1409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 54.359375, "completions/mean_terminated_length": 54.359375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.4925334031962274, "frac_reward_zero_std": 0.0, "grad_norm": 0.3754797875881195, "kl": 4.171875, "learning_rate": 5e-05, "loss": 0.054, "num_tokens": 17870451.0, "reward": 9.17236328125, "reward_std": 0.19384920597076416, "rewards/helpfulness_reward/mean": 1.3780527114868164, "rewards/helpfulness_reward/std": 0.8422350287437439, "rewards/safety_reward/mean": 9.17236328125, "rewards/safety_reward/std": 1.7467628717422485, "step": 1410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 54.640625, "completions/mean_terminated_length": 54.640625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.4928827176665793, "frac_reward_zero_std": 0.0, "grad_norm": 0.3292378783226013, "kl": 4.029296875, "learning_rate": 5e-05, "loss": 0.0428, "num_tokens": 17882397.0, "reward": 9.2607421875, "reward_std": 0.16098903119564056, "rewards/helpfulness_reward/mean": 1.1575336456298828, "rewards/helpfulness_reward/std": 0.7624653577804565, "rewards/safety_reward/mean": 9.2607421875, "rewards/safety_reward/std": 0.8394460082054138, "step": 1411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 54.5703125, "completions/mean_terminated_length": 54.5703125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.4932320321369313, "frac_reward_zero_std": 0.0, "grad_norm": 0.4528101086616516, "kl": 4.36328125, "learning_rate": 5e-05, "loss": 0.0525, "num_tokens": 17893126.0, "reward": 8.933837890625, "reward_std": 0.20506243407726288, "rewards/helpfulness_reward/mean": 1.2963905334472656, "rewards/helpfulness_reward/std": 0.5709028244018555, "rewards/safety_reward/mean": 8.933837890625, "rewards/safety_reward/std": 0.9607486128807068, "step": 1412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 54.65625, "completions/mean_terminated_length": 54.65625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.4935813466072832, "frac_reward_zero_std": 0.0, "grad_norm": 0.4533483684062958, "kl": 4.265625, "learning_rate": 5e-05, "loss": 0.0519, "num_tokens": 17904130.0, "reward": 9.12158203125, "reward_std": 0.25545260310173035, "rewards/helpfulness_reward/mean": 1.397857666015625, "rewards/helpfulness_reward/std": 0.5878279209136963, "rewards/safety_reward/mean": 9.12158203125, "rewards/safety_reward/std": 1.0419890880584717, "step": 1413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 54.1953125, "completions/mean_terminated_length": 54.1953125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.4939306610776351, "frac_reward_zero_std": 0.125, "grad_norm": 0.2800704538822174, "kl": 3.955078125, "learning_rate": 5e-05, "loss": 0.0429, "num_tokens": 17915595.0, "reward": 8.72998046875, "reward_std": 0.1933760941028595, "rewards/helpfulness_reward/mean": 1.501941204071045, "rewards/helpfulness_reward/std": 0.8868715763092041, "rewards/safety_reward/mean": 8.72998046875, "rewards/safety_reward/std": 1.4636850357055664, "step": 1414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 53.7734375, "completions/mean_terminated_length": 53.7734375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.4942799755479871, "frac_reward_zero_std": 0.0625, "grad_norm": 0.3668806850910187, "kl": 4.181640625, "learning_rate": 5e-05, "loss": 0.0375, "num_tokens": 17926678.0, "reward": 9.00732421875, "reward_std": 0.2504435181617737, "rewards/helpfulness_reward/mean": 1.2659034729003906, "rewards/helpfulness_reward/std": 0.6840457916259766, "rewards/safety_reward/mean": 9.00732421875, "rewards/safety_reward/std": 1.0280461311340332, "step": 1415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 85.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 54.53125, "completions/mean_terminated_length": 54.53125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.494629290018339, "frac_reward_zero_std": 0.0, "grad_norm": 0.4833817481994629, "kl": 4.328125, "learning_rate": 5e-05, "loss": 0.0527, "num_tokens": 17938298.0, "reward": 9.156005859375, "reward_std": 0.17799265682697296, "rewards/helpfulness_reward/mean": 1.129495620727539, "rewards/helpfulness_reward/std": 0.7691544890403748, "rewards/safety_reward/mean": 9.156005859375, "rewards/safety_reward/std": 1.3103686571121216, "step": 1416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 54.75, "completions/mean_terminated_length": 54.75, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.49497860448869097, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7139380574226379, "kl": 4.65625, "learning_rate": 5e-05, "loss": 0.0533, "num_tokens": 17949306.0, "reward": 8.730224609375, "reward_std": 0.22867707908153534, "rewards/helpfulness_reward/mean": 1.3398017883300781, "rewards/helpfulness_reward/std": 0.7358601093292236, "rewards/safety_reward/mean": 8.730224609375, "rewards/safety_reward/std": 1.3406182527542114, "step": 1417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 94.0, "completions/max_terminated_length": 94.0, "completions/mean_length": 54.625, "completions/mean_terminated_length": 54.625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.4953279189590429, "frac_reward_zero_std": 0.0, "grad_norm": 0.4204728305339813, "kl": 4.0859375, "learning_rate": 5e-05, "loss": 0.0563, "num_tokens": 17961426.0, "reward": 8.937255859375, "reward_std": 0.24757276475429535, "rewards/helpfulness_reward/mean": 1.4377613067626953, "rewards/helpfulness_reward/std": 0.9046352505683899, "rewards/safety_reward/mean": 8.937255859375, "rewards/safety_reward/std": 1.2998398542404175, "step": 1418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 54.0234375, "completions/mean_terminated_length": 54.0234375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.4956772334293948, "frac_reward_zero_std": 0.0, "grad_norm": 0.9159576892852783, "kl": 4.56640625, "learning_rate": 5e-05, "loss": 0.0518, "num_tokens": 17972333.0, "reward": 8.840576171875, "reward_std": 0.27392578125, "rewards/helpfulness_reward/mean": 1.3727283477783203, "rewards/helpfulness_reward/std": 0.6583223938941956, "rewards/safety_reward/mean": 8.840576171875, "rewards/safety_reward/std": 1.0348402261734009, "step": 1419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 53.875, "completions/mean_terminated_length": 53.875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.49602654789974676, "frac_reward_zero_std": 0.0, "grad_norm": 0.4956521689891815, "kl": 4.31640625, "learning_rate": 5e-05, "loss": 0.0467, "num_tokens": 17984253.0, "reward": 8.8193359375, "reward_std": 0.2529820501804352, "rewards/helpfulness_reward/mean": 1.2064151763916016, "rewards/helpfulness_reward/std": 0.5473856925964355, "rewards/safety_reward/mean": 8.8193359375, "rewards/safety_reward/std": 0.8442550897598267, "step": 1420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 54.4609375, "completions/mean_terminated_length": 54.4609375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.4963758623700987, "frac_reward_zero_std": 0.0, "grad_norm": 0.3249351680278778, "kl": 4.3984375, "learning_rate": 5e-05, "loss": 0.0535, "num_tokens": 17998312.0, "reward": 9.043701171875, "reward_std": 0.2937450408935547, "rewards/helpfulness_reward/mean": 1.4850425720214844, "rewards/helpfulness_reward/std": 1.014447808265686, "rewards/safety_reward/mean": 9.043701171875, "rewards/safety_reward/std": 1.2875053882598877, "step": 1421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 54.65625, "completions/mean_terminated_length": 54.65625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.4967251768404506, "frac_reward_zero_std": 0.0, "grad_norm": 0.39255237579345703, "kl": 4.046875, "learning_rate": 5e-05, "loss": 0.051, "num_tokens": 18009292.0, "reward": 8.60107421875, "reward_std": 0.43358802795410156, "rewards/helpfulness_reward/mean": 1.1480255126953125, "rewards/helpfulness_reward/std": 0.6516124606132507, "rewards/safety_reward/mean": 8.60107421875, "rewards/safety_reward/std": 1.1981396675109863, "step": 1422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 54.9296875, "completions/mean_terminated_length": 54.9296875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.49707449131080256, "frac_reward_zero_std": 0.0, "grad_norm": 0.3201315999031067, "kl": 4.158203125, "learning_rate": 5e-05, "loss": 0.0505, "num_tokens": 18025211.0, "reward": 8.096864700317383, "reward_std": 0.32573193311691284, "rewards/helpfulness_reward/mean": 1.0953788757324219, "rewards/helpfulness_reward/std": 0.9035313129425049, "rewards/safety_reward/mean": 8.096864700317383, "rewards/safety_reward/std": 2.275017261505127, "step": 1423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 54.46875, "completions/mean_terminated_length": 54.46875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.49742380578115447, "frac_reward_zero_std": 0.0, "grad_norm": 0.3939705789089203, "kl": 4.109375, "learning_rate": 5e-05, "loss": 0.0531, "num_tokens": 18037007.0, "reward": 8.94921875, "reward_std": 0.25496792793273926, "rewards/helpfulness_reward/mean": 1.0055937767028809, "rewards/helpfulness_reward/std": 0.8948380947113037, "rewards/safety_reward/mean": 8.94921875, "rewards/safety_reward/std": 1.3593474626541138, "step": 1424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 53.921875, "completions/mean_terminated_length": 53.921875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.49777312025150644, "frac_reward_zero_std": 0.0, "grad_norm": 0.39527642726898193, "kl": 4.11328125, "learning_rate": 5e-05, "loss": 0.0423, "num_tokens": 18048325.0, "reward": 8.697509765625, "reward_std": 0.29334449768066406, "rewards/helpfulness_reward/mean": 1.6140632629394531, "rewards/helpfulness_reward/std": 0.7077406644821167, "rewards/safety_reward/mean": 8.697509765625, "rewards/safety_reward/std": 0.9428238868713379, "step": 1425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 103.0, "completions/max_terminated_length": 103.0, "completions/mean_length": 56.625, "completions/mean_terminated_length": 56.625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.49812243472185835, "frac_reward_zero_std": 0.0, "grad_norm": 0.36699753999710083, "kl": 3.748046875, "learning_rate": 5e-05, "loss": 0.035, "num_tokens": 18061237.0, "reward": 8.511448860168457, "reward_std": 0.4507564306259155, "rewards/helpfulness_reward/mean": 1.726423740386963, "rewards/helpfulness_reward/std": 0.8157715797424316, "rewards/safety_reward/mean": 8.511448860168457, "rewards/safety_reward/std": 1.9201486110687256, "step": 1426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 54.328125, "completions/mean_terminated_length": 54.328125, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.49847174919221027, "frac_reward_zero_std": 0.0625, "grad_norm": 0.3997741937637329, "kl": 3.955078125, "learning_rate": 5e-05, "loss": 0.0467, "num_tokens": 18073095.0, "reward": 8.8212890625, "reward_std": 0.44704729318618774, "rewards/helpfulness_reward/mean": 1.4578704833984375, "rewards/helpfulness_reward/std": 0.6460398435592651, "rewards/safety_reward/mean": 8.8212890625, "rewards/safety_reward/std": 1.132174015045166, "step": 1427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 53.9609375, "completions/mean_terminated_length": 53.9609375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.49882106366256224, "frac_reward_zero_std": 0.0625, "grad_norm": 0.3477688133716583, "kl": 3.876953125, "learning_rate": 5e-05, "loss": 0.0434, "num_tokens": 18083890.0, "reward": 8.89111328125, "reward_std": 0.1324332058429718, "rewards/helpfulness_reward/mean": 1.498457908630371, "rewards/helpfulness_reward/std": 0.7699245810508728, "rewards/safety_reward/mean": 8.89111328125, "rewards/safety_reward/std": 0.9834380149841309, "step": 1428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 54.734375, "completions/mean_terminated_length": 54.734375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.49917037813291415, "frac_reward_zero_std": 0.0, "grad_norm": 0.46710923314094543, "kl": 4.23828125, "learning_rate": 5e-05, "loss": 0.0508, "num_tokens": 18095776.0, "reward": 8.722900390625, "reward_std": 0.25926029682159424, "rewards/helpfulness_reward/mean": 1.0415458679199219, "rewards/helpfulness_reward/std": 0.6077736616134644, "rewards/safety_reward/mean": 8.722900390625, "rewards/safety_reward/std": 1.1854673624038696, "step": 1429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 55.1015625, "completions/mean_terminated_length": 55.1015625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.49951969260326606, "frac_reward_zero_std": 0.0, "grad_norm": 1.2992959022521973, "kl": 4.84375, "learning_rate": 5e-05, "loss": 0.0695, "num_tokens": 18107493.0, "reward": 8.78466796875, "reward_std": 0.3663187026977539, "rewards/helpfulness_reward/mean": 1.174992561340332, "rewards/helpfulness_reward/std": 1.0448776483535767, "rewards/safety_reward/mean": 8.78466796875, "rewards/safety_reward/std": 1.4779237508773804, "step": 1430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 54.3125, "completions/mean_terminated_length": 54.3125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.49986900707361803, "frac_reward_zero_std": 0.0, "grad_norm": 0.5867627859115601, "kl": 4.61328125, "learning_rate": 5e-05, "loss": 0.0628, "num_tokens": 18119069.0, "reward": 9.1044921875, "reward_std": 0.20306235551834106, "rewards/helpfulness_reward/mean": 1.0935077667236328, "rewards/helpfulness_reward/std": 0.7016064524650574, "rewards/safety_reward/mean": 9.1044921875, "rewards/safety_reward/std": 0.9556352496147156, "step": 1431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 54.6484375, "completions/mean_terminated_length": 54.6484375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.50021832154397, "frac_reward_zero_std": 0.0, "grad_norm": 0.3681892156600952, "kl": 4.119140625, "learning_rate": 5e-05, "loss": 0.0566, "num_tokens": 18130096.0, "reward": 9.073974609375, "reward_std": 0.2006937861442566, "rewards/helpfulness_reward/mean": 1.324507713317871, "rewards/helpfulness_reward/std": 0.7207566499710083, "rewards/safety_reward/mean": 9.073974609375, "rewards/safety_reward/std": 0.8837082982063293, "step": 1432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 54.6875, "completions/mean_terminated_length": 54.6875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.5005676360143219, "frac_reward_zero_std": 0.0, "grad_norm": 0.46636295318603516, "kl": 4.52734375, "learning_rate": 5e-05, "loss": 0.0605, "num_tokens": 18141048.0, "reward": 8.67822265625, "reward_std": 0.39779821038246155, "rewards/helpfulness_reward/mean": 1.166311264038086, "rewards/helpfulness_reward/std": 0.8829303979873657, "rewards/safety_reward/mean": 8.67822265625, "rewards/safety_reward/std": 1.2935293912887573, "step": 1433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 54.109375, "completions/mean_terminated_length": 54.109375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.5009169504846738, "frac_reward_zero_std": 0.0, "grad_norm": 1.3983800411224365, "kl": 4.630859375, "learning_rate": 5e-05, "loss": 0.0394, "num_tokens": 18152118.0, "reward": 9.043212890625, "reward_std": 0.26005232334136963, "rewards/helpfulness_reward/mean": 1.4248838424682617, "rewards/helpfulness_reward/std": 0.8711416721343994, "rewards/safety_reward/mean": 9.043212890625, "rewards/safety_reward/std": 1.0261117219924927, "step": 1434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 84.0, "completions/max_terminated_length": 84.0, "completions/mean_length": 54.9375, "completions/mean_terminated_length": 54.9375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.5012662649550258, "frac_reward_zero_std": 0.0, "grad_norm": 0.45453956723213196, "kl": 4.2890625, "learning_rate": 5e-05, "loss": 0.0557, "num_tokens": 18164566.0, "reward": 8.7138671875, "reward_std": 0.3498545289039612, "rewards/helpfulness_reward/mean": 1.0488089323043823, "rewards/helpfulness_reward/std": 0.9593129754066467, "rewards/safety_reward/mean": 8.7138671875, "rewards/safety_reward/std": 1.1680338382720947, "step": 1435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 89.0, "completions/max_terminated_length": 89.0, "completions/mean_length": 54.484375, "completions/mean_terminated_length": 54.484375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.5016155794253777, "frac_reward_zero_std": 0.0, "grad_norm": 0.8243534564971924, "kl": 4.4296875, "learning_rate": 5e-05, "loss": 0.0476, "num_tokens": 18176452.0, "reward": 8.772216796875, "reward_std": 0.3094446361064911, "rewards/helpfulness_reward/mean": 1.2730276584625244, "rewards/helpfulness_reward/std": 0.8939438462257385, "rewards/safety_reward/mean": 8.772216796875, "rewards/safety_reward/std": 1.6104892492294312, "step": 1436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 54.78125, "completions/mean_terminated_length": 54.78125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.5019648938957296, "frac_reward_zero_std": 0.0, "grad_norm": 0.40960368514060974, "kl": 4.201171875, "learning_rate": 5e-05, "loss": 0.0582, "num_tokens": 18187896.0, "reward": 8.904296875, "reward_std": 0.2844525873661041, "rewards/helpfulness_reward/mean": 1.0458836555480957, "rewards/helpfulness_reward/std": 0.8603299260139465, "rewards/safety_reward/mean": 8.904296875, "rewards/safety_reward/std": 1.4618452787399292, "step": 1437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 54.4140625, "completions/mean_terminated_length": 54.4140625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.5023142083660815, "frac_reward_zero_std": 0.0, "grad_norm": 0.751596987247467, "kl": 4.65625, "learning_rate": 5e-05, "loss": 0.0496, "num_tokens": 18198853.0, "reward": 8.525146484375, "reward_std": 0.41958117485046387, "rewards/helpfulness_reward/mean": 1.0848565101623535, "rewards/helpfulness_reward/std": 0.8270451426506042, "rewards/safety_reward/mean": 8.525146484375, "rewards/safety_reward/std": 0.9957622289657593, "step": 1438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 55.1875, "completions/mean_terminated_length": 55.1875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.5026635228364335, "frac_reward_zero_std": 0.0, "grad_norm": 0.5009254217147827, "kl": 4.453125, "learning_rate": 5e-05, "loss": 0.0527, "num_tokens": 18210381.0, "reward": 8.777587890625, "reward_std": 0.3531763553619385, "rewards/helpfulness_reward/mean": 1.0134825706481934, "rewards/helpfulness_reward/std": 0.6116880178451538, "rewards/safety_reward/mean": 8.777587890625, "rewards/safety_reward/std": 1.0661025047302246, "step": 1439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 54.6328125, "completions/mean_terminated_length": 54.6328125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.5030128373067855, "frac_reward_zero_std": 0.0, "grad_norm": 0.4376145601272583, "kl": 3.91015625, "learning_rate": 5e-05, "loss": 0.0471, "num_tokens": 18220734.0, "reward": 9.14794921875, "reward_std": 0.394792377948761, "rewards/helpfulness_reward/mean": 1.4316749572753906, "rewards/helpfulness_reward/std": 0.7536389231681824, "rewards/safety_reward/mean": 9.14794921875, "rewards/safety_reward/std": 1.070678949356079, "step": 1440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 100.0, "completions/max_terminated_length": 100.0, "completions/mean_length": 55.3046875, "completions/mean_terminated_length": 55.3046875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.5033621517771374, "frac_reward_zero_std": 0.0, "grad_norm": 0.4274523854255676, "kl": 4.1484375, "learning_rate": 5e-05, "loss": 0.0531, "num_tokens": 18234197.0, "reward": 8.65802001953125, "reward_std": 0.5063565969467163, "rewards/helpfulness_reward/mean": 1.1251001358032227, "rewards/helpfulness_reward/std": 0.8286014795303345, "rewards/safety_reward/mean": 8.65802001953125, "rewards/safety_reward/std": 1.9349887371063232, "step": 1441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 88.0, "completions/max_terminated_length": 88.0, "completions/mean_length": 55.609375, "completions/mean_terminated_length": 55.609375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.5037114662474893, "frac_reward_zero_std": 0.0, "grad_norm": 0.4694124162197113, "kl": 4.3125, "learning_rate": 5e-05, "loss": 0.0702, "num_tokens": 18245955.0, "reward": 8.775634765625, "reward_std": 0.41718241572380066, "rewards/helpfulness_reward/mean": 1.008249282836914, "rewards/helpfulness_reward/std": 0.5967223644256592, "rewards/safety_reward/mean": 8.775634765625, "rewards/safety_reward/std": 1.0869935750961304, "step": 1442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 54.875, "completions/mean_terminated_length": 54.875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.5040607807178412, "frac_reward_zero_std": 0.0, "grad_norm": 0.5403291583061218, "kl": 4.197265625, "learning_rate": 5e-05, "loss": 0.0531, "num_tokens": 18259571.0, "reward": 8.1165771484375, "reward_std": 0.43582117557525635, "rewards/helpfulness_reward/mean": 0.7464332580566406, "rewards/helpfulness_reward/std": 1.2764220237731934, "rewards/safety_reward/mean": 8.1165771484375, "rewards/safety_reward/std": 1.647514820098877, "step": 1443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 85.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 55.78125, "completions/mean_terminated_length": 55.78125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.5044100951881931, "frac_reward_zero_std": 0.0, "grad_norm": 0.35859906673431396, "kl": 4.37890625, "learning_rate": 5e-05, "loss": 0.0554, "num_tokens": 18271551.0, "reward": 8.7852783203125, "reward_std": 0.3221120834350586, "rewards/helpfulness_reward/mean": 0.9791793823242188, "rewards/helpfulness_reward/std": 0.9497009515762329, "rewards/safety_reward/mean": 8.7852783203125, "rewards/safety_reward/std": 1.7763060331344604, "step": 1444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 88.0, "completions/max_terminated_length": 88.0, "completions/mean_length": 54.953125, "completions/mean_terminated_length": 54.953125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.5047594096585452, "frac_reward_zero_std": 0.0, "grad_norm": 0.7108243703842163, "kl": 4.373046875, "learning_rate": 5e-05, "loss": 0.0593, "num_tokens": 18282481.0, "reward": 9.021484375, "reward_std": 0.42845532298088074, "rewards/helpfulness_reward/mean": 1.4057531356811523, "rewards/helpfulness_reward/std": 0.8663743138313293, "rewards/safety_reward/mean": 9.021484375, "rewards/safety_reward/std": 0.952284574508667, "step": 1445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 91.0, "completions/max_terminated_length": 91.0, "completions/mean_length": 55.9453125, "completions/mean_terminated_length": 55.9453125, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.5051087241288971, "frac_reward_zero_std": 0.0, "grad_norm": 0.390057772397995, "kl": 4.130859375, "learning_rate": 5e-05, "loss": 0.0682, "num_tokens": 18294546.0, "reward": 8.8642578125, "reward_std": 0.42582106590270996, "rewards/helpfulness_reward/mean": 0.9741916656494141, "rewards/helpfulness_reward/std": 0.8552224636077881, "rewards/safety_reward/mean": 8.8642578125, "rewards/safety_reward/std": 1.1375640630722046, "step": 1446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 54.703125, "completions/mean_terminated_length": 54.703125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.505458038599249, "frac_reward_zero_std": 0.0, "grad_norm": 0.5725898742675781, "kl": 4.4140625, "learning_rate": 5e-05, "loss": 0.0597, "num_tokens": 18306300.0, "reward": 8.334228515625, "reward_std": 0.36364051699638367, "rewards/helpfulness_reward/mean": 1.1988372802734375, "rewards/helpfulness_reward/std": 0.6242386698722839, "rewards/safety_reward/mean": 8.334228515625, "rewards/safety_reward/std": 1.3733859062194824, "step": 1447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 55.734375, "completions/mean_terminated_length": 55.734375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.5058073530696009, "frac_reward_zero_std": 0.0, "grad_norm": 0.42531758546829224, "kl": 4.27734375, "learning_rate": 5e-05, "loss": 0.0743, "num_tokens": 18318962.0, "reward": 8.801513671875, "reward_std": 0.3788908123970032, "rewards/helpfulness_reward/mean": 1.1617279052734375, "rewards/helpfulness_reward/std": 0.5370463132858276, "rewards/safety_reward/mean": 8.801513671875, "rewards/safety_reward/std": 1.5284810066223145, "step": 1448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 55.171875, "completions/mean_terminated_length": 55.171875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.5061566675399528, "frac_reward_zero_std": 0.0, "grad_norm": 0.4232759177684784, "kl": 4.103515625, "learning_rate": 5e-05, "loss": 0.0553, "num_tokens": 18330336.0, "reward": 8.552978515625, "reward_std": 0.40220755338668823, "rewards/helpfulness_reward/mean": 1.3456802368164062, "rewards/helpfulness_reward/std": 0.8884987235069275, "rewards/safety_reward/mean": 8.552978515625, "rewards/safety_reward/std": 1.2467325925827026, "step": 1449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 54.203125, "completions/mean_terminated_length": 54.203125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.5065059820103047, "frac_reward_zero_std": 0.0, "grad_norm": 0.5068824291229248, "kl": 4.349609375, "learning_rate": 5e-05, "loss": 0.0513, "num_tokens": 18341226.0, "reward": 8.624267578125, "reward_std": 0.38386270403862, "rewards/helpfulness_reward/mean": 1.430959701538086, "rewards/helpfulness_reward/std": 0.664832592010498, "rewards/safety_reward/mean": 8.624267578125, "rewards/safety_reward/std": 1.3300896883010864, "step": 1450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 89.0, "completions/max_terminated_length": 89.0, "completions/mean_length": 54.7421875, "completions/mean_terminated_length": 54.7421875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.5068552964806567, "frac_reward_zero_std": 0.0, "grad_norm": 0.4517417252063751, "kl": 4.234375, "learning_rate": 5e-05, "loss": 0.0555, "num_tokens": 18352489.0, "reward": 9.06494140625, "reward_std": 0.3552272915840149, "rewards/helpfulness_reward/mean": 1.6187095642089844, "rewards/helpfulness_reward/std": 0.9400056004524231, "rewards/safety_reward/mean": 9.06494140625, "rewards/safety_reward/std": 1.1465604305267334, "step": 1451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 54.3671875, "completions/mean_terminated_length": 54.3671875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.5072046109510087, "frac_reward_zero_std": 0.0, "grad_norm": 0.4210887551307678, "kl": 4.267578125, "learning_rate": 5e-05, "loss": 0.0414, "num_tokens": 18364256.0, "reward": 9.031005859375, "reward_std": 0.32877108454704285, "rewards/helpfulness_reward/mean": 1.5970916748046875, "rewards/helpfulness_reward/std": 0.6886966824531555, "rewards/safety_reward/mean": 9.031005859375, "rewards/safety_reward/std": 0.835838794708252, "step": 1452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 55.0234375, "completions/mean_terminated_length": 55.0234375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.5075539254213606, "frac_reward_zero_std": 0.0, "grad_norm": 0.3908303380012512, "kl": 4.29296875, "learning_rate": 5e-05, "loss": 0.0594, "num_tokens": 18375171.0, "reward": 8.800048828125, "reward_std": 0.40467581152915955, "rewards/helpfulness_reward/mean": 1.48455810546875, "rewards/helpfulness_reward/std": 0.7182775735855103, "rewards/safety_reward/mean": 8.800048828125, "rewards/safety_reward/std": 1.5456738471984863, "step": 1453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 94.0, "completions/max_terminated_length": 94.0, "completions/mean_length": 54.6875, "completions/mean_terminated_length": 54.6875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.5079032398917125, "frac_reward_zero_std": 0.0, "grad_norm": 0.5191966891288757, "kl": 4.029296875, "learning_rate": 5e-05, "loss": 0.0534, "num_tokens": 18386323.0, "reward": 9.057861328125, "reward_std": 0.38318514823913574, "rewards/helpfulness_reward/mean": 1.4622879028320312, "rewards/helpfulness_reward/std": 0.5839048027992249, "rewards/safety_reward/mean": 9.057861328125, "rewards/safety_reward/std": 1.2637536525726318, "step": 1454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 54.71875, "completions/mean_terminated_length": 54.71875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.5082525543620644, "frac_reward_zero_std": 0.0, "grad_norm": 0.6735833883285522, "kl": 4.58984375, "learning_rate": 5e-05, "loss": 0.058, "num_tokens": 18398791.0, "reward": 8.9805908203125, "reward_std": 0.40125852823257446, "rewards/helpfulness_reward/mean": 1.1916265487670898, "rewards/helpfulness_reward/std": 0.9706652164459229, "rewards/safety_reward/mean": 8.9805908203125, "rewards/safety_reward/std": 1.5593961477279663, "step": 1455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 88.0, "completions/max_terminated_length": 88.0, "completions/mean_length": 54.5859375, "completions/mean_terminated_length": 54.5859375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.5086018688324164, "frac_reward_zero_std": 0.0, "grad_norm": 0.49224358797073364, "kl": 4.2578125, "learning_rate": 5e-05, "loss": 0.0601, "num_tokens": 18410138.0, "reward": 8.440185546875, "reward_std": 0.39215725660324097, "rewards/helpfulness_reward/mean": 1.441427230834961, "rewards/helpfulness_reward/std": 0.629938006401062, "rewards/safety_reward/mean": 8.440185546875, "rewards/safety_reward/std": 1.1720203161239624, "step": 1456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 53.7890625, "completions/mean_terminated_length": 53.7890625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.5089511833027683, "frac_reward_zero_std": 0.0, "grad_norm": 0.43065035343170166, "kl": 3.90625, "learning_rate": 5e-05, "loss": 0.0482, "num_tokens": 18421319.0, "reward": 8.061767578125, "reward_std": 0.35430169105529785, "rewards/helpfulness_reward/mean": 1.442422866821289, "rewards/helpfulness_reward/std": 0.5698482990264893, "rewards/safety_reward/mean": 8.061767578125, "rewards/safety_reward/std": 1.403518557548523, "step": 1457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 54.6796875, "completions/mean_terminated_length": 54.6796875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.5093004977731203, "frac_reward_zero_std": 0.0, "grad_norm": 0.3577554225921631, "kl": 4.287109375, "learning_rate": 5e-05, "loss": 0.0481, "num_tokens": 18432014.0, "reward": 9.00390625, "reward_std": 0.31581902503967285, "rewards/helpfulness_reward/mean": 1.2926597595214844, "rewards/helpfulness_reward/std": 0.644946813583374, "rewards/safety_reward/mean": 9.00390625, "rewards/safety_reward/std": 1.0013984441757202, "step": 1458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 53.890625, "completions/mean_terminated_length": 53.890625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.5096498122434722, "frac_reward_zero_std": 0.0, "grad_norm": 0.3470808267593384, "kl": 3.984375, "learning_rate": 5e-05, "loss": 0.0378, "num_tokens": 18443680.0, "reward": 9.1240234375, "reward_std": 0.2630390226840973, "rewards/helpfulness_reward/mean": 1.4036865234375, "rewards/helpfulness_reward/std": 0.5833349823951721, "rewards/safety_reward/mean": 9.1240234375, "rewards/safety_reward/std": 1.1018222570419312, "step": 1459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 120.0, "completions/max_terminated_length": 120.0, "completions/mean_length": 55.5234375, "completions/mean_terminated_length": 55.5234375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.5099991267138241, "frac_reward_zero_std": 0.0, "grad_norm": 0.5081334710121155, "kl": 3.99609375, "learning_rate": 5e-05, "loss": 0.043, "num_tokens": 18457755.0, "reward": 8.42913818359375, "reward_std": 0.25167232751846313, "rewards/helpfulness_reward/mean": 1.2324519157409668, "rewards/helpfulness_reward/std": 0.7520922422409058, "rewards/safety_reward/mean": 8.42913818359375, "rewards/safety_reward/std": 2.1652352809906006, "step": 1460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 54.375, "completions/mean_terminated_length": 54.375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.5103484411841761, "frac_reward_zero_std": 0.0, "grad_norm": 0.732090950012207, "kl": 4.71875, "learning_rate": 5e-05, "loss": 0.0463, "num_tokens": 18469147.0, "reward": 9.14111328125, "reward_std": 0.21120502054691315, "rewards/helpfulness_reward/mean": 1.0542900562286377, "rewards/helpfulness_reward/std": 0.5981470346450806, "rewards/safety_reward/mean": 9.14111328125, "rewards/safety_reward/std": 1.0922337770462036, "step": 1461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 54.2109375, "completions/mean_terminated_length": 54.2109375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.510697755654528, "frac_reward_zero_std": 0.0, "grad_norm": 0.4373350441455841, "kl": 4.302734375, "learning_rate": 5e-05, "loss": 0.0584, "num_tokens": 18481214.0, "reward": 8.859130859375, "reward_std": 0.2913385331630707, "rewards/helpfulness_reward/mean": 1.4002532958984375, "rewards/helpfulness_reward/std": 0.7012960910797119, "rewards/safety_reward/mean": 8.859130859375, "rewards/safety_reward/std": 0.9807368516921997, "step": 1462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 103.0, "completions/max_terminated_length": 103.0, "completions/mean_length": 54.6171875, "completions/mean_terminated_length": 54.6171875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.5110470701248799, "frac_reward_zero_std": 0.0, "grad_norm": 0.3978997468948364, "kl": 4.140625, "learning_rate": 5e-05, "loss": 0.0463, "num_tokens": 18493853.0, "reward": 9.16259765625, "reward_std": 0.2346281111240387, "rewards/helpfulness_reward/mean": 1.6477584838867188, "rewards/helpfulness_reward/std": 0.8155941367149353, "rewards/safety_reward/mean": 9.16259765625, "rewards/safety_reward/std": 1.097321629524231, "step": 1463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 54.2734375, "completions/mean_terminated_length": 54.2734375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.5113963845952318, "frac_reward_zero_std": 0.0, "grad_norm": 0.40999117493629456, "kl": 4.40625, "learning_rate": 5e-05, "loss": 0.0554, "num_tokens": 18504272.0, "reward": 8.83935546875, "reward_std": 0.3754264712333679, "rewards/helpfulness_reward/mean": 1.252833366394043, "rewards/helpfulness_reward/std": 0.8397237062454224, "rewards/safety_reward/mean": 8.83935546875, "rewards/safety_reward/std": 0.8500351905822754, "step": 1464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 54.234375, "completions/mean_terminated_length": 54.234375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.5117456990655838, "frac_reward_zero_std": 0.0, "grad_norm": 0.4385976195335388, "kl": 4.009765625, "learning_rate": 5e-05, "loss": 0.0521, "num_tokens": 18515134.0, "reward": 8.89306640625, "reward_std": 0.27320772409439087, "rewards/helpfulness_reward/mean": 1.3511743545532227, "rewards/helpfulness_reward/std": 0.6894171237945557, "rewards/safety_reward/mean": 8.89306640625, "rewards/safety_reward/std": 1.008671522140503, "step": 1465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 54.2734375, "completions/mean_terminated_length": 54.2734375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.5120950135359357, "frac_reward_zero_std": 0.0625, "grad_norm": 0.3478710651397705, "kl": 4.28515625, "learning_rate": 5e-05, "loss": 0.0427, "num_tokens": 18526361.0, "reward": 9.00244140625, "reward_std": 0.1952255368232727, "rewards/helpfulness_reward/mean": 1.1788508892059326, "rewards/helpfulness_reward/std": 0.8624716401100159, "rewards/safety_reward/mean": 9.00244140625, "rewards/safety_reward/std": 1.7689653635025024, "step": 1466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 54.8359375, "completions/mean_terminated_length": 54.8359375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.5124443280062877, "frac_reward_zero_std": 0.0, "grad_norm": 0.41442903876304626, "kl": 4.3515625, "learning_rate": 5e-05, "loss": 0.0606, "num_tokens": 18538828.0, "reward": 8.728271484375, "reward_std": 0.41879719495773315, "rewards/helpfulness_reward/mean": 0.9886802434921265, "rewards/helpfulness_reward/std": 1.1830357313156128, "rewards/safety_reward/mean": 8.728271484375, "rewards/safety_reward/std": 1.4498918056488037, "step": 1467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 54.5546875, "completions/mean_terminated_length": 54.5546875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.5127936424766396, "frac_reward_zero_std": 0.0, "grad_norm": 0.5330060720443726, "kl": 4.35546875, "learning_rate": 5e-05, "loss": 0.0479, "num_tokens": 18551011.0, "reward": 9.07421875, "reward_std": 0.21093325316905975, "rewards/helpfulness_reward/mean": 1.4171295166015625, "rewards/helpfulness_reward/std": 0.7017748355865479, "rewards/safety_reward/mean": 9.07421875, "rewards/safety_reward/std": 1.2994168996810913, "step": 1468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 54.09375, "completions/mean_terminated_length": 54.09375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.5131429569469915, "frac_reward_zero_std": 0.0, "grad_norm": 0.30575230717658997, "kl": 4.076171875, "learning_rate": 5e-05, "loss": 0.0447, "num_tokens": 18561431.0, "reward": 8.771484375, "reward_std": 0.19708842039108276, "rewards/helpfulness_reward/mean": 1.1288681030273438, "rewards/helpfulness_reward/std": 0.6984785199165344, "rewards/safety_reward/mean": 8.771484375, "rewards/safety_reward/std": 1.4155491590499878, "step": 1469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 53.9609375, "completions/mean_terminated_length": 53.9609375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.5134922714173434, "frac_reward_zero_std": 0.0, "grad_norm": 0.29940104484558105, "kl": 4.27734375, "learning_rate": 5e-05, "loss": 0.0423, "num_tokens": 18572074.0, "reward": 8.7265625, "reward_std": 0.2571903467178345, "rewards/helpfulness_reward/mean": 1.1811418533325195, "rewards/helpfulness_reward/std": 0.8890196681022644, "rewards/safety_reward/mean": 8.7265625, "rewards/safety_reward/std": 1.2325679063796997, "step": 1470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 54.609375, "completions/mean_terminated_length": 54.609375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.5138415858876954, "frac_reward_zero_std": 0.0, "grad_norm": 0.3477613627910614, "kl": 4.4453125, "learning_rate": 5e-05, "loss": 0.0518, "num_tokens": 18583440.0, "reward": 8.9688720703125, "reward_std": 0.23878085613250732, "rewards/helpfulness_reward/mean": 1.332977294921875, "rewards/helpfulness_reward/std": 0.724702000617981, "rewards/safety_reward/mean": 8.9688720703125, "rewards/safety_reward/std": 2.0029876232147217, "step": 1471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 54.8203125, "completions/mean_terminated_length": 54.8203125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.5141909003580474, "frac_reward_zero_std": 0.125, "grad_norm": 0.35677221417427063, "kl": 4.0234375, "learning_rate": 5e-05, "loss": 0.0715, "num_tokens": 18594817.0, "reward": 9.009765625, "reward_std": 0.372944176197052, "rewards/helpfulness_reward/mean": 1.1746883392333984, "rewards/helpfulness_reward/std": 0.5701663494110107, "rewards/safety_reward/mean": 9.009765625, "rewards/safety_reward/std": 0.8428495526313782, "step": 1472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 91.0, "completions/max_terminated_length": 91.0, "completions/mean_length": 54.796875, "completions/mean_terminated_length": 54.796875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.5145402148283993, "frac_reward_zero_std": 0.0, "grad_norm": 0.3503842055797577, "kl": 4.298828125, "learning_rate": 5e-05, "loss": 0.0556, "num_tokens": 18606543.0, "reward": 8.7314453125, "reward_std": 0.24710065126419067, "rewards/helpfulness_reward/mean": 1.1277484893798828, "rewards/helpfulness_reward/std": 0.7639432549476624, "rewards/safety_reward/mean": 8.7314453125, "rewards/safety_reward/std": 1.204763650894165, "step": 1473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 54.4921875, "completions/mean_terminated_length": 54.4921875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.5148895292987512, "frac_reward_zero_std": 0.0, "grad_norm": 0.4050365686416626, "kl": 4.35546875, "learning_rate": 5e-05, "loss": 0.0484, "num_tokens": 18617622.0, "reward": 9.028564453125, "reward_std": 0.3003939092159271, "rewards/helpfulness_reward/mean": 1.2103482484817505, "rewards/helpfulness_reward/std": 0.7661948204040527, "rewards/safety_reward/mean": 9.028564453125, "rewards/safety_reward/std": 0.9045729041099548, "step": 1474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 54.109375, "completions/mean_terminated_length": 54.109375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.5152388437691031, "frac_reward_zero_std": 0.0625, "grad_norm": 0.3351185917854309, "kl": 3.98828125, "learning_rate": 5e-05, "loss": 0.0446, "num_tokens": 18628148.0, "reward": 8.652587890625, "reward_std": 0.24940869212150574, "rewards/helpfulness_reward/mean": 1.0735528469085693, "rewards/helpfulness_reward/std": 0.7203508615493774, "rewards/safety_reward/mean": 8.652587890625, "rewards/safety_reward/std": 1.035937786102295, "step": 1475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 89.0, "completions/max_terminated_length": 89.0, "completions/mean_length": 55.0625, "completions/mean_terminated_length": 55.0625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.515588158239455, "frac_reward_zero_std": 0.0, "grad_norm": 5.342260837554932, "kl": 7.1796875, "learning_rate": 5e-05, "loss": 0.083, "num_tokens": 18639460.0, "reward": 9.022705078125, "reward_std": 0.2897163927555084, "rewards/helpfulness_reward/mean": 0.9945449829101562, "rewards/helpfulness_reward/std": 0.6494945287704468, "rewards/safety_reward/mean": 9.022705078125, "rewards/safety_reward/std": 1.0054271221160889, "step": 1476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 54.46875, "completions/mean_terminated_length": 54.46875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.515937472709807, "frac_reward_zero_std": 0.0, "grad_norm": 0.5713916420936584, "kl": 4.41796875, "learning_rate": 5e-05, "loss": 0.0511, "num_tokens": 18650688.0, "reward": 9.137451171875, "reward_std": 0.23875069618225098, "rewards/helpfulness_reward/mean": 1.3627777099609375, "rewards/helpfulness_reward/std": 0.7101911902427673, "rewards/safety_reward/mean": 9.137451171875, "rewards/safety_reward/std": 0.900818943977356, "step": 1477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 54.515625, "completions/mean_terminated_length": 54.515625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.516286787180159, "frac_reward_zero_std": 0.0, "grad_norm": 0.40886181592941284, "kl": 4.13671875, "learning_rate": 5e-05, "loss": 0.0436, "num_tokens": 18663810.0, "reward": 8.5638427734375, "reward_std": 0.20362016558647156, "rewards/helpfulness_reward/mean": 0.9382705688476562, "rewards/helpfulness_reward/std": 0.9469962120056152, "rewards/safety_reward/mean": 8.5638427734375, "rewards/safety_reward/std": 1.8851398229599, "step": 1478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 54.3984375, "completions/mean_terminated_length": 54.3984375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.5166361016505109, "frac_reward_zero_std": 0.0, "grad_norm": 0.4283318817615509, "kl": 4.291015625, "learning_rate": 5e-05, "loss": 0.0354, "num_tokens": 18674669.0, "reward": 8.963134765625, "reward_std": 0.3938482999801636, "rewards/helpfulness_reward/mean": 1.107618808746338, "rewards/helpfulness_reward/std": 0.770004391670227, "rewards/safety_reward/mean": 8.963134765625, "rewards/safety_reward/std": 1.2950270175933838, "step": 1479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 54.8125, "completions/mean_terminated_length": 54.8125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.5169854161208628, "frac_reward_zero_std": 0.0, "grad_norm": 0.3808380961418152, "kl": 4.31640625, "learning_rate": 5e-05, "loss": 0.052, "num_tokens": 18686565.0, "reward": 9.102294921875, "reward_std": 0.32059454917907715, "rewards/helpfulness_reward/mean": 1.00480318069458, "rewards/helpfulness_reward/std": 0.7657085657119751, "rewards/safety_reward/mean": 9.102294921875, "rewards/safety_reward/std": 1.0733919143676758, "step": 1480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 54.34375, "completions/mean_terminated_length": 54.34375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.5173347305912147, "frac_reward_zero_std": 0.0, "grad_norm": 0.364089697599411, "kl": 4.115234375, "learning_rate": 5e-05, "loss": 0.0385, "num_tokens": 18697961.0, "reward": 8.911865234375, "reward_std": 0.2392350435256958, "rewards/helpfulness_reward/mean": 0.9162492752075195, "rewards/helpfulness_reward/std": 0.7832614183425903, "rewards/safety_reward/mean": 8.911865234375, "rewards/safety_reward/std": 1.210754632949829, "step": 1481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 85.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 54.78125, "completions/mean_terminated_length": 54.78125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.5176840450615666, "frac_reward_zero_std": 0.0, "grad_norm": 0.39138469099998474, "kl": 4.30859375, "learning_rate": 5e-05, "loss": 0.0516, "num_tokens": 18709501.0, "reward": 9.07275390625, "reward_std": 0.26041096448898315, "rewards/helpfulness_reward/mean": 1.2323861122131348, "rewards/helpfulness_reward/std": 0.7200359106063843, "rewards/safety_reward/mean": 9.07275390625, "rewards/safety_reward/std": 1.3782849311828613, "step": 1482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 54.4921875, "completions/mean_terminated_length": 54.4921875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.5180333595319186, "frac_reward_zero_std": 0.0, "grad_norm": 0.33503013849258423, "kl": 4.021484375, "learning_rate": 5e-05, "loss": 0.0378, "num_tokens": 18721772.0, "reward": 8.98046875, "reward_std": 0.23712898790836334, "rewards/helpfulness_reward/mean": 1.2796287536621094, "rewards/helpfulness_reward/std": 0.7231019735336304, "rewards/safety_reward/mean": 8.98046875, "rewards/safety_reward/std": 1.2984341382980347, "step": 1483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 94.0, "completions/max_terminated_length": 94.0, "completions/mean_length": 55.1796875, "completions/mean_terminated_length": 55.1796875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.5183826740022706, "frac_reward_zero_std": 0.0, "grad_norm": 6.822556495666504, "kl": 7.12109375, "learning_rate": 5e-05, "loss": 0.0867, "num_tokens": 18733683.0, "reward": 9.430419921875, "reward_std": 0.33219483494758606, "rewards/helpfulness_reward/mean": 1.2703361511230469, "rewards/helpfulness_reward/std": 0.5418996810913086, "rewards/safety_reward/mean": 9.430419921875, "rewards/safety_reward/std": 1.1023544073104858, "step": 1484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 54.703125, "completions/mean_terminated_length": 54.703125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.5187319884726225, "frac_reward_zero_std": 0.0, "grad_norm": 0.38943445682525635, "kl": 4.3125, "learning_rate": 5e-05, "loss": 0.0503, "num_tokens": 18745437.0, "reward": 9.1209716796875, "reward_std": 0.4409332871437073, "rewards/helpfulness_reward/mean": 1.4867630004882812, "rewards/helpfulness_reward/std": 0.8421161770820618, "rewards/safety_reward/mean": 9.1209716796875, "rewards/safety_reward/std": 1.251285433769226, "step": 1485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 104.0, "completions/max_terminated_length": 104.0, "completions/mean_length": 55.875, "completions/mean_terminated_length": 55.875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.5190813029429744, "frac_reward_zero_std": 0.0, "grad_norm": 0.46051591634750366, "kl": 4.373046875, "learning_rate": 5e-05, "loss": 0.0731, "num_tokens": 18757021.0, "reward": 8.92919921875, "reward_std": 0.4087427854537964, "rewards/helpfulness_reward/mean": 1.015768051147461, "rewards/helpfulness_reward/std": 0.7261874675750732, "rewards/safety_reward/mean": 8.92919921875, "rewards/safety_reward/std": 0.9863930344581604, "step": 1486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 84.0, "completions/max_terminated_length": 84.0, "completions/mean_length": 54.5390625, "completions/mean_terminated_length": 54.5390625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.5194306174133263, "frac_reward_zero_std": 0.0, "grad_norm": 0.8269940614700317, "kl": 4.70703125, "learning_rate": 5e-05, "loss": 0.058, "num_tokens": 18767442.0, "reward": 8.70166015625, "reward_std": 0.3285963833332062, "rewards/helpfulness_reward/mean": 1.139129638671875, "rewards/helpfulness_reward/std": 0.8215139508247375, "rewards/safety_reward/mean": 8.70166015625, "rewards/safety_reward/std": 1.2959495782852173, "step": 1487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 104.0, "completions/max_terminated_length": 104.0, "completions/mean_length": 55.421875, "completions/mean_terminated_length": 55.421875, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.5197799318836783, "frac_reward_zero_std": 0.0, "grad_norm": 0.3859690725803375, "kl": 3.998046875, "learning_rate": 5e-05, "loss": 0.0592, "num_tokens": 18778312.0, "reward": 9.3935546875, "reward_std": 0.245908722281456, "rewards/helpfulness_reward/mean": 1.43487548828125, "rewards/helpfulness_reward/std": 0.6475173234939575, "rewards/safety_reward/mean": 9.3935546875, "rewards/safety_reward/std": 1.2478477954864502, "step": 1488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 54.96875, "completions/mean_terminated_length": 54.96875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.5201292463540302, "frac_reward_zero_std": 0.0, "grad_norm": 0.6250026822090149, "kl": 4.3359375, "learning_rate": 5e-05, "loss": 0.0713, "num_tokens": 18788964.0, "reward": 8.58642578125, "reward_std": 0.3264598846435547, "rewards/helpfulness_reward/mean": 0.9398231506347656, "rewards/helpfulness_reward/std": 0.6737605333328247, "rewards/safety_reward/mean": 8.58642578125, "rewards/safety_reward/std": 1.0520012378692627, "step": 1489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 85.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 55.0, "completions/mean_terminated_length": 55.0, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.5204785608243822, "frac_reward_zero_std": 0.0, "grad_norm": 0.47541117668151855, "kl": 4.220703125, "learning_rate": 5e-05, "loss": 0.0608, "num_tokens": 18800412.0, "reward": 9.055908203125, "reward_std": 0.4077916145324707, "rewards/helpfulness_reward/mean": 1.1139426231384277, "rewards/helpfulness_reward/std": 1.0016558170318604, "rewards/safety_reward/mean": 9.055908203125, "rewards/safety_reward/std": 1.2368628978729248, "step": 1490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 54.6796875, "completions/mean_terminated_length": 54.6796875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.5208278752947341, "frac_reward_zero_std": 0.0, "grad_norm": 0.45017096400260925, "kl": 4.12109375, "learning_rate": 5e-05, "loss": 0.0504, "num_tokens": 18811107.0, "reward": 8.96044921875, "reward_std": 0.3555833101272583, "rewards/helpfulness_reward/mean": 1.2707405090332031, "rewards/helpfulness_reward/std": 0.5885547995567322, "rewards/safety_reward/mean": 8.96044921875, "rewards/safety_reward/std": 1.4525247812271118, "step": 1491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 54.671875, "completions/mean_terminated_length": 54.671875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.521177189765086, "frac_reward_zero_std": 0.0, "grad_norm": 0.35842934250831604, "kl": 4.3203125, "learning_rate": 5e-05, "loss": 0.0472, "num_tokens": 18822529.0, "reward": 8.694091796875, "reward_std": 0.2532549500465393, "rewards/helpfulness_reward/mean": 1.2293624877929688, "rewards/helpfulness_reward/std": 0.8645117282867432, "rewards/safety_reward/mean": 8.694091796875, "rewards/safety_reward/std": 0.9831069707870483, "step": 1492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 54.953125, "completions/mean_terminated_length": 54.953125, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.521526504235438, "frac_reward_zero_std": 0.0, "grad_norm": 0.6551411151885986, "kl": 4.5703125, "learning_rate": 5e-05, "loss": 0.049, "num_tokens": 18834011.0, "reward": 9.3984375, "reward_std": 0.2888367176055908, "rewards/helpfulness_reward/mean": 1.349578857421875, "rewards/helpfulness_reward/std": 0.5516969561576843, "rewards/safety_reward/mean": 9.3984375, "rewards/safety_reward/std": 0.9551488161087036, "step": 1493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 92.0, "completions/max_terminated_length": 92.0, "completions/mean_length": 55.390625, "completions/mean_terminated_length": 55.390625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.5218758187057899, "frac_reward_zero_std": 0.0, "grad_norm": 0.40885165333747864, "kl": 4.205078125, "learning_rate": 5e-05, "loss": 0.0724, "num_tokens": 18845173.0, "reward": 8.820068359375, "reward_std": 0.3797740638256073, "rewards/helpfulness_reward/mean": 1.0486907958984375, "rewards/helpfulness_reward/std": 0.7577382326126099, "rewards/safety_reward/mean": 8.820068359375, "rewards/safety_reward/std": 1.1679633855819702, "step": 1494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 54.6171875, "completions/mean_terminated_length": 54.6171875, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.5222251331761418, "frac_reward_zero_std": 0.0, "grad_norm": 0.4586746096611023, "kl": 4.5703125, "learning_rate": 5e-05, "loss": 0.049, "num_tokens": 18856268.0, "reward": 8.938720703125, "reward_std": 0.3958989083766937, "rewards/helpfulness_reward/mean": 1.2125530242919922, "rewards/helpfulness_reward/std": 0.6629573702812195, "rewards/safety_reward/mean": 8.938720703125, "rewards/safety_reward/std": 1.076879858970642, "step": 1495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 54.9140625, "completions/mean_terminated_length": 54.9140625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.5225744476464937, "frac_reward_zero_std": 0.0, "grad_norm": 0.384971559047699, "kl": 4.076171875, "learning_rate": 5e-05, "loss": 0.0469, "num_tokens": 18866945.0, "reward": 8.714599609375, "reward_std": 0.25514188408851624, "rewards/helpfulness_reward/mean": 1.0777969360351562, "rewards/helpfulness_reward/std": 0.6525267958641052, "rewards/safety_reward/mean": 8.714599609375, "rewards/safety_reward/std": 1.2491345405578613, "step": 1496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 55.046875, "completions/mean_terminated_length": 55.046875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.5229237621168457, "frac_reward_zero_std": 0.0, "grad_norm": 0.4237847626209259, "kl": 4.125, "learning_rate": 5e-05, "loss": 0.0519, "num_tokens": 18878191.0, "reward": 9.33251953125, "reward_std": 0.3006000816822052, "rewards/helpfulness_reward/mean": 1.4273872375488281, "rewards/helpfulness_reward/std": 0.6793501377105713, "rewards/safety_reward/mean": 9.33251953125, "rewards/safety_reward/std": 0.9758458137512207, "step": 1497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 55.9296875, "completions/mean_terminated_length": 55.9296875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.5232730765871976, "frac_reward_zero_std": 0.0, "grad_norm": 0.445659339427948, "kl": 4.09765625, "learning_rate": 5e-05, "loss": 0.0577, "num_tokens": 18891726.0, "reward": 9.1806640625, "reward_std": 0.4555426239967346, "rewards/helpfulness_reward/mean": 1.4421463012695312, "rewards/helpfulness_reward/std": 0.6649326682090759, "rewards/safety_reward/mean": 9.1806640625, "rewards/safety_reward/std": 1.1557340621948242, "step": 1498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 54.65625, "completions/mean_terminated_length": 54.65625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.5236223910575496, "frac_reward_zero_std": 0.0, "grad_norm": 0.3898386061191559, "kl": 4.16796875, "learning_rate": 5e-05, "loss": 0.0557, "num_tokens": 18903130.0, "reward": 8.90771484375, "reward_std": 0.2815916836261749, "rewards/helpfulness_reward/mean": 1.4432172775268555, "rewards/helpfulness_reward/std": 0.6099676489830017, "rewards/safety_reward/mean": 8.90771484375, "rewards/safety_reward/std": 1.10752272605896, "step": 1499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 85.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 54.984375, "completions/mean_terminated_length": 54.984375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.5239717055279015, "frac_reward_zero_std": 0.0, "grad_norm": 0.3516351878643036, "kl": 4.19921875, "learning_rate": 5e-05, "loss": 0.0624, "num_tokens": 18913920.0, "reward": 9.278564453125, "reward_std": 0.3295482099056244, "rewards/helpfulness_reward/mean": 1.3525772094726562, "rewards/helpfulness_reward/std": 0.7214128971099854, "rewards/safety_reward/mean": 9.278564453125, "rewards/safety_reward/std": 1.0340015888214111, "step": 1500 } ], "logging_steps": 1, "max_steps": 2863, "num_input_tokens_seen": 18913920, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }