{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5239717055279015, "eval_steps": 500, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1229.0, "completions/max_terminated_length": 1229.0, "completions/mean_length": 459.46875, "completions/mean_terminated_length": 459.46875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.0003493144703519343, "frac_reward_zero_std": 0.0, "grad_norm": 0.09076514095067978, "kl": 0.0, "learning_rate": 5e-05, "loss": 0.0671, "num_tokens": 62644.0, "reward": 0.7454519271850586, "reward_std": 0.7274596691131592, "rewards/helpfulness_reward/mean": 0.7454519271850586, "rewards/helpfulness_reward/std": 0.8803032636642456, "rewards/safety_reward/mean": 2.2877464294433594, "rewards/safety_reward/std": 2.2047319412231445, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1486.0, "completions/max_terminated_length": 1486.0, "completions/mean_length": 404.7265625, "completions/mean_terminated_length": 404.7265625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.0006986289407038686, "frac_reward_zero_std": 0.0, "grad_norm": 0.11754141002893448, "kl": 0.0007853507995605469, "learning_rate": 5e-05, "loss": 0.0115, "num_tokens": 118769.0, "reward": 0.47026586532592773, "reward_std": 0.6061829328536987, "rewards/helpfulness_reward/mean": 0.47026586532592773, "rewards/helpfulness_reward/std": 0.9079115390777588, "rewards/safety_reward/mean": 1.7072210311889648, "rewards/safety_reward/std": 1.6164162158966064, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1792.0, "completions/max_terminated_length": 1454.0, "completions/mean_length": 549.5390625, "completions/mean_terminated_length": 466.7083435058594, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.001047943411055803, "frac_reward_zero_std": 0.0, "grad_norm": 0.08614565432071686, "kl": 0.0006704330444335938, "learning_rate": 5e-05, "loss": 0.0551, "num_tokens": 193830.0, "reward": 0.33064937591552734, "reward_std": 0.7028899788856506, "rewards/helpfulness_reward/mean": 0.33064937591552734, "rewards/helpfulness_reward/std": 1.2106647491455078, "rewards/safety_reward/mean": 1.2706809043884277, "rewards/safety_reward/std": 2.422553062438965, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1397.0, "completions/max_terminated_length": 1397.0, "completions/mean_length": 364.6171875, "completions/mean_terminated_length": 364.6171875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.0013972578814077372, "frac_reward_zero_std": 0.0, "grad_norm": 0.12357155233621597, "kl": 0.0008478164672851562, "learning_rate": 5e-05, "loss": 0.0266, "num_tokens": 244741.0, "reward": 0.49830150604248047, "reward_std": 0.628875732421875, "rewards/helpfulness_reward/mean": 0.49830150604248047, "rewards/helpfulness_reward/std": 1.051396131515503, "rewards/safety_reward/mean": 2.474785804748535, "rewards/safety_reward/std": 2.317692995071411, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1521.0, "completions/max_terminated_length": 1521.0, "completions/mean_length": 497.2265625, "completions/mean_terminated_length": 497.2265625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.0017465723517596716, "frac_reward_zero_std": 0.0, "grad_norm": 0.08808404207229614, "kl": 0.0009441375732421875, "learning_rate": 5e-05, "loss": 0.0563, "num_tokens": 312914.0, "reward": 0.5495474338531494, "reward_std": 0.6789533495903015, "rewards/helpfulness_reward/mean": 0.5495474338531494, "rewards/helpfulness_reward/std": 1.035644292831421, "rewards/safety_reward/mean": 1.4524116516113281, "rewards/safety_reward/std": 2.1978490352630615, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1464.0, "completions/mean_length": 537.3515625, "completions/mean_terminated_length": 527.472412109375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.002095886822111606, "frac_reward_zero_std": 0.0, "grad_norm": 0.07897509634494781, "kl": 0.0009207725524902344, "learning_rate": 5e-05, "loss": 0.0934, "num_tokens": 385311.0, "reward": 0.35894060134887695, "reward_std": 0.5458699464797974, "rewards/helpfulness_reward/mean": 0.35894060134887695, "rewards/helpfulness_reward/std": 0.9135392904281616, "rewards/safety_reward/mean": 1.4208526611328125, "rewards/safety_reward/std": 2.136950969696045, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1330.0, "completions/max_terminated_length": 1330.0, "completions/mean_length": 503.0234375, "completions/mean_terminated_length": 503.0234375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.0024452012924635405, "frac_reward_zero_std": 0.0, "grad_norm": 0.08651446551084518, "kl": 0.0011096000671386719, "learning_rate": 5e-05, "loss": -0.0026, "num_tokens": 453674.0, "reward": 0.052249908447265625, "reward_std": 0.6382900476455688, "rewards/helpfulness_reward/mean": 0.052249908447265625, "rewards/helpfulness_reward/std": 1.0323797464370728, "rewards/safety_reward/mean": 1.6951961517333984, "rewards/safety_reward/std": 2.2446155548095703, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1792.0, "completions/max_terminated_length": 1768.0, "completions/mean_length": 559.140625, "completions/mean_terminated_length": 539.5714721679688, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.0027945157628154744, "frac_reward_zero_std": 0.0, "grad_norm": 0.08693980425596237, "kl": 0.0011515617370605469, "learning_rate": 5e-05, "loss": 0.0467, "num_tokens": 529804.0, "reward": 0.6604931950569153, "reward_std": 0.4710709750652313, "rewards/helpfulness_reward/mean": 0.6604931950569153, "rewards/helpfulness_reward/std": 1.0247236490249634, "rewards/safety_reward/mean": 1.0404624938964844, "rewards/safety_reward/std": 1.9560402631759644, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1567.0, "completions/max_terminated_length": 1567.0, "completions/mean_length": 654.546875, "completions/mean_terminated_length": 654.546875, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.003143830233167409, "frac_reward_zero_std": 0.0, "grad_norm": 0.07118160277605057, "kl": 0.0012178421020507812, "learning_rate": 5e-05, "loss": -0.0151, "num_tokens": 617826.0, "reward": 0.12753914296627045, "reward_std": 0.5788777470588684, "rewards/helpfulness_reward/mean": 0.12753914296627045, "rewards/helpfulness_reward/std": 0.9544141888618469, "rewards/safety_reward/mean": 0.9074296951293945, "rewards/safety_reward/std": 2.0247955322265625, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1185.0, "completions/max_terminated_length": 1185.0, "completions/mean_length": 371.7421875, "completions/mean_terminated_length": 371.7421875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.003493144703519343, "frac_reward_zero_std": 0.0, "grad_norm": 0.10895636677742004, "kl": 0.0015716552734375, "learning_rate": 5e-05, "loss": 0.014, "num_tokens": 668921.0, "reward": 0.47046732902526855, "reward_std": 0.614091157913208, "rewards/helpfulness_reward/mean": 0.47046732902526855, "rewards/helpfulness_reward/std": 0.9617099761962891, "rewards/safety_reward/mean": 1.5506501197814941, "rewards/safety_reward/std": 1.9692766666412354, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1508.0, "completions/max_terminated_length": 1508.0, "completions/mean_length": 583.1640625, "completions/mean_terminated_length": 583.1640625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.0038424591738712775, "frac_reward_zero_std": 0.0, "grad_norm": 0.07889501750469208, "kl": 0.0016050338745117188, "learning_rate": 5e-05, "loss": 0.0309, "num_tokens": 749430.0, "reward": 0.19135558605194092, "reward_std": 0.6203348636627197, "rewards/helpfulness_reward/mean": 0.19135558605194092, "rewards/helpfulness_reward/std": 0.8380932807922363, "rewards/safety_reward/mean": 1.179300308227539, "rewards/safety_reward/std": 2.000000476837158, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1229.0, "completions/max_terminated_length": 1229.0, "completions/mean_length": 369.46875, "completions/mean_terminated_length": 369.46875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.004191773644223212, "frac_reward_zero_std": 0.0, "grad_norm": 0.11583568155765533, "kl": 0.0021581649780273438, "learning_rate": 5e-05, "loss": 0.0463, "num_tokens": 801578.0, "reward": 0.6342997550964355, "reward_std": 0.681769609451294, "rewards/helpfulness_reward/mean": 0.6342997550964355, "rewards/helpfulness_reward/std": 0.9147577881813049, "rewards/safety_reward/mean": 2.2300243377685547, "rewards/safety_reward/std": 1.956925392150879, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1254.0, "completions/max_terminated_length": 1254.0, "completions/mean_length": 457.6484375, "completions/mean_terminated_length": 457.6484375, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.004541088114575147, "frac_reward_zero_std": 0.0, "grad_norm": 0.09541888535022736, "kl": 0.00211334228515625, "learning_rate": 5e-05, "loss": 0.0628, "num_tokens": 863957.0, "reward": 0.5842044353485107, "reward_std": 0.6296340227127075, "rewards/helpfulness_reward/mean": 0.5842044353485107, "rewards/helpfulness_reward/std": 1.013549566268921, "rewards/safety_reward/mean": 2.0304718017578125, "rewards/safety_reward/std": 2.0636191368103027, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1185.0, "completions/max_terminated_length": 1185.0, "completions/mean_length": 408.8984375, "completions/mean_terminated_length": 408.8984375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.004890402584927081, "frac_reward_zero_std": 0.0, "grad_norm": 0.10889797657728195, "kl": 0.0023403167724609375, "learning_rate": 5e-05, "loss": 0.0946, "num_tokens": 919768.0, "reward": 0.954409122467041, "reward_std": 0.6100387573242188, "rewards/helpfulness_reward/mean": 0.954409122467041, "rewards/helpfulness_reward/std": 0.8955455422401428, "rewards/safety_reward/mean": 2.204028606414795, "rewards/safety_reward/std": 2.087705612182617, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1577.0, "completions/mean_length": 506.78125, "completions/mean_terminated_length": 496.6614074707031, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.0052397170552790145, "frac_reward_zero_std": 0.0, "grad_norm": 0.09085052460432053, "kl": 0.0026702880859375, "learning_rate": 5e-05, "loss": 0.025, "num_tokens": 988972.0, "reward": 0.3684568405151367, "reward_std": 0.6651139259338379, "rewards/helpfulness_reward/mean": 0.3684568405151367, "rewards/helpfulness_reward/std": 1.0326173305511475, "rewards/safety_reward/mean": 1.8189241886138916, "rewards/safety_reward/std": 2.056744337081909, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1714.0, "completions/max_terminated_length": 1714.0, "completions/mean_length": 508.5859375, "completions/mean_terminated_length": 508.5859375, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.005589031525630949, "frac_reward_zero_std": 0.0, "grad_norm": 0.09485320001840591, "kl": 0.0024862289428710938, "learning_rate": 5e-05, "loss": -0.0024, "num_tokens": 1059223.0, "reward": 0.6423309445381165, "reward_std": 0.6194648742675781, "rewards/helpfulness_reward/mean": 0.6423309445381165, "rewards/helpfulness_reward/std": 0.9223731160163879, "rewards/safety_reward/mean": 1.9109172821044922, "rewards/safety_reward/std": 2.055133819580078, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1308.0, "completions/max_terminated_length": 1308.0, "completions/mean_length": 514.953125, "completions/mean_terminated_length": 514.953125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.005938345995982883, "frac_reward_zero_std": 0.0, "grad_norm": 0.09729105234146118, "kl": 0.002521514892578125, "learning_rate": 5e-05, "loss": 0.0871, "num_tokens": 1129585.0, "reward": 0.9097509384155273, "reward_std": 0.6492941379547119, "rewards/helpfulness_reward/mean": 0.9097509384155273, "rewards/helpfulness_reward/std": 1.0958693027496338, "rewards/safety_reward/mean": 2.0756940841674805, "rewards/safety_reward/std": 2.258796215057373, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1460.0, "completions/max_terminated_length": 1460.0, "completions/mean_length": 428.828125, "completions/mean_terminated_length": 428.828125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.006287660466334818, "frac_reward_zero_std": 0.0, "grad_norm": 0.11828664690256119, "kl": 0.0038280487060546875, "learning_rate": 5e-05, "loss": -0.0203, "num_tokens": 1188531.0, "reward": 0.8264170289039612, "reward_std": 0.7189738750457764, "rewards/helpfulness_reward/mean": 0.8264170289039612, "rewards/helpfulness_reward/std": 0.8350399136543274, "rewards/safety_reward/mean": 1.6434385776519775, "rewards/safety_reward/std": 1.5733444690704346, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1272.0, "completions/max_terminated_length": 1272.0, "completions/mean_length": 432.65625, "completions/mean_terminated_length": 432.65625, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.006636974936686752, "frac_reward_zero_std": 0.0, "grad_norm": 0.12826676666736603, "kl": 0.003948211669921875, "learning_rate": 5e-05, "loss": 0.0552, "num_tokens": 1247399.0, "reward": 0.700002908706665, "reward_std": 0.5022361874580383, "rewards/helpfulness_reward/mean": 0.700002908706665, "rewards/helpfulness_reward/std": 0.9541680812835693, "rewards/safety_reward/mean": 2.01204776763916, "rewards/safety_reward/std": 2.011726140975952, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 1792.0, "completions/max_terminated_length": 1777.0, "completions/mean_length": 534.625, "completions/mean_terminated_length": 483.5121765136719, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.006986289407038686, "frac_reward_zero_std": 0.0, "grad_norm": 0.09098071604967117, "kl": 0.0040073394775390625, "learning_rate": 5e-05, "loss": 0.0474, "num_tokens": 1320111.0, "reward": 0.8124549388885498, "reward_std": 0.5625039339065552, "rewards/helpfulness_reward/mean": 0.8124549388885498, "rewards/helpfulness_reward/std": 0.9078332185745239, "rewards/safety_reward/mean": 1.7114911079406738, "rewards/safety_reward/std": 1.9348737001419067, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1488.0, "completions/mean_length": 495.9921875, "completions/mean_terminated_length": 454.1854553222656, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.007335603877390621, "frac_reward_zero_std": 0.0, "grad_norm": 0.08880435675382614, "kl": 0.0040531158447265625, "learning_rate": 5e-05, "loss": 0.0795, "num_tokens": 1387926.0, "reward": 0.5796370506286621, "reward_std": 0.6758288145065308, "rewards/helpfulness_reward/mean": 0.5796370506286621, "rewards/helpfulness_reward/std": 1.1220582723617554, "rewards/safety_reward/mean": 1.4385089874267578, "rewards/safety_reward/std": 2.3652124404907227, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1610.0, "completions/max_terminated_length": 1610.0, "completions/mean_length": 534.875, "completions/mean_terminated_length": 534.875, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.007684918347742555, "frac_reward_zero_std": 0.0, "grad_norm": 0.08749895542860031, "kl": 0.004611968994140625, "learning_rate": 5e-05, "loss": 0.1169, "num_tokens": 1460454.0, "reward": 0.755955696105957, "reward_std": 0.6061676740646362, "rewards/helpfulness_reward/mean": 0.755955696105957, "rewards/helpfulness_reward/std": 0.8640915751457214, "rewards/safety_reward/mean": 1.9132728576660156, "rewards/safety_reward/std": 2.3223953247070312, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1672.0, "completions/max_terminated_length": 1672.0, "completions/mean_length": 490.96875, "completions/mean_terminated_length": 490.96875, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.00803423281809449, "frac_reward_zero_std": 0.0, "grad_norm": 0.09438382089138031, "kl": 0.005062103271484375, "learning_rate": 5e-05, "loss": 0.0269, "num_tokens": 1531186.0, "reward": 0.716893196105957, "reward_std": 0.6534146070480347, "rewards/helpfulness_reward/mean": 0.716893196105957, "rewards/helpfulness_reward/std": 1.0092159509658813, "rewards/safety_reward/mean": 1.9752540588378906, "rewards/safety_reward/std": 2.2937815189361572, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1293.0, "completions/max_terminated_length": 1293.0, "completions/mean_length": 449.3046875, "completions/mean_terminated_length": 449.3046875, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.008383547288446425, "frac_reward_zero_std": 0.0, "grad_norm": 0.1066155657172203, "kl": 0.00609588623046875, "learning_rate": 5e-05, "loss": 0.0746, "num_tokens": 1595673.0, "reward": 0.7567958831787109, "reward_std": 0.7453399896621704, "rewards/helpfulness_reward/mean": 0.7567958831787109, "rewards/helpfulness_reward/std": 0.9461308121681213, "rewards/safety_reward/mean": 1.7897205352783203, "rewards/safety_reward/std": 1.9160587787628174, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1254.0, "completions/max_terminated_length": 1254.0, "completions/mean_length": 310.40625, "completions/mean_terminated_length": 310.40625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.008732861758798359, "frac_reward_zero_std": 0.0, "grad_norm": 0.16820111870765686, "kl": 0.010711669921875, "learning_rate": 5e-05, "loss": -0.0006, "num_tokens": 1639853.0, "reward": 0.7711200714111328, "reward_std": 0.7448667287826538, "rewards/helpfulness_reward/mean": 0.7711200714111328, "rewards/helpfulness_reward/std": 1.1572437286376953, "rewards/safety_reward/mean": 3.0538742542266846, "rewards/safety_reward/std": 2.3436625003814697, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1792.0, "completions/max_terminated_length": 1142.0, "completions/mean_length": 434.859375, "completions/mean_terminated_length": 344.38336181640625, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.009082176229150293, "frac_reward_zero_std": 0.0, "grad_norm": 0.10999217629432678, "kl": 0.0068817138671875, "learning_rate": 5e-05, "loss": 0.0578, "num_tokens": 1699651.0, "reward": 0.5266369581222534, "reward_std": 0.608366847038269, "rewards/helpfulness_reward/mean": 0.5266369581222534, "rewards/helpfulness_reward/std": 0.9621043801307678, "rewards/safety_reward/mean": 1.816028118133545, "rewards/safety_reward/std": 2.086315155029297, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1550.0, "completions/mean_length": 431.609375, "completions/mean_terminated_length": 420.89764404296875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.009431490699502228, "frac_reward_zero_std": 0.0, "grad_norm": 0.09771678596735, "kl": 0.00791168212890625, "learning_rate": 5e-05, "loss": -0.0044, "num_tokens": 1758537.0, "reward": 0.7899468541145325, "reward_std": 0.595039963722229, "rewards/helpfulness_reward/mean": 0.7899468541145325, "rewards/helpfulness_reward/std": 0.9917141795158386, "rewards/safety_reward/mean": 2.161200761795044, "rewards/safety_reward/std": 2.231689453125, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1695.0, "completions/max_terminated_length": 1695.0, "completions/mean_length": 396.515625, "completions/mean_terminated_length": 396.515625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.009780805169854162, "frac_reward_zero_std": 0.0, "grad_norm": 0.14257243275642395, "kl": 0.010860443115234375, "learning_rate": 5e-05, "loss": 0.1319, "num_tokens": 1814851.0, "reward": 0.9465599060058594, "reward_std": 0.6072133183479309, "rewards/helpfulness_reward/mean": 0.9465599060058594, "rewards/helpfulness_reward/std": 1.423966884613037, "rewards/safety_reward/mean": 2.145183563232422, "rewards/safety_reward/std": 2.474485158920288, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1665.0, "completions/max_terminated_length": 1665.0, "completions/mean_length": 570.3046875, "completions/mean_terminated_length": 570.3046875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.010130119640206096, "frac_reward_zero_std": 0.0, "grad_norm": 0.08447562158107758, "kl": 0.0065460205078125, "learning_rate": 5e-05, "loss": 0.0485, "num_tokens": 1892130.0, "reward": 0.6243162155151367, "reward_std": 0.6913121938705444, "rewards/helpfulness_reward/mean": 0.6243162155151367, "rewards/helpfulness_reward/std": 0.8384016156196594, "rewards/safety_reward/mean": 1.495840072631836, "rewards/safety_reward/std": 2.0507538318634033, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 1792.0, "completions/max_terminated_length": 1763.0, "completions/mean_length": 634.421875, "completions/mean_terminated_length": 587.3658447265625, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.010479434110558029, "frac_reward_zero_std": 0.0, "grad_norm": 0.08622939139604568, "kl": 0.0092010498046875, "learning_rate": 5e-05, "loss": 0.0997, "num_tokens": 1977016.0, "reward": 0.6998007297515869, "reward_std": 0.5700432062149048, "rewards/helpfulness_reward/mean": 0.6998007297515869, "rewards/helpfulness_reward/std": 1.0541359186172485, "rewards/safety_reward/mean": 1.5051467418670654, "rewards/safety_reward/std": 2.601836919784546, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1671.0, "completions/mean_length": 415.9765625, "completions/mean_terminated_length": 382.9520263671875, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.010828748580909963, "frac_reward_zero_std": 0.0, "grad_norm": 0.12213493883609772, "kl": 0.0110015869140625, "learning_rate": 5e-05, "loss": 0.1149, "num_tokens": 2035093.0, "reward": 0.8314437866210938, "reward_std": 0.6568882465362549, "rewards/helpfulness_reward/mean": 0.8314437866210938, "rewards/helpfulness_reward/std": 1.046309232711792, "rewards/safety_reward/mean": 1.8558883666992188, "rewards/safety_reward/std": 2.2105157375335693, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1223.0, "completions/max_terminated_length": 1223.0, "completions/mean_length": 351.2734375, "completions/mean_terminated_length": 351.2734375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.011178063051261898, "frac_reward_zero_std": 0.0, "grad_norm": 0.12819261848926544, "kl": 0.01479339599609375, "learning_rate": 5e-05, "loss": 0.0707, "num_tokens": 2084688.0, "reward": 1.0837898254394531, "reward_std": 0.5762518644332886, "rewards/helpfulness_reward/mean": 1.0837898254394531, "rewards/helpfulness_reward/std": 0.9040101170539856, "rewards/safety_reward/mean": 2.63077449798584, "rewards/safety_reward/std": 1.838931679725647, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1540.0, "completions/max_terminated_length": 1540.0, "completions/mean_length": 525.8046875, "completions/mean_terminated_length": 525.8046875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.011527377521613832, "frac_reward_zero_std": 0.0, "grad_norm": 0.09962691366672516, "kl": 0.01282501220703125, "learning_rate": 5e-05, "loss": 0.0834, "num_tokens": 2158271.0, "reward": 0.5863571166992188, "reward_std": 0.6917264461517334, "rewards/helpfulness_reward/mean": 0.5863571166992188, "rewards/helpfulness_reward/std": 1.1196807622909546, "rewards/safety_reward/mean": 1.2297954559326172, "rewards/safety_reward/std": 2.6038780212402344, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1409.0, "completions/max_terminated_length": 1409.0, "completions/mean_length": 368.609375, "completions/mean_terminated_length": 368.609375, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.011876691991965766, "frac_reward_zero_std": 0.0, "grad_norm": 0.12555347383022308, "kl": 0.022674560546875, "learning_rate": 5e-05, "loss": 0.0914, "num_tokens": 2209997.0, "reward": 1.3250885009765625, "reward_std": 0.6235686540603638, "rewards/helpfulness_reward/mean": 1.3250885009765625, "rewards/helpfulness_reward/std": 0.9479987621307373, "rewards/safety_reward/mean": 2.096921682357788, "rewards/safety_reward/std": 1.9525026082992554, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1558.0, "completions/max_terminated_length": 1558.0, "completions/mean_length": 432.4453125, "completions/mean_terminated_length": 432.4453125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.0122260064623177, "frac_reward_zero_std": 0.0, "grad_norm": 0.10635751485824585, "kl": 0.01837921142578125, "learning_rate": 5e-05, "loss": 0.0844, "num_tokens": 2269078.0, "reward": 1.3233375549316406, "reward_std": 0.5845445990562439, "rewards/helpfulness_reward/mean": 1.3233375549316406, "rewards/helpfulness_reward/std": 0.925690770149231, "rewards/safety_reward/mean": 1.85601806640625, "rewards/safety_reward/std": 1.782755970954895, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 1792.0, "completions/max_terminated_length": 1628.0, "completions/mean_length": 541.4609375, "completions/mean_terminated_length": 490.6260070800781, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.012575320932669635, "frac_reward_zero_std": 0.0, "grad_norm": 0.10673508048057556, "kl": 0.01645660400390625, "learning_rate": 5e-05, "loss": 0.1238, "num_tokens": 2344505.0, "reward": 1.229992389678955, "reward_std": 0.7448524236679077, "rewards/helpfulness_reward/mean": 1.229992389678955, "rewards/helpfulness_reward/std": 1.0200690031051636, "rewards/safety_reward/mean": 2.2280094623565674, "rewards/safety_reward/std": 1.9870589971542358, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1698.0, "completions/mean_length": 366.359375, "completions/mean_terminated_length": 355.13385009765625, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.01292463540302157, "frac_reward_zero_std": 0.0, "grad_norm": 0.194883793592453, "kl": 0.0285797119140625, "learning_rate": 5e-05, "loss": 0.0461, "num_tokens": 2396583.0, "reward": 1.2830345630645752, "reward_std": 0.5716500282287598, "rewards/helpfulness_reward/mean": 1.2830345630645752, "rewards/helpfulness_reward/std": 0.9757908582687378, "rewards/safety_reward/mean": 2.696125030517578, "rewards/safety_reward/std": 1.6522223949432373, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1563.0, "completions/max_terminated_length": 1563.0, "completions/mean_length": 509.609375, "completions/mean_terminated_length": 509.609375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.013273949873373504, "frac_reward_zero_std": 0.0, "grad_norm": 0.10136520117521286, "kl": 0.01877593994140625, "learning_rate": 5e-05, "loss": -0.0092, "num_tokens": 2465589.0, "reward": 1.2383918762207031, "reward_std": 0.5625755786895752, "rewards/helpfulness_reward/mean": 1.2383918762207031, "rewards/helpfulness_reward/std": 0.7680402994155884, "rewards/safety_reward/mean": 2.2120723724365234, "rewards/safety_reward/std": 1.5261180400848389, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1217.0, "completions/max_terminated_length": 1217.0, "completions/mean_length": 351.640625, "completions/mean_terminated_length": 351.640625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.013623264343725438, "frac_reward_zero_std": 0.0, "grad_norm": 0.16549207270145416, "kl": 0.0258941650390625, "learning_rate": 5e-05, "loss": 0.1009, "num_tokens": 2515047.0, "reward": 0.9389921426773071, "reward_std": 0.7103508114814758, "rewards/helpfulness_reward/mean": 0.9389921426773071, "rewards/helpfulness_reward/std": 0.952957808971405, "rewards/safety_reward/mean": 2.6290626525878906, "rewards/safety_reward/std": 1.9243297576904297, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1329.0, "completions/max_terminated_length": 1329.0, "completions/mean_length": 299.6328125, "completions/mean_terminated_length": 299.6328125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.013972578814077373, "frac_reward_zero_std": 0.0, "grad_norm": 0.17402124404907227, "kl": 0.03253173828125, "learning_rate": 5e-05, "loss": 0.0035, "num_tokens": 2557120.0, "reward": 1.1265811920166016, "reward_std": 0.5009407997131348, "rewards/helpfulness_reward/mean": 1.1265811920166016, "rewards/helpfulness_reward/std": 1.482442021369934, "rewards/safety_reward/mean": 2.6069016456604004, "rewards/safety_reward/std": 2.200439929962158, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1598.0, "completions/mean_length": 532.7421875, "completions/mean_terminated_length": 522.8267822265625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.014321893284429307, "frac_reward_zero_std": 0.0, "grad_norm": 0.09574317187070847, "kl": 0.018951416015625, "learning_rate": 5e-05, "loss": 0.0787, "num_tokens": 2628807.0, "reward": 1.0789527893066406, "reward_std": 0.5190775394439697, "rewards/helpfulness_reward/mean": 1.0789527893066406, "rewards/helpfulness_reward/std": 0.7808939814567566, "rewards/safety_reward/mean": 1.3713226318359375, "rewards/safety_reward/std": 1.5431783199310303, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1575.0, "completions/mean_length": 530.6171875, "completions/mean_terminated_length": 520.68505859375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.014671207754781241, "frac_reward_zero_std": 0.0, "grad_norm": 0.10205338150262833, "kl": 0.024200439453125, "learning_rate": 5e-05, "loss": 0.0613, "num_tokens": 2702390.0, "reward": 1.1270008087158203, "reward_std": 0.6456426382064819, "rewards/helpfulness_reward/mean": 1.1270008087158203, "rewards/helpfulness_reward/std": 0.9290120601654053, "rewards/safety_reward/mean": 1.391108512878418, "rewards/safety_reward/std": 1.7995991706848145, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1043.0, "completions/max_terminated_length": 1043.0, "completions/mean_length": 425.4296875, "completions/mean_terminated_length": 425.4296875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.015020522225133176, "frac_reward_zero_std": 0.0, "grad_norm": 0.12369070947170258, "kl": 0.031341552734375, "learning_rate": 5e-05, "loss": 0.067, "num_tokens": 2760989.0, "reward": 0.9661941528320312, "reward_std": 0.5028403401374817, "rewards/helpfulness_reward/mean": 0.9661941528320312, "rewards/helpfulness_reward/std": 0.9521198272705078, "rewards/safety_reward/mean": 1.4103879928588867, "rewards/safety_reward/std": 2.4132115840911865, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1639.0, "completions/max_terminated_length": 1639.0, "completions/mean_length": 429.734375, "completions/mean_terminated_length": 429.734375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.01536983669548511, "frac_reward_zero_std": 0.0, "grad_norm": 0.1324411928653717, "kl": 0.0316314697265625, "learning_rate": 5e-05, "loss": 0.0236, "num_tokens": 2820363.0, "reward": 0.9610214233398438, "reward_std": 0.5222424268722534, "rewards/helpfulness_reward/mean": 0.9610214233398438, "rewards/helpfulness_reward/std": 0.7389412522315979, "rewards/safety_reward/mean": 1.8236937522888184, "rewards/safety_reward/std": 1.8606433868408203, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1270.0, "completions/mean_length": 391.4921875, "completions/mean_terminated_length": 380.4645690917969, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.015719151165837046, "frac_reward_zero_std": 0.0, "grad_norm": 0.1350511610507965, "kl": 0.0388946533203125, "learning_rate": 5e-05, "loss": 0.1086, "num_tokens": 2874226.0, "reward": 1.2338829040527344, "reward_std": 0.5195097923278809, "rewards/helpfulness_reward/mean": 1.2338829040527344, "rewards/helpfulness_reward/std": 0.87492835521698, "rewards/safety_reward/mean": 1.7631425857543945, "rewards/safety_reward/std": 1.1868641376495361, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1455.0, "completions/mean_length": 490.4375, "completions/mean_terminated_length": 480.18896484375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.01606846563618898, "frac_reward_zero_std": 0.0, "grad_norm": 0.11127650737762451, "kl": 0.0332489013671875, "learning_rate": 5e-05, "loss": 0.0452, "num_tokens": 2943002.0, "reward": 1.0299229621887207, "reward_std": 0.4882630705833435, "rewards/helpfulness_reward/mean": 1.0299229621887207, "rewards/helpfulness_reward/std": 1.0649478435516357, "rewards/safety_reward/mean": 1.4146428108215332, "rewards/safety_reward/std": 2.0988385677337646, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1620.0, "completions/max_terminated_length": 1620.0, "completions/mean_length": 298.890625, "completions/mean_terminated_length": 298.890625, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.016417780106540915, "frac_reward_zero_std": 0.0, "grad_norm": 0.17102943360805511, "kl": 0.0511932373046875, "learning_rate": 5e-05, "loss": 0.0858, "num_tokens": 2985348.0, "reward": 1.3680353164672852, "reward_std": 0.4793702960014343, "rewards/helpfulness_reward/mean": 1.3680353164672852, "rewards/helpfulness_reward/std": 0.9179648756980896, "rewards/safety_reward/mean": 2.0738601684570312, "rewards/safety_reward/std": 1.7742230892181396, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1224.0, "completions/mean_length": 379.0859375, "completions/mean_terminated_length": 367.96063232421875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.01676709457689285, "frac_reward_zero_std": 0.0, "grad_norm": 0.13279303908348083, "kl": 0.047760009765625, "learning_rate": 5e-05, "loss": 0.1391, "num_tokens": 3039175.0, "reward": 1.229203224182129, "reward_std": 0.5646665096282959, "rewards/helpfulness_reward/mean": 1.229203224182129, "rewards/helpfulness_reward/std": 0.9742417335510254, "rewards/safety_reward/mean": 2.138047218322754, "rewards/safety_reward/std": 1.9432933330535889, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1646.0, "completions/max_terminated_length": 1646.0, "completions/mean_length": 472.015625, "completions/mean_terminated_length": 472.015625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.017116409047244784, "frac_reward_zero_std": 0.0, "grad_norm": 0.12837885320186615, "kl": 0.04107666015625, "learning_rate": 5e-05, "loss": 0.0131, "num_tokens": 3103801.0, "reward": 1.1786866188049316, "reward_std": 0.5583391785621643, "rewards/helpfulness_reward/mean": 1.1786866188049316, "rewards/helpfulness_reward/std": 0.8406270146369934, "rewards/safety_reward/mean": 1.98746919631958, "rewards/safety_reward/std": 1.864693284034729, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1027.0, "completions/max_terminated_length": 1027.0, "completions/mean_length": 342.40625, "completions/mean_terminated_length": 342.40625, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.017465723517596718, "frac_reward_zero_std": 0.0, "grad_norm": 0.20379285514354706, "kl": 0.064788818359375, "learning_rate": 5e-05, "loss": 0.1588, "num_tokens": 3152077.0, "reward": 1.1552281379699707, "reward_std": 0.6589637398719788, "rewards/helpfulness_reward/mean": 1.1552281379699707, "rewards/helpfulness_reward/std": 1.065624475479126, "rewards/safety_reward/mean": 2.441159248352051, "rewards/safety_reward/std": 1.6953315734863281, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1317.0, "completions/max_terminated_length": 1317.0, "completions/mean_length": 378.59375, "completions/mean_terminated_length": 378.59375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.017815037987948652, "frac_reward_zero_std": 0.0, "grad_norm": 0.13653720915317535, "kl": 0.06591796875, "learning_rate": 5e-05, "loss": 0.1049, "num_tokens": 3204505.0, "reward": 1.3216400146484375, "reward_std": 0.43382591009140015, "rewards/helpfulness_reward/mean": 1.3216400146484375, "rewards/helpfulness_reward/std": 0.8811574578285217, "rewards/safety_reward/mean": 2.2058067321777344, "rewards/safety_reward/std": 1.7907427549362183, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1557.0, "completions/max_terminated_length": 1557.0, "completions/mean_length": 597.75, "completions/mean_terminated_length": 597.75, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.018164352458300587, "frac_reward_zero_std": 0.0, "grad_norm": 0.10871731489896774, "kl": 0.0457763671875, "learning_rate": 5e-05, "loss": 0.0859, "num_tokens": 3285593.0, "reward": 1.0710716247558594, "reward_std": 0.46731579303741455, "rewards/helpfulness_reward/mean": 1.0710716247558594, "rewards/helpfulness_reward/std": 0.8607548475265503, "rewards/safety_reward/mean": 1.4231562614440918, "rewards/safety_reward/std": 1.5250638723373413, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 982.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 263.7578125, "completions/mean_terminated_length": 263.7578125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.01851366692865252, "frac_reward_zero_std": 0.0, "grad_norm": 0.2464367002248764, "kl": 0.10382080078125, "learning_rate": 5e-05, "loss": 0.0166, "num_tokens": 3323378.0, "reward": 1.6463890075683594, "reward_std": 0.4959027171134949, "rewards/helpfulness_reward/mean": 1.6463890075683594, "rewards/helpfulness_reward/std": 0.8708354830741882, "rewards/safety_reward/mean": 2.7625865936279297, "rewards/safety_reward/std": 1.5102343559265137, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1256.0, "completions/max_terminated_length": 1256.0, "completions/mean_length": 388.40625, "completions/mean_terminated_length": 388.40625, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.018862981399004455, "frac_reward_zero_std": 0.0, "grad_norm": 0.12615111470222473, "kl": 0.053955078125, "learning_rate": 5e-05, "loss": 0.0005, "num_tokens": 3376830.0, "reward": 1.689422607421875, "reward_std": 0.4922320246696472, "rewards/helpfulness_reward/mean": 1.689422607421875, "rewards/helpfulness_reward/std": 0.8306581974029541, "rewards/safety_reward/mean": 2.3548927307128906, "rewards/safety_reward/std": 1.6779584884643555, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1536.0, "completions/max_terminated_length": 1536.0, "completions/mean_length": 308.2734375, "completions/mean_terminated_length": 308.2734375, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.01921229586935639, "frac_reward_zero_std": 0.0, "grad_norm": 0.18868330121040344, "kl": 0.08538818359375, "learning_rate": 5e-05, "loss": -0.0083, "num_tokens": 3422049.0, "reward": 1.2308063507080078, "reward_std": 0.5078868865966797, "rewards/helpfulness_reward/mean": 1.2308063507080078, "rewards/helpfulness_reward/std": 1.0477793216705322, "rewards/safety_reward/mean": 2.3237648010253906, "rewards/safety_reward/std": 1.7479832172393799, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 901.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 243.7734375, "completions/mean_terminated_length": 243.7734375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.019561610339708324, "frac_reward_zero_std": 0.0, "grad_norm": 0.22214502096176147, "kl": 0.12103271484375, "learning_rate": 5e-05, "loss": 0.0926, "num_tokens": 3457356.0, "reward": 1.640371322631836, "reward_std": 0.5703814029693604, "rewards/helpfulness_reward/mean": 1.640371322631836, "rewards/helpfulness_reward/std": 0.9053226709365845, "rewards/safety_reward/mean": 2.4862990379333496, "rewards/safety_reward/std": 1.6780401468276978, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1283.0, "completions/max_terminated_length": 1283.0, "completions/mean_length": 286.1328125, "completions/mean_terminated_length": 286.1328125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.01991092481006026, "frac_reward_zero_std": 0.0, "grad_norm": 0.19276562333106995, "kl": 0.11517333984375, "learning_rate": 5e-05, "loss": 0.0736, "num_tokens": 3498653.0, "reward": 1.6489677429199219, "reward_std": 0.4716208577156067, "rewards/helpfulness_reward/mean": 1.6489677429199219, "rewards/helpfulness_reward/std": 0.82706618309021, "rewards/safety_reward/mean": 2.6478729248046875, "rewards/safety_reward/std": 1.5476162433624268, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1380.0, "completions/max_terminated_length": 1380.0, "completions/mean_length": 260.625, "completions/mean_terminated_length": 260.625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.020260239280412193, "frac_reward_zero_std": 0.0, "grad_norm": 0.2172303944826126, "kl": 0.1239013671875, "learning_rate": 5e-05, "loss": 0.1484, "num_tokens": 3536125.0, "reward": 1.4239997863769531, "reward_std": 0.5693310499191284, "rewards/helpfulness_reward/mean": 1.4239997863769531, "rewards/helpfulness_reward/std": 0.8533469438552856, "rewards/safety_reward/mean": 2.5772247314453125, "rewards/safety_reward/std": 1.5289322137832642, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1239.0, "completions/max_terminated_length": 1239.0, "completions/mean_length": 444.59375, "completions/mean_terminated_length": 444.59375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.020609553750764124, "frac_reward_zero_std": 0.0, "grad_norm": 0.12962520122528076, "kl": 0.0672607421875, "learning_rate": 5e-05, "loss": 0.1037, "num_tokens": 3597841.0, "reward": 1.2799654006958008, "reward_std": 0.598444402217865, "rewards/helpfulness_reward/mean": 1.2799654006958008, "rewards/helpfulness_reward/std": 1.0245696306228638, "rewards/safety_reward/mean": 1.7074522972106934, "rewards/safety_reward/std": 1.729634404182434, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1391.0, "completions/max_terminated_length": 1391.0, "completions/mean_length": 358.640625, "completions/mean_terminated_length": 358.640625, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.020958868221116058, "frac_reward_zero_std": 0.0, "grad_norm": 0.14917951822280884, "kl": 0.0880126953125, "learning_rate": 5e-05, "loss": 0.0347, "num_tokens": 3648867.0, "reward": 1.2620086669921875, "reward_std": 0.47235044836997986, "rewards/helpfulness_reward/mean": 1.2620086669921875, "rewards/helpfulness_reward/std": 0.9006978869438171, "rewards/safety_reward/mean": 2.3686676025390625, "rewards/safety_reward/std": 1.8266279697418213, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1560.0, "completions/mean_length": 369.921875, "completions/mean_terminated_length": 358.7243957519531, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.021308182691467992, "frac_reward_zero_std": 0.0, "grad_norm": 0.14848929643630981, "kl": 0.08392333984375, "learning_rate": 5e-05, "loss": 0.1653, "num_tokens": 3701961.0, "reward": 1.5207691192626953, "reward_std": 0.5648281574249268, "rewards/helpfulness_reward/mean": 1.5207691192626953, "rewards/helpfulness_reward/std": 1.1624890565872192, "rewards/safety_reward/mean": 2.4251561164855957, "rewards/safety_reward/std": 2.0458357334136963, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1358.0, "completions/max_terminated_length": 1358.0, "completions/mean_length": 324.4453125, "completions/mean_terminated_length": 324.4453125, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.021657497161819927, "frac_reward_zero_std": 0.0, "grad_norm": 0.1621379554271698, "kl": 0.0933837890625, "learning_rate": 5e-05, "loss": 0.097, "num_tokens": 3747450.0, "reward": 1.4736332893371582, "reward_std": 0.5201050043106079, "rewards/helpfulness_reward/mean": 1.4736332893371582, "rewards/helpfulness_reward/std": 0.9330348968505859, "rewards/safety_reward/mean": 2.222633123397827, "rewards/safety_reward/std": 1.7889293432235718, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 909.0, "completions/max_terminated_length": 909.0, "completions/mean_length": 230.25, "completions/mean_terminated_length": 230.25, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.02200681163217186, "frac_reward_zero_std": 0.0, "grad_norm": 0.23766537010669708, "kl": 0.12091064453125, "learning_rate": 5e-05, "loss": 0.0261, "num_tokens": 3780586.0, "reward": 1.6259422302246094, "reward_std": 0.4815652072429657, "rewards/helpfulness_reward/mean": 1.6259422302246094, "rewards/helpfulness_reward/std": 0.8390408754348755, "rewards/safety_reward/mean": 2.7666397094726562, "rewards/safety_reward/std": 1.4572186470031738, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1289.0, "completions/max_terminated_length": 1289.0, "completions/mean_length": 388.0390625, "completions/mean_terminated_length": 388.0390625, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.022356126102523795, "frac_reward_zero_std": 0.0, "grad_norm": 0.15277305245399475, "kl": 0.0853271484375, "learning_rate": 5e-05, "loss": 0.0629, "num_tokens": 3835719.0, "reward": 1.5184135437011719, "reward_std": 0.4811480939388275, "rewards/helpfulness_reward/mean": 1.5184135437011719, "rewards/helpfulness_reward/std": 0.9071491360664368, "rewards/safety_reward/mean": 2.5554380416870117, "rewards/safety_reward/std": 1.7549431324005127, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 763.0, "completions/max_terminated_length": 763.0, "completions/mean_length": 192.8125, "completions/mean_terminated_length": 192.8125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.02270544057287573, "frac_reward_zero_std": 0.0, "grad_norm": 0.26565125584602356, "kl": 0.18115234375, "learning_rate": 5e-05, "loss": 0.0664, "num_tokens": 3864135.0, "reward": 1.8500890731811523, "reward_std": 0.435400128364563, "rewards/helpfulness_reward/mean": 1.8500890731811523, "rewards/helpfulness_reward/std": 0.832516074180603, "rewards/safety_reward/mean": 3.116668701171875, "rewards/safety_reward/std": 1.3366951942443848, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1169.0, "completions/max_terminated_length": 1169.0, "completions/mean_length": 277.0, "completions/mean_terminated_length": 277.0, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.023054755043227664, "frac_reward_zero_std": 0.0, "grad_norm": 0.22429628670215607, "kl": 0.12298583984375, "learning_rate": 5e-05, "loss": 0.1562, "num_tokens": 3905703.0, "reward": 1.6482975482940674, "reward_std": 0.49603071808815, "rewards/helpfulness_reward/mean": 1.6482975482940674, "rewards/helpfulness_reward/std": 0.7814778685569763, "rewards/safety_reward/mean": 2.6507349014282227, "rewards/safety_reward/std": 1.5591164827346802, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1205.0, "completions/max_terminated_length": 1205.0, "completions/mean_length": 265.046875, "completions/mean_terminated_length": 265.046875, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.0234040695135796, "frac_reward_zero_std": 0.0, "grad_norm": 0.2097284495830536, "kl": 0.11236572265625, "learning_rate": 5e-05, "loss": 0.143, "num_tokens": 3944053.0, "reward": 1.9858856201171875, "reward_std": 0.4667302370071411, "rewards/helpfulness_reward/mean": 1.9858856201171875, "rewards/helpfulness_reward/std": 0.770444929599762, "rewards/safety_reward/mean": 2.8260040283203125, "rewards/safety_reward/std": 1.387239933013916, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1377.0, "completions/max_terminated_length": 1377.0, "completions/mean_length": 262.0703125, "completions/mean_terminated_length": 262.0703125, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.023753383983931533, "frac_reward_zero_std": 0.0, "grad_norm": 0.19886159896850586, "kl": 0.09619140625, "learning_rate": 5e-05, "loss": 0.1437, "num_tokens": 3983070.0, "reward": 1.6538047790527344, "reward_std": 0.4466201066970825, "rewards/helpfulness_reward/mean": 1.6538047790527344, "rewards/helpfulness_reward/std": 0.9073041677474976, "rewards/safety_reward/mean": 2.4380059242248535, "rewards/safety_reward/std": 1.4862159490585327, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 817.0, "completions/max_terminated_length": 817.0, "completions/mean_length": 269.8984375, "completions/mean_terminated_length": 269.8984375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.024102698454283467, "frac_reward_zero_std": 0.0625, "grad_norm": 0.19650407135486603, "kl": 0.13397216796875, "learning_rate": 5e-05, "loss": 0.0545, "num_tokens": 4021833.0, "reward": 1.6572465896606445, "reward_std": 0.40771254897117615, "rewards/helpfulness_reward/mean": 1.6572465896606445, "rewards/helpfulness_reward/std": 0.830784022808075, "rewards/safety_reward/mean": 2.6061580181121826, "rewards/safety_reward/std": 1.2036631107330322, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1051.0, "completions/max_terminated_length": 1051.0, "completions/mean_length": 314.3984375, "completions/mean_terminated_length": 314.3984375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.0244520129246354, "frac_reward_zero_std": 0.0, "grad_norm": 0.3759593367576599, "kl": 0.2392578125, "learning_rate": 5e-05, "loss": 0.072, "num_tokens": 4065916.0, "reward": 1.6162824630737305, "reward_std": 0.4563699960708618, "rewards/helpfulness_reward/mean": 1.6162824630737305, "rewards/helpfulness_reward/std": 0.7826206684112549, "rewards/safety_reward/mean": 2.5575709342956543, "rewards/safety_reward/std": 1.6580103635787964, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1115.0, "completions/mean_length": 375.1015625, "completions/mean_terminated_length": 363.94488525390625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.024801327394987336, "frac_reward_zero_std": 0.0, "grad_norm": 0.13857921957969666, "kl": 0.0810546875, "learning_rate": 5e-05, "loss": 0.0927, "num_tokens": 4117865.0, "reward": 1.5916328430175781, "reward_std": 0.5842174291610718, "rewards/helpfulness_reward/mean": 1.5916328430175781, "rewards/helpfulness_reward/std": 0.9448460340499878, "rewards/safety_reward/mean": 2.2083873748779297, "rewards/safety_reward/std": 1.555126428604126, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1792.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 314.8203125, "completions/mean_terminated_length": 267.1693420410156, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.02515064186533927, "frac_reward_zero_std": 0.0, "grad_norm": 0.1746620386838913, "kl": 0.08807373046875, "learning_rate": 5e-05, "loss": 0.117, "num_tokens": 4165514.0, "reward": 1.650503158569336, "reward_std": 0.5816013813018799, "rewards/helpfulness_reward/mean": 1.650503158569336, "rewards/helpfulness_reward/std": 0.9377087354660034, "rewards/safety_reward/mean": 2.1428909301757812, "rewards/safety_reward/std": 1.8027838468551636, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1753.0, "completions/mean_length": 258.65625, "completions/mean_terminated_length": 246.58267211914062, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.025499956335691205, "frac_reward_zero_std": 0.0, "grad_norm": 0.24140296876430511, "kl": 0.13458251953125, "learning_rate": 5e-05, "loss": 0.0464, "num_tokens": 4203502.0, "reward": 1.9458894729614258, "reward_std": 0.4518217444419861, "rewards/helpfulness_reward/mean": 1.9458894729614258, "rewards/helpfulness_reward/std": 0.9543213248252869, "rewards/safety_reward/mean": 3.0465850830078125, "rewards/safety_reward/std": 1.637817144393921, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1131.0, "completions/mean_length": 438.1796875, "completions/mean_terminated_length": 427.5196838378906, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.02584927080604314, "frac_reward_zero_std": 0.0, "grad_norm": 0.13869880139827728, "kl": 0.065277099609375, "learning_rate": 5e-05, "loss": 0.1497, "num_tokens": 4262957.0, "reward": 1.4600410461425781, "reward_std": 0.4865608811378479, "rewards/helpfulness_reward/mean": 1.4600410461425781, "rewards/helpfulness_reward/std": 0.917707622051239, "rewards/safety_reward/mean": 2.0285263061523438, "rewards/safety_reward/std": 1.403546690940857, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1166.0, "completions/max_terminated_length": 1166.0, "completions/mean_length": 247.3671875, "completions/mean_terminated_length": 247.3671875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.026198585276395073, "frac_reward_zero_std": 0.0, "grad_norm": 0.18995168805122375, "kl": 0.1097412109375, "learning_rate": 5e-05, "loss": 0.1191, "num_tokens": 4298868.0, "reward": 1.7869491577148438, "reward_std": 0.5469907522201538, "rewards/helpfulness_reward/mean": 1.7869491577148438, "rewards/helpfulness_reward/std": 0.747492253780365, "rewards/safety_reward/mean": 2.9710707664489746, "rewards/safety_reward/std": 1.5256065130233765, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 141.5390625, "completions/mean_terminated_length": 141.5390625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.026547899746747008, "frac_reward_zero_std": 0.0, "grad_norm": 0.2899115979671478, "kl": 0.1693115234375, "learning_rate": 5e-05, "loss": 0.0322, "num_tokens": 4321353.0, "reward": 2.228487014770508, "reward_std": 0.38414856791496277, "rewards/helpfulness_reward/mean": 2.228487014770508, "rewards/helpfulness_reward/std": 0.792380154132843, "rewards/safety_reward/mean": 3.5652809143066406, "rewards/safety_reward/std": 1.2338361740112305, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 757.0, "completions/max_terminated_length": 757.0, "completions/mean_length": 197.5859375, "completions/mean_terminated_length": 197.5859375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.026897214217098942, "frac_reward_zero_std": 0.0, "grad_norm": 0.23205158114433289, "kl": 0.12762451171875, "learning_rate": 5e-05, "loss": 0.1607, "num_tokens": 4351356.0, "reward": 2.0656051635742188, "reward_std": 0.5226855278015137, "rewards/helpfulness_reward/mean": 2.0656051635742188, "rewards/helpfulness_reward/std": 0.8057409524917603, "rewards/safety_reward/mean": 3.0725479125976562, "rewards/safety_reward/std": 1.6053743362426758, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1194.0, "completions/max_terminated_length": 1194.0, "completions/mean_length": 275.8671875, "completions/mean_terminated_length": 275.8671875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.027246528687450876, "frac_reward_zero_std": 0.0, "grad_norm": 0.22301548719406128, "kl": 0.11767578125, "learning_rate": 5e-05, "loss": 0.1295, "num_tokens": 4391211.0, "reward": 1.7672367095947266, "reward_std": 0.5238099694252014, "rewards/helpfulness_reward/mean": 1.7672367095947266, "rewards/helpfulness_reward/std": 0.958046019077301, "rewards/safety_reward/mean": 2.7549476623535156, "rewards/safety_reward/std": 1.5882865190505981, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/max_terminated_length": 672.0, "completions/mean_length": 152.8125, "completions/mean_terminated_length": 152.8125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.02759584315780281, "frac_reward_zero_std": 0.0, "grad_norm": 0.3066589832305908, "kl": 0.188720703125, "learning_rate": 5e-05, "loss": 0.105, "num_tokens": 4414531.0, "reward": 2.089171886444092, "reward_std": 0.5035598278045654, "rewards/helpfulness_reward/mean": 2.089171886444092, "rewards/helpfulness_reward/std": 0.8268926739692688, "rewards/safety_reward/mean": 3.55621337890625, "rewards/safety_reward/std": 1.1761701107025146, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1008.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 261.6875, "completions/mean_terminated_length": 261.6875, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.027945157628154745, "frac_reward_zero_std": 0.0, "grad_norm": 0.2231369912624359, "kl": 0.1326904296875, "learning_rate": 5e-05, "loss": 0.1479, "num_tokens": 4452323.0, "reward": 1.7393293380737305, "reward_std": 0.49620693922042847, "rewards/helpfulness_reward/mean": 1.7393293380737305, "rewards/helpfulness_reward/std": 0.8206889629364014, "rewards/safety_reward/mean": 3.0989608764648438, "rewards/safety_reward/std": 1.4564167261123657, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 726.0, "completions/max_terminated_length": 726.0, "completions/mean_length": 228.5546875, "completions/mean_terminated_length": 228.5546875, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.02829447209850668, "frac_reward_zero_std": 0.0, "grad_norm": 0.23337355256080627, "kl": 0.14007568359375, "learning_rate": 5e-05, "loss": 0.0622, "num_tokens": 4485626.0, "reward": 1.695241928100586, "reward_std": 0.4582768678665161, "rewards/helpfulness_reward/mean": 1.695241928100586, "rewards/helpfulness_reward/std": 0.8164451122283936, "rewards/safety_reward/mean": 2.5893783569335938, "rewards/safety_reward/std": 1.330500841140747, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 906.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 270.3203125, "completions/mean_terminated_length": 270.3203125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.028643786568858614, "frac_reward_zero_std": 0.0, "grad_norm": 0.18604350090026855, "kl": 0.1097412109375, "learning_rate": 5e-05, "loss": 0.0808, "num_tokens": 4524291.0, "reward": 1.86376953125, "reward_std": 0.5414124131202698, "rewards/helpfulness_reward/mean": 1.86376953125, "rewards/helpfulness_reward/std": 0.8004970550537109, "rewards/safety_reward/mean": 2.852203369140625, "rewards/safety_reward/std": 1.524082064628601, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1237.0, "completions/max_terminated_length": 1237.0, "completions/mean_length": 218.828125, "completions/mean_terminated_length": 218.828125, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.02899310103921055, "frac_reward_zero_std": 0.0, "grad_norm": 0.2680920362472534, "kl": 0.1435546875, "learning_rate": 5e-05, "loss": 0.1035, "num_tokens": 4557765.0, "reward": 1.764399766921997, "reward_std": 0.48756176233291626, "rewards/helpfulness_reward/mean": 1.764399766921997, "rewards/helpfulness_reward/std": 1.2719515562057495, "rewards/safety_reward/mean": 3.5308494567871094, "rewards/safety_reward/std": 1.3536105155944824, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 234.1171875, "completions/mean_terminated_length": 234.1171875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.029342415509562483, "frac_reward_zero_std": 0.0, "grad_norm": 0.2099536806344986, "kl": 0.1239013671875, "learning_rate": 5e-05, "loss": -0.0124, "num_tokens": 4591844.0, "reward": 2.1424484252929688, "reward_std": 0.4816861152648926, "rewards/helpfulness_reward/mean": 2.1424484252929688, "rewards/helpfulness_reward/std": 0.9129124879837036, "rewards/safety_reward/mean": 3.1293487548828125, "rewards/safety_reward/std": 1.6247111558914185, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 999.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 329.25, "completions/mean_terminated_length": 329.25, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.029691729979914417, "frac_reward_zero_std": 0.0, "grad_norm": 0.17500153183937073, "kl": 0.09881591796875, "learning_rate": 5e-05, "loss": 0.1366, "num_tokens": 4638148.0, "reward": 1.697422981262207, "reward_std": 0.5339616537094116, "rewards/helpfulness_reward/mean": 1.697422981262207, "rewards/helpfulness_reward/std": 1.1048227548599243, "rewards/safety_reward/mean": 2.4356908798217773, "rewards/safety_reward/std": 1.6462233066558838, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 896.0, "completions/max_terminated_length": 896.0, "completions/mean_length": 376.140625, "completions/mean_terminated_length": 376.140625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.03004104445026635, "frac_reward_zero_std": 0.0, "grad_norm": 0.16695758700370789, "kl": 0.09765625, "learning_rate": 5e-05, "loss": -0.0419, "num_tokens": 4691398.0, "reward": 1.5586700439453125, "reward_std": 0.47921591997146606, "rewards/helpfulness_reward/mean": 1.5586700439453125, "rewards/helpfulness_reward/std": 0.8118410110473633, "rewards/safety_reward/mean": 2.527036666870117, "rewards/safety_reward/std": 1.7242207527160645, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 986.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 199.546875, "completions/mean_terminated_length": 199.546875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.030390358920618286, "frac_reward_zero_std": 0.0, "grad_norm": 0.2502132058143616, "kl": 0.1754150390625, "learning_rate": 5e-05, "loss": 0.1445, "num_tokens": 4725476.0, "reward": 2.006143569946289, "reward_std": 0.5414842963218689, "rewards/helpfulness_reward/mean": 2.006143569946289, "rewards/helpfulness_reward/std": 0.9164220094680786, "rewards/safety_reward/mean": 3.7687339782714844, "rewards/safety_reward/std": 1.5463969707489014, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 804.0, "completions/max_terminated_length": 804.0, "completions/mean_length": 186.5078125, "completions/mean_terminated_length": 186.5078125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.03073967339097022, "frac_reward_zero_std": 0.0, "grad_norm": 0.2642494738101959, "kl": 0.19677734375, "learning_rate": 5e-05, "loss": 0.0056, "num_tokens": 4753013.0, "reward": 2.3872852325439453, "reward_std": 0.44217440485954285, "rewards/helpfulness_reward/mean": 2.3872852325439453, "rewards/helpfulness_reward/std": 0.7946711182594299, "rewards/safety_reward/mean": 3.6443328857421875, "rewards/safety_reward/std": 1.5016335248947144, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1649.0, "completions/max_terminated_length": 1649.0, "completions/mean_length": 330.375, "completions/mean_terminated_length": 330.375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.031088987861322154, "frac_reward_zero_std": 0.0, "grad_norm": 0.22474421560764313, "kl": 0.14825439453125, "learning_rate": 5e-05, "loss": 0.1229, "num_tokens": 4802501.0, "reward": 1.93756103515625, "reward_std": 0.5688122510910034, "rewards/helpfulness_reward/mean": 1.93756103515625, "rewards/helpfulness_reward/std": 1.0519322156906128, "rewards/safety_reward/mean": 3.0148963928222656, "rewards/safety_reward/std": 1.950553059577942, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 772.0, "completions/max_terminated_length": 772.0, "completions/mean_length": 227.4609375, "completions/mean_terminated_length": 227.4609375, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.03143830233167409, "frac_reward_zero_std": 0.0, "grad_norm": 0.23463119566440582, "kl": 0.1512451171875, "learning_rate": 5e-05, "loss": 0.159, "num_tokens": 4837696.0, "reward": 1.979628562927246, "reward_std": 0.5582247972488403, "rewards/helpfulness_reward/mean": 1.979628562927246, "rewards/helpfulness_reward/std": 0.977066159248352, "rewards/safety_reward/mean": 3.2601025104522705, "rewards/safety_reward/std": 1.5845940113067627, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1383.0, "completions/max_terminated_length": 1383.0, "completions/mean_length": 307.328125, "completions/mean_terminated_length": 307.328125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.03178761680202603, "frac_reward_zero_std": 0.0, "grad_norm": 0.1731823831796646, "kl": 0.15777587890625, "learning_rate": 5e-05, "loss": 0.0525, "num_tokens": 4881698.0, "reward": 1.8312034606933594, "reward_std": 0.45915156602859497, "rewards/helpfulness_reward/mean": 1.8312034606933594, "rewards/helpfulness_reward/std": 1.2218247652053833, "rewards/safety_reward/mean": 2.9303455352783203, "rewards/safety_reward/std": 1.9210768938064575, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1299.0, "completions/mean_length": 308.5, "completions/mean_terminated_length": 196.3025360107422, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.03213693127237796, "frac_reward_zero_std": 0.0, "grad_norm": 0.29211804270744324, "kl": 0.1575927734375, "learning_rate": 5e-05, "loss": 0.0732, "num_tokens": 4926498.0, "reward": 2.1590700149536133, "reward_std": 0.46173402667045593, "rewards/helpfulness_reward/mean": 2.1590700149536133, "rewards/helpfulness_reward/std": 1.0044376850128174, "rewards/safety_reward/mean": 3.3811147212982178, "rewards/safety_reward/std": 1.8825689554214478, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 897.0, "completions/max_terminated_length": 897.0, "completions/mean_length": 165.9921875, "completions/mean_terminated_length": 165.9921875, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.032486245742729895, "frac_reward_zero_std": 0.0, "grad_norm": 0.30629968643188477, "kl": 0.21337890625, "learning_rate": 5e-05, "loss": 0.0442, "num_tokens": 4951649.0, "reward": 2.1735382080078125, "reward_std": 0.4324687421321869, "rewards/helpfulness_reward/mean": 2.1735382080078125, "rewards/helpfulness_reward/std": 0.8095918893814087, "rewards/safety_reward/mean": 3.2314453125, "rewards/safety_reward/std": 1.4754490852355957, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1374.0, "completions/max_terminated_length": 1374.0, "completions/mean_length": 277.9375, "completions/mean_terminated_length": 277.9375, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.03283556021308183, "frac_reward_zero_std": 0.0, "grad_norm": 0.20444150269031525, "kl": 0.1883544921875, "learning_rate": 5e-05, "loss": 0.1265, "num_tokens": 4991273.0, "reward": 2.0363426208496094, "reward_std": 0.49901583790779114, "rewards/helpfulness_reward/mean": 2.0363426208496094, "rewards/helpfulness_reward/std": 0.9952787160873413, "rewards/safety_reward/mean": 3.539142608642578, "rewards/safety_reward/std": 1.5869287252426147, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 840.0, "completions/max_terminated_length": 840.0, "completions/mean_length": 178.3359375, "completions/mean_terminated_length": 178.3359375, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.033184874683433764, "frac_reward_zero_std": 0.0, "grad_norm": 0.25996193289756775, "kl": 0.1961669921875, "learning_rate": 5e-05, "loss": 0.1416, "num_tokens": 5019812.0, "reward": 2.3440475463867188, "reward_std": 0.5488351583480835, "rewards/helpfulness_reward/mean": 2.3440475463867188, "rewards/helpfulness_reward/std": 0.8292038440704346, "rewards/safety_reward/mean": 3.614990234375, "rewards/safety_reward/std": 1.4963504076004028, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 706.0, "completions/max_terminated_length": 706.0, "completions/mean_length": 199.921875, "completions/mean_terminated_length": 199.921875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.0335341891537857, "frac_reward_zero_std": 0.0, "grad_norm": 0.26698076725006104, "kl": 0.23095703125, "learning_rate": 5e-05, "loss": 0.0993, "num_tokens": 5049546.0, "reward": 2.0556583404541016, "reward_std": 0.4320642948150635, "rewards/helpfulness_reward/mean": 2.0556583404541016, "rewards/helpfulness_reward/std": 0.8834826946258545, "rewards/safety_reward/mean": 3.204265594482422, "rewards/safety_reward/std": 1.5329307317733765, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 703.0, "completions/max_terminated_length": 703.0, "completions/mean_length": 152.828125, "completions/mean_terminated_length": 152.828125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.03388350362413763, "frac_reward_zero_std": 0.0, "grad_norm": 0.3234156370162964, "kl": 0.2713623046875, "learning_rate": 5e-05, "loss": 0.1577, "num_tokens": 5073052.0, "reward": 2.177321434020996, "reward_std": 0.5261185169219971, "rewards/helpfulness_reward/mean": 2.177321434020996, "rewards/helpfulness_reward/std": 0.7152072787284851, "rewards/safety_reward/mean": 3.796630859375, "rewards/safety_reward/std": 1.2932932376861572, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 977.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 211.6015625, "completions/mean_terminated_length": 211.6015625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.03423281809448957, "frac_reward_zero_std": 0.0, "grad_norm": 0.3169298768043518, "kl": 0.20947265625, "learning_rate": 5e-05, "loss": 0.0386, "num_tokens": 5104905.0, "reward": 2.118077516555786, "reward_std": 0.40881210565567017, "rewards/helpfulness_reward/mean": 2.118077516555786, "rewards/helpfulness_reward/std": 0.9331831932067871, "rewards/safety_reward/mean": 3.853240966796875, "rewards/safety_reward/std": 1.7137573957443237, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 665.0, "completions/max_terminated_length": 665.0, "completions/mean_length": 163.21875, "completions/mean_terminated_length": 163.21875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.0345821325648415, "frac_reward_zero_std": 0.0, "grad_norm": 0.29003483057022095, "kl": 0.24462890625, "learning_rate": 5e-05, "loss": 0.0708, "num_tokens": 5129941.0, "reward": 2.3309402465820312, "reward_std": 0.49934569001197815, "rewards/helpfulness_reward/mean": 2.3309402465820312, "rewards/helpfulness_reward/std": 0.8648067116737366, "rewards/safety_reward/mean": 3.84283447265625, "rewards/safety_reward/std": 1.342793583869934, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 124.4140625, "completions/mean_terminated_length": 124.4140625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.034931447035193436, "frac_reward_zero_std": 0.0, "grad_norm": 13.396774291992188, "kl": 4.384765625, "learning_rate": 5e-05, "loss": 0.0704, "num_tokens": 5151626.0, "reward": 2.54534912109375, "reward_std": 0.4064963459968567, "rewards/helpfulness_reward/mean": 2.54534912109375, "rewards/helpfulness_reward/std": 0.9748256802558899, "rewards/safety_reward/mean": 3.9301300048828125, "rewards/safety_reward/std": 1.8091834783554077, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1722.0, "completions/mean_length": 273.90625, "completions/mean_terminated_length": 224.9354705810547, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.03528076150554537, "frac_reward_zero_std": 0.0, "grad_norm": 0.32176804542541504, "kl": 0.280029296875, "learning_rate": 5e-05, "loss": 0.0528, "num_tokens": 5193654.0, "reward": 2.1990814208984375, "reward_std": 0.4771266579627991, "rewards/helpfulness_reward/mean": 2.1990814208984375, "rewards/helpfulness_reward/std": 0.9863426089286804, "rewards/safety_reward/mean": 3.6723976135253906, "rewards/safety_reward/std": 1.8151800632476807, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1792.0, "completions/max_terminated_length": 845.0, "completions/mean_length": 145.609375, "completions/mean_terminated_length": 132.6456756591797, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.035630075975897305, "frac_reward_zero_std": 0.0, "grad_norm": 0.3691158890724182, "kl": 0.366455078125, "learning_rate": 5e-05, "loss": 0.1876, "num_tokens": 5217620.0, "reward": 2.2330760955810547, "reward_std": 0.45639389753341675, "rewards/helpfulness_reward/mean": 2.2330760955810547, "rewards/helpfulness_reward/std": 0.7059590220451355, "rewards/safety_reward/mean": 3.7785491943359375, "rewards/safety_reward/std": 1.1414637565612793, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 684.0, "completions/max_terminated_length": 684.0, "completions/mean_length": 126.1953125, "completions/mean_terminated_length": 126.1953125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.03597939044624924, "frac_reward_zero_std": 0.0, "grad_norm": 0.3772641122341156, "kl": 0.31396484375, "learning_rate": 5e-05, "loss": 0.0355, "num_tokens": 5238157.0, "reward": 2.389162063598633, "reward_std": 0.45773398876190186, "rewards/helpfulness_reward/mean": 2.389162063598633, "rewards/helpfulness_reward/std": 0.8859588503837585, "rewards/safety_reward/mean": 3.6279373168945312, "rewards/safety_reward/std": 1.7872732877731323, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/max_terminated_length": 601.0, "completions/mean_length": 140.484375, "completions/mean_terminated_length": 140.484375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.03632870491660117, "frac_reward_zero_std": 0.0, "grad_norm": 0.32834020256996155, "kl": 0.34619140625, "learning_rate": 5e-05, "loss": 0.0733, "num_tokens": 5259731.0, "reward": 2.31854248046875, "reward_std": 0.5369431972503662, "rewards/helpfulness_reward/mean": 2.31854248046875, "rewards/helpfulness_reward/std": 1.1403099298477173, "rewards/safety_reward/mean": 4.05621337890625, "rewards/safety_reward/std": 1.3076146841049194, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 105.015625, "completions/mean_terminated_length": 105.015625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.03667801938695311, "frac_reward_zero_std": 0.0, "grad_norm": 0.39006730914115906, "kl": 0.352783203125, "learning_rate": 5e-05, "loss": 0.0128, "num_tokens": 5277925.0, "reward": 2.337803840637207, "reward_std": 0.43414580821990967, "rewards/helpfulness_reward/mean": 2.337803840637207, "rewards/helpfulness_reward/std": 0.7693417072296143, "rewards/safety_reward/mean": 4.21185302734375, "rewards/safety_reward/std": 1.1748980283737183, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 109.2109375, "completions/mean_terminated_length": 109.2109375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.03702733385730504, "frac_reward_zero_std": 0.0, "grad_norm": 0.4409373104572296, "kl": 0.431884765625, "learning_rate": 5e-05, "loss": 0.0513, "num_tokens": 5297512.0, "reward": 2.4869232177734375, "reward_std": 0.4706234037876129, "rewards/helpfulness_reward/mean": 2.4869232177734375, "rewards/helpfulness_reward/std": 0.9020498394966125, "rewards/safety_reward/mean": 4.1917877197265625, "rewards/safety_reward/std": 1.4988971948623657, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1687.0, "completions/mean_length": 166.7578125, "completions/mean_terminated_length": 153.96063232421875, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.037376648327656976, "frac_reward_zero_std": 0.0, "grad_norm": 0.3336259126663208, "kl": 0.2640380859375, "learning_rate": 5e-05, "loss": 0.1522, "num_tokens": 5323865.0, "reward": 2.360668182373047, "reward_std": 0.4718075692653656, "rewards/helpfulness_reward/mean": 2.360668182373047, "rewards/helpfulness_reward/std": 0.9266906976699829, "rewards/safety_reward/mean": 3.507232666015625, "rewards/safety_reward/std": 1.8709100484848022, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/max_terminated_length": 555.0, "completions/mean_length": 124.515625, "completions/mean_terminated_length": 124.515625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.03772596279800891, "frac_reward_zero_std": 0.0, "grad_norm": 0.3711261451244354, "kl": 0.38720703125, "learning_rate": 5e-05, "loss": 0.0414, "num_tokens": 5343803.0, "reward": 2.708099365234375, "reward_std": 0.46468645334243774, "rewards/helpfulness_reward/mean": 2.708099365234375, "rewards/helpfulness_reward/std": 0.7862535119056702, "rewards/safety_reward/mean": 4.566890716552734, "rewards/safety_reward/std": 1.3471018075942993, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 681.0, "completions/max_terminated_length": 681.0, "completions/mean_length": 142.9765625, "completions/mean_terminated_length": 142.9765625, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.038075277268360845, "frac_reward_zero_std": 0.0, "grad_norm": 0.3427834212779999, "kl": 0.353271484375, "learning_rate": 5e-05, "loss": 0.0944, "num_tokens": 5366592.0, "reward": 2.4370956420898438, "reward_std": 0.4889710247516632, "rewards/helpfulness_reward/mean": 2.4370956420898438, "rewards/helpfulness_reward/std": 0.9714218974113464, "rewards/safety_reward/mean": 3.9238357543945312, "rewards/safety_reward/std": 1.770216703414917, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 142.6015625, "completions/mean_terminated_length": 142.6015625, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.03842459173871278, "frac_reward_zero_std": 0.0, "grad_norm": 0.4007651209831238, "kl": 0.34033203125, "learning_rate": 5e-05, "loss": 0.0431, "num_tokens": 5388893.0, "reward": 2.4108104705810547, "reward_std": 0.552263617515564, "rewards/helpfulness_reward/mean": 2.4108104705810547, "rewards/helpfulness_reward/std": 1.0108616352081299, "rewards/safety_reward/mean": 3.88067626953125, "rewards/safety_reward/std": 1.4211548566818237, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 99.25, "completions/mean_terminated_length": 99.25, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.038773906209064714, "frac_reward_zero_std": 0.0, "grad_norm": 0.4377408027648926, "kl": 0.455322265625, "learning_rate": 5e-05, "loss": 0.0328, "num_tokens": 5406157.0, "reward": 2.6860198974609375, "reward_std": 0.5166409015655518, "rewards/helpfulness_reward/mean": 2.6860198974609375, "rewards/helpfulness_reward/std": 0.7388544082641602, "rewards/safety_reward/mean": 4.49591064453125, "rewards/safety_reward/std": 1.2370184659957886, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 112.75, "completions/mean_terminated_length": 112.75, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.03912322067941665, "frac_reward_zero_std": 0.0, "grad_norm": 0.37761133909225464, "kl": 0.36962890625, "learning_rate": 5e-05, "loss": 0.0645, "num_tokens": 5424469.0, "reward": 2.4504928588867188, "reward_std": 0.47763916850090027, "rewards/helpfulness_reward/mean": 2.4504928588867188, "rewards/helpfulness_reward/std": 0.9107632040977478, "rewards/safety_reward/mean": 4.18414306640625, "rewards/safety_reward/std": 1.397121787071228, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 126.9921875, "completions/mean_terminated_length": 126.9921875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.03947253514976858, "frac_reward_zero_std": 0.0, "grad_norm": 0.37858447432518005, "kl": 0.38916015625, "learning_rate": 5e-05, "loss": 0.0661, "num_tokens": 5444484.0, "reward": 2.5275449752807617, "reward_std": 0.5873452425003052, "rewards/helpfulness_reward/mean": 2.5275449752807617, "rewards/helpfulness_reward/std": 0.9284306764602661, "rewards/safety_reward/mean": 4.3955078125, "rewards/safety_reward/std": 1.3428330421447754, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 105.96875, "completions/mean_terminated_length": 105.96875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.03982184962012052, "frac_reward_zero_std": 0.0, "grad_norm": 0.4087860584259033, "kl": 0.431396484375, "learning_rate": 5e-05, "loss": 0.0308, "num_tokens": 5462080.0, "reward": 2.4793853759765625, "reward_std": 0.6198524236679077, "rewards/helpfulness_reward/mean": 2.4793853759765625, "rewards/helpfulness_reward/std": 1.0912705659866333, "rewards/safety_reward/mean": 4.2664031982421875, "rewards/safety_reward/std": 1.5384936332702637, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/max_terminated_length": 648.0, "completions/mean_length": 129.0234375, "completions/mean_terminated_length": 129.0234375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.04017116409047245, "frac_reward_zero_std": 0.0, "grad_norm": 0.3336641192436218, "kl": 0.37158203125, "learning_rate": 5e-05, "loss": 0.0864, "num_tokens": 5482875.0, "reward": 2.806304931640625, "reward_std": 0.5396829843521118, "rewards/helpfulness_reward/mean": 2.806304931640625, "rewards/helpfulness_reward/std": 0.8656386733055115, "rewards/safety_reward/mean": 4.55865478515625, "rewards/safety_reward/std": 1.1892870664596558, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 124.0546875, "completions/mean_terminated_length": 124.0546875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.040520478560824386, "frac_reward_zero_std": 0.0, "grad_norm": 0.3692927658557892, "kl": 0.439697265625, "learning_rate": 5e-05, "loss": 0.1002, "num_tokens": 5501914.0, "reward": 2.838836669921875, "reward_std": 0.4916630983352661, "rewards/helpfulness_reward/mean": 2.838836669921875, "rewards/helpfulness_reward/std": 0.8649582862854004, "rewards/safety_reward/mean": 4.5482177734375, "rewards/safety_reward/std": 1.4128104448318481, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 111.9453125, "completions/mean_terminated_length": 111.9453125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.04086979303117631, "frac_reward_zero_std": 0.0, "grad_norm": 0.36514681577682495, "kl": 0.45361328125, "learning_rate": 5e-05, "loss": 0.0326, "num_tokens": 5520403.0, "reward": 2.937164306640625, "reward_std": 0.4579366147518158, "rewards/helpfulness_reward/mean": 2.937164306640625, "rewards/helpfulness_reward/std": 0.6724352240562439, "rewards/safety_reward/mean": 4.8104248046875, "rewards/safety_reward/std": 0.908993661403656, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/max_terminated_length": 560.0, "completions/mean_length": 131.4296875, "completions/mean_terminated_length": 131.4296875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.04121910750152825, "frac_reward_zero_std": 0.0, "grad_norm": 0.35000869631767273, "kl": 0.42236328125, "learning_rate": 5e-05, "loss": 0.0643, "num_tokens": 5541202.0, "reward": 2.6350574493408203, "reward_std": 0.5049299001693726, "rewards/helpfulness_reward/mean": 2.6350574493408203, "rewards/helpfulness_reward/std": 1.1313501596450806, "rewards/safety_reward/mean": 4.295239448547363, "rewards/safety_reward/std": 1.7027124166488647, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 108.078125, "completions/mean_terminated_length": 108.078125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.04156842197188018, "frac_reward_zero_std": 0.0, "grad_norm": 0.4070560932159424, "kl": 0.428955078125, "learning_rate": 5e-05, "loss": 0.065, "num_tokens": 5559748.0, "reward": 3.074188232421875, "reward_std": 0.48904672265052795, "rewards/helpfulness_reward/mean": 3.074188232421875, "rewards/helpfulness_reward/std": 0.7460724711418152, "rewards/safety_reward/mean": 4.97760009765625, "rewards/safety_reward/std": 1.0800251960754395, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 117.125, "completions/mean_terminated_length": 117.125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.041917736442232116, "frac_reward_zero_std": 0.0, "grad_norm": 0.40956297516822815, "kl": 0.5283203125, "learning_rate": 5e-05, "loss": 0.0716, "num_tokens": 5578572.0, "reward": 2.7552490234375, "reward_std": 0.5397356748580933, "rewards/helpfulness_reward/mean": 2.7552490234375, "rewards/helpfulness_reward/std": 0.8486740589141846, "rewards/safety_reward/mean": 4.7447509765625, "rewards/safety_reward/std": 1.157488465309143, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 701.0, "completions/max_terminated_length": 701.0, "completions/mean_length": 152.0703125, "completions/mean_terminated_length": 152.0703125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.04226705091258405, "frac_reward_zero_std": 0.0, "grad_norm": 0.35181087255477905, "kl": 0.4111328125, "learning_rate": 5e-05, "loss": 0.0529, "num_tokens": 5602389.0, "reward": 2.7177791595458984, "reward_std": 0.4869447350502014, "rewards/helpfulness_reward/mean": 2.7177791595458984, "rewards/helpfulness_reward/std": 0.9399107098579407, "rewards/safety_reward/mean": 4.6191558837890625, "rewards/safety_reward/std": 1.393306851387024, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 130.65625, "completions/mean_terminated_length": 130.65625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.042616365382935985, "frac_reward_zero_std": 0.0, "grad_norm": 0.3613523542881012, "kl": 0.47802734375, "learning_rate": 5e-05, "loss": 0.0579, "num_tokens": 5623681.0, "reward": 2.58099365234375, "reward_std": 0.5264651775360107, "rewards/helpfulness_reward/mean": 2.58099365234375, "rewards/helpfulness_reward/std": 0.7731768488883972, "rewards/safety_reward/mean": 4.44354248046875, "rewards/safety_reward/std": 1.0907185077667236, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/max_terminated_length": 631.0, "completions/mean_length": 146.859375, "completions/mean_terminated_length": 146.859375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.04296567985328792, "frac_reward_zero_std": 0.0, "grad_norm": 0.2905992567539215, "kl": 0.391357421875, "learning_rate": 5e-05, "loss": 0.1192, "num_tokens": 5646095.0, "reward": 2.8601531982421875, "reward_std": 0.448077917098999, "rewards/helpfulness_reward/mean": 2.8601531982421875, "rewards/helpfulness_reward/std": 0.8427213430404663, "rewards/safety_reward/mean": 4.69134521484375, "rewards/safety_reward/std": 1.1950147151947021, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1030.0, "completions/mean_length": 168.9296875, "completions/mean_terminated_length": 156.1496124267578, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.043314994323639854, "frac_reward_zero_std": 0.0, "grad_norm": 0.34463104605674744, "kl": 0.416259765625, "learning_rate": 5e-05, "loss": 0.0941, "num_tokens": 5672990.0, "reward": 2.5340843200683594, "reward_std": 0.5140625238418579, "rewards/helpfulness_reward/mean": 2.5340843200683594, "rewards/helpfulness_reward/std": 1.2632092237472534, "rewards/safety_reward/mean": 3.9722042083740234, "rewards/safety_reward/std": 1.894943118095398, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1139.0, "completions/mean_length": 144.8203125, "completions/mean_terminated_length": 131.8503875732422, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.04366430879399179, "frac_reward_zero_std": 0.0, "grad_norm": 0.3703119158744812, "kl": 0.4423828125, "learning_rate": 5e-05, "loss": 0.1407, "num_tokens": 5695175.0, "reward": 2.7723541259765625, "reward_std": 0.5283041000366211, "rewards/helpfulness_reward/mean": 2.7723541259765625, "rewards/helpfulness_reward/std": 0.8746535181999207, "rewards/safety_reward/mean": 4.3132476806640625, "rewards/safety_reward/std": 1.552513837814331, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 122.3671875, "completions/mean_terminated_length": 122.3671875, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.04401362326434372, "frac_reward_zero_std": 0.0, "grad_norm": 0.38640010356903076, "kl": 0.48779296875, "learning_rate": 5e-05, "loss": 0.0866, "num_tokens": 5714758.0, "reward": 3.052825927734375, "reward_std": 0.4690234661102295, "rewards/helpfulness_reward/mean": 3.052825927734375, "rewards/helpfulness_reward/std": 0.7878543734550476, "rewards/safety_reward/mean": 5.115234375, "rewards/safety_reward/std": 0.9281273484230042, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/max_terminated_length": 634.0, "completions/mean_length": 138.9140625, "completions/mean_terminated_length": 138.9140625, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.04436293773469566, "frac_reward_zero_std": 0.0, "grad_norm": 0.34648650884628296, "kl": 0.4921875, "learning_rate": 5e-05, "loss": 0.0883, "num_tokens": 5737371.0, "reward": 2.963106155395508, "reward_std": 0.5237293839454651, "rewards/helpfulness_reward/mean": 2.963106155395508, "rewards/helpfulness_reward/std": 0.8630232810974121, "rewards/safety_reward/mean": 4.904754638671875, "rewards/safety_reward/std": 1.0487207174301147, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 119.765625, "completions/mean_terminated_length": 119.765625, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.04471225220504759, "frac_reward_zero_std": 0.0, "grad_norm": 0.37079155445098877, "kl": 0.49755859375, "learning_rate": 5e-05, "loss": 0.0613, "num_tokens": 5756549.0, "reward": 3.0502548217773438, "reward_std": 0.4675557017326355, "rewards/helpfulness_reward/mean": 3.0502548217773438, "rewards/helpfulness_reward/std": 0.754848301410675, "rewards/safety_reward/mean": 5.1090087890625, "rewards/safety_reward/std": 1.166371464729309, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 126.140625, "completions/mean_terminated_length": 126.140625, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.045061566675399525, "frac_reward_zero_std": 0.0, "grad_norm": 0.4323989152908325, "kl": 0.561279296875, "learning_rate": 5e-05, "loss": 0.0809, "num_tokens": 5776783.0, "reward": 2.88653564453125, "reward_std": 0.4823434352874756, "rewards/helpfulness_reward/mean": 2.88653564453125, "rewards/helpfulness_reward/std": 0.6674829721450806, "rewards/safety_reward/mean": 4.810546875, "rewards/safety_reward/std": 0.8000782132148743, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1792.0, "completions/max_terminated_length": 535.0, "completions/mean_length": 126.28125, "completions/mean_terminated_length": 113.16535186767578, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.04541088114575146, "frac_reward_zero_std": 0.0, "grad_norm": 0.36124926805496216, "kl": 0.507568359375, "learning_rate": 5e-05, "loss": 0.1525, "num_tokens": 5796587.0, "reward": 2.8114013671875, "reward_std": 0.45976144075393677, "rewards/helpfulness_reward/mean": 2.8114013671875, "rewards/helpfulness_reward/std": 0.7812195420265198, "rewards/safety_reward/mean": 4.512237548828125, "rewards/safety_reward/std": 1.2441482543945312, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 111.765625, "completions/mean_terminated_length": 111.765625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.045760195616103394, "frac_reward_zero_std": 0.0, "grad_norm": 0.38139232993125916, "kl": 0.504150390625, "learning_rate": 5e-05, "loss": 0.0343, "num_tokens": 5814925.0, "reward": 3.2215576171875, "reward_std": 0.3841015100479126, "rewards/helpfulness_reward/mean": 3.2215576171875, "rewards/helpfulness_reward/std": 0.5884683132171631, "rewards/safety_reward/mean": 5.09130859375, "rewards/safety_reward/std": 0.9029101729393005, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 106.2578125, "completions/mean_terminated_length": 106.2578125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.04610951008645533, "frac_reward_zero_std": 0.0, "grad_norm": 0.3840186595916748, "kl": 0.462890625, "learning_rate": 5e-05, "loss": 0.0572, "num_tokens": 5833806.0, "reward": 3.176910400390625, "reward_std": 0.47729307413101196, "rewards/helpfulness_reward/mean": 3.176910400390625, "rewards/helpfulness_reward/std": 0.6079205870628357, "rewards/safety_reward/mean": 5.149169921875, "rewards/safety_reward/std": 0.921988308429718, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 100.4140625, "completions/mean_terminated_length": 100.4140625, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.04645882455680726, "frac_reward_zero_std": 0.0, "grad_norm": 0.4136613607406616, "kl": 0.59130859375, "learning_rate": 5e-05, "loss": 0.0557, "num_tokens": 5851171.0, "reward": 2.7456817626953125, "reward_std": 0.3794376850128174, "rewards/helpfulness_reward/mean": 2.7456817626953125, "rewards/helpfulness_reward/std": 0.8176782131195068, "rewards/safety_reward/mean": 4.5372314453125, "rewards/safety_reward/std": 1.4379260540008545, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 102.6796875, "completions/mean_terminated_length": 102.6796875, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.0468081390271592, "frac_reward_zero_std": 0.0, "grad_norm": 0.43321630358695984, "kl": 0.5419921875, "learning_rate": 5e-05, "loss": 0.0089, "num_tokens": 5868442.0, "reward": 3.50604248046875, "reward_std": 0.3618784248828888, "rewards/helpfulness_reward/mean": 3.50604248046875, "rewards/helpfulness_reward/std": 0.7761107683181763, "rewards/safety_reward/mean": 5.339111328125, "rewards/safety_reward/std": 1.073511004447937, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 100.71875, "completions/mean_terminated_length": 100.71875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.04715745349751113, "frac_reward_zero_std": 0.0, "grad_norm": 0.4148734509944916, "kl": 0.53125, "learning_rate": 5e-05, "loss": -0.0002, "num_tokens": 5885254.0, "reward": 2.99603271484375, "reward_std": 0.3803263306617737, "rewards/helpfulness_reward/mean": 2.99603271484375, "rewards/helpfulness_reward/std": 0.5718949437141418, "rewards/safety_reward/mean": 4.6925048828125, "rewards/safety_reward/std": 0.9950394034385681, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 100.34375, "completions/mean_terminated_length": 100.34375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.047506767967863066, "frac_reward_zero_std": 0.0, "grad_norm": 0.4486151337623596, "kl": 0.523193359375, "learning_rate": 5e-05, "loss": 0.0252, "num_tokens": 5902898.0, "reward": 3.02520751953125, "reward_std": 0.41483697295188904, "rewards/helpfulness_reward/mean": 3.02520751953125, "rewards/helpfulness_reward/std": 0.6268513202667236, "rewards/safety_reward/mean": 5.12786865234375, "rewards/safety_reward/std": 0.9967149496078491, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 97.7421875, "completions/mean_terminated_length": 97.7421875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.047856082438215, "frac_reward_zero_std": 0.0, "grad_norm": 2.248697280883789, "kl": 1.4755859375, "learning_rate": 5e-05, "loss": 0.0131, "num_tokens": 5919985.0, "reward": 2.94342041015625, "reward_std": 0.37480831146240234, "rewards/helpfulness_reward/mean": 2.94342041015625, "rewards/helpfulness_reward/std": 0.739574134349823, "rewards/safety_reward/mean": 4.64056396484375, "rewards/safety_reward/std": 1.1303998231887817, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 106.8046875, "completions/mean_terminated_length": 106.8046875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.048205396908566935, "frac_reward_zero_std": 0.0, "grad_norm": 0.3546243906021118, "kl": 0.558349609375, "learning_rate": 5e-05, "loss": 0.0278, "num_tokens": 5938112.0, "reward": 3.185821533203125, "reward_std": 0.37181925773620605, "rewards/helpfulness_reward/mean": 3.185821533203125, "rewards/helpfulness_reward/std": 0.6233065128326416, "rewards/safety_reward/mean": 5.2958984375, "rewards/safety_reward/std": 0.8053606152534485, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 116.890625, "completions/mean_terminated_length": 116.890625, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.04855471137891887, "frac_reward_zero_std": 0.0, "grad_norm": 0.37253761291503906, "kl": 0.472900390625, "learning_rate": 5e-05, "loss": 0.0476, "num_tokens": 5959642.0, "reward": 2.941436767578125, "reward_std": 0.40119266510009766, "rewards/helpfulness_reward/mean": 2.941436767578125, "rewards/helpfulness_reward/std": 0.7550133466720581, "rewards/safety_reward/mean": 4.873138427734375, "rewards/safety_reward/std": 1.3447964191436768, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 113.4609375, "completions/mean_terminated_length": 113.4609375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.0489040258492708, "frac_reward_zero_std": 0.0, "grad_norm": 0.4585351347923279, "kl": 0.608154296875, "learning_rate": 5e-05, "loss": 0.0135, "num_tokens": 5979501.0, "reward": 3.19232177734375, "reward_std": 0.4057525396347046, "rewards/helpfulness_reward/mean": 3.19232177734375, "rewards/helpfulness_reward/std": 0.6790251135826111, "rewards/safety_reward/mean": 5.3111572265625, "rewards/safety_reward/std": 0.9495130181312561, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 102.6484375, "completions/mean_terminated_length": 102.6484375, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.04925334031962274, "frac_reward_zero_std": 0.0, "grad_norm": 0.3629283607006073, "kl": 0.5009765625, "learning_rate": 5e-05, "loss": 0.0331, "num_tokens": 5996488.0, "reward": 3.221923828125, "reward_std": 0.36946311593055725, "rewards/helpfulness_reward/mean": 3.221923828125, "rewards/helpfulness_reward/std": 0.5476955771446228, "rewards/safety_reward/mean": 5.073486328125, "rewards/safety_reward/std": 1.065598964691162, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 109.875, "completions/mean_terminated_length": 109.875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.04960265478997467, "frac_reward_zero_std": 0.0, "grad_norm": 0.3578588366508484, "kl": 0.456298828125, "learning_rate": 5e-05, "loss": 0.0101, "num_tokens": 6015224.0, "reward": 3.14544677734375, "reward_std": 0.3802605867385864, "rewards/helpfulness_reward/mean": 3.14544677734375, "rewards/helpfulness_reward/std": 0.7241332530975342, "rewards/safety_reward/mean": 5.3482666015625, "rewards/safety_reward/std": 1.0750423669815063, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 119.3125, "completions/mean_terminated_length": 119.3125, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.049951969260326606, "frac_reward_zero_std": 0.0, "grad_norm": 0.43227648735046387, "kl": 0.5263671875, "learning_rate": 5e-05, "loss": 0.0398, "num_tokens": 6035592.0, "reward": 2.8641433715820312, "reward_std": 0.4103758931159973, "rewards/helpfulness_reward/mean": 2.8641433715820312, "rewards/helpfulness_reward/std": 1.0404949188232422, "rewards/safety_reward/mean": 4.54754638671875, "rewards/safety_reward/std": 1.5414960384368896, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 116.9375, "completions/mean_terminated_length": 116.9375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.05030128373067854, "frac_reward_zero_std": 0.0, "grad_norm": 0.3471103310585022, "kl": 0.55126953125, "learning_rate": 5e-05, "loss": 0.0285, "num_tokens": 6054696.0, "reward": 3.29327392578125, "reward_std": 0.3816724717617035, "rewards/helpfulness_reward/mean": 3.29327392578125, "rewards/helpfulness_reward/std": 0.5034358501434326, "rewards/safety_reward/mean": 5.0965576171875, "rewards/safety_reward/std": 0.6768633723258972, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 1792.0, "completions/max_terminated_length": 1620.0, "completions/mean_length": 205.765625, "completions/mean_terminated_length": 141.2845458984375, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.050650598201030475, "frac_reward_zero_std": 0.0, "grad_norm": 0.29287189245224, "kl": 0.375732421875, "learning_rate": 5e-05, "loss": 0.0254, "num_tokens": 6085906.0, "reward": 3.1946887969970703, "reward_std": 0.39407896995544434, "rewards/helpfulness_reward/mean": 3.1946887969970703, "rewards/helpfulness_reward/std": 0.9468740820884705, "rewards/safety_reward/mean": 5.107166290283203, "rewards/safety_reward/std": 1.5031861066818237, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1792.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 144.03125, "completions/mean_terminated_length": 131.05511474609375, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.05099991267138241, "frac_reward_zero_std": 0.0, "grad_norm": 0.30655375123023987, "kl": 0.445556640625, "learning_rate": 5e-05, "loss": 0.0193, "num_tokens": 6107958.0, "reward": 2.9244384765625, "reward_std": 0.4149685502052307, "rewards/helpfulness_reward/mean": 2.9244384765625, "rewards/helpfulness_reward/std": 0.8023571968078613, "rewards/safety_reward/mean": 4.7891845703125, "rewards/safety_reward/std": 1.2621140480041504, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1474.0, "completions/mean_length": 177.796875, "completions/mean_terminated_length": 165.08660888671875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.051349227141734344, "frac_reward_zero_std": 0.0, "grad_norm": 0.2906220853328705, "kl": 0.496337890625, "learning_rate": 5e-05, "loss": 0.0152, "num_tokens": 6135580.0, "reward": 2.9847869873046875, "reward_std": 0.5016161203384399, "rewards/helpfulness_reward/mean": 2.9847869873046875, "rewards/helpfulness_reward/std": 0.9000236392021179, "rewards/safety_reward/mean": 4.7021355628967285, "rewards/safety_reward/std": 1.57038414478302, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 804.0, "completions/max_terminated_length": 804.0, "completions/mean_length": 134.015625, "completions/mean_terminated_length": 134.015625, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.05169854161208628, "frac_reward_zero_std": 0.0, "grad_norm": 0.3781241178512573, "kl": 0.551025390625, "learning_rate": 5e-05, "loss": 0.0845, "num_tokens": 6158366.0, "reward": 3.066375732421875, "reward_std": 0.49217861890792847, "rewards/helpfulness_reward/mean": 3.066375732421875, "rewards/helpfulness_reward/std": 0.8460262417793274, "rewards/safety_reward/mean": 4.9560546875, "rewards/safety_reward/std": 1.0344005823135376, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 118.5234375, "completions/mean_terminated_length": 118.5234375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.05204785608243821, "frac_reward_zero_std": 0.0, "grad_norm": 0.3343982398509979, "kl": 0.4453125, "learning_rate": 5e-05, "loss": 0.0349, "num_tokens": 6178353.0, "reward": 3.0040740966796875, "reward_std": 0.430868536233902, "rewards/helpfulness_reward/mean": 3.0040740966796875, "rewards/helpfulness_reward/std": 0.7288893461227417, "rewards/safety_reward/mean": 4.81298828125, "rewards/safety_reward/std": 1.0621755123138428, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 121.609375, "completions/mean_terminated_length": 121.609375, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.05239717055279015, "frac_reward_zero_std": 0.0, "grad_norm": 0.349814236164093, "kl": 0.443115234375, "learning_rate": 5e-05, "loss": 0.053, "num_tokens": 6197975.0, "reward": 3.0826416015625, "reward_std": 0.4337521195411682, "rewards/helpfulness_reward/mean": 3.0826416015625, "rewards/helpfulness_reward/std": 0.6458524465560913, "rewards/safety_reward/mean": 5.1297607421875, "rewards/safety_reward/std": 0.8856381177902222, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 904.0, "completions/max_terminated_length": 904.0, "completions/mean_length": 135.1484375, "completions/mean_terminated_length": 135.1484375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.05274648502314208, "frac_reward_zero_std": 0.0, "grad_norm": 0.29926225543022156, "kl": 0.456298828125, "learning_rate": 5e-05, "loss": 0.0912, "num_tokens": 6219642.0, "reward": 3.286224365234375, "reward_std": 0.3771795630455017, "rewards/helpfulness_reward/mean": 3.286224365234375, "rewards/helpfulness_reward/std": 0.7243449687957764, "rewards/safety_reward/mean": 5.30828857421875, "rewards/safety_reward/std": 1.0578144788742065, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 125.6328125, "completions/mean_terminated_length": 125.6328125, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.053095799493494016, "frac_reward_zero_std": 0.0, "grad_norm": 0.5032610893249512, "kl": 0.4912109375, "learning_rate": 5e-05, "loss": 0.0394, "num_tokens": 6242323.0, "reward": 3.220672607421875, "reward_std": 0.4339712858200073, "rewards/helpfulness_reward/mean": 3.220672607421875, "rewards/helpfulness_reward/std": 0.9654765129089355, "rewards/safety_reward/mean": 4.973785400390625, "rewards/safety_reward/std": 1.6190775632858276, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 127.6484375, "completions/mean_terminated_length": 127.6484375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.05344511396384595, "frac_reward_zero_std": 0.0, "grad_norm": 0.31613731384277344, "kl": 0.434326171875, "learning_rate": 5e-05, "loss": 0.0565, "num_tokens": 6263798.0, "reward": 2.864959716796875, "reward_std": 0.4055347442626953, "rewards/helpfulness_reward/mean": 2.864959716796875, "rewards/helpfulness_reward/std": 0.9823663234710693, "rewards/safety_reward/mean": 4.6044921875, "rewards/safety_reward/std": 1.5200496912002563, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1792.0, "completions/max_terminated_length": 1572.0, "completions/mean_length": 233.46875, "completions/mean_terminated_length": 156.8196563720703, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.053794428434197884, "frac_reward_zero_std": 0.0, "grad_norm": 0.25627899169921875, "kl": 0.3641357421875, "learning_rate": 5e-05, "loss": 0.0265, "num_tokens": 6300906.0, "reward": 3.127532958984375, "reward_std": 0.47568783164024353, "rewards/helpfulness_reward/mean": 3.127532958984375, "rewards/helpfulness_reward/std": 1.0297307968139648, "rewards/safety_reward/mean": 4.898227691650391, "rewards/safety_reward/std": 1.7904866933822632, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 115.265625, "completions/mean_terminated_length": 115.265625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.05414374290454982, "frac_reward_zero_std": 0.0, "grad_norm": 0.3588011562824249, "kl": 0.551513671875, "learning_rate": 5e-05, "loss": 0.0443, "num_tokens": 6320756.0, "reward": 3.0023193359375, "reward_std": 0.33374452590942383, "rewards/helpfulness_reward/mean": 3.0023193359375, "rewards/helpfulness_reward/std": 0.7464399337768555, "rewards/safety_reward/mean": 4.93072509765625, "rewards/safety_reward/std": 1.1864012479782104, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 120.7109375, "completions/mean_terminated_length": 120.7109375, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.05449305737490175, "frac_reward_zero_std": 0.0, "grad_norm": 0.3290193974971771, "kl": 0.532470703125, "learning_rate": 5e-05, "loss": 0.049, "num_tokens": 6340223.0, "reward": 3.320556640625, "reward_std": 0.40038228034973145, "rewards/helpfulness_reward/mean": 3.320556640625, "rewards/helpfulness_reward/std": 0.6000397801399231, "rewards/safety_reward/mean": 5.1136474609375, "rewards/safety_reward/std": 0.8096409440040588, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 121.2734375, "completions/mean_terminated_length": 121.2734375, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.05484237184525369, "frac_reward_zero_std": 0.0, "grad_norm": 0.35013601183891296, "kl": 0.6171875, "learning_rate": 5e-05, "loss": 0.0446, "num_tokens": 6359338.0, "reward": 3.3839111328125, "reward_std": 0.3411881625652313, "rewards/helpfulness_reward/mean": 3.3839111328125, "rewards/helpfulness_reward/std": 0.558998703956604, "rewards/safety_reward/mean": 5.2108154296875, "rewards/safety_reward/std": 0.8616079092025757, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 116.515625, "completions/mean_terminated_length": 116.515625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.05519168631560562, "frac_reward_zero_std": 0.0, "grad_norm": 0.32975178956985474, "kl": 0.582275390625, "learning_rate": 5e-05, "loss": 0.0151, "num_tokens": 6378244.0, "reward": 3.2466583251953125, "reward_std": 0.37535226345062256, "rewards/helpfulness_reward/mean": 3.2466583251953125, "rewards/helpfulness_reward/std": 0.6051344275474548, "rewards/safety_reward/mean": 5.30126953125, "rewards/safety_reward/std": 1.009495496749878, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 884.0, "completions/max_terminated_length": 884.0, "completions/mean_length": 140.515625, "completions/mean_terminated_length": 140.515625, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.055541000785957556, "frac_reward_zero_std": 0.0, "grad_norm": 0.30190664529800415, "kl": 0.4296875, "learning_rate": 5e-05, "loss": 0.0301, "num_tokens": 6400598.0, "reward": 3.307647705078125, "reward_std": 0.417278915643692, "rewards/helpfulness_reward/mean": 3.307647705078125, "rewards/helpfulness_reward/std": 0.8298696875572205, "rewards/safety_reward/mean": 5.01019287109375, "rewards/safety_reward/std": 1.223357915878296, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 122.71875, "completions/mean_terminated_length": 122.71875, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.05589031525630949, "frac_reward_zero_std": 0.0, "grad_norm": 0.34254583716392517, "kl": 0.5927734375, "learning_rate": 5e-05, "loss": 0.046, "num_tokens": 6420074.0, "reward": 2.96087646484375, "reward_std": 0.41171449422836304, "rewards/helpfulness_reward/mean": 2.96087646484375, "rewards/helpfulness_reward/std": 0.7050331234931946, "rewards/safety_reward/mean": 4.8316650390625, "rewards/safety_reward/std": 1.0300698280334473, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 118.90625, "completions/mean_terminated_length": 118.90625, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.056239629726661425, "frac_reward_zero_std": 0.0, "grad_norm": 0.33351898193359375, "kl": 0.518798828125, "learning_rate": 5e-05, "loss": 0.0699, "num_tokens": 6439510.0, "reward": 3.3695831298828125, "reward_std": 0.3856828808784485, "rewards/helpfulness_reward/mean": 3.3695831298828125, "rewards/helpfulness_reward/std": 0.8754667043685913, "rewards/safety_reward/mean": 5.16552734375, "rewards/safety_reward/std": 1.171418309211731, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 124.875, "completions/mean_terminated_length": 124.875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.05658894419701336, "frac_reward_zero_std": 0.0, "grad_norm": 0.33640480041503906, "kl": 0.58544921875, "learning_rate": 5e-05, "loss": 0.0512, "num_tokens": 6460406.0, "reward": 3.242767333984375, "reward_std": 0.39905649423599243, "rewards/helpfulness_reward/mean": 3.242767333984375, "rewards/helpfulness_reward/std": 0.6496968269348145, "rewards/safety_reward/mean": 5.2557373046875, "rewards/safety_reward/std": 0.9513896107673645, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 117.75, "completions/mean_terminated_length": 117.75, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.056938258667365294, "frac_reward_zero_std": 0.0, "grad_norm": 0.35125887393951416, "kl": 0.6259765625, "learning_rate": 5e-05, "loss": 0.0485, "num_tokens": 6479142.0, "reward": 3.15240478515625, "reward_std": 0.4677489101886749, "rewards/helpfulness_reward/mean": 3.15240478515625, "rewards/helpfulness_reward/std": 0.6044344902038574, "rewards/safety_reward/mean": 5.025634765625, "rewards/safety_reward/std": 0.857698380947113, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 756.0, "completions/max_terminated_length": 756.0, "completions/mean_length": 120.7265625, "completions/mean_terminated_length": 120.7265625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.05728757313771723, "frac_reward_zero_std": 0.0, "grad_norm": 0.48335331678390503, "kl": 0.550048828125, "learning_rate": 5e-05, "loss": 0.0883, "num_tokens": 6501251.0, "reward": 2.9143524169921875, "reward_std": 0.4007682204246521, "rewards/helpfulness_reward/mean": 2.9143524169921875, "rewards/helpfulness_reward/std": 1.1309822797775269, "rewards/safety_reward/mean": 4.8069610595703125, "rewards/safety_reward/std": 1.4763890504837036, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 120.6875, "completions/mean_terminated_length": 120.6875, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.05763688760806916, "frac_reward_zero_std": 0.0, "grad_norm": 0.3309517204761505, "kl": 0.556640625, "learning_rate": 5e-05, "loss": 0.0217, "num_tokens": 6521771.0, "reward": 3.316650390625, "reward_std": 0.35366353392601013, "rewards/helpfulness_reward/mean": 3.316650390625, "rewards/helpfulness_reward/std": 0.6038585305213928, "rewards/safety_reward/mean": 5.458740234375, "rewards/safety_reward/std": 0.8181015849113464, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 118.96875, "completions/mean_terminated_length": 118.96875, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.0579862020784211, "frac_reward_zero_std": 0.0, "grad_norm": 0.38029909133911133, "kl": 0.58349609375, "learning_rate": 5e-05, "loss": 0.0494, "num_tokens": 6542247.0, "reward": 3.161376953125, "reward_std": 0.3535292148590088, "rewards/helpfulness_reward/mean": 3.161376953125, "rewards/helpfulness_reward/std": 0.5406772494316101, "rewards/safety_reward/mean": 5.1439208984375, "rewards/safety_reward/std": 0.7410152554512024, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 121.1796875, "completions/mean_terminated_length": 121.1796875, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.05833551654877303, "frac_reward_zero_std": 0.0, "grad_norm": 0.3408011794090271, "kl": 0.6201171875, "learning_rate": 5e-05, "loss": 0.0073, "num_tokens": 6563118.0, "reward": 3.3812255859375, "reward_std": 0.3471606969833374, "rewards/helpfulness_reward/mean": 3.3812255859375, "rewards/helpfulness_reward/std": 0.456269770860672, "rewards/safety_reward/mean": 5.415771484375, "rewards/safety_reward/std": 0.7805735468864441, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 124.1171875, "completions/mean_terminated_length": 124.1171875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.058684831019124965, "frac_reward_zero_std": 0.0, "grad_norm": 0.30641961097717285, "kl": 0.53173828125, "learning_rate": 5e-05, "loss": 0.0313, "num_tokens": 6584629.0, "reward": 3.4512939453125, "reward_std": 0.2862676680088043, "rewards/helpfulness_reward/mean": 3.4512939453125, "rewards/helpfulness_reward/std": 0.5359106063842773, "rewards/safety_reward/mean": 5.453857421875, "rewards/safety_reward/std": 0.6180838346481323, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 117.046875, "completions/mean_terminated_length": 117.046875, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.0590341454894769, "frac_reward_zero_std": 0.0, "grad_norm": 0.33524057269096375, "kl": 0.59765625, "learning_rate": 5e-05, "loss": 0.0156, "num_tokens": 6605691.0, "reward": 3.3431320190429688, "reward_std": 0.35675618052482605, "rewards/helpfulness_reward/mean": 3.3431320190429688, "rewards/helpfulness_reward/std": 0.77076655626297, "rewards/safety_reward/mean": 5.318641662597656, "rewards/safety_reward/std": 1.3435713052749634, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 113.9765625, "completions/mean_terminated_length": 113.9765625, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.059383459959828834, "frac_reward_zero_std": 0.0, "grad_norm": 0.3733478784561157, "kl": 0.576416015625, "learning_rate": 5e-05, "loss": 0.0056, "num_tokens": 6624448.0, "reward": 3.3887939453125, "reward_std": 0.275166392326355, "rewards/helpfulness_reward/mean": 3.3887939453125, "rewards/helpfulness_reward/std": 0.5418927669525146, "rewards/safety_reward/mean": 5.51220703125, "rewards/safety_reward/std": 0.8235169053077698, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 114.9453125, "completions/mean_terminated_length": 114.9453125, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.05973277443018077, "frac_reward_zero_std": 0.0, "grad_norm": 0.3118334114551544, "kl": 0.6875, "learning_rate": 5e-05, "loss": 0.0076, "num_tokens": 6642569.0, "reward": 3.22357177734375, "reward_std": 0.3277865946292877, "rewards/helpfulness_reward/mean": 3.22357177734375, "rewards/helpfulness_reward/std": 0.5178961753845215, "rewards/safety_reward/mean": 5.154541015625, "rewards/safety_reward/std": 0.8820006251335144, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 111.8828125, "completions/mean_terminated_length": 111.8828125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.0600820889005327, "frac_reward_zero_std": 0.0, "grad_norm": 0.3417350947856903, "kl": 0.6572265625, "learning_rate": 5e-05, "loss": 0.0048, "num_tokens": 6660978.0, "reward": 3.3046875, "reward_std": 0.3550550639629364, "rewards/helpfulness_reward/mean": 3.3046875, "rewards/helpfulness_reward/std": 0.5954191088676453, "rewards/safety_reward/mean": 5.258056640625, "rewards/safety_reward/std": 0.9366424083709717, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 112.7578125, "completions/mean_terminated_length": 112.7578125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.06043140337088464, "frac_reward_zero_std": 0.0, "grad_norm": 0.353735089302063, "kl": 0.6357421875, "learning_rate": 5e-05, "loss": -0.0077, "num_tokens": 6679035.0, "reward": 3.5244140625, "reward_std": 0.301990807056427, "rewards/helpfulness_reward/mean": 3.5244140625, "rewards/helpfulness_reward/std": 0.6218130588531494, "rewards/safety_reward/mean": 5.43701171875, "rewards/safety_reward/std": 1.0246671438217163, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 115.2421875, "completions/mean_terminated_length": 115.2421875, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.06078071784123657, "frac_reward_zero_std": 0.0, "grad_norm": 0.40687963366508484, "kl": 0.66015625, "learning_rate": 5e-05, "loss": 0.0197, "num_tokens": 6697818.0, "reward": 3.2823486328125, "reward_std": 0.32613927125930786, "rewards/helpfulness_reward/mean": 3.2823486328125, "rewards/helpfulness_reward/std": 0.5377389192581177, "rewards/safety_reward/mean": 5.17041015625, "rewards/safety_reward/std": 0.9359514713287354, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 113.734375, "completions/mean_terminated_length": 113.734375, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.061130032311588506, "frac_reward_zero_std": 0.0, "grad_norm": 0.3582474887371063, "kl": 0.6845703125, "learning_rate": 5e-05, "loss": 0.009, "num_tokens": 6716200.0, "reward": 3.3631591796875, "reward_std": 0.3411431908607483, "rewards/helpfulness_reward/mean": 3.3631591796875, "rewards/helpfulness_reward/std": 0.5047315955162048, "rewards/safety_reward/mean": 5.365234375, "rewards/safety_reward/std": 0.788794755935669, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 120.2109375, "completions/mean_terminated_length": 120.2109375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.06147934678194044, "frac_reward_zero_std": 0.0, "grad_norm": 25887.298828125, "kl": 3921.96435546875, "learning_rate": 5e-05, "loss": 39.251, "num_tokens": 6738027.0, "reward": 2.801349639892578, "reward_std": 0.40732988715171814, "rewards/helpfulness_reward/mean": 2.801349639892578, "rewards/helpfulness_reward/std": 1.4607735872268677, "rewards/safety_reward/mean": 4.414961814880371, "rewards/safety_reward/std": 2.1795289516448975, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 117.5390625, "completions/mean_terminated_length": 117.5390625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.061828661252292375, "frac_reward_zero_std": 0.0, "grad_norm": 0.3910330832004547, "kl": 0.64013671875, "learning_rate": 5e-05, "loss": 0.04, "num_tokens": 6758656.0, "reward": 3.16802978515625, "reward_std": 0.38604384660720825, "rewards/helpfulness_reward/mean": 3.16802978515625, "rewards/helpfulness_reward/std": 0.5971020460128784, "rewards/safety_reward/mean": 5.269287109375, "rewards/safety_reward/std": 0.8552401065826416, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 118.4375, "completions/mean_terminated_length": 118.4375, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.06217797572264431, "frac_reward_zero_std": 0.0, "grad_norm": 0.34691357612609863, "kl": 0.6259765625, "learning_rate": 5e-05, "loss": 0.0156, "num_tokens": 6778648.0, "reward": 3.29852294921875, "reward_std": 0.33166271448135376, "rewards/helpfulness_reward/mean": 3.29852294921875, "rewards/helpfulness_reward/std": 0.636647641658783, "rewards/safety_reward/mean": 5.3173828125, "rewards/safety_reward/std": 0.7142338156700134, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 114.046875, "completions/mean_terminated_length": 114.046875, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.06252729019299624, "frac_reward_zero_std": 0.0, "grad_norm": 0.3560873568058014, "kl": 0.68603515625, "learning_rate": 5e-05, "loss": 0.0035, "num_tokens": 6797206.0, "reward": 3.472412109375, "reward_std": 0.2902885675430298, "rewards/helpfulness_reward/mean": 3.472412109375, "rewards/helpfulness_reward/std": 0.5407701134681702, "rewards/safety_reward/mean": 5.5264892578125, "rewards/safety_reward/std": 0.6595184803009033, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 113.40625, "completions/mean_terminated_length": 113.40625, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.06287660466334818, "frac_reward_zero_std": 0.0, "grad_norm": 0.35111361742019653, "kl": 0.69091796875, "learning_rate": 5e-05, "loss": 0.0164, "num_tokens": 6815610.0, "reward": 3.51025390625, "reward_std": 0.27491503953933716, "rewards/helpfulness_reward/mean": 3.51025390625, "rewards/helpfulness_reward/std": 0.4452846944332123, "rewards/safety_reward/mean": 5.534423828125, "rewards/safety_reward/std": 0.7122058272361755, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 111.078125, "completions/mean_terminated_length": 111.078125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.06322591913370011, "frac_reward_zero_std": 0.0, "grad_norm": 1.0958670377731323, "kl": 1.314453125, "learning_rate": 5e-05, "loss": 0.0096, "num_tokens": 6837340.0, "reward": 3.297105073928833, "reward_std": 0.37157517671585083, "rewards/helpfulness_reward/mean": 3.297105073928833, "rewards/helpfulness_reward/std": 0.7242274284362793, "rewards/safety_reward/mean": 5.3551025390625, "rewards/safety_reward/std": 0.9373456239700317, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 114.7578125, "completions/mean_terminated_length": 114.7578125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.06357523360405205, "frac_reward_zero_std": 0.0, "grad_norm": 0.4235418438911438, "kl": 0.86328125, "learning_rate": 5e-05, "loss": -0.0002, "num_tokens": 6857533.0, "reward": 3.37744140625, "reward_std": 0.36089491844177246, "rewards/helpfulness_reward/mean": 3.37744140625, "rewards/helpfulness_reward/std": 0.5182344317436218, "rewards/safety_reward/mean": 5.307861328125, "rewards/safety_reward/std": 0.9351396560668945, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 116.4453125, "completions/mean_terminated_length": 116.4453125, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.06392454807440398, "frac_reward_zero_std": 0.0, "grad_norm": 0.36802271008491516, "kl": 0.751953125, "learning_rate": 5e-05, "loss": 0.0261, "num_tokens": 6876846.0, "reward": 3.30108642578125, "reward_std": 0.34256184101104736, "rewards/helpfulness_reward/mean": 3.30108642578125, "rewards/helpfulness_reward/std": 0.5142108798027039, "rewards/safety_reward/mean": 5.106201171875, "rewards/safety_reward/std": 0.7367632389068604, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 121.8984375, "completions/mean_terminated_length": 121.8984375, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.06427386254475592, "frac_reward_zero_std": 0.0, "grad_norm": 0.36620473861694336, "kl": 0.6533203125, "learning_rate": 5e-05, "loss": 0.0014, "num_tokens": 6897833.0, "reward": 3.436279296875, "reward_std": 0.26801830530166626, "rewards/helpfulness_reward/mean": 3.436279296875, "rewards/helpfulness_reward/std": 0.41984742879867554, "rewards/safety_reward/mean": 5.53076171875, "rewards/safety_reward/std": 0.6058836579322815, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 115.546875, "completions/mean_terminated_length": 115.546875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.06462317701510785, "frac_reward_zero_std": 0.0, "grad_norm": 0.39073193073272705, "kl": 0.6728515625, "learning_rate": 5e-05, "loss": 0.0067, "num_tokens": 6916639.0, "reward": 3.53314208984375, "reward_std": 0.3297998011112213, "rewards/helpfulness_reward/mean": 3.53314208984375, "rewards/helpfulness_reward/std": 0.52826327085495, "rewards/safety_reward/mean": 5.646240234375, "rewards/safety_reward/std": 0.7703714370727539, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 117.8984375, "completions/mean_terminated_length": 117.8984375, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.06497249148545979, "frac_reward_zero_std": 0.0, "grad_norm": 0.35636770725250244, "kl": 0.6943359375, "learning_rate": 5e-05, "loss": 0.0517, "num_tokens": 6935466.0, "reward": 3.43853759765625, "reward_std": 0.3524636924266815, "rewards/helpfulness_reward/mean": 3.43853759765625, "rewards/helpfulness_reward/std": 0.6555549502372742, "rewards/safety_reward/mean": 5.4769287109375, "rewards/safety_reward/std": 1.1033167839050293, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 120.2421875, "completions/mean_terminated_length": 120.2421875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.06532180595581172, "frac_reward_zero_std": 0.0, "grad_norm": 0.3375677764415741, "kl": 0.759765625, "learning_rate": 5e-05, "loss": 0.0206, "num_tokens": 6955329.0, "reward": 3.525390625, "reward_std": 0.3065587282180786, "rewards/helpfulness_reward/mean": 3.525390625, "rewards/helpfulness_reward/std": 0.45049339532852173, "rewards/safety_reward/mean": 5.658935546875, "rewards/safety_reward/std": 0.8012956976890564, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 117.484375, "completions/mean_terminated_length": 117.484375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.06567112042616366, "frac_reward_zero_std": 0.0, "grad_norm": 0.354374498128891, "kl": 0.7607421875, "learning_rate": 5e-05, "loss": 0.0195, "num_tokens": 6974991.0, "reward": 3.34539794921875, "reward_std": 0.3704839050769806, "rewards/helpfulness_reward/mean": 3.34539794921875, "rewards/helpfulness_reward/std": 0.4934573471546173, "rewards/safety_reward/mean": 5.2882080078125, "rewards/safety_reward/std": 0.8904098272323608, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 118.671875, "completions/mean_terminated_length": 118.671875, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.06602043489651559, "frac_reward_zero_std": 0.0, "grad_norm": 0.34194931387901306, "kl": 0.6533203125, "learning_rate": 5e-05, "loss": 0.0435, "num_tokens": 6994837.0, "reward": 3.53973388671875, "reward_std": 0.34386664628982544, "rewards/helpfulness_reward/mean": 3.53973388671875, "rewards/helpfulness_reward/std": 0.4785063862800598, "rewards/safety_reward/mean": 5.4368896484375, "rewards/safety_reward/std": 0.6642752885818481, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/max_terminated_length": 614.0, "completions/mean_length": 140.140625, "completions/mean_terminated_length": 140.140625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.06636974936686753, "frac_reward_zero_std": 0.0, "grad_norm": 0.47411125898361206, "kl": 0.673583984375, "learning_rate": 5e-05, "loss": 0.0362, "num_tokens": 7019639.0, "reward": 3.2312026023864746, "reward_std": 0.42077985405921936, "rewards/helpfulness_reward/mean": 3.2312026023864746, "rewards/helpfulness_reward/std": 1.2265671491622925, "rewards/safety_reward/mean": 4.9861907958984375, "rewards/safety_reward/std": 1.9317975044250488, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 117.1953125, "completions/mean_terminated_length": 117.1953125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.06671906383721946, "frac_reward_zero_std": 0.0, "grad_norm": 0.34272465109825134, "kl": 0.68798828125, "learning_rate": 5e-05, "loss": 0.0213, "num_tokens": 7038536.0, "reward": 3.380126953125, "reward_std": 0.3155898451805115, "rewards/helpfulness_reward/mean": 3.380126953125, "rewards/helpfulness_reward/std": 0.5406301617622375, "rewards/safety_reward/mean": 5.4637451171875, "rewards/safety_reward/std": 0.7061870694160461, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 117.6328125, "completions/mean_terminated_length": 117.6328125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.0670683783075714, "frac_reward_zero_std": 0.0, "grad_norm": 0.36047786474227905, "kl": 0.740234375, "learning_rate": 5e-05, "loss": -0.0029, "num_tokens": 7058953.0, "reward": 3.5262451171875, "reward_std": 0.2684659957885742, "rewards/helpfulness_reward/mean": 3.5262451171875, "rewards/helpfulness_reward/std": 0.41824328899383545, "rewards/safety_reward/mean": 5.6466064453125, "rewards/safety_reward/std": 0.7037497758865356, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 116.8359375, "completions/mean_terminated_length": 116.8359375, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.06741769277792332, "frac_reward_zero_std": 0.0, "grad_norm": 0.36334919929504395, "kl": 0.7529296875, "learning_rate": 5e-05, "loss": 0.0278, "num_tokens": 7078036.0, "reward": 3.45361328125, "reward_std": 0.3198753893375397, "rewards/helpfulness_reward/mean": 3.45361328125, "rewards/helpfulness_reward/std": 0.5286134481430054, "rewards/safety_reward/mean": 5.243408203125, "rewards/safety_reward/std": 0.7622103691101074, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 117.359375, "completions/mean_terminated_length": 117.359375, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.06776700724827527, "frac_reward_zero_std": 0.0, "grad_norm": 0.37165072560310364, "kl": 0.7294921875, "learning_rate": 5e-05, "loss": 0.0354, "num_tokens": 7097626.0, "reward": 3.4609375, "reward_std": 0.32285991311073303, "rewards/helpfulness_reward/mean": 3.4609375, "rewards/helpfulness_reward/std": 0.43462568521499634, "rewards/safety_reward/mean": 5.5050048828125, "rewards/safety_reward/std": 0.7664090394973755, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 121.328125, "completions/mean_terminated_length": 121.328125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.06811632171862719, "frac_reward_zero_std": 0.0, "grad_norm": 0.32393836975097656, "kl": 0.77294921875, "learning_rate": 5e-05, "loss": 0.0383, "num_tokens": 7117196.0, "reward": 3.5029296875, "reward_std": 0.3064306676387787, "rewards/helpfulness_reward/mean": 3.5029296875, "rewards/helpfulness_reward/std": 0.5315456986427307, "rewards/safety_reward/mean": 5.5467529296875, "rewards/safety_reward/std": 0.9063316583633423, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/max_terminated_length": 550.0, "completions/mean_length": 121.6875, "completions/mean_terminated_length": 121.6875, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.06846563618897913, "frac_reward_zero_std": 0.0, "grad_norm": 0.3525346517562866, "kl": 0.66552734375, "learning_rate": 5e-05, "loss": 0.0401, "num_tokens": 7137332.0, "reward": 3.51824951171875, "reward_std": 0.4010746479034424, "rewards/helpfulness_reward/mean": 3.51824951171875, "rewards/helpfulness_reward/std": 0.549378514289856, "rewards/safety_reward/mean": 5.6473388671875, "rewards/safety_reward/std": 0.7353382110595703, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 906.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 130.609375, "completions/mean_terminated_length": 130.609375, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.06881495065933106, "frac_reward_zero_std": 0.0, "grad_norm": 0.32735970616340637, "kl": 0.6591796875, "learning_rate": 5e-05, "loss": 0.0924, "num_tokens": 7159378.0, "reward": 3.375741958618164, "reward_std": 0.37279003858566284, "rewards/helpfulness_reward/mean": 3.375741958618164, "rewards/helpfulness_reward/std": 0.7782469987869263, "rewards/safety_reward/mean": 5.38616943359375, "rewards/safety_reward/std": 1.0000964403152466, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 118.6640625, "completions/mean_terminated_length": 118.6640625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.069164265129683, "frac_reward_zero_std": 0.0, "grad_norm": 0.3311704695224762, "kl": 0.6767578125, "learning_rate": 5e-05, "loss": 0.0236, "num_tokens": 7179207.0, "reward": 3.45404052734375, "reward_std": 0.2817445993423462, "rewards/helpfulness_reward/mean": 3.45404052734375, "rewards/helpfulness_reward/std": 0.44659507274627686, "rewards/safety_reward/mean": 5.5875244140625, "rewards/safety_reward/std": 0.5988897085189819, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 121.15625, "completions/mean_terminated_length": 121.15625, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.06951357960003493, "frac_reward_zero_std": 0.0, "grad_norm": 0.33939841389656067, "kl": 0.71630859375, "learning_rate": 5e-05, "loss": 0.0548, "num_tokens": 7199107.0, "reward": 3.471923828125, "reward_std": 0.2936936020851135, "rewards/helpfulness_reward/mean": 3.471923828125, "rewards/helpfulness_reward/std": 0.48687228560447693, "rewards/safety_reward/mean": 5.5213623046875, "rewards/safety_reward/std": 0.7480876445770264, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 118.3203125, "completions/mean_terminated_length": 118.3203125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.06986289407038687, "frac_reward_zero_std": 0.0, "grad_norm": 0.3377454876899719, "kl": 0.6669921875, "learning_rate": 5e-05, "loss": 0.0109, "num_tokens": 7218420.0, "reward": 3.50604248046875, "reward_std": 0.3054916560649872, "rewards/helpfulness_reward/mean": 3.50604248046875, "rewards/helpfulness_reward/std": 0.5109267234802246, "rewards/safety_reward/mean": 5.5048828125, "rewards/safety_reward/std": 0.6159430742263794, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 115.921875, "completions/mean_terminated_length": 115.921875, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.0702122085407388, "frac_reward_zero_std": 0.0, "grad_norm": 0.36451220512390137, "kl": 0.77392578125, "learning_rate": 5e-05, "loss": 0.0587, "num_tokens": 7236922.0, "reward": 3.51611328125, "reward_std": 0.4017969071865082, "rewards/helpfulness_reward/mean": 3.51611328125, "rewards/helpfulness_reward/std": 0.6047347784042358, "rewards/safety_reward/mean": 5.36083984375, "rewards/safety_reward/std": 0.7847471833229065, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 116.53125, "completions/mean_terminated_length": 116.53125, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.07056152301109074, "frac_reward_zero_std": 0.0, "grad_norm": 0.3560038208961487, "kl": 0.63037109375, "learning_rate": 5e-05, "loss": 0.0186, "num_tokens": 7257366.0, "reward": 3.469970703125, "reward_std": 0.28963080048561096, "rewards/helpfulness_reward/mean": 3.469970703125, "rewards/helpfulness_reward/std": 0.43590474128723145, "rewards/safety_reward/mean": 5.711181640625, "rewards/safety_reward/std": 0.6711177229881287, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 116.046875, "completions/mean_terminated_length": 116.046875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.07091083748144267, "frac_reward_zero_std": 0.0, "grad_norm": 0.33029499650001526, "kl": 0.66943359375, "learning_rate": 5e-05, "loss": 0.0204, "num_tokens": 7276844.0, "reward": 3.5665283203125, "reward_std": 0.27352240681648254, "rewards/helpfulness_reward/mean": 3.5665283203125, "rewards/helpfulness_reward/std": 0.547271192073822, "rewards/safety_reward/mean": 5.700439453125, "rewards/safety_reward/std": 0.5988568663597107, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 115.265625, "completions/mean_terminated_length": 115.265625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.07126015195179461, "frac_reward_zero_std": 0.0, "grad_norm": 0.36105048656463623, "kl": 0.76025390625, "learning_rate": 5e-05, "loss": -0.0285, "num_tokens": 7295926.0, "reward": 3.2488250732421875, "reward_std": 0.28652894496917725, "rewards/helpfulness_reward/mean": 3.2488250732421875, "rewards/helpfulness_reward/std": 0.8199940323829651, "rewards/safety_reward/mean": 5.156929016113281, "rewards/safety_reward/std": 1.311998963356018, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 121.59375, "completions/mean_terminated_length": 121.59375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.07160946642214654, "frac_reward_zero_std": 0.0, "grad_norm": 0.326132208108902, "kl": 0.74169921875, "learning_rate": 5e-05, "loss": 0.0312, "num_tokens": 7317618.0, "reward": 3.47125244140625, "reward_std": 0.30278149247169495, "rewards/helpfulness_reward/mean": 3.47125244140625, "rewards/helpfulness_reward/std": 0.6501564979553223, "rewards/safety_reward/mean": 5.160247802734375, "rewards/safety_reward/std": 1.1032695770263672, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 115.40625, "completions/mean_terminated_length": 115.40625, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.07195878089249848, "frac_reward_zero_std": 0.0, "grad_norm": 0.3238060474395752, "kl": 0.6875, "learning_rate": 5e-05, "loss": 0.0076, "num_tokens": 7336606.0, "reward": 3.37445068359375, "reward_std": 0.3382745385169983, "rewards/helpfulness_reward/mean": 3.37445068359375, "rewards/helpfulness_reward/std": 0.6003947257995605, "rewards/safety_reward/mean": 5.5303955078125, "rewards/safety_reward/std": 0.8070834875106812, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 120.828125, "completions/mean_terminated_length": 120.828125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.0723080953628504, "frac_reward_zero_std": 0.0, "grad_norm": 0.34248775243759155, "kl": 0.7802734375, "learning_rate": 5e-05, "loss": 0.1024, "num_tokens": 7355808.0, "reward": 3.38067626953125, "reward_std": 0.4261370897293091, "rewards/helpfulness_reward/mean": 3.38067626953125, "rewards/helpfulness_reward/std": 0.5950151681900024, "rewards/safety_reward/mean": 5.32568359375, "rewards/safety_reward/std": 0.7762944102287292, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 863.0, "completions/max_terminated_length": 863.0, "completions/mean_length": 139.8671875, "completions/mean_terminated_length": 139.8671875, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.07265740983320235, "frac_reward_zero_std": 0.0, "grad_norm": 0.34683504700660706, "kl": 0.68798828125, "learning_rate": 5e-05, "loss": 0.0348, "num_tokens": 7377647.0, "reward": 3.345550537109375, "reward_std": 0.35007888078689575, "rewards/helpfulness_reward/mean": 3.345550537109375, "rewards/helpfulness_reward/std": 0.735223114490509, "rewards/safety_reward/mean": 5.162689208984375, "rewards/safety_reward/std": 1.0967739820480347, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 113.5625, "completions/mean_terminated_length": 113.5625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.07300672430355427, "frac_reward_zero_std": 0.0, "grad_norm": 0.34278902411460876, "kl": 0.8173828125, "learning_rate": 5e-05, "loss": 0.0018, "num_tokens": 7395847.0, "reward": 3.5333251953125, "reward_std": 0.30576980113983154, "rewards/helpfulness_reward/mean": 3.5333251953125, "rewards/helpfulness_reward/std": 0.5201345682144165, "rewards/safety_reward/mean": 5.418212890625, "rewards/safety_reward/std": 0.8391727805137634, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 117.8046875, "completions/mean_terminated_length": 117.8046875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.07335603877390622, "frac_reward_zero_std": 0.0, "grad_norm": 0.33292892575263977, "kl": 0.703125, "learning_rate": 5e-05, "loss": 0.0345, "num_tokens": 7415278.0, "reward": 3.5501708984375, "reward_std": 0.3151901364326477, "rewards/helpfulness_reward/mean": 3.5501708984375, "rewards/helpfulness_reward/std": 0.413778156042099, "rewards/safety_reward/mean": 5.766845703125, "rewards/safety_reward/std": 0.7202291488647461, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 114.625, "completions/mean_terminated_length": 114.625, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.07370535324425814, "frac_reward_zero_std": 0.0, "grad_norm": 0.3677665889263153, "kl": 0.80078125, "learning_rate": 5e-05, "loss": -0.0123, "num_tokens": 7434262.0, "reward": 3.519287109375, "reward_std": 0.30332818627357483, "rewards/helpfulness_reward/mean": 3.519287109375, "rewards/helpfulness_reward/std": 0.5027280449867249, "rewards/safety_reward/mean": 5.4715576171875, "rewards/safety_reward/std": 0.6494315266609192, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1792.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 127.46875, "completions/mean_terminated_length": 114.3622055053711, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.07405466771461008, "frac_reward_zero_std": 0.0, "grad_norm": 0.36950427293777466, "kl": 0.79052734375, "learning_rate": 5e-05, "loss": 0.0569, "num_tokens": 7455586.0, "reward": 3.2714290618896484, "reward_std": 0.3709031343460083, "rewards/helpfulness_reward/mean": 3.2714290618896484, "rewards/helpfulness_reward/std": 0.7743660807609558, "rewards/safety_reward/mean": 4.998193740844727, "rewards/safety_reward/std": 1.3582944869995117, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 114.0859375, "completions/mean_terminated_length": 114.0859375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.07440398218496201, "frac_reward_zero_std": 0.0, "grad_norm": 0.34321150183677673, "kl": 0.80078125, "learning_rate": 5e-05, "loss": 0.0156, "num_tokens": 7474485.0, "reward": 3.5771484375, "reward_std": 0.31114161014556885, "rewards/helpfulness_reward/mean": 3.5771484375, "rewards/helpfulness_reward/std": 0.5501877069473267, "rewards/safety_reward/mean": 5.725341796875, "rewards/safety_reward/std": 0.6702796816825867, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 118.8125, "completions/mean_terminated_length": 118.8125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.07475329665531395, "frac_reward_zero_std": 0.0, "grad_norm": 0.33447548747062683, "kl": 0.72216796875, "learning_rate": 5e-05, "loss": 0.0096, "num_tokens": 7494021.0, "reward": 3.656982421875, "reward_std": 0.29385530948638916, "rewards/helpfulness_reward/mean": 3.656982421875, "rewards/helpfulness_reward/std": 0.44126638770103455, "rewards/safety_reward/mean": 5.789794921875, "rewards/safety_reward/std": 0.6634657979011536, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 119.484375, "completions/mean_terminated_length": 119.484375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.07510261112566588, "frac_reward_zero_std": 0.0, "grad_norm": 0.34847164154052734, "kl": 0.75, "learning_rate": 5e-05, "loss": 0.0237, "num_tokens": 7513451.0, "reward": 3.765625, "reward_std": 0.3221212327480316, "rewards/helpfulness_reward/mean": 3.765625, "rewards/helpfulness_reward/std": 0.6182389855384827, "rewards/safety_reward/mean": 5.888916015625, "rewards/safety_reward/std": 0.8298032879829407, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 118.90625, "completions/mean_terminated_length": 118.90625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.07545192559601782, "frac_reward_zero_std": 0.0, "grad_norm": 0.35330477356910706, "kl": 0.7119140625, "learning_rate": 5e-05, "loss": 0.0299, "num_tokens": 7532975.0, "reward": 3.52313232421875, "reward_std": 0.33343198895454407, "rewards/helpfulness_reward/mean": 3.52313232421875, "rewards/helpfulness_reward/std": 0.48640018701553345, "rewards/safety_reward/mean": 5.7257080078125, "rewards/safety_reward/std": 0.5746574401855469, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1267.0, "completions/mean_length": 155.8203125, "completions/mean_terminated_length": 142.93701171875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.07580124006636975, "frac_reward_zero_std": 0.0, "grad_norm": 0.2906865179538727, "kl": 0.64990234375, "learning_rate": 5e-05, "loss": 0.0316, "num_tokens": 7559616.0, "reward": 3.4567642211914062, "reward_std": 0.32800978422164917, "rewards/helpfulness_reward/mean": 3.4567642211914062, "rewards/helpfulness_reward/std": 1.2349660396575928, "rewards/safety_reward/mean": 5.099882125854492, "rewards/safety_reward/std": 1.8133639097213745, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 115.7421875, "completions/mean_terminated_length": 115.7421875, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.07615055453672169, "frac_reward_zero_std": 0.0, "grad_norm": 0.4373776614665985, "kl": 0.97265625, "learning_rate": 5e-05, "loss": 0.052, "num_tokens": 7578855.0, "reward": 3.579345703125, "reward_std": 0.36815905570983887, "rewards/helpfulness_reward/mean": 3.579345703125, "rewards/helpfulness_reward/std": 0.499541699886322, "rewards/safety_reward/mean": 5.7767333984375, "rewards/safety_reward/std": 0.7208629250526428, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 115.7265625, "completions/mean_terminated_length": 115.7265625, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.07649986900707362, "frac_reward_zero_std": 0.0, "grad_norm": 0.35397085547447205, "kl": 0.7783203125, "learning_rate": 5e-05, "loss": 0.0283, "num_tokens": 7597292.0, "reward": 3.52239990234375, "reward_std": 0.2971906065940857, "rewards/helpfulness_reward/mean": 3.52239990234375, "rewards/helpfulness_reward/std": 0.5222691297531128, "rewards/safety_reward/mean": 5.443603515625, "rewards/safety_reward/std": 0.6128877401351929, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 116.3671875, "completions/mean_terminated_length": 116.3671875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.07684918347742556, "frac_reward_zero_std": 0.0, "grad_norm": 0.37541961669921875, "kl": 0.78564453125, "learning_rate": 5e-05, "loss": 0.012, "num_tokens": 7616339.0, "reward": 3.65472412109375, "reward_std": 0.294582337141037, "rewards/helpfulness_reward/mean": 3.65472412109375, "rewards/helpfulness_reward/std": 0.5481933951377869, "rewards/safety_reward/mean": 5.6854248046875, "rewards/safety_reward/std": 0.8552106618881226, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 112.765625, "completions/mean_terminated_length": 112.765625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.07719849794777749, "frac_reward_zero_std": 0.0, "grad_norm": 0.38987237215042114, "kl": 0.84912109375, "learning_rate": 5e-05, "loss": 0.0122, "num_tokens": 7635341.0, "reward": 3.43560791015625, "reward_std": 0.3150341212749481, "rewards/helpfulness_reward/mean": 3.43560791015625, "rewards/helpfulness_reward/std": 0.5333129167556763, "rewards/safety_reward/mean": 5.4892578125, "rewards/safety_reward/std": 0.930497407913208, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 119.40625, "completions/mean_terminated_length": 119.40625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.07754781241812943, "frac_reward_zero_std": 0.0, "grad_norm": 0.36755895614624023, "kl": 0.72509765625, "learning_rate": 5e-05, "loss": 0.0168, "num_tokens": 7655545.0, "reward": 3.55621337890625, "reward_std": 0.31200841069221497, "rewards/helpfulness_reward/mean": 3.55621337890625, "rewards/helpfulness_reward/std": 0.4559723436832428, "rewards/safety_reward/mean": 5.61474609375, "rewards/safety_reward/std": 0.5915781855583191, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 114.8125, "completions/mean_terminated_length": 114.8125, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.07789712688848136, "frac_reward_zero_std": 0.0, "grad_norm": 0.3549712896347046, "kl": 0.88671875, "learning_rate": 5e-05, "loss": 0.009, "num_tokens": 7675793.0, "reward": 3.630126953125, "reward_std": 0.3261728882789612, "rewards/helpfulness_reward/mean": 3.630126953125, "rewards/helpfulness_reward/std": 0.5029445290565491, "rewards/safety_reward/mean": 5.748779296875, "rewards/safety_reward/std": 0.656518280506134, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 116.203125, "completions/mean_terminated_length": 116.203125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.0782464413588333, "frac_reward_zero_std": 0.0, "grad_norm": 0.4091455936431885, "kl": 0.97607421875, "learning_rate": 5e-05, "loss": 0.031, "num_tokens": 7695251.0, "reward": 3.37835693359375, "reward_std": 0.3462868332862854, "rewards/helpfulness_reward/mean": 3.37835693359375, "rewards/helpfulness_reward/std": 0.4746052920818329, "rewards/safety_reward/mean": 5.383544921875, "rewards/safety_reward/std": 0.738653838634491, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 112.6796875, "completions/mean_terminated_length": 112.6796875, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.07859575582918522, "frac_reward_zero_std": 0.0, "grad_norm": 0.39560750126838684, "kl": 0.904296875, "learning_rate": 5e-05, "loss": 0.0206, "num_tokens": 7714666.0, "reward": 3.65289306640625, "reward_std": 0.2874792814254761, "rewards/helpfulness_reward/mean": 3.65289306640625, "rewards/helpfulness_reward/std": 0.5729198455810547, "rewards/safety_reward/mean": 5.719970703125, "rewards/safety_reward/std": 0.713347315788269, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1792.0, "completions/max_terminated_length": 1168.0, "completions/mean_length": 183.4140625, "completions/mean_terminated_length": 157.88096618652344, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.07894507029953717, "frac_reward_zero_std": 0.0, "grad_norm": 0.2913733720779419, "kl": 0.635009765625, "learning_rate": 5e-05, "loss": 0.0094, "num_tokens": 7743911.0, "reward": 3.5282440185546875, "reward_std": 0.4013863503932953, "rewards/helpfulness_reward/mean": 3.5282440185546875, "rewards/helpfulness_reward/std": 0.7545205950737, "rewards/safety_reward/mean": 5.346210479736328, "rewards/safety_reward/std": 1.2279672622680664, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 115.8046875, "completions/mean_terminated_length": 115.8046875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.07929438476988909, "frac_reward_zero_std": 0.0, "grad_norm": 0.3865932822227478, "kl": 0.77392578125, "learning_rate": 5e-05, "loss": 0.0209, "num_tokens": 7764678.0, "reward": 3.4567413330078125, "reward_std": 0.35364460945129395, "rewards/helpfulness_reward/mean": 3.4567413330078125, "rewards/helpfulness_reward/std": 0.9851171970367432, "rewards/safety_reward/mean": 5.660736083984375, "rewards/safety_reward/std": 1.2679719924926758, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 113.09375, "completions/mean_terminated_length": 113.09375, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.07964369924024103, "frac_reward_zero_std": 0.0, "grad_norm": 0.3790788948535919, "kl": 0.91748046875, "learning_rate": 5e-05, "loss": 0.0099, "num_tokens": 7783010.0, "reward": 3.73089599609375, "reward_std": 0.3504467308521271, "rewards/helpfulness_reward/mean": 3.73089599609375, "rewards/helpfulness_reward/std": 0.5744337439537048, "rewards/safety_reward/mean": 5.8277587890625, "rewards/safety_reward/std": 0.6988962888717651, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 112.421875, "completions/mean_terminated_length": 112.421875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.07999301371059296, "frac_reward_zero_std": 0.0, "grad_norm": 0.35168617963790894, "kl": 0.8583984375, "learning_rate": 5e-05, "loss": 0.0086, "num_tokens": 7803400.0, "reward": 3.54901123046875, "reward_std": 0.3297176957130432, "rewards/helpfulness_reward/mean": 3.54901123046875, "rewards/helpfulness_reward/std": 0.50837641954422, "rewards/safety_reward/mean": 5.6497802734375, "rewards/safety_reward/std": 0.6888618469238281, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1792.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 125.90625, "completions/mean_terminated_length": 112.78739929199219, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.0803423281809449, "frac_reward_zero_std": 0.0, "grad_norm": 0.3924004137516022, "kl": 0.81103515625, "learning_rate": 5e-05, "loss": 0.0357, "num_tokens": 7824852.0, "reward": 3.381591796875, "reward_std": 0.29250457882881165, "rewards/helpfulness_reward/mean": 3.381591796875, "rewards/helpfulness_reward/std": 0.5938094258308411, "rewards/safety_reward/mean": 5.206684112548828, "rewards/safety_reward/std": 1.0961326360702515, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 107.28125, "completions/mean_terminated_length": 107.28125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.08069164265129683, "frac_reward_zero_std": 0.0, "grad_norm": 0.38603511452674866, "kl": 1.04541015625, "learning_rate": 5e-05, "loss": 0.0082, "num_tokens": 7843664.0, "reward": 3.5526123046875, "reward_std": 0.29585984349250793, "rewards/helpfulness_reward/mean": 3.5526123046875, "rewards/helpfulness_reward/std": 0.49009737372398376, "rewards/safety_reward/mean": 5.771728515625, "rewards/safety_reward/std": 0.7703976035118103, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 111.3984375, "completions/mean_terminated_length": 111.3984375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.08104095712164877, "frac_reward_zero_std": 0.0, "grad_norm": 0.3786093592643738, "kl": 0.818359375, "learning_rate": 5e-05, "loss": 0.0041, "num_tokens": 7863763.0, "reward": 3.44757080078125, "reward_std": 0.26943373680114746, "rewards/helpfulness_reward/mean": 3.44757080078125, "rewards/helpfulness_reward/std": 0.6333588361740112, "rewards/safety_reward/mean": 5.6031494140625, "rewards/safety_reward/std": 0.6651687622070312, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 111.8203125, "completions/mean_terminated_length": 111.8203125, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.0813902715920007, "frac_reward_zero_std": 0.0, "grad_norm": 0.6673322319984436, "kl": 1.09619140625, "learning_rate": 5e-05, "loss": 0.0188, "num_tokens": 7881532.0, "reward": 3.780029296875, "reward_std": 0.2746846377849579, "rewards/helpfulness_reward/mean": 3.780029296875, "rewards/helpfulness_reward/std": 0.47054532170295715, "rewards/safety_reward/mean": 5.638671875, "rewards/safety_reward/std": 0.5675049424171448, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 111.34375, "completions/mean_terminated_length": 111.34375, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.08173958606235263, "frac_reward_zero_std": 0.0, "grad_norm": 0.3603697717189789, "kl": 0.8623046875, "learning_rate": 5e-05, "loss": 0.0123, "num_tokens": 7901632.0, "reward": 3.6859130859375, "reward_std": 0.2503563165664673, "rewards/helpfulness_reward/mean": 3.6859130859375, "rewards/helpfulness_reward/std": 0.4859454035758972, "rewards/safety_reward/mean": 5.916748046875, "rewards/safety_reward/std": 0.7561191916465759, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 112.8984375, "completions/mean_terminated_length": 112.8984375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.08208890053270457, "frac_reward_zero_std": 0.0, "grad_norm": 0.367197185754776, "kl": 0.96875, "learning_rate": 5e-05, "loss": 0.0121, "num_tokens": 7921827.0, "reward": 3.3927001953125, "reward_std": 0.3469676375389099, "rewards/helpfulness_reward/mean": 3.3927001953125, "rewards/helpfulness_reward/std": 0.5402620434761047, "rewards/safety_reward/mean": 5.4971923828125, "rewards/safety_reward/std": 0.7526901960372925, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 113.0625, "completions/mean_terminated_length": 113.0625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0824382150030565, "frac_reward_zero_std": 0.0, "grad_norm": 1.068939208984375, "kl": 1.328125, "learning_rate": 5e-05, "loss": 0.0488, "num_tokens": 7941739.0, "reward": 3.4959869384765625, "reward_std": 0.35948461294174194, "rewards/helpfulness_reward/mean": 3.4959869384765625, "rewards/helpfulness_reward/std": 0.9394286274909973, "rewards/safety_reward/mean": 5.30267333984375, "rewards/safety_reward/std": 1.6159509420394897, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 111.8203125, "completions/mean_terminated_length": 111.8203125, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.08278752947340844, "frac_reward_zero_std": 0.0, "grad_norm": 0.37183699011802673, "kl": 1.01513671875, "learning_rate": 5e-05, "loss": 0.0202, "num_tokens": 7959940.0, "reward": 3.55316162109375, "reward_std": 0.3143201768398285, "rewards/helpfulness_reward/mean": 3.55316162109375, "rewards/helpfulness_reward/std": 0.5905596613883972, "rewards/safety_reward/mean": 5.5687255859375, "rewards/safety_reward/std": 0.8814994692802429, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 113.8515625, "completions/mean_terminated_length": 113.8515625, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.08313684394376036, "frac_reward_zero_std": 0.0, "grad_norm": 0.36876529455184937, "kl": 0.87939453125, "learning_rate": 5e-05, "loss": 0.0135, "num_tokens": 7978689.0, "reward": 3.7279052734375, "reward_std": 0.36241668462753296, "rewards/helpfulness_reward/mean": 3.7279052734375, "rewards/helpfulness_reward/std": 0.5136378407478333, "rewards/safety_reward/mean": 5.9874267578125, "rewards/safety_reward/std": 0.7880927324295044, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 114.5859375, "completions/mean_terminated_length": 114.5859375, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.0834861584141123, "frac_reward_zero_std": 0.0, "grad_norm": 0.43694448471069336, "kl": 0.9970703125, "learning_rate": 5e-05, "loss": 0.0264, "num_tokens": 7997756.0, "reward": 3.80517578125, "reward_std": 0.3431246280670166, "rewards/helpfulness_reward/mean": 3.80517578125, "rewards/helpfulness_reward/std": 0.43883785605430603, "rewards/safety_reward/mean": 6.0728759765625, "rewards/safety_reward/std": 0.7371152639389038, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 117.3828125, "completions/mean_terminated_length": 117.3828125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.08383547288446423, "frac_reward_zero_std": 0.0, "grad_norm": 0.36563727259635925, "kl": 0.88134765625, "learning_rate": 5e-05, "loss": 0.0137, "num_tokens": 8017725.0, "reward": 3.6517333984375, "reward_std": 0.30371445417404175, "rewards/helpfulness_reward/mean": 3.6517333984375, "rewards/helpfulness_reward/std": 0.48629069328308105, "rewards/safety_reward/mean": 5.847900390625, "rewards/safety_reward/std": 0.8636791706085205, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 116.4140625, "completions/mean_terminated_length": 116.4140625, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.08418478735481617, "frac_reward_zero_std": 0.0, "grad_norm": 0.39804989099502563, "kl": 0.9775390625, "learning_rate": 5e-05, "loss": -0.0033, "num_tokens": 8037234.0, "reward": 3.62652587890625, "reward_std": 0.304945170879364, "rewards/helpfulness_reward/mean": 3.62652587890625, "rewards/helpfulness_reward/std": 0.5103920102119446, "rewards/safety_reward/mean": 5.64892578125, "rewards/safety_reward/std": 0.8817362785339355, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 117.75, "completions/mean_terminated_length": 117.75, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.0845341018251681, "frac_reward_zero_std": 0.0, "grad_norm": 0.36526408791542053, "kl": 0.95751953125, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 8056194.0, "reward": 3.548583984375, "reward_std": 0.30452805757522583, "rewards/helpfulness_reward/mean": 3.548583984375, "rewards/helpfulness_reward/std": 0.4269378185272217, "rewards/safety_reward/mean": 5.73388671875, "rewards/safety_reward/std": 0.6443842649459839, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 113.1015625, "completions/mean_terminated_length": 113.1015625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.08488341629552004, "frac_reward_zero_std": 0.0, "grad_norm": 0.3996671736240387, "kl": 0.93701171875, "learning_rate": 5e-05, "loss": -0.0061, "num_tokens": 8075295.0, "reward": 3.491943359375, "reward_std": 0.36530405282974243, "rewards/helpfulness_reward/mean": 3.491943359375, "rewards/helpfulness_reward/std": 0.7023945450782776, "rewards/safety_reward/mean": 5.48974609375, "rewards/safety_reward/std": 0.8936222195625305, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 122.4140625, "completions/mean_terminated_length": 122.4140625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.08523273076587197, "frac_reward_zero_std": 0.0, "grad_norm": 0.35803020000457764, "kl": 0.8916015625, "learning_rate": 5e-05, "loss": 0.0264, "num_tokens": 8096228.0, "reward": 3.71026611328125, "reward_std": 0.360344260931015, "rewards/helpfulness_reward/mean": 3.71026611328125, "rewards/helpfulness_reward/std": 0.5508808493614197, "rewards/safety_reward/mean": 5.9566650390625, "rewards/safety_reward/std": 0.8726773262023926, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 114.53125, "completions/mean_terminated_length": 114.53125, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.08558204523622391, "frac_reward_zero_std": 0.0, "grad_norm": 0.3552563190460205, "kl": 0.9775390625, "learning_rate": 5e-05, "loss": 0.0305, "num_tokens": 8114880.0, "reward": 3.7496337890625, "reward_std": 0.3425962030887604, "rewards/helpfulness_reward/mean": 3.7496337890625, "rewards/helpfulness_reward/std": 0.5710084438323975, "rewards/safety_reward/mean": 5.771484375, "rewards/safety_reward/std": 0.7684107422828674, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1792.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 175.1796875, "completions/mean_terminated_length": 123.0241928100586, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.08593135970657584, "frac_reward_zero_std": 0.0, "grad_norm": 1.329323649406433, "kl": 1.357421875, "learning_rate": 5e-05, "loss": 0.0593, "num_tokens": 8142783.0, "reward": 3.46893310546875, "reward_std": 0.4573532044887543, "rewards/helpfulness_reward/mean": 3.46893310546875, "rewards/helpfulness_reward/std": 1.0928771495819092, "rewards/safety_reward/mean": 5.30584716796875, "rewards/safety_reward/std": 2.025034189224243, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 117.8828125, "completions/mean_terminated_length": 117.8828125, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.08628067417692778, "frac_reward_zero_std": 0.0, "grad_norm": 0.34996530413627625, "kl": 0.916015625, "learning_rate": 5e-05, "loss": 0.0117, "num_tokens": 8161976.0, "reward": 3.83306884765625, "reward_std": 0.418049693107605, "rewards/helpfulness_reward/mean": 3.83306884765625, "rewards/helpfulness_reward/std": 0.6664712429046631, "rewards/safety_reward/mean": 5.755615234375, "rewards/safety_reward/std": 0.892451822757721, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 115.078125, "completions/mean_terminated_length": 115.078125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.08662998864727971, "frac_reward_zero_std": 0.0, "grad_norm": 0.33983278274536133, "kl": 1.01123046875, "learning_rate": 5e-05, "loss": 0.0343, "num_tokens": 8180642.0, "reward": 3.765716552734375, "reward_std": 0.3882954716682434, "rewards/helpfulness_reward/mean": 3.765716552734375, "rewards/helpfulness_reward/std": 0.6076027750968933, "rewards/safety_reward/mean": 5.87548828125, "rewards/safety_reward/std": 0.7108051180839539, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1792.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 162.8203125, "completions/mean_terminated_length": 123.72000885009766, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.08697930311763165, "frac_reward_zero_std": 0.0, "grad_norm": 0.30231600999832153, "kl": 0.728515625, "learning_rate": 5e-05, "loss": -0.0029, "num_tokens": 8209379.0, "reward": 3.353130340576172, "reward_std": 0.40486669540405273, "rewards/helpfulness_reward/mean": 3.353130340576172, "rewards/helpfulness_reward/std": 1.341231346130371, "rewards/safety_reward/mean": 5.33807373046875, "rewards/safety_reward/std": 2.239114284515381, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 115.4140625, "completions/mean_terminated_length": 115.4140625, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.08732861758798358, "frac_reward_zero_std": 0.0, "grad_norm": 0.3990197777748108, "kl": 0.88720703125, "learning_rate": 5e-05, "loss": 0.0326, "num_tokens": 8229776.0, "reward": 3.39459228515625, "reward_std": 0.33115214109420776, "rewards/helpfulness_reward/mean": 3.39459228515625, "rewards/helpfulness_reward/std": 0.470979779958725, "rewards/safety_reward/mean": 5.6209716796875, "rewards/safety_reward/std": 0.5759086608886719, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 124.203125, "completions/mean_terminated_length": 124.203125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.08767793205833552, "frac_reward_zero_std": 0.0, "grad_norm": 0.32158875465393066, "kl": 0.84326171875, "learning_rate": 5e-05, "loss": 0.0686, "num_tokens": 8250034.0, "reward": 3.50685977935791, "reward_std": 0.3603965640068054, "rewards/helpfulness_reward/mean": 3.50685977935791, "rewards/helpfulness_reward/std": 1.0237455368041992, "rewards/safety_reward/mean": 5.6876983642578125, "rewards/safety_reward/std": 1.5476824045181274, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 116.5703125, "completions/mean_terminated_length": 116.5703125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.08802724652868744, "frac_reward_zero_std": 0.0, "grad_norm": 0.34763145446777344, "kl": 0.939453125, "learning_rate": 5e-05, "loss": 0.0157, "num_tokens": 8268755.0, "reward": 3.682373046875, "reward_std": 0.3486410975456238, "rewards/helpfulness_reward/mean": 3.682373046875, "rewards/helpfulness_reward/std": 0.5022139549255371, "rewards/safety_reward/mean": 5.59326171875, "rewards/safety_reward/std": 0.7506889700889587, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 114.234375, "completions/mean_terminated_length": 114.234375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.08837656099903939, "frac_reward_zero_std": 0.0, "grad_norm": 0.4636784493923187, "kl": 1.1533203125, "learning_rate": 5e-05, "loss": 0.0045, "num_tokens": 8287097.0, "reward": 3.5963134765625, "reward_std": 0.40053391456604004, "rewards/helpfulness_reward/mean": 3.5963134765625, "rewards/helpfulness_reward/std": 0.46845269203186035, "rewards/safety_reward/mean": 5.568359375, "rewards/safety_reward/std": 0.9376373887062073, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 118.125, "completions/mean_terminated_length": 118.125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.08872587546939131, "frac_reward_zero_std": 0.0, "grad_norm": 0.34311407804489136, "kl": 0.87451171875, "learning_rate": 5e-05, "loss": 0.0074, "num_tokens": 8306409.0, "reward": 3.6500244140625, "reward_std": 0.2910344898700714, "rewards/helpfulness_reward/mean": 3.6500244140625, "rewards/helpfulness_reward/std": 0.42943447828292847, "rewards/safety_reward/mean": 5.681396484375, "rewards/safety_reward/std": 0.6320597529411316, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 121.9453125, "completions/mean_terminated_length": 121.9453125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.08907518993974325, "frac_reward_zero_std": 0.0, "grad_norm": 0.33331871032714844, "kl": 0.873046875, "learning_rate": 5e-05, "loss": 0.0079, "num_tokens": 8326138.0, "reward": 3.757568359375, "reward_std": 0.26054686307907104, "rewards/helpfulness_reward/mean": 3.757568359375, "rewards/helpfulness_reward/std": 0.5328361392021179, "rewards/safety_reward/mean": 5.784423828125, "rewards/safety_reward/std": 0.8205010890960693, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 122.609375, "completions/mean_terminated_length": 122.609375, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.08942450441009518, "frac_reward_zero_std": 0.0, "grad_norm": 0.3122079074382782, "kl": 0.94140625, "learning_rate": 5e-05, "loss": 0.0471, "num_tokens": 8346296.0, "reward": 3.565338134765625, "reward_std": 0.34720999002456665, "rewards/helpfulness_reward/mean": 3.565338134765625, "rewards/helpfulness_reward/std": 0.661865770816803, "rewards/safety_reward/mean": 5.5599365234375, "rewards/safety_reward/std": 0.8138777017593384, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 118.203125, "completions/mean_terminated_length": 118.203125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.08977381888044712, "frac_reward_zero_std": 0.0, "grad_norm": 0.3398616909980774, "kl": 0.9853515625, "learning_rate": 5e-05, "loss": 0.0348, "num_tokens": 8365378.0, "reward": 3.87353515625, "reward_std": 0.31054532527923584, "rewards/helpfulness_reward/mean": 3.87353515625, "rewards/helpfulness_reward/std": 0.4808255732059479, "rewards/safety_reward/mean": 5.8712158203125, "rewards/safety_reward/std": 0.6869930028915405, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 115.640625, "completions/mean_terminated_length": 115.640625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.09012313335079905, "frac_reward_zero_std": 0.0, "grad_norm": 0.3782918453216553, "kl": 0.92138671875, "learning_rate": 5e-05, "loss": 0.0134, "num_tokens": 8385972.0, "reward": 3.60809326171875, "reward_std": 0.35132282972335815, "rewards/helpfulness_reward/mean": 3.60809326171875, "rewards/helpfulness_reward/std": 0.5453391671180725, "rewards/safety_reward/mean": 5.8087158203125, "rewards/safety_reward/std": 0.7931057810783386, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 118.921875, "completions/mean_terminated_length": 118.921875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.09047244782115099, "frac_reward_zero_std": 0.0, "grad_norm": 0.5587125420570374, "kl": 1.11669921875, "learning_rate": 5e-05, "loss": 0.0266, "num_tokens": 8405650.0, "reward": 3.6141357421875, "reward_std": 0.3259870409965515, "rewards/helpfulness_reward/mean": 3.6141357421875, "rewards/helpfulness_reward/std": 0.48033803701400757, "rewards/safety_reward/mean": 5.8077392578125, "rewards/safety_reward/std": 0.6713055372238159, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 115.9296875, "completions/mean_terminated_length": 115.9296875, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.09082176229150292, "frac_reward_zero_std": 0.0, "grad_norm": 0.3806913495063782, "kl": 0.94384765625, "learning_rate": 5e-05, "loss": 0.0284, "num_tokens": 8424041.0, "reward": 3.916748046875, "reward_std": 0.3473796248435974, "rewards/helpfulness_reward/mean": 3.916748046875, "rewards/helpfulness_reward/std": 0.48296645283699036, "rewards/safety_reward/mean": 6.021484375, "rewards/safety_reward/std": 0.7424442172050476, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1055.0, "completions/max_terminated_length": 1055.0, "completions/mean_length": 143.1015625, "completions/mean_terminated_length": 143.1015625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.09117107676185486, "frac_reward_zero_std": 0.0, "grad_norm": 0.3198004961013794, "kl": 0.74365234375, "learning_rate": 5e-05, "loss": 0.0658, "num_tokens": 8450726.0, "reward": 3.38372802734375, "reward_std": 0.4036116600036621, "rewards/helpfulness_reward/mean": 3.38372802734375, "rewards/helpfulness_reward/std": 1.2416349649429321, "rewards/safety_reward/mean": 5.3551025390625, "rewards/safety_reward/std": 2.151470899581909, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 112.84375, "completions/mean_terminated_length": 112.84375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.09152039123220679, "frac_reward_zero_std": 0.0, "grad_norm": 0.3276878893375397, "kl": 0.9921875, "learning_rate": 5e-05, "loss": 0.0186, "num_tokens": 8468770.0, "reward": 4.060546875, "reward_std": 0.2954387664794922, "rewards/helpfulness_reward/mean": 4.060546875, "rewards/helpfulness_reward/std": 0.47293490171432495, "rewards/safety_reward/mean": 6.0899658203125, "rewards/safety_reward/std": 0.7555027604103088, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 115.0078125, "completions/mean_terminated_length": 115.0078125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.09186970570255873, "frac_reward_zero_std": 0.0, "grad_norm": 0.34454232454299927, "kl": 0.943359375, "learning_rate": 5e-05, "loss": 0.0536, "num_tokens": 8487731.0, "reward": 3.6749267578125, "reward_std": 0.4269813001155853, "rewards/helpfulness_reward/mean": 3.6749267578125, "rewards/helpfulness_reward/std": 0.5812003016471863, "rewards/safety_reward/mean": 5.8953857421875, "rewards/safety_reward/std": 0.9163849353790283, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 114.0234375, "completions/mean_terminated_length": 114.0234375, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.09221902017291066, "frac_reward_zero_std": 0.0, "grad_norm": 0.3933789134025574, "kl": 1.1064453125, "learning_rate": 5e-05, "loss": 0.0166, "num_tokens": 8505662.0, "reward": 3.62939453125, "reward_std": 0.3390616476535797, "rewards/helpfulness_reward/mean": 3.62939453125, "rewards/helpfulness_reward/std": 0.6383566856384277, "rewards/safety_reward/mean": 5.4957275390625, "rewards/safety_reward/std": 0.9127510190010071, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 117.203125, "completions/mean_terminated_length": 117.203125, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.0925683346432626, "frac_reward_zero_std": 0.0, "grad_norm": 0.35991597175598145, "kl": 0.98193359375, "learning_rate": 5e-05, "loss": -0.0061, "num_tokens": 8524440.0, "reward": 3.4871826171875, "reward_std": 0.3115389943122864, "rewards/helpfulness_reward/mean": 3.4871826171875, "rewards/helpfulness_reward/std": 0.6129323244094849, "rewards/safety_reward/mean": 5.5826416015625, "rewards/safety_reward/std": 1.145691156387329, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 112.0703125, "completions/mean_terminated_length": 112.0703125, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.09291764911361453, "frac_reward_zero_std": 0.0, "grad_norm": 0.3651387691497803, "kl": 1.00146484375, "learning_rate": 5e-05, "loss": 0.015, "num_tokens": 8542345.0, "reward": 3.77337646484375, "reward_std": 0.3057919144630432, "rewards/helpfulness_reward/mean": 3.77337646484375, "rewards/helpfulness_reward/std": 0.48642003536224365, "rewards/safety_reward/mean": 5.8768310546875, "rewards/safety_reward/std": 0.7070459127426147, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1792.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 145.203125, "completions/mean_terminated_length": 119.0634994506836, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.09326696358396647, "frac_reward_zero_std": 0.0, "grad_norm": 0.320706307888031, "kl": 0.84765625, "learning_rate": 5e-05, "loss": 0.0262, "num_tokens": 8567475.0, "reward": 3.3924570083618164, "reward_std": 0.28034913539886475, "rewards/helpfulness_reward/mean": 3.3924570083618164, "rewards/helpfulness_reward/std": 1.0459363460540771, "rewards/safety_reward/mean": 5.2232818603515625, "rewards/safety_reward/std": 1.979003667831421, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 112.8515625, "completions/mean_terminated_length": 112.8515625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.0936162780543184, "frac_reward_zero_std": 0.0, "grad_norm": 0.38817429542541504, "kl": 1.1396484375, "learning_rate": 5e-05, "loss": 0.018, "num_tokens": 8585488.0, "reward": 3.7412109375, "reward_std": 0.32675600051879883, "rewards/helpfulness_reward/mean": 3.7412109375, "rewards/helpfulness_reward/std": 0.5502296090126038, "rewards/safety_reward/mean": 5.8497314453125, "rewards/safety_reward/std": 0.654886782169342, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 115.8828125, "completions/mean_terminated_length": 115.8828125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.09396559252467034, "frac_reward_zero_std": 0.0, "grad_norm": 0.41290295124053955, "kl": 1.1318359375, "learning_rate": 5e-05, "loss": 0.0453, "num_tokens": 8604313.0, "reward": 3.71734619140625, "reward_std": 0.36002248525619507, "rewards/helpfulness_reward/mean": 3.71734619140625, "rewards/helpfulness_reward/std": 0.5182619690895081, "rewards/safety_reward/mean": 5.9365234375, "rewards/safety_reward/std": 0.7900409698486328, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 114.953125, "completions/mean_terminated_length": 114.953125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.09431490699502226, "frac_reward_zero_std": 0.0, "grad_norm": 0.3492775857448578, "kl": 1.0048828125, "learning_rate": 5e-05, "loss": 0.0287, "num_tokens": 8625411.0, "reward": 3.61370849609375, "reward_std": 0.34210044145584106, "rewards/helpfulness_reward/mean": 3.61370849609375, "rewards/helpfulness_reward/std": 0.5689164400100708, "rewards/safety_reward/mean": 5.7200927734375, "rewards/safety_reward/std": 0.8273598551750183, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 114.1171875, "completions/mean_terminated_length": 114.1171875, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.0946642214653742, "frac_reward_zero_std": 0.0, "grad_norm": 0.36706292629241943, "kl": 1.0283203125, "learning_rate": 5e-05, "loss": 0.0167, "num_tokens": 8644626.0, "reward": 3.9388427734375, "reward_std": 0.295706570148468, "rewards/helpfulness_reward/mean": 3.9388427734375, "rewards/helpfulness_reward/std": 0.4568158686161041, "rewards/safety_reward/mean": 6.082763671875, "rewards/safety_reward/std": 0.6987364292144775, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/max_terminated_length": 590.0, "completions/mean_length": 119.8359375, "completions/mean_terminated_length": 119.8359375, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.09501353593572613, "frac_reward_zero_std": 0.0, "grad_norm": 0.368210107088089, "kl": 0.96533203125, "learning_rate": 5e-05, "loss": 0.0075, "num_tokens": 8664957.0, "reward": 3.5017242431640625, "reward_std": 0.3274557292461395, "rewards/helpfulness_reward/mean": 3.5017242431640625, "rewards/helpfulness_reward/std": 0.845766007900238, "rewards/safety_reward/mean": 5.5938720703125, "rewards/safety_reward/std": 1.2665469646453857, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 112.59375, "completions/mean_terminated_length": 112.59375, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.09536285040607807, "frac_reward_zero_std": 0.0, "grad_norm": 0.39219895005226135, "kl": 1.1083984375, "learning_rate": 5e-05, "loss": 0.0287, "num_tokens": 8682593.0, "reward": 3.822509765625, "reward_std": 0.31022733449935913, "rewards/helpfulness_reward/mean": 3.822509765625, "rewards/helpfulness_reward/std": 0.5052160024642944, "rewards/safety_reward/mean": 5.968505859375, "rewards/safety_reward/std": 0.7955675721168518, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 112.1484375, "completions/mean_terminated_length": 112.1484375, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.09571216487643, "frac_reward_zero_std": 0.0, "grad_norm": 0.36135098338127136, "kl": 0.99658203125, "learning_rate": 5e-05, "loss": -0.0001, "num_tokens": 8702004.0, "reward": 3.822998046875, "reward_std": 0.24474090337753296, "rewards/helpfulness_reward/mean": 3.822998046875, "rewards/helpfulness_reward/std": 0.47659167647361755, "rewards/safety_reward/mean": 6.2236328125, "rewards/safety_reward/std": 0.6990510821342468, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 112.671875, "completions/mean_terminated_length": 112.671875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.09606147934678194, "frac_reward_zero_std": 0.0, "grad_norm": 0.3827451765537262, "kl": 1.0283203125, "learning_rate": 5e-05, "loss": 0.0212, "num_tokens": 8720442.0, "reward": 3.6478271484375, "reward_std": 0.31735745072364807, "rewards/helpfulness_reward/mean": 3.6478271484375, "rewards/helpfulness_reward/std": 0.56821209192276, "rewards/safety_reward/mean": 5.7264404296875, "rewards/safety_reward/std": 0.8641143441200256, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 112.0234375, "completions/mean_terminated_length": 112.0234375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.09641079381713387, "frac_reward_zero_std": 0.0, "grad_norm": 0.3793887197971344, "kl": 1.1201171875, "learning_rate": 5e-05, "loss": -0.0065, "num_tokens": 8739749.0, "reward": 3.64306640625, "reward_std": 0.3013889491558075, "rewards/helpfulness_reward/mean": 3.64306640625, "rewards/helpfulness_reward/std": 0.48835495114326477, "rewards/safety_reward/mean": 5.936767578125, "rewards/safety_reward/std": 0.7618755102157593, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 112.375, "completions/mean_terminated_length": 112.375, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.09676010828748581, "frac_reward_zero_std": 0.0, "grad_norm": 0.41704103350639343, "kl": 1.1103515625, "learning_rate": 5e-05, "loss": -0.002, "num_tokens": 8757573.0, "reward": 3.808349609375, "reward_std": 0.32497602701187134, "rewards/helpfulness_reward/mean": 3.808349609375, "rewards/helpfulness_reward/std": 0.5240673422813416, "rewards/safety_reward/mean": 5.7564697265625, "rewards/safety_reward/std": 0.7411238551139832, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 109.515625, "completions/mean_terminated_length": 109.515625, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.09710942275783774, "frac_reward_zero_std": 0.0, "grad_norm": 0.3483309745788574, "kl": 1.1015625, "learning_rate": 5e-05, "loss": -0.0121, "num_tokens": 8776135.0, "reward": 3.88983154296875, "reward_std": 0.32566893100738525, "rewards/helpfulness_reward/mean": 3.88983154296875, "rewards/helpfulness_reward/std": 0.5488362908363342, "rewards/safety_reward/mean": 6.109130859375, "rewards/safety_reward/std": 0.7090288996696472, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 782.0, "completions/max_terminated_length": 782.0, "completions/mean_length": 135.4921875, "completions/mean_terminated_length": 135.4921875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.09745873722818968, "frac_reward_zero_std": 0.0, "grad_norm": 0.35553237795829773, "kl": 0.95703125, "learning_rate": 5e-05, "loss": 0.0735, "num_tokens": 8799390.0, "reward": 3.8031959533691406, "reward_std": 0.37616783380508423, "rewards/helpfulness_reward/mean": 3.8031959533691406, "rewards/helpfulness_reward/std": 0.8266212344169617, "rewards/safety_reward/mean": 5.799781799316406, "rewards/safety_reward/std": 1.246313214302063, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 116.734375, "completions/mean_terminated_length": 116.734375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.0978080516985416, "frac_reward_zero_std": 0.0, "grad_norm": 0.3697638213634491, "kl": 0.97802734375, "learning_rate": 5e-05, "loss": 0.0365, "num_tokens": 8819100.0, "reward": 3.731201171875, "reward_std": 0.3514450788497925, "rewards/helpfulness_reward/mean": 3.731201171875, "rewards/helpfulness_reward/std": 0.5284503698348999, "rewards/safety_reward/mean": 5.923095703125, "rewards/safety_reward/std": 0.7901476621627808, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 115.671875, "completions/mean_terminated_length": 115.671875, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.09815736616889355, "frac_reward_zero_std": 0.0, "grad_norm": 0.3511950969696045, "kl": 1.125, "learning_rate": 5e-05, "loss": 0.0267, "num_tokens": 8838258.0, "reward": 3.9229736328125, "reward_std": 0.29830023646354675, "rewards/helpfulness_reward/mean": 3.9229736328125, "rewards/helpfulness_reward/std": 0.45622700452804565, "rewards/safety_reward/mean": 6.0692138671875, "rewards/safety_reward/std": 0.7191154956817627, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 112.3828125, "completions/mean_terminated_length": 112.3828125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.09850668063924548, "frac_reward_zero_std": 0.0, "grad_norm": 0.4050510823726654, "kl": 1.119140625, "learning_rate": 5e-05, "loss": 0.0298, "num_tokens": 8857435.0, "reward": 3.876220703125, "reward_std": 0.34962034225463867, "rewards/helpfulness_reward/mean": 3.876220703125, "rewards/helpfulness_reward/std": 0.5916502475738525, "rewards/safety_reward/mean": 6.2569580078125, "rewards/safety_reward/std": 0.7519654631614685, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 112.6171875, "completions/mean_terminated_length": 112.6171875, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.09885599510959742, "frac_reward_zero_std": 0.0, "grad_norm": 0.39177653193473816, "kl": 1.099609375, "learning_rate": 5e-05, "loss": 0.0309, "num_tokens": 8875994.0, "reward": 3.87890625, "reward_std": 0.3812812566757202, "rewards/helpfulness_reward/mean": 3.87890625, "rewards/helpfulness_reward/std": 0.628251850605011, "rewards/safety_reward/mean": 5.962890625, "rewards/safety_reward/std": 0.8604291081428528, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 114.75, "completions/mean_terminated_length": 114.75, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.09920530957994934, "frac_reward_zero_std": 0.0, "grad_norm": 0.49472495913505554, "kl": 1.154296875, "learning_rate": 5e-05, "loss": 0.0124, "num_tokens": 8895106.0, "reward": 3.865234375, "reward_std": 0.2681770324707031, "rewards/helpfulness_reward/mean": 3.865234375, "rewards/helpfulness_reward/std": 0.4957256019115448, "rewards/safety_reward/mean": 5.99755859375, "rewards/safety_reward/std": 0.7353349328041077, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 117.0625, "completions/mean_terminated_length": 117.0625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.09955462405030129, "frac_reward_zero_std": 0.0, "grad_norm": 0.3544132709503174, "kl": 1.02978515625, "learning_rate": 5e-05, "loss": 0.0226, "num_tokens": 8915578.0, "reward": 4.046875, "reward_std": 0.2966936230659485, "rewards/helpfulness_reward/mean": 4.046875, "rewards/helpfulness_reward/std": 0.5241096019744873, "rewards/safety_reward/mean": 6.1871337890625, "rewards/safety_reward/std": 0.7267307043075562, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 135.9296875, "completions/mean_terminated_length": 135.9296875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.09990393852065321, "frac_reward_zero_std": 0.0, "grad_norm": 0.3665153682231903, "kl": 0.904296875, "learning_rate": 5e-05, "loss": 0.0298, "num_tokens": 8939089.0, "reward": 3.592498779296875, "reward_std": 0.3486262261867523, "rewards/helpfulness_reward/mean": 3.592498779296875, "rewards/helpfulness_reward/std": 0.8938090801239014, "rewards/safety_reward/mean": 5.507877349853516, "rewards/safety_reward/std": 1.5482945442199707, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 114.9609375, "completions/mean_terminated_length": 114.9609375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.10025325299100515, "frac_reward_zero_std": 0.0, "grad_norm": 0.37267255783081055, "kl": 0.97509765625, "learning_rate": 5e-05, "loss": 0.0263, "num_tokens": 8958228.0, "reward": 4.04052734375, "reward_std": 0.2543908357620239, "rewards/helpfulness_reward/mean": 4.04052734375, "rewards/helpfulness_reward/std": 0.43346723914146423, "rewards/safety_reward/mean": 6.30908203125, "rewards/safety_reward/std": 0.7083045244216919, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 114.34375, "completions/mean_terminated_length": 114.34375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.10060256746135708, "frac_reward_zero_std": 0.0, "grad_norm": 0.38588446378707886, "kl": 1.111328125, "learning_rate": 5e-05, "loss": 0.0194, "num_tokens": 8976928.0, "reward": 3.878662109375, "reward_std": 0.2808074951171875, "rewards/helpfulness_reward/mean": 3.878662109375, "rewards/helpfulness_reward/std": 0.5245476961135864, "rewards/safety_reward/mean": 6.05419921875, "rewards/safety_reward/std": 0.7922040224075317, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 118.1953125, "completions/mean_terminated_length": 118.1953125, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.10095188193170902, "frac_reward_zero_std": 0.0, "grad_norm": 0.41850799322128296, "kl": 1.11328125, "learning_rate": 5e-05, "loss": 0.0421, "num_tokens": 8996393.0, "reward": 3.73828125, "reward_std": 0.3362189829349518, "rewards/helpfulness_reward/mean": 3.73828125, "rewards/helpfulness_reward/std": 0.537709653377533, "rewards/safety_reward/mean": 5.750732421875, "rewards/safety_reward/std": 0.7573328614234924, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 115.3125, "completions/mean_terminated_length": 115.3125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.10130119640206095, "frac_reward_zero_std": 0.0, "grad_norm": 0.41806739568710327, "kl": 1.279296875, "learning_rate": 5e-05, "loss": 0.0267, "num_tokens": 9014865.0, "reward": 3.62640380859375, "reward_std": 0.2696187496185303, "rewards/helpfulness_reward/mean": 3.62640380859375, "rewards/helpfulness_reward/std": 0.5464903712272644, "rewards/safety_reward/mean": 5.3408203125, "rewards/safety_reward/std": 0.9356008172035217, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 114.1640625, "completions/mean_terminated_length": 114.1640625, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.10165051087241289, "frac_reward_zero_std": 0.0, "grad_norm": 0.3954828679561615, "kl": 1.05712890625, "learning_rate": 5e-05, "loss": 0.0325, "num_tokens": 9034230.0, "reward": 3.9923095703125, "reward_std": 0.3250218629837036, "rewards/helpfulness_reward/mean": 3.9923095703125, "rewards/helpfulness_reward/std": 0.658492922782898, "rewards/safety_reward/mean": 6.3525390625, "rewards/safety_reward/std": 0.6598891019821167, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 115.703125, "completions/mean_terminated_length": 115.703125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.10199982534276482, "frac_reward_zero_std": 0.0, "grad_norm": 0.46355533599853516, "kl": 1.2421875, "learning_rate": 5e-05, "loss": 0.0423, "num_tokens": 9053592.0, "reward": 3.981109619140625, "reward_std": 0.32631948590278625, "rewards/helpfulness_reward/mean": 3.981109619140625, "rewards/helpfulness_reward/std": 0.6739892959594727, "rewards/safety_reward/mean": 6.10748291015625, "rewards/safety_reward/std": 0.8852378129959106, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 115.671875, "completions/mean_terminated_length": 115.671875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.10234913981311676, "frac_reward_zero_std": 0.0, "grad_norm": 0.39126259088516235, "kl": 1.1259765625, "learning_rate": 5e-05, "loss": 0.0282, "num_tokens": 9073262.0, "reward": 3.96533203125, "reward_std": 0.2620248794555664, "rewards/helpfulness_reward/mean": 3.96533203125, "rewards/helpfulness_reward/std": 0.41267693042755127, "rewards/safety_reward/mean": 6.115478515625, "rewards/safety_reward/std": 0.6697851419448853, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1516.0, "completions/max_terminated_length": 1516.0, "completions/mean_length": 134.1796875, "completions/mean_terminated_length": 134.1796875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.10269845428346869, "frac_reward_zero_std": 0.0, "grad_norm": 0.35901254415512085, "kl": 0.990234375, "learning_rate": 5e-05, "loss": 0.0502, "num_tokens": 9095821.0, "reward": 3.4432945251464844, "reward_std": 0.33525216579437256, "rewards/helpfulness_reward/mean": 3.4432945251464844, "rewards/helpfulness_reward/std": 1.053454875946045, "rewards/safety_reward/mean": 5.2413177490234375, "rewards/safety_reward/std": 1.7198625802993774, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 113.6484375, "completions/mean_terminated_length": 113.6484375, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.10304776875382063, "frac_reward_zero_std": 0.0, "grad_norm": 0.41889920830726624, "kl": 1.1220703125, "learning_rate": 5e-05, "loss": 0.0178, "num_tokens": 9114552.0, "reward": 4.0841064453125, "reward_std": 0.2851594090461731, "rewards/helpfulness_reward/mean": 4.0841064453125, "rewards/helpfulness_reward/std": 0.49981585144996643, "rewards/safety_reward/mean": 6.1246337890625, "rewards/safety_reward/std": 0.6408619284629822, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 111.78125, "completions/mean_terminated_length": 111.78125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.10339708322417256, "frac_reward_zero_std": 0.0, "grad_norm": 0.40055590867996216, "kl": 1.255859375, "learning_rate": 5e-05, "loss": 0.0278, "num_tokens": 9132876.0, "reward": 3.7952880859375, "reward_std": 0.2804757356643677, "rewards/helpfulness_reward/mean": 3.7952880859375, "rewards/helpfulness_reward/std": 0.3745323717594147, "rewards/safety_reward/mean": 5.857421875, "rewards/safety_reward/std": 0.6002192497253418, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 110.015625, "completions/mean_terminated_length": 110.015625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.1037463976945245, "frac_reward_zero_std": 0.0, "grad_norm": 0.40987521409988403, "kl": 1.2177734375, "learning_rate": 5e-05, "loss": 0.0015, "num_tokens": 9151542.0, "reward": 3.9136962890625, "reward_std": 0.32197949290275574, "rewards/helpfulness_reward/mean": 3.9136962890625, "rewards/helpfulness_reward/std": 0.5813727378845215, "rewards/safety_reward/mean": 6.12548828125, "rewards/safety_reward/std": 0.6454454064369202, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1628.0, "completions/mean_length": 192.9296875, "completions/mean_terminated_length": 154.552001953125, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.10409571216487642, "frac_reward_zero_std": 0.0, "grad_norm": 0.367777019739151, "kl": 0.99658203125, "learning_rate": 5e-05, "loss": 0.0912, "num_tokens": 9182213.0, "reward": 3.6741409301757812, "reward_std": 0.34336310625076294, "rewards/helpfulness_reward/mean": 3.6741409301757812, "rewards/helpfulness_reward/std": 0.8992522358894348, "rewards/safety_reward/mean": 5.646919250488281, "rewards/safety_reward/std": 1.447012186050415, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 119.609375, "completions/mean_terminated_length": 119.609375, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.10444502663522837, "frac_reward_zero_std": 0.0, "grad_norm": 0.4356423616409302, "kl": 1.09716796875, "learning_rate": 5e-05, "loss": 0.0578, "num_tokens": 9201691.0, "reward": 3.7140064239501953, "reward_std": 0.3680320978164673, "rewards/helpfulness_reward/mean": 3.7140064239501953, "rewards/helpfulness_reward/std": 0.786153256893158, "rewards/safety_reward/mean": 5.750457763671875, "rewards/safety_reward/std": 1.0984095335006714, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 111.3828125, "completions/mean_terminated_length": 111.3828125, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.1047943411055803, "frac_reward_zero_std": 0.0, "grad_norm": 0.4909352958202362, "kl": 1.3701171875, "learning_rate": 5e-05, "loss": 0.0225, "num_tokens": 9219580.0, "reward": 3.9920654296875, "reward_std": 0.22919607162475586, "rewards/helpfulness_reward/mean": 3.9920654296875, "rewards/helpfulness_reward/std": 0.383254736661911, "rewards/safety_reward/mean": 6.26123046875, "rewards/safety_reward/std": 0.7625440955162048, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 111.0546875, "completions/mean_terminated_length": 111.0546875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.10514365557593223, "frac_reward_zero_std": 0.0, "grad_norm": 0.4088371992111206, "kl": 1.228515625, "learning_rate": 5e-05, "loss": 0.0204, "num_tokens": 9237251.0, "reward": 3.96435546875, "reward_std": 0.33143913745880127, "rewards/helpfulness_reward/mean": 3.96435546875, "rewards/helpfulness_reward/std": 0.48499545454978943, "rewards/safety_reward/mean": 6.001708984375, "rewards/safety_reward/std": 0.7948211431503296, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 109.125, "completions/mean_terminated_length": 109.125, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.10549297004628416, "frac_reward_zero_std": 0.0, "grad_norm": 0.43975314497947693, "kl": 1.345703125, "learning_rate": 5e-05, "loss": 0.0086, "num_tokens": 9255843.0, "reward": 3.97705078125, "reward_std": 0.31886619329452515, "rewards/helpfulness_reward/mean": 3.97705078125, "rewards/helpfulness_reward/std": 0.4833039939403534, "rewards/safety_reward/mean": 6.05810546875, "rewards/safety_reward/std": 0.6659532189369202, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 108.59375, "completions/mean_terminated_length": 108.59375, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.1058422845166361, "frac_reward_zero_std": 0.0, "grad_norm": 0.385571151971817, "kl": 1.2490234375, "learning_rate": 5e-05, "loss": 0.0212, "num_tokens": 9274911.0, "reward": 3.8408203125, "reward_std": 0.3128911852836609, "rewards/helpfulness_reward/mean": 3.8408203125, "rewards/helpfulness_reward/std": 0.6331499814987183, "rewards/safety_reward/mean": 5.9873046875, "rewards/safety_reward/std": 0.9260636568069458, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 109.046875, "completions/mean_terminated_length": 109.046875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.10619159898698803, "frac_reward_zero_std": 0.0, "grad_norm": 0.4493292272090912, "kl": 1.31640625, "learning_rate": 5e-05, "loss": 0.015, "num_tokens": 9293885.0, "reward": 4.0028076171875, "reward_std": 0.3251521587371826, "rewards/helpfulness_reward/mean": 4.0028076171875, "rewards/helpfulness_reward/std": 0.4565734267234802, "rewards/safety_reward/mean": 6.219482421875, "rewards/safety_reward/std": 0.6872590780258179, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 110.2734375, "completions/mean_terminated_length": 110.2734375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.10654091345733997, "frac_reward_zero_std": 0.0, "grad_norm": 0.3903247117996216, "kl": 1.357421875, "learning_rate": 5e-05, "loss": 0.0215, "num_tokens": 9312352.0, "reward": 4.07098388671875, "reward_std": 0.30927473306655884, "rewards/helpfulness_reward/mean": 4.07098388671875, "rewards/helpfulness_reward/std": 0.46423041820526123, "rewards/safety_reward/mean": 6.1595458984375, "rewards/safety_reward/std": 0.7623845934867859, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 108.0546875, "completions/mean_terminated_length": 108.0546875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.1068902279276919, "frac_reward_zero_std": 0.0, "grad_norm": 0.5227620601654053, "kl": 1.53515625, "learning_rate": 5e-05, "loss": 0.0098, "num_tokens": 9331919.0, "reward": 4.102783203125, "reward_std": 0.25021350383758545, "rewards/helpfulness_reward/mean": 4.102783203125, "rewards/helpfulness_reward/std": 0.49799907207489014, "rewards/safety_reward/mean": 6.1458740234375, "rewards/safety_reward/std": 0.918835461139679, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1168.0, "completions/max_terminated_length": 1168.0, "completions/mean_length": 150.828125, "completions/mean_terminated_length": 150.828125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.10723954239804384, "frac_reward_zero_std": 0.0, "grad_norm": 0.373879075050354, "kl": 1.10693359375, "learning_rate": 5e-05, "loss": 0.0458, "num_tokens": 9356721.0, "reward": 3.9327392578125, "reward_std": 0.3394475281238556, "rewards/helpfulness_reward/mean": 3.9327392578125, "rewards/helpfulness_reward/std": 0.746309757232666, "rewards/safety_reward/mean": 5.943023681640625, "rewards/safety_reward/std": 1.2361204624176025, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 106.796875, "completions/mean_terminated_length": 106.796875, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.10758885686839577, "frac_reward_zero_std": 0.0, "grad_norm": 0.4276440739631653, "kl": 1.3486328125, "learning_rate": 5e-05, "loss": 0.0029, "num_tokens": 9376127.0, "reward": 4.1361083984375, "reward_std": 0.28817301988601685, "rewards/helpfulness_reward/mean": 4.1361083984375, "rewards/helpfulness_reward/std": 0.4948995113372803, "rewards/safety_reward/mean": 6.119384765625, "rewards/safety_reward/std": 0.7626821994781494, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 107.9296875, "completions/mean_terminated_length": 107.9296875, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.10793817133874771, "frac_reward_zero_std": 0.0, "grad_norm": 0.47635403275489807, "kl": 1.5791015625, "learning_rate": 5e-05, "loss": 0.0258, "num_tokens": 9394102.0, "reward": 3.987060546875, "reward_std": 0.3077614903450012, "rewards/helpfulness_reward/mean": 3.987060546875, "rewards/helpfulness_reward/std": 0.4666336178779602, "rewards/safety_reward/mean": 5.96435546875, "rewards/safety_reward/std": 0.7972884774208069, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1321.0, "completions/max_terminated_length": 1321.0, "completions/mean_length": 153.4140625, "completions/mean_terminated_length": 153.4140625, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.10828748580909964, "frac_reward_zero_std": 0.0, "grad_norm": 0.372529000043869, "kl": 1.220703125, "learning_rate": 5e-05, "loss": 0.0477, "num_tokens": 9420763.0, "reward": 3.95318603515625, "reward_std": 0.44215071201324463, "rewards/helpfulness_reward/mean": 3.95318603515625, "rewards/helpfulness_reward/std": 0.8589210510253906, "rewards/safety_reward/mean": 5.920379638671875, "rewards/safety_reward/std": 1.5470565557479858, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 107.6015625, "completions/mean_terminated_length": 107.6015625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.10863680027945158, "frac_reward_zero_std": 0.0, "grad_norm": 0.43437400460243225, "kl": 1.3037109375, "learning_rate": 5e-05, "loss": 0.0197, "num_tokens": 9439384.0, "reward": 4.0728759765625, "reward_std": 0.2803826928138733, "rewards/helpfulness_reward/mean": 4.0728759765625, "rewards/helpfulness_reward/std": 0.5476610064506531, "rewards/safety_reward/mean": 6.256591796875, "rewards/safety_reward/std": 0.642815113067627, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 103.2578125, "completions/mean_terminated_length": 103.2578125, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.1089861147498035, "frac_reward_zero_std": 0.0, "grad_norm": 0.46736761927604675, "kl": 1.4716796875, "learning_rate": 5e-05, "loss": -0.0014, "num_tokens": 9456129.0, "reward": 3.9757080078125, "reward_std": 0.30341047048568726, "rewards/helpfulness_reward/mean": 3.9757080078125, "rewards/helpfulness_reward/std": 0.5380663871765137, "rewards/safety_reward/mean": 5.7646484375, "rewards/safety_reward/std": 0.8498380184173584, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 105.796875, "completions/mean_terminated_length": 105.796875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.10933542922015545, "frac_reward_zero_std": 0.0, "grad_norm": 0.4371229112148285, "kl": 1.48046875, "learning_rate": 5e-05, "loss": 0.0098, "num_tokens": 9473447.0, "reward": 4.0316162109375, "reward_std": 0.2549213767051697, "rewards/helpfulness_reward/mean": 4.0316162109375, "rewards/helpfulness_reward/std": 0.5161800384521484, "rewards/safety_reward/mean": 5.918701171875, "rewards/safety_reward/std": 0.707762598991394, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 106.6796875, "completions/mean_terminated_length": 106.6796875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.10968474369050737, "frac_reward_zero_std": 0.0, "grad_norm": 0.4271160960197449, "kl": 1.421875, "learning_rate": 5e-05, "loss": 0.015, "num_tokens": 9490926.0, "reward": 4.1610107421875, "reward_std": 0.2646925449371338, "rewards/helpfulness_reward/mean": 4.1610107421875, "rewards/helpfulness_reward/std": 0.4429912269115448, "rewards/safety_reward/mean": 6.1876220703125, "rewards/safety_reward/std": 0.8137778043746948, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 107.21875, "completions/mean_terminated_length": 107.21875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.11003405816085932, "frac_reward_zero_std": 0.0, "grad_norm": 0.41337278485298157, "kl": 1.4521484375, "learning_rate": 5e-05, "loss": 0.0179, "num_tokens": 9508938.0, "reward": 4.205078125, "reward_std": 0.27515387535095215, "rewards/helpfulness_reward/mean": 4.205078125, "rewards/helpfulness_reward/std": 0.42584121227264404, "rewards/safety_reward/mean": 6.458984375, "rewards/safety_reward/std": 0.7918304204940796, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 110.3125, "completions/mean_terminated_length": 110.3125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.11038337263121124, "frac_reward_zero_std": 0.0, "grad_norm": 0.9019113183021545, "kl": 1.796875, "learning_rate": 5e-05, "loss": 0.0141, "num_tokens": 9528122.0, "reward": 4.019287109375, "reward_std": 0.29116904735565186, "rewards/helpfulness_reward/mean": 4.019287109375, "rewards/helpfulness_reward/std": 0.3934982120990753, "rewards/safety_reward/mean": 6.119384765625, "rewards/safety_reward/std": 0.787933349609375, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 106.171875, "completions/mean_terminated_length": 106.171875, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.11073268710156318, "frac_reward_zero_std": 0.0, "grad_norm": 0.628661572933197, "kl": 1.64453125, "learning_rate": 5e-05, "loss": 0.022, "num_tokens": 9545832.0, "reward": 4.1512451171875, "reward_std": 0.3296404182910919, "rewards/helpfulness_reward/mean": 4.1512451171875, "rewards/helpfulness_reward/std": 0.4900936782360077, "rewards/safety_reward/mean": 6.35986328125, "rewards/safety_reward/std": 0.7245835661888123, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 108.390625, "completions/mean_terminated_length": 108.390625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.11108200157191511, "frac_reward_zero_std": 0.0, "grad_norm": 0.42507466673851013, "kl": 1.4931640625, "learning_rate": 5e-05, "loss": 0.0218, "num_tokens": 9563434.0, "reward": 4.278564453125, "reward_std": 0.3311140537261963, "rewards/helpfulness_reward/mean": 4.278564453125, "rewards/helpfulness_reward/std": 0.46121782064437866, "rewards/safety_reward/mean": 6.410400390625, "rewards/safety_reward/std": 0.7456233501434326, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 106.734375, "completions/mean_terminated_length": 106.734375, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.11143131604226705, "frac_reward_zero_std": 0.0, "grad_norm": 0.43109819293022156, "kl": 1.5859375, "learning_rate": 5e-05, "loss": 0.0153, "num_tokens": 9580888.0, "reward": 3.9736328125, "reward_std": 0.2924015522003174, "rewards/helpfulness_reward/mean": 3.9736328125, "rewards/helpfulness_reward/std": 0.4199717044830322, "rewards/safety_reward/mean": 6.01806640625, "rewards/safety_reward/std": 0.7123787999153137, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 108.359375, "completions/mean_terminated_length": 108.359375, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.11178063051261898, "frac_reward_zero_std": 0.0, "grad_norm": 0.43054723739624023, "kl": 1.6796875, "learning_rate": 5e-05, "loss": 0.0252, "num_tokens": 9598454.0, "reward": 4.110595703125, "reward_std": 0.3191021680831909, "rewards/helpfulness_reward/mean": 4.110595703125, "rewards/helpfulness_reward/std": 0.4849227964878082, "rewards/safety_reward/mean": 6.162841796875, "rewards/safety_reward/std": 0.6793387532234192, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 111.453125, "completions/mean_terminated_length": 111.453125, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.11212994498297092, "frac_reward_zero_std": 0.0, "grad_norm": 0.43980690836906433, "kl": 1.52734375, "learning_rate": 5e-05, "loss": 0.0208, "num_tokens": 9617096.0, "reward": 4.17999267578125, "reward_std": 0.4007267951965332, "rewards/helpfulness_reward/mean": 4.17999267578125, "rewards/helpfulness_reward/std": 0.5875265002250671, "rewards/safety_reward/mean": 6.24945068359375, "rewards/safety_reward/std": 0.9395115375518799, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 111.7890625, "completions/mean_terminated_length": 111.7890625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.11247925945332285, "frac_reward_zero_std": 0.0, "grad_norm": 0.6016884446144104, "kl": 1.880859375, "learning_rate": 5e-05, "loss": 0.0344, "num_tokens": 9636581.0, "reward": 4.05975341796875, "reward_std": 0.35905593633651733, "rewards/helpfulness_reward/mean": 4.05975341796875, "rewards/helpfulness_reward/std": 0.5965315699577332, "rewards/safety_reward/mean": 6.1094970703125, "rewards/safety_reward/std": 0.912752628326416, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 109.0390625, "completions/mean_terminated_length": 109.0390625, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.11282857392367479, "frac_reward_zero_std": 0.0, "grad_norm": 2.2988810539245605, "kl": 2.5087890625, "learning_rate": 5e-05, "loss": 0.0504, "num_tokens": 9654370.0, "reward": 4.094482421875, "reward_std": 0.3646177053451538, "rewards/helpfulness_reward/mean": 4.094482421875, "rewards/helpfulness_reward/std": 0.4883408546447754, "rewards/safety_reward/mean": 6.3740234375, "rewards/safety_reward/std": 0.7057018876075745, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 108.2109375, "completions/mean_terminated_length": 108.2109375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.11317788839402672, "frac_reward_zero_std": 0.0, "grad_norm": 0.4373651146888733, "kl": 1.595703125, "learning_rate": 5e-05, "loss": 0.0148, "num_tokens": 9671813.0, "reward": 4.037841796875, "reward_std": 0.42311322689056396, "rewards/helpfulness_reward/mean": 4.037841796875, "rewards/helpfulness_reward/std": 0.5898246765136719, "rewards/safety_reward/mean": 6.217529296875, "rewards/safety_reward/std": 0.790215790271759, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 107.765625, "completions/mean_terminated_length": 107.765625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.11352720286437866, "frac_reward_zero_std": 0.0, "grad_norm": 0.4153093099594116, "kl": 1.6572265625, "learning_rate": 5e-05, "loss": -0.0093, "num_tokens": 9690423.0, "reward": 3.8837966918945312, "reward_std": 0.5323872566223145, "rewards/helpfulness_reward/mean": 3.8837966918945312, "rewards/helpfulness_reward/std": 0.8980302214622498, "rewards/safety_reward/mean": 5.8475341796875, "rewards/safety_reward/std": 0.8836110234260559, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 110.25, "completions/mean_terminated_length": 110.25, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.11387651733473059, "frac_reward_zero_std": 0.0, "grad_norm": 0.4557141661643982, "kl": 1.5029296875, "learning_rate": 5e-05, "loss": 0.0022, "num_tokens": 9708519.0, "reward": 4.00262451171875, "reward_std": 0.4985986351966858, "rewards/helpfulness_reward/mean": 4.00262451171875, "rewards/helpfulness_reward/std": 0.7185595631599426, "rewards/safety_reward/mean": 6.092315673828125, "rewards/safety_reward/std": 0.9997062683105469, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 112.890625, "completions/mean_terminated_length": 112.890625, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.11422583180508253, "frac_reward_zero_std": 0.0, "grad_norm": 0.42338523268699646, "kl": 1.5556640625, "learning_rate": 5e-05, "loss": 0.0244, "num_tokens": 9727641.0, "reward": 3.879150390625, "reward_std": 0.4458491802215576, "rewards/helpfulness_reward/mean": 3.879150390625, "rewards/helpfulness_reward/std": 0.6667603850364685, "rewards/safety_reward/mean": 6.03662109375, "rewards/safety_reward/std": 0.9242594242095947, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 111.828125, "completions/mean_terminated_length": 111.828125, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.11457514627543446, "frac_reward_zero_std": 0.0, "grad_norm": 0.43305087089538574, "kl": 1.75390625, "learning_rate": 5e-05, "loss": 0.024, "num_tokens": 9745547.0, "reward": 4.0416259765625, "reward_std": 0.3775418698787689, "rewards/helpfulness_reward/mean": 4.0416259765625, "rewards/helpfulness_reward/std": 0.5026316046714783, "rewards/safety_reward/mean": 6.018310546875, "rewards/safety_reward/std": 0.8438390493392944, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 111.0625, "completions/mean_terminated_length": 111.0625, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.1149244607457864, "frac_reward_zero_std": 0.0, "grad_norm": 0.4907342791557312, "kl": 1.529296875, "learning_rate": 5e-05, "loss": 0.0173, "num_tokens": 9764115.0, "reward": 4.222869873046875, "reward_std": 0.3766711950302124, "rewards/helpfulness_reward/mean": 4.222869873046875, "rewards/helpfulness_reward/std": 0.6356297135353088, "rewards/safety_reward/mean": 6.442626953125, "rewards/safety_reward/std": 0.71257483959198, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 112.953125, "completions/mean_terminated_length": 112.953125, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.11527377521613832, "frac_reward_zero_std": 0.0, "grad_norm": 0.4212454855442047, "kl": 1.6005859375, "learning_rate": 5e-05, "loss": -0.0022, "num_tokens": 9784461.0, "reward": 3.8851318359375, "reward_std": 0.4019268751144409, "rewards/helpfulness_reward/mean": 3.8851318359375, "rewards/helpfulness_reward/std": 0.5910454988479614, "rewards/safety_reward/mean": 5.9556884765625, "rewards/safety_reward/std": 0.8846373558044434, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 111.5390625, "completions/mean_terminated_length": 111.5390625, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.11562308968649027, "frac_reward_zero_std": 0.0, "grad_norm": 0.458936482667923, "kl": 1.548828125, "learning_rate": 5e-05, "loss": 0.011, "num_tokens": 9803690.0, "reward": 3.776275634765625, "reward_std": 0.43174493312835693, "rewards/helpfulness_reward/mean": 3.776275634765625, "rewards/helpfulness_reward/std": 0.8288151621818542, "rewards/safety_reward/mean": 5.774658203125, "rewards/safety_reward/std": 1.271161675453186, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1792.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 146.078125, "completions/mean_terminated_length": 119.95238494873047, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.1159724041568422, "frac_reward_zero_std": 0.0, "grad_norm": 0.7795074582099915, "kl": 1.6845703125, "learning_rate": 5e-05, "loss": 0.1729, "num_tokens": 9827412.0, "reward": 3.952545166015625, "reward_std": 0.5435460209846497, "rewards/helpfulness_reward/mean": 3.952545166015625, "rewards/helpfulness_reward/std": 0.9042646884918213, "rewards/safety_reward/mean": 6.018951416015625, "rewards/safety_reward/std": 1.3185662031173706, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 114.8671875, "completions/mean_terminated_length": 114.8671875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.11632171862719413, "frac_reward_zero_std": 0.0, "grad_norm": 0.46157997846603394, "kl": 1.6162109375, "learning_rate": 5e-05, "loss": 0.0528, "num_tokens": 9845771.0, "reward": 4.274658203125, "reward_std": 0.4089217185974121, "rewards/helpfulness_reward/mean": 4.274658203125, "rewards/helpfulness_reward/std": 0.5117596387863159, "rewards/safety_reward/mean": 6.41943359375, "rewards/safety_reward/std": 0.7028377056121826, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 111.03125, "completions/mean_terminated_length": 111.03125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.11667103309754606, "frac_reward_zero_std": 0.0, "grad_norm": 0.4046136438846588, "kl": 1.552734375, "learning_rate": 5e-05, "loss": 0.0285, "num_tokens": 9864311.0, "reward": 4.0731201171875, "reward_std": 0.315887451171875, "rewards/helpfulness_reward/mean": 4.0731201171875, "rewards/helpfulness_reward/std": 0.6078634262084961, "rewards/safety_reward/mean": 6.2470703125, "rewards/safety_reward/std": 0.8564760684967041, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1075.0, "completions/max_terminated_length": 1075.0, "completions/mean_length": 134.3515625, "completions/mean_terminated_length": 134.3515625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.117020347567898, "frac_reward_zero_std": 0.0, "grad_norm": 0.42076176404953003, "kl": 1.3720703125, "learning_rate": 5e-05, "loss": 0.0758, "num_tokens": 9886460.0, "reward": 3.8952178955078125, "reward_std": 0.42855626344680786, "rewards/helpfulness_reward/mean": 3.8952178955078125, "rewards/helpfulness_reward/std": 0.8601832985877991, "rewards/safety_reward/mean": 5.76568603515625, "rewards/safety_reward/std": 1.4085227251052856, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 112.5546875, "completions/mean_terminated_length": 112.5546875, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.11736966203824993, "frac_reward_zero_std": 0.0, "grad_norm": 0.44047483801841736, "kl": 1.671875, "learning_rate": 5e-05, "loss": 0.02, "num_tokens": 9904947.0, "reward": 4.0750732421875, "reward_std": 0.4173586964607239, "rewards/helpfulness_reward/mean": 4.0750732421875, "rewards/helpfulness_reward/std": 0.5474952459335327, "rewards/safety_reward/mean": 6.11865234375, "rewards/safety_reward/std": 1.0113438367843628, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 115.9296875, "completions/mean_terminated_length": 115.9296875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.11771897650860187, "frac_reward_zero_std": 0.0, "grad_norm": 0.3753667175769806, "kl": 1.5595703125, "learning_rate": 5e-05, "loss": 0.0247, "num_tokens": 9923530.0, "reward": 3.88128662109375, "reward_std": 0.40180012583732605, "rewards/helpfulness_reward/mean": 3.88128662109375, "rewards/helpfulness_reward/std": 0.8362138867378235, "rewards/safety_reward/mean": 5.816009521484375, "rewards/safety_reward/std": 1.2217949628829956, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 115.6328125, "completions/mean_terminated_length": 115.6328125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.1180682909789538, "frac_reward_zero_std": 0.0, "grad_norm": 0.376441091299057, "kl": 1.6865234375, "learning_rate": 5e-05, "loss": 0.0187, "num_tokens": 9943003.0, "reward": 4.10205078125, "reward_std": 0.3907288908958435, "rewards/helpfulness_reward/mean": 4.10205078125, "rewards/helpfulness_reward/std": 0.5033297538757324, "rewards/safety_reward/mean": 6.097412109375, "rewards/safety_reward/std": 0.6541153788566589, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 111.359375, "completions/mean_terminated_length": 111.359375, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.11841760544930574, "frac_reward_zero_std": 0.0, "grad_norm": 0.44957801699638367, "kl": 1.8251953125, "learning_rate": 5e-05, "loss": 0.0402, "num_tokens": 9962873.0, "reward": 3.786041259765625, "reward_std": 0.5093427300453186, "rewards/helpfulness_reward/mean": 3.786041259765625, "rewards/helpfulness_reward/std": 0.7572989463806152, "rewards/safety_reward/mean": 5.6873779296875, "rewards/safety_reward/std": 0.9246941208839417, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 113.421875, "completions/mean_terminated_length": 113.421875, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.11876691991965767, "frac_reward_zero_std": 0.0, "grad_norm": 0.4595985412597656, "kl": 1.6962890625, "learning_rate": 5e-05, "loss": 0.0235, "num_tokens": 9982631.0, "reward": 4.024757385253906, "reward_std": 0.415099561214447, "rewards/helpfulness_reward/mean": 4.024757385253906, "rewards/helpfulness_reward/std": 0.687984824180603, "rewards/safety_reward/mean": 6.0821533203125, "rewards/safety_reward/std": 0.9678670167922974, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 113.3828125, "completions/mean_terminated_length": 113.3828125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.11911623439000961, "frac_reward_zero_std": 0.0, "grad_norm": 0.4188237488269806, "kl": 1.712890625, "learning_rate": 5e-05, "loss": 0.0292, "num_tokens": 10000720.0, "reward": 4.0242919921875, "reward_std": 0.48657044768333435, "rewards/helpfulness_reward/mean": 4.0242919921875, "rewards/helpfulness_reward/std": 0.6698126792907715, "rewards/safety_reward/mean": 6.0828857421875, "rewards/safety_reward/std": 0.8422439098358154, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 115.6875, "completions/mean_terminated_length": 115.6875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.11946554886036154, "frac_reward_zero_std": 0.0, "grad_norm": 0.4032737910747528, "kl": 1.6396484375, "learning_rate": 5e-05, "loss": 0.0389, "num_tokens": 10019608.0, "reward": 4.04736328125, "reward_std": 0.3278951644897461, "rewards/helpfulness_reward/mean": 4.04736328125, "rewards/helpfulness_reward/std": 0.4659733176231384, "rewards/safety_reward/mean": 6.1336669921875, "rewards/safety_reward/std": 0.7593151330947876, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 121.7578125, "completions/mean_terminated_length": 121.7578125, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.11981486333071348, "frac_reward_zero_std": 0.0, "grad_norm": 0.44113820791244507, "kl": 1.5791015625, "learning_rate": 5e-05, "loss": 0.0624, "num_tokens": 10040057.0, "reward": 4.0131378173828125, "reward_std": 0.4099041819572449, "rewards/helpfulness_reward/mean": 4.0131378173828125, "rewards/helpfulness_reward/std": 0.7391282916069031, "rewards/safety_reward/mean": 6.01629638671875, "rewards/safety_reward/std": 1.2447385787963867, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 112.421875, "completions/mean_terminated_length": 112.421875, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.1201641778010654, "frac_reward_zero_std": 0.0, "grad_norm": 0.4507156014442444, "kl": 1.701171875, "learning_rate": 5e-05, "loss": 0.0254, "num_tokens": 10058695.0, "reward": 4.2257080078125, "reward_std": 0.2649673819541931, "rewards/helpfulness_reward/mean": 4.2257080078125, "rewards/helpfulness_reward/std": 0.4157070517539978, "rewards/safety_reward/mean": 6.27197265625, "rewards/safety_reward/std": 0.5986363887786865, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 111.7890625, "completions/mean_terminated_length": 111.7890625, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.12051349227141735, "frac_reward_zero_std": 0.0, "grad_norm": 0.4339616000652313, "kl": 1.6259765625, "learning_rate": 5e-05, "loss": 0.0165, "num_tokens": 10077844.0, "reward": 4.0623779296875, "reward_std": 0.3892003297805786, "rewards/helpfulness_reward/mean": 4.0623779296875, "rewards/helpfulness_reward/std": 0.5523314476013184, "rewards/safety_reward/mean": 6.0670166015625, "rewards/safety_reward/std": 0.6815721392631531, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 112.3828125, "completions/mean_terminated_length": 112.3828125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.12086280674176927, "frac_reward_zero_std": 0.0, "grad_norm": 0.46301206946372986, "kl": 1.73046875, "learning_rate": 5e-05, "loss": 0.0367, "num_tokens": 10097189.0, "reward": 3.97674560546875, "reward_std": 0.4500780701637268, "rewards/helpfulness_reward/mean": 3.97674560546875, "rewards/helpfulness_reward/std": 0.641715943813324, "rewards/safety_reward/mean": 6.1123046875, "rewards/safety_reward/std": 0.8370333909988403, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 114.0390625, "completions/mean_terminated_length": 114.0390625, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.12121212121212122, "frac_reward_zero_std": 0.0, "grad_norm": 0.4458199143409729, "kl": 1.662109375, "learning_rate": 5e-05, "loss": 0.0179, "num_tokens": 10116450.0, "reward": 4.2481689453125, "reward_std": 0.353423535823822, "rewards/helpfulness_reward/mean": 4.2481689453125, "rewards/helpfulness_reward/std": 0.5094555616378784, "rewards/safety_reward/mean": 6.376220703125, "rewards/safety_reward/std": 0.5666459202766418, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 111.25, "completions/mean_terminated_length": 111.25, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.12156143568247314, "frac_reward_zero_std": 0.0, "grad_norm": 0.4343351125717163, "kl": 1.705078125, "learning_rate": 5e-05, "loss": 0.0046, "num_tokens": 10135178.0, "reward": 4.337158203125, "reward_std": 0.28935009241104126, "rewards/helpfulness_reward/mean": 4.337158203125, "rewards/helpfulness_reward/std": 0.41474318504333496, "rewards/safety_reward/mean": 6.505615234375, "rewards/safety_reward/std": 0.6936290860176086, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 112.3828125, "completions/mean_terminated_length": 112.3828125, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.12191075015282508, "frac_reward_zero_std": 0.0, "grad_norm": 0.4614581763744354, "kl": 1.705078125, "learning_rate": 5e-05, "loss": 0.0113, "num_tokens": 10153827.0, "reward": 4.345947265625, "reward_std": 0.306785523891449, "rewards/helpfulness_reward/mean": 4.345947265625, "rewards/helpfulness_reward/std": 0.4655225872993469, "rewards/safety_reward/mean": 6.4013671875, "rewards/safety_reward/std": 0.7313171029090881, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 111.125, "completions/mean_terminated_length": 111.125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.12226006462317701, "frac_reward_zero_std": 0.0, "grad_norm": 0.42343345284461975, "kl": 1.76953125, "learning_rate": 5e-05, "loss": 0.0158, "num_tokens": 10172179.0, "reward": 4.4010009765625, "reward_std": 0.30150729417800903, "rewards/helpfulness_reward/mean": 4.4010009765625, "rewards/helpfulness_reward/std": 0.402127742767334, "rewards/safety_reward/mean": 6.4091796875, "rewards/safety_reward/std": 0.6563781499862671, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 116.0625, "completions/mean_terminated_length": 116.0625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.12260937909352895, "frac_reward_zero_std": 0.0, "grad_norm": 0.4303966164588928, "kl": 1.7646484375, "learning_rate": 5e-05, "loss": 0.02, "num_tokens": 10192579.0, "reward": 4.35247802734375, "reward_std": 0.3747626841068268, "rewards/helpfulness_reward/mean": 4.35247802734375, "rewards/helpfulness_reward/std": 0.6417227983474731, "rewards/safety_reward/mean": 6.358154296875, "rewards/safety_reward/std": 0.786042332649231, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 111.796875, "completions/mean_terminated_length": 111.796875, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.12295869356388088, "frac_reward_zero_std": 0.0, "grad_norm": 0.4517495632171631, "kl": 1.8505859375, "learning_rate": 5e-05, "loss": 0.0162, "num_tokens": 10210689.0, "reward": 4.2503662109375, "reward_std": 0.31925517320632935, "rewards/helpfulness_reward/mean": 4.2503662109375, "rewards/helpfulness_reward/std": 0.4336225390434265, "rewards/safety_reward/mean": 6.156982421875, "rewards/safety_reward/std": 0.5748783946037292, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 110.078125, "completions/mean_terminated_length": 110.078125, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.12330800803423282, "frac_reward_zero_std": 0.0, "grad_norm": 1.0845913887023926, "kl": 2.1748046875, "learning_rate": 5e-05, "loss": 0.0168, "num_tokens": 10230259.0, "reward": 4.19281005859375, "reward_std": 0.36382752656936646, "rewards/helpfulness_reward/mean": 4.19281005859375, "rewards/helpfulness_reward/std": 0.6479892730712891, "rewards/safety_reward/mean": 6.0157470703125, "rewards/safety_reward/std": 0.9090911746025085, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 107.25, "completions/mean_terminated_length": 107.25, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.12365732250458475, "frac_reward_zero_std": 0.0, "grad_norm": 0.4120347797870636, "kl": 1.744140625, "learning_rate": 5e-05, "loss": 0.0047, "num_tokens": 10248579.0, "reward": 4.232421875, "reward_std": 0.24980154633522034, "rewards/helpfulness_reward/mean": 4.232421875, "rewards/helpfulness_reward/std": 0.3486199975013733, "rewards/safety_reward/mean": 6.573974609375, "rewards/safety_reward/std": 0.5116878151893616, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 111.2890625, "completions/mean_terminated_length": 111.2890625, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.12400663697493669, "frac_reward_zero_std": 0.0, "grad_norm": 0.5476062893867493, "kl": 1.69921875, "learning_rate": 5e-05, "loss": 0.0003, "num_tokens": 10266824.0, "reward": 4.357421875, "reward_std": 0.29638001322746277, "rewards/helpfulness_reward/mean": 4.357421875, "rewards/helpfulness_reward/std": 0.419817179441452, "rewards/safety_reward/mean": 6.537841796875, "rewards/safety_reward/std": 0.6100532412528992, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 109.21875, "completions/mean_terminated_length": 109.21875, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.12435595144528862, "frac_reward_zero_std": 0.0, "grad_norm": 0.46934932470321655, "kl": 1.7158203125, "learning_rate": 5e-05, "loss": 0.007, "num_tokens": 10288356.0, "reward": 3.94354248046875, "reward_std": 0.3888246715068817, "rewards/helpfulness_reward/mean": 3.94354248046875, "rewards/helpfulness_reward/std": 0.8479625582695007, "rewards/safety_reward/mean": 5.915740966796875, "rewards/safety_reward/std": 1.3724870681762695, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 111.8359375, "completions/mean_terminated_length": 111.8359375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.12470526591564056, "frac_reward_zero_std": 0.0, "grad_norm": 0.5089754462242126, "kl": 1.87109375, "learning_rate": 5e-05, "loss": 0.0013, "num_tokens": 10307255.0, "reward": 4.3184814453125, "reward_std": 0.43145042657852173, "rewards/helpfulness_reward/mean": 4.3184814453125, "rewards/helpfulness_reward/std": 0.5989274978637695, "rewards/safety_reward/mean": 6.43994140625, "rewards/safety_reward/std": 0.7906336784362793, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 110.234375, "completions/mean_terminated_length": 110.234375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.1250545803859925, "frac_reward_zero_std": 0.0, "grad_norm": 0.7643000483512878, "kl": 1.767578125, "learning_rate": 5e-05, "loss": -0.0227, "num_tokens": 10326005.0, "reward": 4.20013427734375, "reward_std": 0.5718082189559937, "rewards/helpfulness_reward/mean": 4.20013427734375, "rewards/helpfulness_reward/std": 0.6444911360740662, "rewards/safety_reward/mean": 6.2330322265625, "rewards/safety_reward/std": 0.7717259526252747, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 114.9453125, "completions/mean_terminated_length": 114.9453125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.12540389485634443, "frac_reward_zero_std": 0.0, "grad_norm": 0.6768120527267456, "kl": 1.970703125, "learning_rate": 5e-05, "loss": 0.0183, "num_tokens": 10344502.0, "reward": 4.412109375, "reward_std": 0.4090580940246582, "rewards/helpfulness_reward/mean": 4.412109375, "rewards/helpfulness_reward/std": 0.6094680428504944, "rewards/safety_reward/mean": 6.56591796875, "rewards/safety_reward/std": 0.8042933940887451, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 113.8046875, "completions/mean_terminated_length": 113.8046875, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.12575320932669637, "frac_reward_zero_std": 0.0, "grad_norm": 4059.3828125, "kl": 1445.412109375, "learning_rate": 5e-05, "loss": 14.4572, "num_tokens": 10364461.0, "reward": 4.228904724121094, "reward_std": 0.379755437374115, "rewards/helpfulness_reward/mean": 4.228904724121094, "rewards/helpfulness_reward/std": 1.0990617275238037, "rewards/safety_reward/mean": 6.10516357421875, "rewards/safety_reward/std": 1.3030049800872803, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 114.484375, "completions/mean_terminated_length": 114.484375, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.12610252379704828, "frac_reward_zero_std": 0.0, "grad_norm": 0.45230740308761597, "kl": 1.859375, "learning_rate": 5e-05, "loss": 0.0207, "num_tokens": 10382899.0, "reward": 4.4703369140625, "reward_std": 0.27883970737457275, "rewards/helpfulness_reward/mean": 4.4703369140625, "rewards/helpfulness_reward/std": 0.5995233058929443, "rewards/safety_reward/mean": 6.336181640625, "rewards/safety_reward/std": 0.8333166241645813, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 116.828125, "completions/mean_terminated_length": 116.828125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.12645183826740022, "frac_reward_zero_std": 0.0, "grad_norm": 0.44868022203445435, "kl": 1.8525390625, "learning_rate": 5e-05, "loss": 0.0183, "num_tokens": 10402205.0, "reward": 4.4471435546875, "reward_std": 0.23160088062286377, "rewards/helpfulness_reward/mean": 4.4471435546875, "rewards/helpfulness_reward/std": 0.39119356870651245, "rewards/safety_reward/mean": 6.497314453125, "rewards/safety_reward/std": 0.4970543682575226, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 117.0546875, "completions/mean_terminated_length": 117.0546875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.12680115273775217, "frac_reward_zero_std": 0.0, "grad_norm": 0.6754857897758484, "kl": 2.0849609375, "learning_rate": 5e-05, "loss": 0.024, "num_tokens": 10421388.0, "reward": 4.29571533203125, "reward_std": 0.29677605628967285, "rewards/helpfulness_reward/mean": 4.29571533203125, "rewards/helpfulness_reward/std": 0.6515824198722839, "rewards/safety_reward/mean": 6.11376953125, "rewards/safety_reward/std": 0.7981656193733215, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 119.7265625, "completions/mean_terminated_length": 119.7265625, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.1271504672081041, "frac_reward_zero_std": 0.0, "grad_norm": 0.4765739440917969, "kl": 1.828125, "learning_rate": 5e-05, "loss": 0.0321, "num_tokens": 10441129.0, "reward": 4.3975830078125, "reward_std": 0.26250654458999634, "rewards/helpfulness_reward/mean": 4.3975830078125, "rewards/helpfulness_reward/std": 0.4868898391723633, "rewards/safety_reward/mean": 6.522216796875, "rewards/safety_reward/std": 0.6297574043273926, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 117.0859375, "completions/mean_terminated_length": 117.0859375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.12749978167845602, "frac_reward_zero_std": 0.0, "grad_norm": 0.4819450080394745, "kl": 1.669921875, "learning_rate": 5e-05, "loss": 0.0118, "num_tokens": 10460964.0, "reward": 4.4251708984375, "reward_std": 0.2654111385345459, "rewards/helpfulness_reward/mean": 4.4251708984375, "rewards/helpfulness_reward/std": 0.37539783120155334, "rewards/safety_reward/mean": 6.542724609375, "rewards/safety_reward/std": 0.6302299499511719, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 119.4609375, "completions/mean_terminated_length": 119.4609375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.12784909614880796, "frac_reward_zero_std": 0.0, "grad_norm": 0.4423789083957672, "kl": 1.712890625, "learning_rate": 5e-05, "loss": 0.0264, "num_tokens": 10480471.0, "reward": 4.4443359375, "reward_std": 0.2835114598274231, "rewards/helpfulness_reward/mean": 4.4443359375, "rewards/helpfulness_reward/std": 0.4816257953643799, "rewards/safety_reward/mean": 6.41015625, "rewards/safety_reward/std": 0.4902312159538269, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 117.625, "completions/mean_terminated_length": 117.625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.1281984106191599, "frac_reward_zero_std": 0.0, "grad_norm": 0.44393491744995117, "kl": 1.697265625, "learning_rate": 5e-05, "loss": 0.0242, "num_tokens": 10499751.0, "reward": 4.43695068359375, "reward_std": 0.3050681948661804, "rewards/helpfulness_reward/mean": 4.43695068359375, "rewards/helpfulness_reward/std": 0.4868374764919281, "rewards/safety_reward/mean": 6.555419921875, "rewards/safety_reward/std": 0.7843708395957947, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 122.1484375, "completions/mean_terminated_length": 122.1484375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.12854772508951184, "frac_reward_zero_std": 0.0, "grad_norm": 0.4482097029685974, "kl": 1.7119140625, "learning_rate": 5e-05, "loss": 0.034, "num_tokens": 10519450.0, "reward": 4.490234375, "reward_std": 0.3206648528575897, "rewards/helpfulness_reward/mean": 4.490234375, "rewards/helpfulness_reward/std": 0.42777344584465027, "rewards/safety_reward/mean": 6.501220703125, "rewards/safety_reward/std": 0.565260112285614, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 118.8046875, "completions/mean_terminated_length": 118.8046875, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.12889703955986376, "frac_reward_zero_std": 0.0, "grad_norm": 0.4592204689979553, "kl": 1.75390625, "learning_rate": 5e-05, "loss": 0.0353, "num_tokens": 10538185.0, "reward": 4.4349365234375, "reward_std": 0.2915343940258026, "rewards/helpfulness_reward/mean": 4.4349365234375, "rewards/helpfulness_reward/std": 0.46013981103897095, "rewards/safety_reward/mean": 6.40673828125, "rewards/safety_reward/std": 0.5155520439147949, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 117.9765625, "completions/mean_terminated_length": 117.9765625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.1292463540302157, "frac_reward_zero_std": 0.0, "grad_norm": 1421.6368408203125, "kl": 144.40234375, "learning_rate": 5e-05, "loss": 1.4734, "num_tokens": 10560510.0, "reward": 4.223876953125, "reward_std": 0.4165439009666443, "rewards/helpfulness_reward/mean": 4.223876953125, "rewards/helpfulness_reward/std": 0.9832175374031067, "rewards/safety_reward/mean": 6.130973815917969, "rewards/safety_reward/std": 1.4182353019714355, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 116.7890625, "completions/mean_terminated_length": 116.7890625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.12959566850056764, "frac_reward_zero_std": 0.0, "grad_norm": 0.48371294140815735, "kl": 1.8837890625, "learning_rate": 5e-05, "loss": 0.0216, "num_tokens": 10580107.0, "reward": 4.2353515625, "reward_std": 0.36693882942199707, "rewards/helpfulness_reward/mean": 4.2353515625, "rewards/helpfulness_reward/std": 0.6791297197341919, "rewards/safety_reward/mean": 6.2567138671875, "rewards/safety_reward/std": 0.9234067797660828, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 119.828125, "completions/mean_terminated_length": 119.828125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.12994498297091958, "frac_reward_zero_std": 0.0, "grad_norm": 0.40997952222824097, "kl": 1.751953125, "learning_rate": 5e-05, "loss": 0.0345, "num_tokens": 10600773.0, "reward": 4.58935546875, "reward_std": 0.3042997121810913, "rewards/helpfulness_reward/mean": 4.58935546875, "rewards/helpfulness_reward/std": 0.5074756741523743, "rewards/safety_reward/mean": 6.57666015625, "rewards/safety_reward/std": 0.745948076248169, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 118.625, "completions/mean_terminated_length": 118.625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.1302942974412715, "frac_reward_zero_std": 0.0, "grad_norm": 0.3869610130786896, "kl": 1.7158203125, "learning_rate": 5e-05, "loss": 0.0285, "num_tokens": 10619941.0, "reward": 4.3782958984375, "reward_std": 0.28530117869377136, "rewards/helpfulness_reward/mean": 4.3782958984375, "rewards/helpfulness_reward/std": 0.5275047421455383, "rewards/safety_reward/mean": 6.43505859375, "rewards/safety_reward/std": 0.5541901588439941, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 117.0546875, "completions/mean_terminated_length": 117.0546875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.13064361191162344, "frac_reward_zero_std": 0.0, "grad_norm": 0.4691081643104553, "kl": 1.6953125, "learning_rate": 5e-05, "loss": 0.0385, "num_tokens": 10638924.0, "reward": 4.4849853515625, "reward_std": 0.2887815833091736, "rewards/helpfulness_reward/mean": 4.4849853515625, "rewards/helpfulness_reward/std": 0.41430163383483887, "rewards/safety_reward/mean": 6.552001953125, "rewards/safety_reward/std": 0.6967906951904297, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 115.5859375, "completions/mean_terminated_length": 115.5859375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.13099292638197538, "frac_reward_zero_std": 0.0, "grad_norm": 0.4495319724082947, "kl": 1.7373046875, "learning_rate": 5e-05, "loss": 0.0357, "num_tokens": 10658871.0, "reward": 4.191162109375, "reward_std": 0.3131342828273773, "rewards/helpfulness_reward/mean": 4.191162109375, "rewards/helpfulness_reward/std": 0.5714224576950073, "rewards/safety_reward/mean": 6.421630859375, "rewards/safety_reward/std": 0.763201892375946, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 124.6875, "completions/mean_terminated_length": 124.6875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.13134224085232732, "frac_reward_zero_std": 0.0, "grad_norm": 0.43148306012153625, "kl": 1.724609375, "learning_rate": 5e-05, "loss": 0.0066, "num_tokens": 10682407.0, "reward": 3.962973117828369, "reward_std": 0.35788822174072266, "rewards/helpfulness_reward/mean": 3.962973117828369, "rewards/helpfulness_reward/std": 1.3685803413391113, "rewards/safety_reward/mean": 5.8488006591796875, "rewards/safety_reward/std": 1.7087007761001587, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 114.984375, "completions/mean_terminated_length": 114.984375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.13169155532267923, "frac_reward_zero_std": 0.0, "grad_norm": 0.47372308373451233, "kl": 1.8662109375, "learning_rate": 5e-05, "loss": 0.0246, "num_tokens": 10701133.0, "reward": 4.306640625, "reward_std": 0.3154146671295166, "rewards/helpfulness_reward/mean": 4.306640625, "rewards/helpfulness_reward/std": 0.5016236901283264, "rewards/safety_reward/mean": 6.353271484375, "rewards/safety_reward/std": 0.6274381279945374, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 114.453125, "completions/mean_terminated_length": 114.453125, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.13204086979303117, "frac_reward_zero_std": 0.0, "grad_norm": 0.48817697167396545, "kl": 1.8388671875, "learning_rate": 5e-05, "loss": 0.0174, "num_tokens": 10721471.0, "reward": 4.352783203125, "reward_std": 0.33604103326797485, "rewards/helpfulness_reward/mean": 4.352783203125, "rewards/helpfulness_reward/std": 0.5214568972587585, "rewards/safety_reward/mean": 6.40283203125, "rewards/safety_reward/std": 0.662265419960022, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1792.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 138.171875, "completions/mean_terminated_length": 125.14960479736328, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.13239018426338311, "frac_reward_zero_std": 0.0, "grad_norm": 0.4513660669326782, "kl": 1.5205078125, "learning_rate": 5e-05, "loss": 0.0183, "num_tokens": 10746341.0, "reward": 3.9185447692871094, "reward_std": 0.34087371826171875, "rewards/helpfulness_reward/mean": 3.9185447692871094, "rewards/helpfulness_reward/std": 1.3710689544677734, "rewards/safety_reward/mean": 5.89227294921875, "rewards/safety_reward/std": 2.110008955001831, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 113.1484375, "completions/mean_terminated_length": 113.1484375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.13273949873373506, "frac_reward_zero_std": 0.0, "grad_norm": 0.4811563789844513, "kl": 1.84375, "learning_rate": 5e-05, "loss": 0.0137, "num_tokens": 10765736.0, "reward": 4.238067626953125, "reward_std": 0.4117184281349182, "rewards/helpfulness_reward/mean": 4.238067626953125, "rewards/helpfulness_reward/std": 0.6517507433891296, "rewards/safety_reward/mean": 6.38214111328125, "rewards/safety_reward/std": 1.0447874069213867, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 105.5234375, "completions/mean_terminated_length": 105.5234375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.13308881320408697, "frac_reward_zero_std": 0.0, "grad_norm": 0.49737799167633057, "kl": 1.93359375, "learning_rate": 5e-05, "loss": 0.0052, "num_tokens": 10783395.0, "reward": 4.1555328369140625, "reward_std": 0.3576328456401825, "rewards/helpfulness_reward/mean": 4.1555328369140625, "rewards/helpfulness_reward/std": 0.9545426964759827, "rewards/safety_reward/mean": 6.040283203125, "rewards/safety_reward/std": 1.4605706930160522, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 111.1484375, "completions/mean_terminated_length": 111.1484375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.1334381276744389, "frac_reward_zero_std": 0.0, "grad_norm": 0.44490474462509155, "kl": 1.9091796875, "learning_rate": 5e-05, "loss": 0.0135, "num_tokens": 10801934.0, "reward": 4.5029296875, "reward_std": 0.27163583040237427, "rewards/helpfulness_reward/mean": 4.5029296875, "rewards/helpfulness_reward/std": 0.40179046988487244, "rewards/safety_reward/mean": 6.57421875, "rewards/safety_reward/std": 0.5896499156951904, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 111.7578125, "completions/mean_terminated_length": 111.7578125, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.13378744214479085, "frac_reward_zero_std": 0.0, "grad_norm": 0.4579738676548004, "kl": 1.798828125, "learning_rate": 5e-05, "loss": -0.0071, "num_tokens": 10821887.0, "reward": 4.095436096191406, "reward_std": 0.3987562656402588, "rewards/helpfulness_reward/mean": 4.095436096191406, "rewards/helpfulness_reward/std": 1.3439409732818604, "rewards/safety_reward/mean": 6.049468994140625, "rewards/safety_reward/std": 1.7639716863632202, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 111.2421875, "completions/mean_terminated_length": 111.2421875, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.1341367566151428, "frac_reward_zero_std": 0.0, "grad_norm": 0.44025009870529175, "kl": 1.9638671875, "learning_rate": 5e-05, "loss": 0.0178, "num_tokens": 10840246.0, "reward": 4.401123046875, "reward_std": 0.26798582077026367, "rewards/helpfulness_reward/mean": 4.401123046875, "rewards/helpfulness_reward/std": 0.43625080585479736, "rewards/safety_reward/mean": 6.450927734375, "rewards/safety_reward/std": 0.5284954309463501, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 111.5234375, "completions/mean_terminated_length": 111.5234375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.1344860710854947, "frac_reward_zero_std": 0.0, "grad_norm": 0.45911601185798645, "kl": 1.998046875, "learning_rate": 5e-05, "loss": 0.038, "num_tokens": 10858793.0, "reward": 4.431640625, "reward_std": 0.3186788856983185, "rewards/helpfulness_reward/mean": 4.431640625, "rewards/helpfulness_reward/std": 0.5034559965133667, "rewards/safety_reward/mean": 6.42724609375, "rewards/safety_reward/std": 0.6829901933670044, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 110.7578125, "completions/mean_terminated_length": 110.7578125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.13483538555584665, "frac_reward_zero_std": 0.0, "grad_norm": 0.43633800745010376, "kl": 1.921875, "learning_rate": 5e-05, "loss": 0.0129, "num_tokens": 10876754.0, "reward": 4.7156982421875, "reward_std": 0.25484588742256165, "rewards/helpfulness_reward/mean": 4.7156982421875, "rewards/helpfulness_reward/std": 0.32912272214889526, "rewards/safety_reward/mean": 6.615234375, "rewards/safety_reward/std": 0.495717853307724, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 113.34375, "completions/mean_terminated_length": 113.34375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.1351847000261986, "frac_reward_zero_std": 0.0, "grad_norm": 0.4744437634944916, "kl": 1.947265625, "learning_rate": 5e-05, "loss": 0.0235, "num_tokens": 10894998.0, "reward": 4.592529296875, "reward_std": 0.33694878220558167, "rewards/helpfulness_reward/mean": 4.592529296875, "rewards/helpfulness_reward/std": 0.570850670337677, "rewards/safety_reward/mean": 6.757568359375, "rewards/safety_reward/std": 0.8221315741539001, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 115.953125, "completions/mean_terminated_length": 115.953125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.13553401449655053, "frac_reward_zero_std": 0.0, "grad_norm": 0.4695790708065033, "kl": 2.0146484375, "learning_rate": 5e-05, "loss": 0.0409, "num_tokens": 10915992.0, "reward": 4.459716796875, "reward_std": 0.3445417582988739, "rewards/helpfulness_reward/mean": 4.459716796875, "rewards/helpfulness_reward/std": 0.5770436525344849, "rewards/safety_reward/mean": 6.5361328125, "rewards/safety_reward/std": 0.6054450273513794, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 112.1015625, "completions/mean_terminated_length": 112.1015625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.13588332896690244, "frac_reward_zero_std": 0.0, "grad_norm": 0.5059937238693237, "kl": 2.23046875, "learning_rate": 5e-05, "loss": 0.0101, "num_tokens": 10933949.0, "reward": 4.3975830078125, "reward_std": 0.28264832496643066, "rewards/helpfulness_reward/mean": 4.3975830078125, "rewards/helpfulness_reward/std": 0.5239951610565186, "rewards/safety_reward/mean": 6.44140625, "rewards/safety_reward/std": 0.6646271347999573, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 113.8359375, "completions/mean_terminated_length": 113.8359375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.13623264343725439, "frac_reward_zero_std": 0.0, "grad_norm": 0.4441373944282532, "kl": 1.9189453125, "learning_rate": 5e-05, "loss": 0.0233, "num_tokens": 10954832.0, "reward": 4.41168212890625, "reward_std": 0.37028467655181885, "rewards/helpfulness_reward/mean": 4.41168212890625, "rewards/helpfulness_reward/std": 0.47654440999031067, "rewards/safety_reward/mean": 6.7117919921875, "rewards/safety_reward/std": 0.7048570513725281, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 108.59375, "completions/mean_terminated_length": 108.59375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.13658195790760633, "frac_reward_zero_std": 0.0, "grad_norm": 0.4793104827404022, "kl": 1.99609375, "learning_rate": 5e-05, "loss": 0.041, "num_tokens": 10972996.0, "reward": 4.226654052734375, "reward_std": 0.34760719537734985, "rewards/helpfulness_reward/mean": 4.226654052734375, "rewards/helpfulness_reward/std": 0.9661064743995667, "rewards/safety_reward/mean": 6.09326171875, "rewards/safety_reward/std": 1.259463906288147, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 115.1015625, "completions/mean_terminated_length": 115.1015625, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.13693127237795827, "frac_reward_zero_std": 0.0, "grad_norm": 0.4230120778083801, "kl": 1.892578125, "learning_rate": 5e-05, "loss": 0.0316, "num_tokens": 10993553.0, "reward": 4.47021484375, "reward_std": 0.33149272203445435, "rewards/helpfulness_reward/mean": 4.47021484375, "rewards/helpfulness_reward/std": 0.5785768628120422, "rewards/safety_reward/mean": 6.391357421875, "rewards/safety_reward/std": 0.7226599454879761, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 115.171875, "completions/mean_terminated_length": 115.171875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.13728058684831018, "frac_reward_zero_std": 0.0, "grad_norm": 0.5135923027992249, "kl": 2.0224609375, "learning_rate": 5e-05, "loss": 0.0203, "num_tokens": 11011879.0, "reward": 4.551513671875, "reward_std": 0.2760311961174011, "rewards/helpfulness_reward/mean": 4.551513671875, "rewards/helpfulness_reward/std": 0.4650086760520935, "rewards/safety_reward/mean": 6.470703125, "rewards/safety_reward/std": 0.7288874387741089, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 116.7578125, "completions/mean_terminated_length": 116.7578125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.13762990131866212, "frac_reward_zero_std": 0.0, "grad_norm": 0.43559765815734863, "kl": 2.0849609375, "learning_rate": 5e-05, "loss": 0.0302, "num_tokens": 11030640.0, "reward": 4.6624755859375, "reward_std": 0.31675082445144653, "rewards/helpfulness_reward/mean": 4.6624755859375, "rewards/helpfulness_reward/std": 0.5960163474082947, "rewards/safety_reward/mean": 6.74462890625, "rewards/safety_reward/std": 0.6531093716621399, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 117.2421875, "completions/mean_terminated_length": 117.2421875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.13797921578901406, "frac_reward_zero_std": 0.0, "grad_norm": 0.40992796421051025, "kl": 2.005859375, "learning_rate": 5e-05, "loss": 0.0237, "num_tokens": 11049551.0, "reward": 4.719970703125, "reward_std": 0.3111457824707031, "rewards/helpfulness_reward/mean": 4.719970703125, "rewards/helpfulness_reward/std": 0.42244014143943787, "rewards/safety_reward/mean": 6.76904296875, "rewards/safety_reward/std": 0.6325604915618896, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 116.46875, "completions/mean_terminated_length": 116.46875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.138328530259366, "frac_reward_zero_std": 0.0, "grad_norm": 0.4140653908252716, "kl": 1.986328125, "learning_rate": 5e-05, "loss": 0.025, "num_tokens": 11068259.0, "reward": 4.7484130859375, "reward_std": 0.3087310791015625, "rewards/helpfulness_reward/mean": 4.7484130859375, "rewards/helpfulness_reward/std": 0.46039485931396484, "rewards/safety_reward/mean": 6.79931640625, "rewards/safety_reward/std": 0.5527591705322266, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 115.3828125, "completions/mean_terminated_length": 115.3828125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.13867784472971792, "frac_reward_zero_std": 0.0, "grad_norm": 0.4474315643310547, "kl": 2.08203125, "learning_rate": 5e-05, "loss": 0.0387, "num_tokens": 11087948.0, "reward": 4.7108154296875, "reward_std": 0.29466021060943604, "rewards/helpfulness_reward/mean": 4.7108154296875, "rewards/helpfulness_reward/std": 0.5571362376213074, "rewards/safety_reward/mean": 6.704345703125, "rewards/safety_reward/std": 0.7366125583648682, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 114.8515625, "completions/mean_terminated_length": 114.8515625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.13902715920006986, "frac_reward_zero_std": 0.0, "grad_norm": 0.48757418990135193, "kl": 1.9453125, "learning_rate": 5e-05, "loss": 0.026, "num_tokens": 11107929.0, "reward": 4.4271392822265625, "reward_std": 0.36771392822265625, "rewards/helpfulness_reward/mean": 4.4271392822265625, "rewards/helpfulness_reward/std": 0.7984017729759216, "rewards/safety_reward/mean": 6.6358642578125, "rewards/safety_reward/std": 0.8906510472297668, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 115.546875, "completions/mean_terminated_length": 115.546875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.1393764736704218, "frac_reward_zero_std": 0.0, "grad_norm": 0.4741869866847992, "kl": 2.0703125, "learning_rate": 5e-05, "loss": 0.0399, "num_tokens": 11126079.0, "reward": 4.7271728515625, "reward_std": 0.2897363305091858, "rewards/helpfulness_reward/mean": 4.7271728515625, "rewards/helpfulness_reward/std": 0.4892730712890625, "rewards/safety_reward/mean": 6.728759765625, "rewards/safety_reward/std": 0.628850519657135, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 116.0546875, "completions/mean_terminated_length": 116.0546875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.13972578814077374, "frac_reward_zero_std": 0.0, "grad_norm": 0.48057639598846436, "kl": 2.041015625, "learning_rate": 5e-05, "loss": 0.0176, "num_tokens": 11145222.0, "reward": 4.8648681640625, "reward_std": 0.300783634185791, "rewards/helpfulness_reward/mean": 4.8648681640625, "rewards/helpfulness_reward/std": 0.4205884337425232, "rewards/safety_reward/mean": 6.951171875, "rewards/safety_reward/std": 0.6448228359222412, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 116.1875, "completions/mean_terminated_length": 116.1875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.14007510261112566, "frac_reward_zero_std": 0.0, "grad_norm": 0.5055373907089233, "kl": 2.357421875, "learning_rate": 5e-05, "loss": 0.0279, "num_tokens": 11163614.0, "reward": 4.668701171875, "reward_std": 0.3595391511917114, "rewards/helpfulness_reward/mean": 4.668701171875, "rewards/helpfulness_reward/std": 0.690110445022583, "rewards/safety_reward/mean": 6.576171875, "rewards/safety_reward/std": 0.8028342127799988, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 112.4453125, "completions/mean_terminated_length": 112.4453125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.1404244170814776, "frac_reward_zero_std": 0.0, "grad_norm": 0.49991291761398315, "kl": 2.103515625, "learning_rate": 5e-05, "loss": 0.0334, "num_tokens": 11182303.0, "reward": 4.5157470703125, "reward_std": 0.3027053773403168, "rewards/helpfulness_reward/mean": 4.5157470703125, "rewards/helpfulness_reward/std": 0.6639397740364075, "rewards/safety_reward/mean": 6.505126953125, "rewards/safety_reward/std": 1.2007029056549072, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 116.71875, "completions/mean_terminated_length": 116.71875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.14077373155182954, "frac_reward_zero_std": 0.0, "grad_norm": 0.4740345776081085, "kl": 2.162109375, "learning_rate": 5e-05, "loss": 0.0474, "num_tokens": 11201083.0, "reward": 4.65283203125, "reward_std": 0.34309709072113037, "rewards/helpfulness_reward/mean": 4.65283203125, "rewards/helpfulness_reward/std": 0.47268661856651306, "rewards/safety_reward/mean": 6.709716796875, "rewards/safety_reward/std": 0.6466851830482483, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 112.875, "completions/mean_terminated_length": 112.875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.14112304602218148, "frac_reward_zero_std": 0.0, "grad_norm": 0.467180460691452, "kl": 2.16015625, "learning_rate": 5e-05, "loss": 0.0193, "num_tokens": 11219571.0, "reward": 4.8175048828125, "reward_std": 0.269267737865448, "rewards/helpfulness_reward/mean": 4.8175048828125, "rewards/helpfulness_reward/std": 0.39742812514305115, "rewards/safety_reward/mean": 6.794921875, "rewards/safety_reward/std": 0.5642572641372681, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 112.734375, "completions/mean_terminated_length": 112.734375, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.1414723604925334, "frac_reward_zero_std": 0.0, "grad_norm": 0.5588943362236023, "kl": 2.275390625, "learning_rate": 5e-05, "loss": 0.0256, "num_tokens": 11238809.0, "reward": 4.59375, "reward_std": 0.3514319360256195, "rewards/helpfulness_reward/mean": 4.59375, "rewards/helpfulness_reward/std": 0.7243183255195618, "rewards/safety_reward/mean": 6.6787109375, "rewards/safety_reward/std": 0.8338257670402527, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 114.9453125, "completions/mean_terminated_length": 114.9453125, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.14182167496288534, "frac_reward_zero_std": 0.0, "grad_norm": 0.528264582157135, "kl": 2.208984375, "learning_rate": 5e-05, "loss": 0.0041, "num_tokens": 11258562.0, "reward": 4.62939453125, "reward_std": 0.3188265860080719, "rewards/helpfulness_reward/mean": 4.62939453125, "rewards/helpfulness_reward/std": 0.5033526420593262, "rewards/safety_reward/mean": 6.717041015625, "rewards/safety_reward/std": 0.5859911441802979, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 113.75, "completions/mean_terminated_length": 113.75, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.14217098943323728, "frac_reward_zero_std": 0.0, "grad_norm": 0.48630470037460327, "kl": 2.32421875, "learning_rate": 5e-05, "loss": 0.0251, "num_tokens": 11276634.0, "reward": 4.699462890625, "reward_std": 0.2617917060852051, "rewards/helpfulness_reward/mean": 4.699462890625, "rewards/helpfulness_reward/std": 0.4376954138278961, "rewards/safety_reward/mean": 6.803955078125, "rewards/safety_reward/std": 0.7344779968261719, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 115.4765625, "completions/mean_terminated_length": 115.4765625, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.14252030390358922, "frac_reward_zero_std": 0.0, "grad_norm": 0.583850622177124, "kl": 2.431640625, "learning_rate": 5e-05, "loss": 0.0303, "num_tokens": 11295631.0, "reward": 4.9039306640625, "reward_std": 0.30324265360832214, "rewards/helpfulness_reward/mean": 4.9039306640625, "rewards/helpfulness_reward/std": 0.46601104736328125, "rewards/safety_reward/mean": 6.960205078125, "rewards/safety_reward/std": 0.5402522087097168, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1792.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 131.2109375, "completions/mean_terminated_length": 118.13385772705078, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.14286961837394113, "frac_reward_zero_std": 0.0, "grad_norm": 0.45237287878990173, "kl": 2.1513671875, "learning_rate": 5e-05, "loss": 0.1368, "num_tokens": 11318050.0, "reward": 4.555206298828125, "reward_std": 0.4651895761489868, "rewards/helpfulness_reward/mean": 4.555206298828125, "rewards/helpfulness_reward/std": 0.8493986129760742, "rewards/safety_reward/mean": 6.44390869140625, "rewards/safety_reward/std": 1.1035640239715576, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 114.9765625, "completions/mean_terminated_length": 114.9765625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.14321893284429307, "frac_reward_zero_std": 0.0, "grad_norm": 0.5297466516494751, "kl": 2.1396484375, "learning_rate": 5e-05, "loss": 0.0351, "num_tokens": 11337679.0, "reward": 4.751953125, "reward_std": 0.2548072636127472, "rewards/helpfulness_reward/mean": 4.751953125, "rewards/helpfulness_reward/std": 0.36472976207733154, "rewards/safety_reward/mean": 6.743896484375, "rewards/safety_reward/std": 0.559581458568573, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 118.9921875, "completions/mean_terminated_length": 118.9921875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.14356824731464501, "frac_reward_zero_std": 0.0, "grad_norm": 0.4610542953014374, "kl": 2.185546875, "learning_rate": 5e-05, "loss": 0.0269, "num_tokens": 11357398.0, "reward": 4.69195556640625, "reward_std": 0.3239000141620636, "rewards/helpfulness_reward/mean": 4.69195556640625, "rewards/helpfulness_reward/std": 0.9146366715431213, "rewards/safety_reward/mean": 6.66357421875, "rewards/safety_reward/std": 1.377652645111084, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 115.828125, "completions/mean_terminated_length": 115.828125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.14391756178499696, "frac_reward_zero_std": 0.0, "grad_norm": 0.5043153166770935, "kl": 2.349609375, "learning_rate": 5e-05, "loss": 0.0303, "num_tokens": 11376032.0, "reward": 4.793701171875, "reward_std": 0.2655443549156189, "rewards/helpfulness_reward/mean": 4.793701171875, "rewards/helpfulness_reward/std": 0.4482061564922333, "rewards/safety_reward/mean": 6.805419921875, "rewards/safety_reward/std": 0.6695975065231323, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 113.0390625, "completions/mean_terminated_length": 113.0390625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.14426687625534887, "frac_reward_zero_std": 0.0, "grad_norm": 0.45833620429039, "kl": 2.33203125, "learning_rate": 5e-05, "loss": 0.024, "num_tokens": 11394165.0, "reward": 4.84912109375, "reward_std": 0.34486693143844604, "rewards/helpfulness_reward/mean": 4.84912109375, "rewards/helpfulness_reward/std": 0.7020851373672485, "rewards/safety_reward/mean": 6.7996826171875, "rewards/safety_reward/std": 1.2626093626022339, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 112.5546875, "completions/mean_terminated_length": 112.5546875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.1446161907257008, "frac_reward_zero_std": 0.0, "grad_norm": 0.5244905948638916, "kl": 2.41796875, "learning_rate": 5e-05, "loss": 0.0174, "num_tokens": 11412372.0, "reward": 4.850830078125, "reward_std": 0.2897571921348572, "rewards/helpfulness_reward/mean": 4.850830078125, "rewards/helpfulness_reward/std": 0.5628582835197449, "rewards/safety_reward/mean": 6.85888671875, "rewards/safety_reward/std": 0.6228060126304626, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 113.4375, "completions/mean_terminated_length": 113.4375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.14496550519605275, "frac_reward_zero_std": 0.0, "grad_norm": 0.5035123825073242, "kl": 2.46484375, "learning_rate": 5e-05, "loss": 0.0246, "num_tokens": 11430484.0, "reward": 5.1397705078125, "reward_std": 0.3065575659275055, "rewards/helpfulness_reward/mean": 5.1397705078125, "rewards/helpfulness_reward/std": 0.40251970291137695, "rewards/safety_reward/mean": 6.99072265625, "rewards/safety_reward/std": 0.5240781903266907, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 116.03125, "completions/mean_terminated_length": 116.03125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.1453148196664047, "frac_reward_zero_std": 0.0, "grad_norm": 0.5258997678756714, "kl": 2.578125, "learning_rate": 5e-05, "loss": 0.0303, "num_tokens": 11451160.0, "reward": 4.9423828125, "reward_std": 0.3209795355796814, "rewards/helpfulness_reward/mean": 4.9423828125, "rewards/helpfulness_reward/std": 0.5430555939674377, "rewards/safety_reward/mean": 6.8720703125, "rewards/safety_reward/std": 0.5597378015518188, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 114.9140625, "completions/mean_terminated_length": 114.9140625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.1456641341367566, "frac_reward_zero_std": 0.0, "grad_norm": 0.5398166179656982, "kl": 2.421875, "learning_rate": 5e-05, "loss": 0.0258, "num_tokens": 11470469.0, "reward": 4.9317626953125, "reward_std": 0.24530532956123352, "rewards/helpfulness_reward/mean": 4.9317626953125, "rewards/helpfulness_reward/std": 0.5363776087760925, "rewards/safety_reward/mean": 6.830810546875, "rewards/safety_reward/std": 0.5475445985794067, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 114.53125, "completions/mean_terminated_length": 114.53125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.14601344860710855, "frac_reward_zero_std": 0.0, "grad_norm": 0.4826239049434662, "kl": 2.46484375, "learning_rate": 5e-05, "loss": 0.0178, "num_tokens": 11490145.0, "reward": 5.01513671875, "reward_std": 0.27363938093185425, "rewards/helpfulness_reward/mean": 5.01513671875, "rewards/helpfulness_reward/std": 0.5477089881896973, "rewards/safety_reward/mean": 6.976806640625, "rewards/safety_reward/std": 0.6833293437957764, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 115.6484375, "completions/mean_terminated_length": 115.6484375, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.1463627630774605, "frac_reward_zero_std": 0.0, "grad_norm": 0.555164098739624, "kl": 2.390625, "learning_rate": 5e-05, "loss": 0.0255, "num_tokens": 11509172.0, "reward": 4.960205078125, "reward_std": 0.30433785915374756, "rewards/helpfulness_reward/mean": 4.960205078125, "rewards/helpfulness_reward/std": 0.4249819815158844, "rewards/safety_reward/mean": 6.865478515625, "rewards/safety_reward/std": 0.47146618366241455, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 116.1328125, "completions/mean_terminated_length": 116.1328125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.14671207754781243, "frac_reward_zero_std": 0.0, "grad_norm": 0.5326452255249023, "kl": 2.384765625, "learning_rate": 5e-05, "loss": 0.025, "num_tokens": 11528661.0, "reward": 5.043212890625, "reward_std": 0.3027797341346741, "rewards/helpfulness_reward/mean": 5.043212890625, "rewards/helpfulness_reward/std": 0.4709327518939972, "rewards/safety_reward/mean": 7.099365234375, "rewards/safety_reward/std": 0.5331850647926331, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 116.1796875, "completions/mean_terminated_length": 116.1796875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.14706139201816434, "frac_reward_zero_std": 0.0, "grad_norm": 0.5058929920196533, "kl": 2.591796875, "learning_rate": 5e-05, "loss": 0.0377, "num_tokens": 11547540.0, "reward": 5.0611572265625, "reward_std": 0.3078056573867798, "rewards/helpfulness_reward/mean": 5.0611572265625, "rewards/helpfulness_reward/std": 0.48742738366127014, "rewards/safety_reward/mean": 7.052490234375, "rewards/safety_reward/std": 0.6811844110488892, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 113.921875, "completions/mean_terminated_length": 113.921875, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.14741070648851629, "frac_reward_zero_std": 0.0, "grad_norm": 0.6708123683929443, "kl": 2.69921875, "learning_rate": 5e-05, "loss": 0.0247, "num_tokens": 11566058.0, "reward": 5.214111328125, "reward_std": 0.30319976806640625, "rewards/helpfulness_reward/mean": 5.214111328125, "rewards/helpfulness_reward/std": 0.5351055860519409, "rewards/safety_reward/mean": 6.972412109375, "rewards/safety_reward/std": 0.6309951543807983, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 115.015625, "completions/mean_terminated_length": 115.015625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.14776002095886823, "frac_reward_zero_std": 0.0, "grad_norm": 0.545078456401825, "kl": 2.470703125, "learning_rate": 5e-05, "loss": 0.0356, "num_tokens": 11586596.0, "reward": 5.064697265625, "reward_std": 0.3389340043067932, "rewards/helpfulness_reward/mean": 5.064697265625, "rewards/helpfulness_reward/std": 0.6460402011871338, "rewards/safety_reward/mean": 7.0888671875, "rewards/safety_reward/std": 0.6310538053512573, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 114.265625, "completions/mean_terminated_length": 114.265625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.14810933542922017, "frac_reward_zero_std": 0.0, "grad_norm": 0.5413931608200073, "kl": 2.3671875, "learning_rate": 5e-05, "loss": 0.0304, "num_tokens": 11606006.0, "reward": 5.302490234375, "reward_std": 0.3117883801460266, "rewards/helpfulness_reward/mean": 5.302490234375, "rewards/helpfulness_reward/std": 0.49497556686401367, "rewards/safety_reward/mean": 7.3857421875, "rewards/safety_reward/std": 0.5837447047233582, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 114.8125, "completions/mean_terminated_length": 114.8125, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.14845864989957208, "frac_reward_zero_std": 0.0, "grad_norm": 31.422588348388672, "kl": 11.80859375, "learning_rate": 5e-05, "loss": 0.1155, "num_tokens": 11625462.0, "reward": 5.0863037109375, "reward_std": 0.2887643575668335, "rewards/helpfulness_reward/mean": 5.0863037109375, "rewards/helpfulness_reward/std": 0.39861640334129333, "rewards/safety_reward/mean": 6.98828125, "rewards/safety_reward/std": 0.6212856769561768, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 104.3125, "completions/mean_terminated_length": 104.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.14880796436992402, "frac_reward_zero_std": 0.0, "grad_norm": 0.6333743929862976, "kl": 2.62109375, "learning_rate": 5e-05, "loss": 0.0098, "num_tokens": 11644846.0, "reward": 4.77294921875, "reward_std": 0.3146044909954071, "rewards/helpfulness_reward/mean": 4.77294921875, "rewards/helpfulness_reward/std": 0.6887094974517822, "rewards/safety_reward/mean": 6.6939697265625, "rewards/safety_reward/std": 1.4145541191101074, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 115.953125, "completions/mean_terminated_length": 115.953125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.14915727884027596, "frac_reward_zero_std": 0.0, "grad_norm": 0.5060918927192688, "kl": 2.5, "learning_rate": 5e-05, "loss": 0.0383, "num_tokens": 11664752.0, "reward": 5.089599609375, "reward_std": 0.33937373757362366, "rewards/helpfulness_reward/mean": 5.089599609375, "rewards/helpfulness_reward/std": 0.5874270796775818, "rewards/safety_reward/mean": 6.820556640625, "rewards/safety_reward/std": 0.6453889608383179, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 116.5390625, "completions/mean_terminated_length": 116.5390625, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.1495065933106279, "frac_reward_zero_std": 0.0, "grad_norm": 0.611225426197052, "kl": 2.541015625, "learning_rate": 5e-05, "loss": 0.0283, "num_tokens": 11684493.0, "reward": 5.1602783203125, "reward_std": 0.28886616230010986, "rewards/helpfulness_reward/mean": 5.1602783203125, "rewards/helpfulness_reward/std": 0.5355109572410583, "rewards/safety_reward/mean": 6.926513671875, "rewards/safety_reward/std": 0.6211444735527039, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 116.5859375, "completions/mean_terminated_length": 116.5859375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.14985590778097982, "frac_reward_zero_std": 0.0, "grad_norm": 2.1953492164611816, "kl": 3.19140625, "learning_rate": 5e-05, "loss": 0.0328, "num_tokens": 11704432.0, "reward": 4.9134521484375, "reward_std": 0.28982043266296387, "rewards/helpfulness_reward/mean": 4.9134521484375, "rewards/helpfulness_reward/std": 0.5805982947349548, "rewards/safety_reward/mean": 6.823974609375, "rewards/safety_reward/std": 0.630315363407135, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 114.671875, "completions/mean_terminated_length": 114.671875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.15020522225133176, "frac_reward_zero_std": 0.0, "grad_norm": 0.53038489818573, "kl": 2.5859375, "learning_rate": 5e-05, "loss": 0.0319, "num_tokens": 11723118.0, "reward": 5.112548828125, "reward_std": 0.27204686403274536, "rewards/helpfulness_reward/mean": 5.112548828125, "rewards/helpfulness_reward/std": 0.4064362049102783, "rewards/safety_reward/mean": 6.941650390625, "rewards/safety_reward/std": 0.6874817609786987, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 109.0546875, "completions/mean_terminated_length": 109.0546875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.1505545367216837, "frac_reward_zero_std": 0.0, "grad_norm": 0.5287937521934509, "kl": 2.55078125, "learning_rate": 5e-05, "loss": 0.0449, "num_tokens": 11743549.0, "reward": 4.93853759765625, "reward_std": 0.3478892147541046, "rewards/helpfulness_reward/mean": 4.93853759765625, "rewards/helpfulness_reward/std": 0.9087218046188354, "rewards/safety_reward/mean": 6.610107421875, "rewards/safety_reward/std": 1.3049559593200684, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 116.71875, "completions/mean_terminated_length": 116.71875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.15090385119203564, "frac_reward_zero_std": 0.0, "grad_norm": 0.5189686417579651, "kl": 2.552734375, "learning_rate": 5e-05, "loss": 0.0184, "num_tokens": 11762345.0, "reward": 5.128662109375, "reward_std": 0.2764774560928345, "rewards/helpfulness_reward/mean": 5.128662109375, "rewards/helpfulness_reward/std": 0.4476917088031769, "rewards/safety_reward/mean": 7.0810546875, "rewards/safety_reward/std": 0.6674491763114929, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 113.8984375, "completions/mean_terminated_length": 113.8984375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.15125316566238756, "frac_reward_zero_std": 0.0, "grad_norm": 0.5297585129737854, "kl": 2.505859375, "learning_rate": 5e-05, "loss": 0.0226, "num_tokens": 11781892.0, "reward": 4.963287353515625, "reward_std": 0.35892438888549805, "rewards/helpfulness_reward/mean": 4.963287353515625, "rewards/helpfulness_reward/std": 0.9476075768470764, "rewards/safety_reward/mean": 6.98583984375, "rewards/safety_reward/std": 0.8994972109794617, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 117.4609375, "completions/mean_terminated_length": 117.4609375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.1516024801327395, "frac_reward_zero_std": 0.0, "grad_norm": 0.6351417303085327, "kl": 2.552734375, "learning_rate": 5e-05, "loss": 0.0452, "num_tokens": 11802255.0, "reward": 5.2188720703125, "reward_std": 0.3103923201560974, "rewards/helpfulness_reward/mean": 5.2188720703125, "rewards/helpfulness_reward/std": 0.5404501557350159, "rewards/safety_reward/mean": 7.22216796875, "rewards/safety_reward/std": 0.4603702127933502, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 116.0859375, "completions/mean_terminated_length": 116.0859375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.15195179460309144, "frac_reward_zero_std": 0.0, "grad_norm": 0.5172598958015442, "kl": 2.4609375, "learning_rate": 5e-05, "loss": 0.0246, "num_tokens": 11821098.0, "reward": 5.2161865234375, "reward_std": 0.3545547127723694, "rewards/helpfulness_reward/mean": 5.2161865234375, "rewards/helpfulness_reward/std": 0.42896613478660583, "rewards/safety_reward/mean": 7.193359375, "rewards/safety_reward/std": 0.5637801289558411, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 115.140625, "completions/mean_terminated_length": 115.140625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.15230110907344338, "frac_reward_zero_std": 0.0, "grad_norm": 0.5416061878204346, "kl": 2.43359375, "learning_rate": 5e-05, "loss": 0.0293, "num_tokens": 11840284.0, "reward": 5.06005859375, "reward_std": 0.34949204325675964, "rewards/helpfulness_reward/mean": 5.06005859375, "rewards/helpfulness_reward/std": 0.44576364755630493, "rewards/safety_reward/mean": 6.827392578125, "rewards/safety_reward/std": 0.6619138717651367, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 116.8515625, "completions/mean_terminated_length": 116.8515625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.1526504235437953, "frac_reward_zero_std": 0.0, "grad_norm": 0.6031486988067627, "kl": 2.775390625, "learning_rate": 5e-05, "loss": 0.0275, "num_tokens": 11859761.0, "reward": 5.122802734375, "reward_std": 0.30376946926116943, "rewards/helpfulness_reward/mean": 5.122802734375, "rewards/helpfulness_reward/std": 0.5098257064819336, "rewards/safety_reward/mean": 6.996337890625, "rewards/safety_reward/std": 0.5755889415740967, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 116.921875, "completions/mean_terminated_length": 116.921875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.15299973801414724, "frac_reward_zero_std": 0.0, "grad_norm": 0.5610032081604004, "kl": 2.537109375, "learning_rate": 5e-05, "loss": 0.0303, "num_tokens": 11878991.0, "reward": 5.0460205078125, "reward_std": 0.26361432671546936, "rewards/helpfulness_reward/mean": 5.0460205078125, "rewards/helpfulness_reward/std": 0.5498610734939575, "rewards/safety_reward/mean": 6.81005859375, "rewards/safety_reward/std": 0.5692482590675354, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 115.703125, "completions/mean_terminated_length": 115.703125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.15334905248449918, "frac_reward_zero_std": 0.0, "grad_norm": 0.5480182766914368, "kl": 2.669921875, "learning_rate": 5e-05, "loss": 0.0262, "num_tokens": 11898065.0, "reward": 5.2850341796875, "reward_std": 0.29669973254203796, "rewards/helpfulness_reward/mean": 5.2850341796875, "rewards/helpfulness_reward/std": 0.5114821195602417, "rewards/safety_reward/mean": 7.23876953125, "rewards/safety_reward/std": 0.6427323818206787, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 114.4453125, "completions/mean_terminated_length": 114.4453125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.15369836695485112, "frac_reward_zero_std": 0.0, "grad_norm": 0.49920108914375305, "kl": 2.529296875, "learning_rate": 5e-05, "loss": 0.0342, "num_tokens": 11916554.0, "reward": 5.4393310546875, "reward_std": 0.3332670331001282, "rewards/helpfulness_reward/mean": 5.4393310546875, "rewards/helpfulness_reward/std": 0.5624029636383057, "rewards/safety_reward/mean": 7.3486328125, "rewards/safety_reward/std": 0.6580629348754883, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 116.2734375, "completions/mean_terminated_length": 116.2734375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.15404768142520303, "frac_reward_zero_std": 0.0, "grad_norm": 0.5793245434761047, "kl": 2.62890625, "learning_rate": 5e-05, "loss": 0.0414, "num_tokens": 11935853.0, "reward": 5.28759765625, "reward_std": 0.3278382420539856, "rewards/helpfulness_reward/mean": 5.28759765625, "rewards/helpfulness_reward/std": 0.5097378492355347, "rewards/safety_reward/mean": 7.1669921875, "rewards/safety_reward/std": 0.593464195728302, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 114.2109375, "completions/mean_terminated_length": 114.2109375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.15439699589555497, "frac_reward_zero_std": 0.0, "grad_norm": 0.5300320386886597, "kl": 2.740234375, "learning_rate": 5e-05, "loss": 0.0335, "num_tokens": 11954064.0, "reward": 5.46875, "reward_std": 0.3149494528770447, "rewards/helpfulness_reward/mean": 5.46875, "rewards/helpfulness_reward/std": 0.599273681640625, "rewards/safety_reward/mean": 7.372802734375, "rewards/safety_reward/std": 0.6578496098518372, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 113.40625, "completions/mean_terminated_length": 113.40625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.15474631036590691, "frac_reward_zero_std": 0.0, "grad_norm": 0.5467286705970764, "kl": 2.8203125, "learning_rate": 5e-05, "loss": 0.0281, "num_tokens": 11973476.0, "reward": 5.493408203125, "reward_std": 0.3017593026161194, "rewards/helpfulness_reward/mean": 5.493408203125, "rewards/helpfulness_reward/std": 0.4672573208808899, "rewards/safety_reward/mean": 7.391357421875, "rewards/safety_reward/std": 0.7249970436096191, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 113.6640625, "completions/mean_terminated_length": 113.6640625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.15509562483625886, "frac_reward_zero_std": 0.0, "grad_norm": 0.5184593200683594, "kl": 2.658203125, "learning_rate": 5e-05, "loss": 0.0308, "num_tokens": 11991953.0, "reward": 5.429443359375, "reward_std": 0.2963535785675049, "rewards/helpfulness_reward/mean": 5.429443359375, "rewards/helpfulness_reward/std": 0.37853577733039856, "rewards/safety_reward/mean": 7.0615234375, "rewards/safety_reward/std": 0.5438868403434753, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 113.6953125, "completions/mean_terminated_length": 113.6953125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.15544493930661077, "frac_reward_zero_std": 0.0, "grad_norm": 0.5315566658973694, "kl": 2.796875, "learning_rate": 5e-05, "loss": 0.0241, "num_tokens": 12010450.0, "reward": 5.567626953125, "reward_std": 0.2928920090198517, "rewards/helpfulness_reward/mean": 5.567626953125, "rewards/helpfulness_reward/std": 0.40574386715888977, "rewards/safety_reward/mean": 7.31689453125, "rewards/safety_reward/std": 0.544816255569458, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 113.171875, "completions/mean_terminated_length": 113.171875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.1557942537769627, "frac_reward_zero_std": 0.0, "grad_norm": 0.5232986807823181, "kl": 2.810546875, "learning_rate": 5e-05, "loss": 0.0223, "num_tokens": 12028640.0, "reward": 5.3970947265625, "reward_std": 0.3183405101299286, "rewards/helpfulness_reward/mean": 5.3970947265625, "rewards/helpfulness_reward/std": 0.5219282507896423, "rewards/safety_reward/mean": 7.1552734375, "rewards/safety_reward/std": 0.7378541827201843, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 112.5390625, "completions/mean_terminated_length": 112.5390625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.15614356824731465, "frac_reward_zero_std": 0.0, "grad_norm": 0.5623731017112732, "kl": 2.58984375, "learning_rate": 5e-05, "loss": 0.0194, "num_tokens": 12047157.0, "reward": 5.4200439453125, "reward_std": 0.38604414463043213, "rewards/helpfulness_reward/mean": 5.4200439453125, "rewards/helpfulness_reward/std": 0.5408772826194763, "rewards/safety_reward/mean": 7.391845703125, "rewards/safety_reward/std": 0.5953072309494019, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 115.2734375, "completions/mean_terminated_length": 115.2734375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.1564928827176666, "frac_reward_zero_std": 0.0, "grad_norm": 1.028691291809082, "kl": 3.130859375, "learning_rate": 5e-05, "loss": 0.0434, "num_tokens": 12066856.0, "reward": 5.27734375, "reward_std": 0.3796066641807556, "rewards/helpfulness_reward/mean": 5.27734375, "rewards/helpfulness_reward/std": 0.771608829498291, "rewards/safety_reward/mean": 7.1455078125, "rewards/safety_reward/std": 0.7427102327346802, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 113.6640625, "completions/mean_terminated_length": 113.6640625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.1568421971880185, "frac_reward_zero_std": 0.0, "grad_norm": 0.5362759828567505, "kl": 2.74609375, "learning_rate": 5e-05, "loss": 0.0322, "num_tokens": 12085269.0, "reward": 5.478759765625, "reward_std": 0.3276798129081726, "rewards/helpfulness_reward/mean": 5.478759765625, "rewards/helpfulness_reward/std": 0.5311694145202637, "rewards/safety_reward/mean": 7.43603515625, "rewards/safety_reward/std": 0.5732762813568115, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 114.828125, "completions/mean_terminated_length": 114.828125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.15719151165837045, "frac_reward_zero_std": 0.0, "grad_norm": 0.45234259963035583, "kl": 2.791015625, "learning_rate": 5e-05, "loss": 0.0307, "num_tokens": 12104087.0, "reward": 5.718017578125, "reward_std": 0.29279348254203796, "rewards/helpfulness_reward/mean": 5.718017578125, "rewards/helpfulness_reward/std": 0.5007447600364685, "rewards/safety_reward/mean": 7.430419921875, "rewards/safety_reward/std": 0.5918394923210144, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 114.109375, "completions/mean_terminated_length": 114.109375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.1575408261287224, "frac_reward_zero_std": 0.0, "grad_norm": 0.5351918339729309, "kl": 2.85546875, "learning_rate": 5e-05, "loss": 0.0207, "num_tokens": 12123237.0, "reward": 5.49365234375, "reward_std": 0.37895768880844116, "rewards/helpfulness_reward/mean": 5.49365234375, "rewards/helpfulness_reward/std": 0.565650463104248, "rewards/safety_reward/mean": 7.356201171875, "rewards/safety_reward/std": 0.5957002639770508, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 114.2734375, "completions/mean_terminated_length": 114.2734375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.15789014059907433, "frac_reward_zero_std": 0.0, "grad_norm": 0.5164166688919067, "kl": 2.853515625, "learning_rate": 5e-05, "loss": 0.0307, "num_tokens": 12142200.0, "reward": 5.47705078125, "reward_std": 0.32389312982559204, "rewards/helpfulness_reward/mean": 5.47705078125, "rewards/helpfulness_reward/std": 0.5748217701911926, "rewards/safety_reward/mean": 7.376953125, "rewards/safety_reward/std": 0.64994215965271, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 114.84375, "completions/mean_terminated_length": 114.84375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.15823945506942624, "frac_reward_zero_std": 0.0, "grad_norm": 0.4745802879333496, "kl": 2.861328125, "learning_rate": 5e-05, "loss": 0.0209, "num_tokens": 12162164.0, "reward": 5.450927734375, "reward_std": 0.2868085503578186, "rewards/helpfulness_reward/mean": 5.450927734375, "rewards/helpfulness_reward/std": 0.4762714207172394, "rewards/safety_reward/mean": 7.1240234375, "rewards/safety_reward/std": 0.6168256998062134, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 115.078125, "completions/mean_terminated_length": 115.078125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.15858876953977818, "frac_reward_zero_std": 0.0, "grad_norm": 0.5276048183441162, "kl": 3.04296875, "learning_rate": 5e-05, "loss": 0.0325, "num_tokens": 12182214.0, "reward": 5.328857421875, "reward_std": 0.33498892188072205, "rewards/helpfulness_reward/mean": 5.328857421875, "rewards/helpfulness_reward/std": 0.5656824707984924, "rewards/safety_reward/mean": 7.35107421875, "rewards/safety_reward/std": 0.6716530323028564, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 116.40625, "completions/mean_terminated_length": 116.40625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.15893808401013013, "frac_reward_zero_std": 0.0, "grad_norm": 0.4670431315898895, "kl": 2.90625, "learning_rate": 5e-05, "loss": 0.0207, "num_tokens": 12201258.0, "reward": 5.5927734375, "reward_std": 0.32578080892562866, "rewards/helpfulness_reward/mean": 5.5927734375, "rewards/helpfulness_reward/std": 0.4954569935798645, "rewards/safety_reward/mean": 7.50244140625, "rewards/safety_reward/std": 0.5315554738044739, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 117.953125, "completions/mean_terminated_length": 117.953125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.15928739848048207, "frac_reward_zero_std": 0.0, "grad_norm": 0.5675003528594971, "kl": 2.853515625, "learning_rate": 5e-05, "loss": 0.0369, "num_tokens": 12220860.0, "reward": 5.44189453125, "reward_std": 0.33227404952049255, "rewards/helpfulness_reward/mean": 5.44189453125, "rewards/helpfulness_reward/std": 0.5880107879638672, "rewards/safety_reward/mean": 7.26611328125, "rewards/safety_reward/std": 0.5655943751335144, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 116.28125, "completions/mean_terminated_length": 116.28125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.15963671295083398, "frac_reward_zero_std": 0.0, "grad_norm": 0.5059581995010376, "kl": 2.91015625, "learning_rate": 5e-05, "loss": 0.029, "num_tokens": 12240000.0, "reward": 5.39501953125, "reward_std": 0.3385767638683319, "rewards/helpfulness_reward/mean": 5.39501953125, "rewards/helpfulness_reward/std": 0.4478825330734253, "rewards/safety_reward/mean": 7.31298828125, "rewards/safety_reward/std": 0.5418336391448975, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 116.53125, "completions/mean_terminated_length": 116.53125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.15998602742118592, "frac_reward_zero_std": 0.0, "grad_norm": 0.5892375707626343, "kl": 3.03125, "learning_rate": 5e-05, "loss": 0.039, "num_tokens": 12259972.0, "reward": 5.4781494140625, "reward_std": 0.3237709105014801, "rewards/helpfulness_reward/mean": 5.4781494140625, "rewards/helpfulness_reward/std": 0.7758829593658447, "rewards/safety_reward/mean": 7.13623046875, "rewards/safety_reward/std": 0.8213548064231873, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 118.4765625, "completions/mean_terminated_length": 118.4765625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.16033534189153786, "frac_reward_zero_std": 0.0, "grad_norm": 0.6068413853645325, "kl": 2.84375, "learning_rate": 5e-05, "loss": 0.0245, "num_tokens": 12279409.0, "reward": 5.6181640625, "reward_std": 0.26519975066185, "rewards/helpfulness_reward/mean": 5.6181640625, "rewards/helpfulness_reward/std": 0.534819483757019, "rewards/safety_reward/mean": 7.308349609375, "rewards/safety_reward/std": 0.6797089576721191, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 117.953125, "completions/mean_terminated_length": 117.953125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.1606846563618898, "frac_reward_zero_std": 0.0, "grad_norm": 0.5002425909042358, "kl": 2.865234375, "learning_rate": 5e-05, "loss": 0.0263, "num_tokens": 12298171.0, "reward": 5.692138671875, "reward_std": 0.28405332565307617, "rewards/helpfulness_reward/mean": 5.692138671875, "rewards/helpfulness_reward/std": 0.47474172711372375, "rewards/safety_reward/mean": 7.519287109375, "rewards/safety_reward/std": 0.5766900181770325, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 117.78125, "completions/mean_terminated_length": 117.78125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.16103397083224172, "frac_reward_zero_std": 0.0, "grad_norm": 0.5421258807182312, "kl": 2.83984375, "learning_rate": 5e-05, "loss": 0.0236, "num_tokens": 12317239.0, "reward": 5.497802734375, "reward_std": 0.2567894458770752, "rewards/helpfulness_reward/mean": 5.497802734375, "rewards/helpfulness_reward/std": 0.4920658469200134, "rewards/safety_reward/mean": 7.351806640625, "rewards/safety_reward/std": 0.5560136437416077, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 117.8671875, "completions/mean_terminated_length": 117.8671875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.16138328530259366, "frac_reward_zero_std": 0.0, "grad_norm": 0.5092614889144897, "kl": 3.033203125, "learning_rate": 5e-05, "loss": 0.0359, "num_tokens": 12336334.0, "reward": 5.744873046875, "reward_std": 0.3086920380592346, "rewards/helpfulness_reward/mean": 5.744873046875, "rewards/helpfulness_reward/std": 0.5068547129631042, "rewards/safety_reward/mean": 7.6806640625, "rewards/safety_reward/std": 0.6094136238098145, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 119.515625, "completions/mean_terminated_length": 119.515625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.1617325997729456, "frac_reward_zero_std": 0.0, "grad_norm": 0.7047668099403381, "kl": 3.06640625, "learning_rate": 5e-05, "loss": 0.0376, "num_tokens": 12356936.0, "reward": 5.4765625, "reward_std": 0.2784239649772644, "rewards/helpfulness_reward/mean": 5.4765625, "rewards/helpfulness_reward/std": 0.43437790870666504, "rewards/safety_reward/mean": 7.43505859375, "rewards/safety_reward/std": 0.5404214859008789, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 118.6953125, "completions/mean_terminated_length": 118.6953125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.16208191424329754, "frac_reward_zero_std": 0.0, "grad_norm": 0.8151106834411621, "kl": 3.26953125, "learning_rate": 5e-05, "loss": 0.025, "num_tokens": 12376977.0, "reward": 5.581298828125, "reward_std": 0.324218213558197, "rewards/helpfulness_reward/mean": 5.581298828125, "rewards/helpfulness_reward/std": 0.5577961206436157, "rewards/safety_reward/mean": 7.291015625, "rewards/safety_reward/std": 0.7185707688331604, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 119.796875, "completions/mean_terminated_length": 119.796875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.16243122871364946, "frac_reward_zero_std": 0.0, "grad_norm": 0.4769313633441925, "kl": 2.98046875, "learning_rate": 5e-05, "loss": 0.0274, "num_tokens": 12397343.0, "reward": 5.52490234375, "reward_std": 0.3032853603363037, "rewards/helpfulness_reward/mean": 5.52490234375, "rewards/helpfulness_reward/std": 0.5098283290863037, "rewards/safety_reward/mean": 7.293212890625, "rewards/safety_reward/std": 0.6936145424842834, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 120.84375, "completions/mean_terminated_length": 120.84375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.1627805431840014, "frac_reward_zero_std": 0.0, "grad_norm": 0.5642624497413635, "kl": 3.162109375, "learning_rate": 5e-05, "loss": 0.0438, "num_tokens": 12418211.0, "reward": 5.60302734375, "reward_std": 0.24768193066120148, "rewards/helpfulness_reward/mean": 5.60302734375, "rewards/helpfulness_reward/std": 0.5325057506561279, "rewards/safety_reward/mean": 7.71728515625, "rewards/safety_reward/std": 0.4237479269504547, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 120.328125, "completions/mean_terminated_length": 120.328125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.16312985765435334, "frac_reward_zero_std": 0.0, "grad_norm": 0.5080615282058716, "kl": 2.92578125, "learning_rate": 5e-05, "loss": 0.0282, "num_tokens": 12437125.0, "reward": 5.603515625, "reward_std": 0.2511560320854187, "rewards/helpfulness_reward/mean": 5.603515625, "rewards/helpfulness_reward/std": 0.3844917118549347, "rewards/safety_reward/mean": 7.318115234375, "rewards/safety_reward/std": 0.43537577986717224, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 119.90625, "completions/mean_terminated_length": 119.90625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.16347917212470525, "frac_reward_zero_std": 0.0, "grad_norm": 0.6007146239280701, "kl": 2.9375, "learning_rate": 5e-05, "loss": 0.0426, "num_tokens": 12457177.0, "reward": 5.5562744140625, "reward_std": 0.24508662521839142, "rewards/helpfulness_reward/mean": 5.5562744140625, "rewards/helpfulness_reward/std": 0.5633548498153687, "rewards/safety_reward/mean": 7.51171875, "rewards/safety_reward/std": 0.6037606596946716, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 119.8203125, "completions/mean_terminated_length": 119.8203125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.1638284865950572, "frac_reward_zero_std": 0.0, "grad_norm": 0.5099050998687744, "kl": 2.89453125, "learning_rate": 5e-05, "loss": 0.0306, "num_tokens": 12476890.0, "reward": 5.740478515625, "reward_std": 0.23161587119102478, "rewards/helpfulness_reward/mean": 5.740478515625, "rewards/helpfulness_reward/std": 0.40105050802230835, "rewards/safety_reward/mean": 7.578369140625, "rewards/safety_reward/std": 0.4206176996231079, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 119.6484375, "completions/mean_terminated_length": 119.6484375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.16417780106540913, "frac_reward_zero_std": 0.0, "grad_norm": 0.5146843194961548, "kl": 2.87890625, "learning_rate": 5e-05, "loss": 0.0324, "num_tokens": 12496069.0, "reward": 5.4716796875, "reward_std": 0.24943280220031738, "rewards/helpfulness_reward/mean": 5.4716796875, "rewards/helpfulness_reward/std": 0.5231247544288635, "rewards/safety_reward/mean": 7.3662109375, "rewards/safety_reward/std": 0.6012967824935913, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 119.828125, "completions/mean_terminated_length": 119.828125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.16452711553576108, "frac_reward_zero_std": 0.0, "grad_norm": 0.5802769660949707, "kl": 3.12109375, "learning_rate": 5e-05, "loss": 0.0349, "num_tokens": 12515527.0, "reward": 5.512939453125, "reward_std": 0.2390677034854889, "rewards/helpfulness_reward/mean": 5.512939453125, "rewards/helpfulness_reward/std": 0.5127935409545898, "rewards/safety_reward/mean": 7.323974609375, "rewards/safety_reward/std": 0.5761130452156067, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 117.765625, "completions/mean_terminated_length": 117.765625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.164876430006113, "frac_reward_zero_std": 0.0, "grad_norm": 0.9392755627632141, "kl": 3.298828125, "learning_rate": 5e-05, "loss": 0.0385, "num_tokens": 12534537.0, "reward": 5.96923828125, "reward_std": 0.23107020556926727, "rewards/helpfulness_reward/mean": 5.96923828125, "rewards/helpfulness_reward/std": 0.43886852264404297, "rewards/safety_reward/mean": 7.759765625, "rewards/safety_reward/std": 0.4656103849411011, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 117.4609375, "completions/mean_terminated_length": 117.4609375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.16522574447646493, "frac_reward_zero_std": 0.0, "grad_norm": 0.47745177149772644, "kl": 3.111328125, "learning_rate": 5e-05, "loss": 0.0267, "num_tokens": 12553380.0, "reward": 5.71044921875, "reward_std": 0.30686795711517334, "rewards/helpfulness_reward/mean": 5.71044921875, "rewards/helpfulness_reward/std": 0.4924181401729584, "rewards/safety_reward/mean": 7.61865234375, "rewards/safety_reward/std": 0.6680355668067932, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 119.296875, "completions/mean_terminated_length": 119.296875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.16557505894681687, "frac_reward_zero_std": 0.0, "grad_norm": 1.3966270685195923, "kl": 3.720703125, "learning_rate": 5e-05, "loss": 0.0399, "num_tokens": 12572690.0, "reward": 5.7119140625, "reward_std": 0.2301138937473297, "rewards/helpfulness_reward/mean": 5.7119140625, "rewards/helpfulness_reward/std": 0.44516515731811523, "rewards/safety_reward/mean": 7.5439453125, "rewards/safety_reward/std": 0.5110647678375244, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 116.0546875, "completions/mean_terminated_length": 116.0546875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.1659243734171688, "frac_reward_zero_std": 0.0, "grad_norm": 0.4938373863697052, "kl": 3.080078125, "learning_rate": 5e-05, "loss": 0.0313, "num_tokens": 12592881.0, "reward": 5.572021484375, "reward_std": 0.2876244783401489, "rewards/helpfulness_reward/mean": 5.572021484375, "rewards/helpfulness_reward/std": 0.6220811009407043, "rewards/safety_reward/mean": 7.314697265625, "rewards/safety_reward/std": 0.9001344442367554, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 115.9921875, "completions/mean_terminated_length": 115.9921875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.16627368788752073, "frac_reward_zero_std": 0.0, "grad_norm": 0.4795069992542267, "kl": 2.91015625, "learning_rate": 5e-05, "loss": 0.0239, "num_tokens": 12611680.0, "reward": 5.6759033203125, "reward_std": 0.3607368469238281, "rewards/helpfulness_reward/mean": 5.6759033203125, "rewards/helpfulness_reward/std": 0.6315060257911682, "rewards/safety_reward/mean": 7.40380859375, "rewards/safety_reward/std": 0.6622233390808105, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 116.3828125, "completions/mean_terminated_length": 116.3828125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.16662300235787267, "frac_reward_zero_std": 0.0, "grad_norm": 0.45287200808525085, "kl": 3.130859375, "learning_rate": 5e-05, "loss": 0.0251, "num_tokens": 12629953.0, "reward": 5.6182861328125, "reward_std": 0.3868042230606079, "rewards/helpfulness_reward/mean": 5.6182861328125, "rewards/helpfulness_reward/std": 0.5751351714134216, "rewards/safety_reward/mean": 7.1717529296875, "rewards/safety_reward/std": 0.8708896636962891, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 114.3359375, "completions/mean_terminated_length": 114.3359375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.1669723168282246, "frac_reward_zero_std": 0.0, "grad_norm": 1.0176756381988525, "kl": 3.259765625, "learning_rate": 5e-05, "loss": 0.0318, "num_tokens": 12648524.0, "reward": 5.756591796875, "reward_std": 0.23419034481048584, "rewards/helpfulness_reward/mean": 5.756591796875, "rewards/helpfulness_reward/std": 0.43509697914123535, "rewards/safety_reward/mean": 7.461669921875, "rewards/safety_reward/std": 0.5674434900283813, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 115.4453125, "completions/mean_terminated_length": 115.4453125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.16732163129857655, "frac_reward_zero_std": 0.0, "grad_norm": 0.6400517821311951, "kl": 3.0078125, "learning_rate": 5e-05, "loss": 0.0217, "num_tokens": 12669253.0, "reward": 5.530029296875, "reward_std": 0.30074524879455566, "rewards/helpfulness_reward/mean": 5.530029296875, "rewards/helpfulness_reward/std": 0.8402147889137268, "rewards/safety_reward/mean": 7.26220703125, "rewards/safety_reward/std": 0.7811686396598816, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 115.3515625, "completions/mean_terminated_length": 115.3515625, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.16767094576892846, "frac_reward_zero_std": 0.0, "grad_norm": 0.47996586561203003, "kl": 2.962890625, "learning_rate": 5e-05, "loss": 0.0256, "num_tokens": 12688690.0, "reward": 5.652099609375, "reward_std": 0.31540945172309875, "rewards/helpfulness_reward/mean": 5.652099609375, "rewards/helpfulness_reward/std": 0.6044336557388306, "rewards/safety_reward/mean": 7.3677978515625, "rewards/safety_reward/std": 0.6990762948989868, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 116.03125, "completions/mean_terminated_length": 116.03125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.1680202602392804, "frac_reward_zero_std": 0.0, "grad_norm": 0.44434061646461487, "kl": 3.046875, "learning_rate": 5e-05, "loss": 0.0288, "num_tokens": 12707110.0, "reward": 5.4510498046875, "reward_std": 0.296672523021698, "rewards/helpfulness_reward/mean": 5.4510498046875, "rewards/helpfulness_reward/std": 0.44989287853240967, "rewards/safety_reward/mean": 7.3203125, "rewards/safety_reward/std": 0.49600085616111755, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 114.5703125, "completions/mean_terminated_length": 114.5703125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.16836957470963235, "frac_reward_zero_std": 0.0, "grad_norm": 0.46322962641716003, "kl": 2.87109375, "learning_rate": 5e-05, "loss": 0.0267, "num_tokens": 12727503.0, "reward": 5.6826171875, "reward_std": 0.2714744806289673, "rewards/helpfulness_reward/mean": 5.6826171875, "rewards/helpfulness_reward/std": 0.44912901520729065, "rewards/safety_reward/mean": 7.391845703125, "rewards/safety_reward/std": 0.6408533453941345, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 116.8828125, "completions/mean_terminated_length": 116.8828125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.1687188891799843, "frac_reward_zero_std": 0.0, "grad_norm": 0.5425177812576294, "kl": 3.203125, "learning_rate": 5e-05, "loss": 0.0322, "num_tokens": 12746760.0, "reward": 5.49853515625, "reward_std": 0.2756671905517578, "rewards/helpfulness_reward/mean": 5.49853515625, "rewards/helpfulness_reward/std": 0.41379573941230774, "rewards/safety_reward/mean": 7.34814453125, "rewards/safety_reward/std": 0.6333197951316833, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 116.3828125, "completions/mean_terminated_length": 116.3828125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.1690682036503362, "frac_reward_zero_std": 0.0, "grad_norm": 0.45758911967277527, "kl": 2.9296875, "learning_rate": 5e-05, "loss": 0.0314, "num_tokens": 12767081.0, "reward": 5.7293701171875, "reward_std": 0.33044201135635376, "rewards/helpfulness_reward/mean": 5.7293701171875, "rewards/helpfulness_reward/std": 0.5936882495880127, "rewards/safety_reward/mean": 7.52978515625, "rewards/safety_reward/std": 0.5905081629753113, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 117.2578125, "completions/mean_terminated_length": 117.2578125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.16941751812068814, "frac_reward_zero_std": 0.0, "grad_norm": 0.4641317129135132, "kl": 2.92578125, "learning_rate": 5e-05, "loss": 0.0288, "num_tokens": 12785650.0, "reward": 5.89111328125, "reward_std": 0.32306987047195435, "rewards/helpfulness_reward/mean": 5.89111328125, "rewards/helpfulness_reward/std": 0.4758724570274353, "rewards/safety_reward/mean": 7.56982421875, "rewards/safety_reward/std": 0.6458801031112671, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 114.75, "completions/mean_terminated_length": 114.75, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.16976683259104008, "frac_reward_zero_std": 0.0, "grad_norm": 1.2082185745239258, "kl": 2.9296875, "learning_rate": 5e-05, "loss": 0.0193, "num_tokens": 12807698.0, "reward": 5.6474609375, "reward_std": 0.2943423390388489, "rewards/helpfulness_reward/mean": 5.6474609375, "rewards/helpfulness_reward/std": 0.5467212200164795, "rewards/safety_reward/mean": 7.424072265625, "rewards/safety_reward/std": 0.6409826874732971, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 114.84375, "completions/mean_terminated_length": 114.84375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.17011614706139203, "frac_reward_zero_std": 0.0, "grad_norm": 0.4742806851863861, "kl": 3.03125, "learning_rate": 5e-05, "loss": 0.0225, "num_tokens": 12826294.0, "reward": 5.3992919921875, "reward_std": 0.40318572521209717, "rewards/helpfulness_reward/mean": 5.3992919921875, "rewards/helpfulness_reward/std": 0.5546526312828064, "rewards/safety_reward/mean": 7.161376953125, "rewards/safety_reward/std": 0.6139622926712036, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 116.8515625, "completions/mean_terminated_length": 116.8515625, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.17046546153174394, "frac_reward_zero_std": 0.0, "grad_norm": 0.5001850128173828, "kl": 2.935546875, "learning_rate": 5e-05, "loss": 0.0156, "num_tokens": 12845571.0, "reward": 5.435302734375, "reward_std": 0.2950740456581116, "rewards/helpfulness_reward/mean": 5.435302734375, "rewards/helpfulness_reward/std": 0.5576422810554504, "rewards/safety_reward/mean": 7.277587890625, "rewards/safety_reward/std": 0.6279656291007996, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 115.96875, "completions/mean_terminated_length": 115.96875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.17081477600209588, "frac_reward_zero_std": 0.0, "grad_norm": 0.5047060251235962, "kl": 2.98046875, "learning_rate": 5e-05, "loss": 0.0252, "num_tokens": 12864415.0, "reward": 5.541015625, "reward_std": 0.46503961086273193, "rewards/helpfulness_reward/mean": 5.541015625, "rewards/helpfulness_reward/std": 0.6834447979927063, "rewards/safety_reward/mean": 7.0726318359375, "rewards/safety_reward/std": 0.7242695093154907, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 115.7890625, "completions/mean_terminated_length": 115.7890625, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.17116409047244782, "frac_reward_zero_std": 0.0, "grad_norm": 0.5435116291046143, "kl": 2.888671875, "learning_rate": 5e-05, "loss": 0.0161, "num_tokens": 12888332.0, "reward": 5.308837890625, "reward_std": 0.4306432604789734, "rewards/helpfulness_reward/mean": 5.308837890625, "rewards/helpfulness_reward/std": 1.0183998346328735, "rewards/safety_reward/mean": 6.9630126953125, "rewards/safety_reward/std": 1.273132562637329, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 116.6171875, "completions/mean_terminated_length": 116.6171875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.17151340494279976, "frac_reward_zero_std": 0.0, "grad_norm": 0.48340725898742676, "kl": 2.919921875, "learning_rate": 5e-05, "loss": 0.023, "num_tokens": 12908371.0, "reward": 5.5850830078125, "reward_std": 0.35862743854522705, "rewards/helpfulness_reward/mean": 5.5850830078125, "rewards/helpfulness_reward/std": 0.657686173915863, "rewards/safety_reward/mean": 7.47119140625, "rewards/safety_reward/std": 0.6750119924545288, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 117.1953125, "completions/mean_terminated_length": 117.1953125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.17186271941315168, "frac_reward_zero_std": 0.0, "grad_norm": 0.4149394929409027, "kl": 2.833984375, "learning_rate": 5e-05, "loss": 0.0221, "num_tokens": 12927516.0, "reward": 5.723388671875, "reward_std": 0.43330878019332886, "rewards/helpfulness_reward/mean": 5.723388671875, "rewards/helpfulness_reward/std": 0.6378368139266968, "rewards/safety_reward/mean": 7.384521484375, "rewards/safety_reward/std": 0.6188620924949646, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 119.0078125, "completions/mean_terminated_length": 119.0078125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.17221203388350362, "frac_reward_zero_std": 0.0, "grad_norm": 0.5293523073196411, "kl": 3.005859375, "learning_rate": 5e-05, "loss": 0.0356, "num_tokens": 12947029.0, "reward": 5.4903564453125, "reward_std": 0.36430954933166504, "rewards/helpfulness_reward/mean": 5.4903564453125, "rewards/helpfulness_reward/std": 0.6536439061164856, "rewards/safety_reward/mean": 7.189453125, "rewards/safety_reward/std": 0.8835767507553101, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 119.9375, "completions/mean_terminated_length": 119.9375, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.17256134835385556, "frac_reward_zero_std": 0.0, "grad_norm": 0.48060622811317444, "kl": 2.8203125, "learning_rate": 5e-05, "loss": 0.0142, "num_tokens": 12966085.0, "reward": 5.46136474609375, "reward_std": 0.6528122425079346, "rewards/helpfulness_reward/mean": 5.46136474609375, "rewards/helpfulness_reward/std": 0.9553718566894531, "rewards/safety_reward/mean": 7.147216796875, "rewards/safety_reward/std": 0.7392827272415161, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 118.9140625, "completions/mean_terminated_length": 118.9140625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.1729106628242075, "frac_reward_zero_std": 0.0, "grad_norm": 0.4854952096939087, "kl": 3.03515625, "learning_rate": 5e-05, "loss": 0.0171, "num_tokens": 12984722.0, "reward": 5.524169921875, "reward_std": 0.41093000769615173, "rewards/helpfulness_reward/mean": 5.524169921875, "rewards/helpfulness_reward/std": 0.6066697239875793, "rewards/safety_reward/mean": 7.17431640625, "rewards/safety_reward/std": 0.7227944731712341, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 120.765625, "completions/mean_terminated_length": 120.765625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.17325997729455941, "frac_reward_zero_std": 0.0, "grad_norm": 0.4846707880496979, "kl": 2.814453125, "learning_rate": 5e-05, "loss": 0.0329, "num_tokens": 13004444.0, "reward": 5.46484375, "reward_std": 0.40298473834991455, "rewards/helpfulness_reward/mean": 5.46484375, "rewards/helpfulness_reward/std": 0.5443258285522461, "rewards/safety_reward/mean": 7.233642578125, "rewards/safety_reward/std": 0.5883439183235168, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 138.640625, "completions/mean_terminated_length": 138.640625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.17360929176491136, "frac_reward_zero_std": 0.0, "grad_norm": 0.46144211292266846, "kl": 2.61328125, "learning_rate": 5e-05, "loss": -0.0037, "num_tokens": 13027846.0, "reward": 5.355619430541992, "reward_std": 0.40292733907699585, "rewards/helpfulness_reward/mean": 5.355619430541992, "rewards/helpfulness_reward/std": 1.5237892866134644, "rewards/safety_reward/mean": 6.923370361328125, "rewards/safety_reward/std": 2.0903096199035645, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 118.921875, "completions/mean_terminated_length": 118.921875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.1739586062352633, "frac_reward_zero_std": 0.0, "grad_norm": 0.4688951075077057, "kl": 2.625, "learning_rate": 5e-05, "loss": 0.032, "num_tokens": 13049804.0, "reward": 5.138031005859375, "reward_std": 0.4977003335952759, "rewards/helpfulness_reward/mean": 5.138031005859375, "rewards/helpfulness_reward/std": 1.3097997903823853, "rewards/safety_reward/mean": 6.7650146484375, "rewards/safety_reward/std": 1.3056938648223877, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 121.1484375, "completions/mean_terminated_length": 121.1484375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.17430792070561524, "frac_reward_zero_std": 0.0, "grad_norm": 0.5027647614479065, "kl": 2.833984375, "learning_rate": 5e-05, "loss": 0.0388, "num_tokens": 13070487.0, "reward": 5.572265625, "reward_std": 0.2834615111351013, "rewards/helpfulness_reward/mean": 5.572265625, "rewards/helpfulness_reward/std": 0.4179355502128601, "rewards/safety_reward/mean": 7.509521484375, "rewards/safety_reward/std": 0.45942258834838867, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 120.7578125, "completions/mean_terminated_length": 120.7578125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.17465723517596715, "frac_reward_zero_std": 0.0, "grad_norm": 0.44765493273735046, "kl": 2.9140625, "learning_rate": 5e-05, "loss": 0.0322, "num_tokens": 13089584.0, "reward": 5.693115234375, "reward_std": 0.28153741359710693, "rewards/helpfulness_reward/mean": 5.693115234375, "rewards/helpfulness_reward/std": 0.4339959919452667, "rewards/safety_reward/mean": 7.364990234375, "rewards/safety_reward/std": 0.5683062672615051, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 117.9921875, "completions/mean_terminated_length": 117.9921875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.1750065496463191, "frac_reward_zero_std": 0.0, "grad_norm": 0.4331670105457306, "kl": 2.93359375, "learning_rate": 5e-05, "loss": 0.0329, "num_tokens": 13108863.0, "reward": 5.5831298828125, "reward_std": 0.29356619715690613, "rewards/helpfulness_reward/mean": 5.5831298828125, "rewards/helpfulness_reward/std": 0.5967693328857422, "rewards/safety_reward/mean": 7.34326171875, "rewards/safety_reward/std": 0.6033272743225098, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 118.9140625, "completions/mean_terminated_length": 118.9140625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.17535586411667103, "frac_reward_zero_std": 0.0, "grad_norm": 0.4777930676937103, "kl": 2.904296875, "learning_rate": 5e-05, "loss": 0.0423, "num_tokens": 13129420.0, "reward": 5.4017333984375, "reward_std": 0.3382788896560669, "rewards/helpfulness_reward/mean": 5.4017333984375, "rewards/helpfulness_reward/std": 0.795014500617981, "rewards/safety_reward/mean": 7.259765625, "rewards/safety_reward/std": 0.7877314686775208, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 117.4375, "completions/mean_terminated_length": 117.4375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.17570517858702298, "frac_reward_zero_std": 0.0, "grad_norm": 0.46497949957847595, "kl": 3.07421875, "learning_rate": 5e-05, "loss": 0.0334, "num_tokens": 13148628.0, "reward": 5.55224609375, "reward_std": 0.36570945382118225, "rewards/helpfulness_reward/mean": 5.55224609375, "rewards/helpfulness_reward/std": 0.6897122263908386, "rewards/safety_reward/mean": 7.702392578125, "rewards/safety_reward/std": 0.6106996536254883, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 117.0625, "completions/mean_terminated_length": 117.0625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.1760544930573749, "frac_reward_zero_std": 0.0, "grad_norm": 0.43440473079681396, "kl": 2.732421875, "learning_rate": 5e-05, "loss": 0.0204, "num_tokens": 13167868.0, "reward": 5.67578125, "reward_std": 0.2624276876449585, "rewards/helpfulness_reward/mean": 5.67578125, "rewards/helpfulness_reward/std": 0.4629147946834564, "rewards/safety_reward/mean": 7.37255859375, "rewards/safety_reward/std": 0.536293625831604, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 118.2265625, "completions/mean_terminated_length": 118.2265625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.17640380752772683, "frac_reward_zero_std": 0.0, "grad_norm": 0.40691548585891724, "kl": 2.869140625, "learning_rate": 5e-05, "loss": 0.0266, "num_tokens": 13186529.0, "reward": 5.711669921875, "reward_std": 0.2939239740371704, "rewards/helpfulness_reward/mean": 5.711669921875, "rewards/helpfulness_reward/std": 0.413709431886673, "rewards/safety_reward/mean": 7.454345703125, "rewards/safety_reward/std": 0.6601216793060303, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 116.53125, "completions/mean_terminated_length": 116.53125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.17675312199807877, "frac_reward_zero_std": 0.0, "grad_norm": 0.41282233595848083, "kl": 2.900390625, "learning_rate": 5e-05, "loss": 0.0332, "num_tokens": 13205717.0, "reward": 5.639892578125, "reward_std": 0.25291967391967773, "rewards/helpfulness_reward/mean": 5.639892578125, "rewards/helpfulness_reward/std": 0.5043939352035522, "rewards/safety_reward/mean": 7.308837890625, "rewards/safety_reward/std": 0.53334641456604, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 115.5546875, "completions/mean_terminated_length": 115.5546875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.1771024364684307, "frac_reward_zero_std": 0.0, "grad_norm": 0.46867436170578003, "kl": 3.064453125, "learning_rate": 5e-05, "loss": 0.0319, "num_tokens": 13224348.0, "reward": 5.548583984375, "reward_std": 0.30874645709991455, "rewards/helpfulness_reward/mean": 5.548583984375, "rewards/helpfulness_reward/std": 0.4972056448459625, "rewards/safety_reward/mean": 7.268310546875, "rewards/safety_reward/std": 0.5668376088142395, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 116.4765625, "completions/mean_terminated_length": 116.4765625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.17745175093878263, "frac_reward_zero_std": 0.0, "grad_norm": 0.3961874544620514, "kl": 2.884765625, "learning_rate": 5e-05, "loss": 0.0274, "num_tokens": 13243081.0, "reward": 5.7491455078125, "reward_std": 0.2628723084926605, "rewards/helpfulness_reward/mean": 5.7491455078125, "rewards/helpfulness_reward/std": 0.5601723194122314, "rewards/safety_reward/mean": 7.387451171875, "rewards/safety_reward/std": 0.6456640362739563, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 115.734375, "completions/mean_terminated_length": 115.734375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.17780106540913457, "frac_reward_zero_std": 0.0, "grad_norm": 0.4334738254547119, "kl": 2.978515625, "learning_rate": 5e-05, "loss": 0.0233, "num_tokens": 13262207.0, "reward": 5.7939453125, "reward_std": 0.31964725255966187, "rewards/helpfulness_reward/mean": 5.7939453125, "rewards/helpfulness_reward/std": 0.5041266083717346, "rewards/safety_reward/mean": 7.55126953125, "rewards/safety_reward/std": 0.6619693040847778, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 115.015625, "completions/mean_terminated_length": 115.015625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.1781503798794865, "frac_reward_zero_std": 0.0, "grad_norm": 0.5175660848617554, "kl": 3.080078125, "learning_rate": 5e-05, "loss": 0.0249, "num_tokens": 13281401.0, "reward": 5.669677734375, "reward_std": 0.2651035189628601, "rewards/helpfulness_reward/mean": 5.669677734375, "rewards/helpfulness_reward/std": 0.47231554985046387, "rewards/safety_reward/mean": 7.47216796875, "rewards/safety_reward/std": 0.593940794467926, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 112.875, "completions/mean_terminated_length": 112.875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.17849969434983845, "frac_reward_zero_std": 0.0, "grad_norm": 0.448213666677475, "kl": 3.033203125, "learning_rate": 5e-05, "loss": 0.0249, "num_tokens": 13300921.0, "reward": 5.38568115234375, "reward_std": 0.3770291209220886, "rewards/helpfulness_reward/mean": 5.38568115234375, "rewards/helpfulness_reward/std": 1.4873838424682617, "rewards/safety_reward/mean": 7.4263916015625, "rewards/safety_reward/std": 1.48210871219635, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 116.578125, "completions/mean_terminated_length": 116.578125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.17884900882019036, "frac_reward_zero_std": 0.0, "grad_norm": 0.39197248220443726, "kl": 3.052734375, "learning_rate": 5e-05, "loss": 0.0315, "num_tokens": 13319979.0, "reward": 5.766845703125, "reward_std": 0.270682156085968, "rewards/helpfulness_reward/mean": 5.766845703125, "rewards/helpfulness_reward/std": 0.44284388422966003, "rewards/safety_reward/mean": 7.4208984375, "rewards/safety_reward/std": 0.6547888517379761, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 114.15625, "completions/mean_terminated_length": 114.15625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.1791983232905423, "frac_reward_zero_std": 0.0, "grad_norm": 0.4614051878452301, "kl": 3.208984375, "learning_rate": 5e-05, "loss": 0.0322, "num_tokens": 13340503.0, "reward": 5.6259765625, "reward_std": 0.29119011759757996, "rewards/helpfulness_reward/mean": 5.6259765625, "rewards/helpfulness_reward/std": 0.6176728010177612, "rewards/safety_reward/mean": 7.452880859375, "rewards/safety_reward/std": 0.474448561668396, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 114.140625, "completions/mean_terminated_length": 114.140625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.17954763776089425, "frac_reward_zero_std": 0.0, "grad_norm": 0.4769397974014282, "kl": 3.12890625, "learning_rate": 5e-05, "loss": 0.0187, "num_tokens": 13359633.0, "reward": 5.842041015625, "reward_std": 0.24745029211044312, "rewards/helpfulness_reward/mean": 5.842041015625, "rewards/helpfulness_reward/std": 0.5287954211235046, "rewards/safety_reward/mean": 7.590576171875, "rewards/safety_reward/std": 0.5838289856910706, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 113.125, "completions/mean_terminated_length": 113.125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.1798969522312462, "frac_reward_zero_std": 0.0, "grad_norm": 0.4916512072086334, "kl": 2.93359375, "learning_rate": 5e-05, "loss": 0.0252, "num_tokens": 13378913.0, "reward": 5.41925048828125, "reward_std": 0.325447678565979, "rewards/helpfulness_reward/mean": 5.41925048828125, "rewards/helpfulness_reward/std": 1.2447155714035034, "rewards/safety_reward/mean": 7.3029937744140625, "rewards/safety_reward/std": 1.0718798637390137, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 114.2578125, "completions/mean_terminated_length": 114.2578125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.1802462667015981, "frac_reward_zero_std": 0.0, "grad_norm": 0.5167246460914612, "kl": 3.025390625, "learning_rate": 5e-05, "loss": 0.0243, "num_tokens": 13398178.0, "reward": 5.766845703125, "reward_std": 0.30233097076416016, "rewards/helpfulness_reward/mean": 5.766845703125, "rewards/helpfulness_reward/std": 0.5938199758529663, "rewards/safety_reward/mean": 7.758544921875, "rewards/safety_reward/std": 0.6024438142776489, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 116.0546875, "completions/mean_terminated_length": 116.0546875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.18059558117195004, "frac_reward_zero_std": 0.0, "grad_norm": 0.5414826273918152, "kl": 3.021484375, "learning_rate": 5e-05, "loss": 0.0393, "num_tokens": 13416849.0, "reward": 5.74658203125, "reward_std": 0.28169697523117065, "rewards/helpfulness_reward/mean": 5.74658203125, "rewards/helpfulness_reward/std": 0.4711672365665436, "rewards/safety_reward/mean": 7.559326171875, "rewards/safety_reward/std": 0.5539713501930237, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 114.9765625, "completions/mean_terminated_length": 114.9765625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.18094489564230198, "frac_reward_zero_std": 0.0, "grad_norm": 0.47362253069877625, "kl": 3.0390625, "learning_rate": 5e-05, "loss": 0.0217, "num_tokens": 13435350.0, "reward": 5.907958984375, "reward_std": 0.2164742648601532, "rewards/helpfulness_reward/mean": 5.907958984375, "rewards/helpfulness_reward/std": 0.4511425495147705, "rewards/safety_reward/mean": 7.737548828125, "rewards/safety_reward/std": 0.551906168460846, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 116.5859375, "completions/mean_terminated_length": 116.5859375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.18129421011265393, "frac_reward_zero_std": 0.0, "grad_norm": 0.4599391520023346, "kl": 2.919921875, "learning_rate": 5e-05, "loss": 0.0151, "num_tokens": 13453809.0, "reward": 5.8203125, "reward_std": 0.2334023118019104, "rewards/helpfulness_reward/mean": 5.8203125, "rewards/helpfulness_reward/std": 0.542670488357544, "rewards/safety_reward/mean": 7.517578125, "rewards/safety_reward/std": 0.604343056678772, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 109.203125, "completions/mean_terminated_length": 109.203125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.18164352458300584, "frac_reward_zero_std": 0.0, "grad_norm": 3.4553306102752686, "kl": 4.169921875, "learning_rate": 5e-05, "loss": 0.0499, "num_tokens": 13473363.0, "reward": 5.76806640625, "reward_std": 0.301630437374115, "rewards/helpfulness_reward/mean": 5.76806640625, "rewards/helpfulness_reward/std": 0.9096512198448181, "rewards/safety_reward/mean": 7.352294921875, "rewards/safety_reward/std": 1.5138651132583618, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 116.078125, "completions/mean_terminated_length": 116.078125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.18199283905335778, "frac_reward_zero_std": 0.0, "grad_norm": 0.4709230661392212, "kl": 3.119140625, "learning_rate": 5e-05, "loss": 0.033, "num_tokens": 13493589.0, "reward": 5.649169921875, "reward_std": 0.2374982237815857, "rewards/helpfulness_reward/mean": 5.649169921875, "rewards/helpfulness_reward/std": 0.5054485201835632, "rewards/safety_reward/mean": 7.7255859375, "rewards/safety_reward/std": 0.40773195028305054, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 116.484375, "completions/mean_terminated_length": 116.484375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.18234215352370972, "frac_reward_zero_std": 0.0, "grad_norm": 0.45268115401268005, "kl": 2.896484375, "learning_rate": 5e-05, "loss": 0.0274, "num_tokens": 13512355.0, "reward": 5.712646484375, "reward_std": 0.24039064347743988, "rewards/helpfulness_reward/mean": 5.712646484375, "rewards/helpfulness_reward/std": 0.4620678424835205, "rewards/safety_reward/mean": 7.66845703125, "rewards/safety_reward/std": 0.4493993818759918, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 116.9765625, "completions/mean_terminated_length": 116.9765625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.18269146799406166, "frac_reward_zero_std": 0.0, "grad_norm": 0.4189705550670624, "kl": 2.90625, "learning_rate": 5e-05, "loss": 0.0275, "num_tokens": 13531288.0, "reward": 5.8533935546875, "reward_std": 0.29561734199523926, "rewards/helpfulness_reward/mean": 5.8533935546875, "rewards/helpfulness_reward/std": 0.47579237818717957, "rewards/safety_reward/mean": 7.638916015625, "rewards/safety_reward/std": 0.723689615726471, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 118.109375, "completions/mean_terminated_length": 118.109375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.18304078246441358, "frac_reward_zero_std": 0.0, "grad_norm": 0.43883660435676575, "kl": 2.919921875, "learning_rate": 5e-05, "loss": 0.028, "num_tokens": 13550078.0, "reward": 6.00244140625, "reward_std": 0.26015615463256836, "rewards/helpfulness_reward/mean": 6.00244140625, "rewards/helpfulness_reward/std": 0.4346541464328766, "rewards/safety_reward/mean": 7.79150390625, "rewards/safety_reward/std": 0.4380113184452057, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 119.2578125, "completions/mean_terminated_length": 119.2578125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.18339009693476552, "frac_reward_zero_std": 0.0, "grad_norm": 0.4870316684246063, "kl": 2.947265625, "learning_rate": 5e-05, "loss": 0.0292, "num_tokens": 13568823.0, "reward": 6.05517578125, "reward_std": 0.23098644614219666, "rewards/helpfulness_reward/mean": 6.05517578125, "rewards/helpfulness_reward/std": 0.49352169036865234, "rewards/safety_reward/mean": 7.62451171875, "rewards/safety_reward/std": 0.6714068651199341, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 116.6015625, "completions/mean_terminated_length": 116.6015625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.18373941140511746, "frac_reward_zero_std": 0.0, "grad_norm": 0.47698888182640076, "kl": 2.962890625, "learning_rate": 5e-05, "loss": 0.0339, "num_tokens": 13587964.0, "reward": 5.83642578125, "reward_std": 0.2811238765716553, "rewards/helpfulness_reward/mean": 5.83642578125, "rewards/helpfulness_reward/std": 0.5077900290489197, "rewards/safety_reward/mean": 7.603271484375, "rewards/safety_reward/std": 0.44531404972076416, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 117.5, "completions/mean_terminated_length": 117.5, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.1840887258754694, "frac_reward_zero_std": 0.0, "grad_norm": 0.4554954171180725, "kl": 2.888671875, "learning_rate": 5e-05, "loss": 0.0241, "num_tokens": 13608508.0, "reward": 5.804443359375, "reward_std": 0.23650194704532623, "rewards/helpfulness_reward/mean": 5.804443359375, "rewards/helpfulness_reward/std": 0.4409357011318207, "rewards/safety_reward/mean": 7.517333984375, "rewards/safety_reward/std": 0.49326181411743164, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 115.84375, "completions/mean_terminated_length": 115.84375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.1844380403458213, "frac_reward_zero_std": 0.0, "grad_norm": 0.45553702116012573, "kl": 2.984375, "learning_rate": 5e-05, "loss": 0.0212, "num_tokens": 13628936.0, "reward": 5.7279052734375, "reward_std": 0.2663421034812927, "rewards/helpfulness_reward/mean": 5.7279052734375, "rewards/helpfulness_reward/std": 0.7143247127532959, "rewards/safety_reward/mean": 7.739990234375, "rewards/safety_reward/std": 0.8056659698486328, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 119.28125, "completions/mean_terminated_length": 119.28125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.18478735481617325, "frac_reward_zero_std": 0.0, "grad_norm": 0.7031991481781006, "kl": 3.1484375, "learning_rate": 5e-05, "loss": 0.04, "num_tokens": 13649540.0, "reward": 5.99169921875, "reward_std": 0.2635633945465088, "rewards/helpfulness_reward/mean": 5.99169921875, "rewards/helpfulness_reward/std": 0.5067896246910095, "rewards/safety_reward/mean": 7.858154296875, "rewards/safety_reward/std": 0.43682989478111267, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 120.1171875, "completions/mean_terminated_length": 120.1171875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.1851366692865252, "frac_reward_zero_std": 0.0, "grad_norm": 0.4285707175731659, "kl": 3.0078125, "learning_rate": 5e-05, "loss": 0.036, "num_tokens": 13668395.0, "reward": 5.953369140625, "reward_std": 0.269710898399353, "rewards/helpfulness_reward/mean": 5.953369140625, "rewards/helpfulness_reward/std": 0.4031814932823181, "rewards/safety_reward/mean": 7.73681640625, "rewards/safety_reward/std": 0.5464211106300354, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 118.9296875, "completions/mean_terminated_length": 118.9296875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.18548598375687714, "frac_reward_zero_std": 0.0, "grad_norm": 0.4745829999446869, "kl": 3.181640625, "learning_rate": 5e-05, "loss": 0.0267, "num_tokens": 13688002.0, "reward": 5.90234375, "reward_std": 0.24511462450027466, "rewards/helpfulness_reward/mean": 5.90234375, "rewards/helpfulness_reward/std": 0.5789772868156433, "rewards/safety_reward/mean": 7.69775390625, "rewards/safety_reward/std": 0.6378852725028992, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 118.21875, "completions/mean_terminated_length": 118.21875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.18583529822722905, "frac_reward_zero_std": 0.0, "grad_norm": 0.9596479535102844, "kl": 3.283203125, "learning_rate": 5e-05, "loss": 0.0397, "num_tokens": 13706998.0, "reward": 5.9912109375, "reward_std": 0.2206529676914215, "rewards/helpfulness_reward/mean": 5.9912109375, "rewards/helpfulness_reward/std": 0.5062651634216309, "rewards/safety_reward/mean": 7.8095703125, "rewards/safety_reward/std": 0.47568583488464355, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 118.171875, "completions/mean_terminated_length": 118.171875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.186184612697581, "frac_reward_zero_std": 0.0, "grad_norm": 0.42870157957077026, "kl": 2.96875, "learning_rate": 5e-05, "loss": 0.0315, "num_tokens": 13726956.0, "reward": 5.976318359375, "reward_std": 0.2891596555709839, "rewards/helpfulness_reward/mean": 5.976318359375, "rewards/helpfulness_reward/std": 0.5238050222396851, "rewards/safety_reward/mean": 7.874267578125, "rewards/safety_reward/std": 0.4932252764701843, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 117.5859375, "completions/mean_terminated_length": 117.5859375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.18653392716793293, "frac_reward_zero_std": 0.0, "grad_norm": 0.45813053846359253, "kl": 3.0546875, "learning_rate": 5e-05, "loss": 0.031, "num_tokens": 13746023.0, "reward": 5.9783935546875, "reward_std": 0.3713321089744568, "rewards/helpfulness_reward/mean": 5.9783935546875, "rewards/helpfulness_reward/std": 0.7174010872840881, "rewards/safety_reward/mean": 7.8232421875, "rewards/safety_reward/std": 0.6290767192840576, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 114.3203125, "completions/mean_terminated_length": 114.3203125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.18688324163828487, "frac_reward_zero_std": 0.0, "grad_norm": 0.41658106446266174, "kl": 3.06640625, "learning_rate": 5e-05, "loss": 0.0314, "num_tokens": 13766256.0, "reward": 6.04150390625, "reward_std": 0.30480971932411194, "rewards/helpfulness_reward/mean": 6.04150390625, "rewards/helpfulness_reward/std": 0.48634323477745056, "rewards/safety_reward/mean": 7.856201171875, "rewards/safety_reward/std": 0.5098280310630798, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 113.1328125, "completions/mean_terminated_length": 113.1328125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.1872325561086368, "frac_reward_zero_std": 0.0, "grad_norm": 0.5507491827011108, "kl": 3.255859375, "learning_rate": 5e-05, "loss": 0.0263, "num_tokens": 13784713.0, "reward": 5.927734375, "reward_std": 0.28575849533081055, "rewards/helpfulness_reward/mean": 5.927734375, "rewards/helpfulness_reward/std": 0.48032382130622864, "rewards/safety_reward/mean": 7.787353515625, "rewards/safety_reward/std": 0.5338426828384399, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 113.6171875, "completions/mean_terminated_length": 113.6171875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.18758187057898873, "frac_reward_zero_std": 0.0, "grad_norm": 0.5119368433952332, "kl": 2.998046875, "learning_rate": 5e-05, "loss": 0.0359, "num_tokens": 13804816.0, "reward": 5.7611083984375, "reward_std": 0.2969673275947571, "rewards/helpfulness_reward/mean": 5.7611083984375, "rewards/helpfulness_reward/std": 0.6716593503952026, "rewards/safety_reward/mean": 7.651611328125, "rewards/safety_reward/std": 0.6248780488967896, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 112.453125, "completions/mean_terminated_length": 112.453125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.18793118504934067, "frac_reward_zero_std": 0.0, "grad_norm": 0.4862171709537506, "kl": 3.11328125, "learning_rate": 5e-05, "loss": 0.0215, "num_tokens": 13823626.0, "reward": 5.8746337890625, "reward_std": 0.2870679497718811, "rewards/helpfulness_reward/mean": 5.8746337890625, "rewards/helpfulness_reward/std": 0.7758808135986328, "rewards/safety_reward/mean": 7.561279296875, "rewards/safety_reward/std": 0.7336471676826477, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 111.296875, "completions/mean_terminated_length": 111.296875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.1882804995196926, "frac_reward_zero_std": 0.0, "grad_norm": 0.4966675341129303, "kl": 3.08203125, "learning_rate": 5e-05, "loss": 0.0143, "num_tokens": 13841960.0, "reward": 6.00146484375, "reward_std": 0.28078997135162354, "rewards/helpfulness_reward/mean": 6.00146484375, "rewards/helpfulness_reward/std": 0.4903544783592224, "rewards/safety_reward/mean": 7.662841796875, "rewards/safety_reward/std": 0.6156242489814758, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 113.4140625, "completions/mean_terminated_length": 113.4140625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.18862981399004453, "frac_reward_zero_std": 0.0, "grad_norm": 0.45178499817848206, "kl": 2.98046875, "learning_rate": 5e-05, "loss": 0.0179, "num_tokens": 13860717.0, "reward": 5.955810546875, "reward_std": 0.26137810945510864, "rewards/helpfulness_reward/mean": 5.955810546875, "rewards/helpfulness_reward/std": 0.48300227522850037, "rewards/safety_reward/mean": 7.77734375, "rewards/safety_reward/std": 0.4522608518600464, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 111.8984375, "completions/mean_terminated_length": 111.8984375, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.18897912846039647, "frac_reward_zero_std": 0.0, "grad_norm": 8.731077194213867, "kl": 6.2109375, "learning_rate": 5e-05, "loss": 0.0521, "num_tokens": 13878816.0, "reward": 5.9873046875, "reward_std": 0.3155186176300049, "rewards/helpfulness_reward/mean": 5.9873046875, "rewards/helpfulness_reward/std": 0.42060747742652893, "rewards/safety_reward/mean": 7.63037109375, "rewards/safety_reward/std": 0.5045180916786194, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 112.84375, "completions/mean_terminated_length": 112.84375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.1893284429307484, "frac_reward_zero_std": 0.0, "grad_norm": 0.6591163277626038, "kl": 3.515625, "learning_rate": 5e-05, "loss": 0.0273, "num_tokens": 13897372.0, "reward": 5.846435546875, "reward_std": 0.29283565282821655, "rewards/helpfulness_reward/mean": 5.846435546875, "rewards/helpfulness_reward/std": 0.4611677825450897, "rewards/safety_reward/mean": 7.681640625, "rewards/safety_reward/std": 0.4633755087852478, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 113.40625, "completions/mean_terminated_length": 113.40625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.18967775740110035, "frac_reward_zero_std": 0.0, "grad_norm": 0.5195531845092773, "kl": 3.408203125, "learning_rate": 5e-05, "loss": 0.025, "num_tokens": 13918376.0, "reward": 5.9810791015625, "reward_std": 0.24439787864685059, "rewards/helpfulness_reward/mean": 5.9810791015625, "rewards/helpfulness_reward/std": 0.6802155375480652, "rewards/safety_reward/mean": 7.48486328125, "rewards/safety_reward/std": 0.850245475769043, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1792.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 120.765625, "completions/mean_terminated_length": 107.6063003540039, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.19002707187145226, "frac_reward_zero_std": 0.0, "grad_norm": 0.47191107273101807, "kl": 3.068359375, "learning_rate": 5e-05, "loss": -0.0082, "num_tokens": 13939034.0, "reward": 5.640380859375, "reward_std": 0.26165974140167236, "rewards/helpfulness_reward/mean": 5.640380859375, "rewards/helpfulness_reward/std": 1.299704909324646, "rewards/safety_reward/mean": 7.392730712890625, "rewards/safety_reward/std": 1.835518717765808, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 115.5390625, "completions/mean_terminated_length": 115.5390625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.1903763863418042, "frac_reward_zero_std": 0.0, "grad_norm": 0.4657008945941925, "kl": 3.20703125, "learning_rate": 5e-05, "loss": 0.0292, "num_tokens": 13957487.0, "reward": 6.013916015625, "reward_std": 0.2711719870567322, "rewards/helpfulness_reward/mean": 6.013916015625, "rewards/helpfulness_reward/std": 0.5157880187034607, "rewards/safety_reward/mean": 7.76708984375, "rewards/safety_reward/std": 0.5673302412033081, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 115.7890625, "completions/mean_terminated_length": 115.7890625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.19072570081215615, "frac_reward_zero_std": 0.0, "grad_norm": 0.4511070251464844, "kl": 3.0625, "learning_rate": 5e-05, "loss": 0.026, "num_tokens": 13976052.0, "reward": 5.985107421875, "reward_std": 0.2208770215511322, "rewards/helpfulness_reward/mean": 5.985107421875, "rewards/helpfulness_reward/std": 0.5336251854896545, "rewards/safety_reward/mean": 7.617919921875, "rewards/safety_reward/std": 0.4962366819381714, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 116.7265625, "completions/mean_terminated_length": 116.7265625, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.1910750152825081, "frac_reward_zero_std": 0.0, "grad_norm": 0.4208590090274811, "kl": 3.08984375, "learning_rate": 5e-05, "loss": 0.0286, "num_tokens": 13995601.0, "reward": 5.80078125, "reward_std": 0.3338882327079773, "rewards/helpfulness_reward/mean": 5.80078125, "rewards/helpfulness_reward/std": 0.7023029327392578, "rewards/safety_reward/mean": 7.397216796875, "rewards/safety_reward/std": 0.725698709487915, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 700.0, "completions/max_terminated_length": 700.0, "completions/mean_length": 116.84375, "completions/mean_terminated_length": 116.84375, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.19142432975286, "frac_reward_zero_std": 0.0, "grad_norm": 0.48561468720436096, "kl": 3.099609375, "learning_rate": 5e-05, "loss": 0.0233, "num_tokens": 14016373.0, "reward": 5.661308288574219, "reward_std": 0.25800806283950806, "rewards/helpfulness_reward/mean": 5.661308288574219, "rewards/helpfulness_reward/std": 1.3408745527267456, "rewards/safety_reward/mean": 7.287750244140625, "rewards/safety_reward/std": 1.7422142028808594, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 116.375, "completions/mean_terminated_length": 116.375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.19177364422321194, "frac_reward_zero_std": 0.0, "grad_norm": 0.38609328866004944, "kl": 3.091796875, "learning_rate": 5e-05, "loss": 0.0206, "num_tokens": 14035277.0, "reward": 5.837646484375, "reward_std": 0.22968222200870514, "rewards/helpfulness_reward/mean": 5.837646484375, "rewards/helpfulness_reward/std": 0.5387462973594666, "rewards/safety_reward/mean": 7.714599609375, "rewards/safety_reward/std": 0.4471030533313751, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 113.3046875, "completions/mean_terminated_length": 113.3046875, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.19212295869356388, "frac_reward_zero_std": 0.0, "grad_norm": 0.487111359834671, "kl": 3.26953125, "learning_rate": 5e-05, "loss": 0.0343, "num_tokens": 14054460.0, "reward": 5.67193603515625, "reward_std": 0.30938348174095154, "rewards/helpfulness_reward/mean": 5.67193603515625, "rewards/helpfulness_reward/std": 1.133800983428955, "rewards/safety_reward/mean": 7.281982421875, "rewards/safety_reward/std": 1.4585946798324585, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 112.2109375, "completions/mean_terminated_length": 112.2109375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.19247227316391582, "frac_reward_zero_std": 0.0, "grad_norm": 0.6190647482872009, "kl": 3.310546875, "learning_rate": 5e-05, "loss": 0.0288, "num_tokens": 14073183.0, "reward": 5.862602233886719, "reward_std": 0.3033367991447449, "rewards/helpfulness_reward/mean": 5.862602233886719, "rewards/helpfulness_reward/std": 1.382896900177002, "rewards/safety_reward/mean": 7.422027587890625, "rewards/safety_reward/std": 1.987120509147644, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 116.171875, "completions/mean_terminated_length": 116.171875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.19282158763426774, "frac_reward_zero_std": 0.0, "grad_norm": 0.4009423851966858, "kl": 3.001953125, "learning_rate": 5e-05, "loss": 0.0245, "num_tokens": 14094693.0, "reward": 6.0224609375, "reward_std": 0.2660723924636841, "rewards/helpfulness_reward/mean": 6.0224609375, "rewards/helpfulness_reward/std": 0.48294514417648315, "rewards/safety_reward/mean": 7.787353515625, "rewards/safety_reward/std": 0.47010236978530884, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 116.0546875, "completions/mean_terminated_length": 116.0546875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.19317090210461968, "frac_reward_zero_std": 0.0, "grad_norm": 0.47365278005599976, "kl": 2.98828125, "learning_rate": 5e-05, "loss": 0.0266, "num_tokens": 14113516.0, "reward": 6.0634765625, "reward_std": 0.22869673371315002, "rewards/helpfulness_reward/mean": 6.0634765625, "rewards/helpfulness_reward/std": 0.40618255734443665, "rewards/safety_reward/mean": 7.81640625, "rewards/safety_reward/std": 0.5295610427856445, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 117.921875, "completions/mean_terminated_length": 117.921875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.19352021657497162, "frac_reward_zero_std": 0.0, "grad_norm": 0.4483865797519684, "kl": 2.95703125, "learning_rate": 5e-05, "loss": 0.0301, "num_tokens": 14132122.0, "reward": 6.037841796875, "reward_std": 0.22686466574668884, "rewards/helpfulness_reward/mean": 6.037841796875, "rewards/helpfulness_reward/std": 0.4297623634338379, "rewards/safety_reward/mean": 7.606201171875, "rewards/safety_reward/std": 0.551738977432251, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 118.5703125, "completions/mean_terminated_length": 118.5703125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.19386953104532356, "frac_reward_zero_std": 0.0, "grad_norm": 0.5120593905448914, "kl": 3.08203125, "learning_rate": 5e-05, "loss": 0.0274, "num_tokens": 14152483.0, "reward": 6.187744140625, "reward_std": 0.2343885749578476, "rewards/helpfulness_reward/mean": 6.187744140625, "rewards/helpfulness_reward/std": 0.5669592618942261, "rewards/safety_reward/mean": 7.904052734375, "rewards/safety_reward/std": 0.5853198766708374, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 117.5703125, "completions/mean_terminated_length": 117.5703125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.19421884551567548, "frac_reward_zero_std": 0.0, "grad_norm": 0.5225670337677002, "kl": 3.05859375, "learning_rate": 5e-05, "loss": 0.0306, "num_tokens": 14172444.0, "reward": 6.05029296875, "reward_std": 0.34236663579940796, "rewards/helpfulness_reward/mean": 6.05029296875, "rewards/helpfulness_reward/std": 0.5102636814117432, "rewards/safety_reward/mean": 7.693359375, "rewards/safety_reward/std": 0.6418466567993164, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 116.9296875, "completions/mean_terminated_length": 116.9296875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.19456815998602742, "frac_reward_zero_std": 0.0, "grad_norm": 0.4177369475364685, "kl": 2.91796875, "learning_rate": 5e-05, "loss": 0.0214, "num_tokens": 14191619.0, "reward": 6.036865234375, "reward_std": 0.22365909814834595, "rewards/helpfulness_reward/mean": 6.036865234375, "rewards/helpfulness_reward/std": 0.4616911709308624, "rewards/safety_reward/mean": 7.781005859375, "rewards/safety_reward/std": 0.43321681022644043, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 120.0546875, "completions/mean_terminated_length": 120.0546875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.19491747445637936, "frac_reward_zero_std": 0.0, "grad_norm": 0.393671452999115, "kl": 2.89453125, "learning_rate": 5e-05, "loss": 0.0344, "num_tokens": 14211610.0, "reward": 6.012451171875, "reward_std": 0.2686452269554138, "rewards/helpfulness_reward/mean": 6.012451171875, "rewards/helpfulness_reward/std": 0.44065824151039124, "rewards/safety_reward/mean": 7.842041015625, "rewards/safety_reward/std": 0.4403434991836548, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 119.6328125, "completions/mean_terminated_length": 119.6328125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.1952667889267313, "frac_reward_zero_std": 0.0, "grad_norm": 0.41906172037124634, "kl": 3.05859375, "learning_rate": 5e-05, "loss": 0.0342, "num_tokens": 14230347.0, "reward": 6.073974609375, "reward_std": 0.27076494693756104, "rewards/helpfulness_reward/mean": 6.073974609375, "rewards/helpfulness_reward/std": 0.45569702982902527, "rewards/safety_reward/mean": 7.81982421875, "rewards/safety_reward/std": 0.5604112148284912, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 120.734375, "completions/mean_terminated_length": 120.734375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.1956161033970832, "frac_reward_zero_std": 0.0, "grad_norm": 0.4047130048274994, "kl": 2.947265625, "learning_rate": 5e-05, "loss": 0.0342, "num_tokens": 14249009.0, "reward": 6.274169921875, "reward_std": 0.24834847450256348, "rewards/helpfulness_reward/mean": 6.274169921875, "rewards/helpfulness_reward/std": 0.40257489681243896, "rewards/safety_reward/mean": 7.767822265625, "rewards/safety_reward/std": 0.40160855650901794, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 119.6875, "completions/mean_terminated_length": 119.6875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.19596541786743515, "frac_reward_zero_std": 0.0, "grad_norm": 0.5014277696609497, "kl": 3.171875, "learning_rate": 5e-05, "loss": 0.0314, "num_tokens": 14267969.0, "reward": 6.035400390625, "reward_std": 0.29250815510749817, "rewards/helpfulness_reward/mean": 6.035400390625, "rewards/helpfulness_reward/std": 0.5051008462905884, "rewards/safety_reward/mean": 7.625732421875, "rewards/safety_reward/std": 0.6671959161758423, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 119.359375, "completions/mean_terminated_length": 119.359375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.1963147323377871, "frac_reward_zero_std": 0.0, "grad_norm": 0.4028889536857605, "kl": 2.931640625, "learning_rate": 5e-05, "loss": 0.03, "num_tokens": 14287119.0, "reward": 6.163330078125, "reward_std": 0.2544204592704773, "rewards/helpfulness_reward/mean": 6.163330078125, "rewards/helpfulness_reward/std": 0.47632840275764465, "rewards/safety_reward/mean": 7.851806640625, "rewards/safety_reward/std": 0.5440467000007629, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 118.59375, "completions/mean_terminated_length": 118.59375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.19666404680813904, "frac_reward_zero_std": 0.0, "grad_norm": 0.4726410508155823, "kl": 3.111328125, "learning_rate": 5e-05, "loss": 0.0278, "num_tokens": 14306531.0, "reward": 6.010009765625, "reward_std": 0.25275352597236633, "rewards/helpfulness_reward/mean": 6.010009765625, "rewards/helpfulness_reward/std": 0.5118991136550903, "rewards/safety_reward/mean": 7.664306640625, "rewards/safety_reward/std": 0.47036704421043396, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 119.3515625, "completions/mean_terminated_length": 119.3515625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.19701336127849095, "frac_reward_zero_std": 0.0, "grad_norm": 0.46090126037597656, "kl": 3.2734375, "learning_rate": 5e-05, "loss": 0.0334, "num_tokens": 14325264.0, "reward": 6.130126953125, "reward_std": 0.22748123109340668, "rewards/helpfulness_reward/mean": 6.130126953125, "rewards/helpfulness_reward/std": 0.49652424454689026, "rewards/safety_reward/mean": 7.813720703125, "rewards/safety_reward/std": 0.5530622601509094, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 118.5703125, "completions/mean_terminated_length": 118.5703125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.1973626757488429, "frac_reward_zero_std": 0.0, "grad_norm": 0.3701933026313782, "kl": 3.1015625, "learning_rate": 5e-05, "loss": 0.0261, "num_tokens": 14344089.0, "reward": 6.223388671875, "reward_std": 0.2712697386741638, "rewards/helpfulness_reward/mean": 6.223388671875, "rewards/helpfulness_reward/std": 0.45137736201286316, "rewards/safety_reward/mean": 7.844970703125, "rewards/safety_reward/std": 0.6997433304786682, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 118.109375, "completions/mean_terminated_length": 118.109375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.19771199021919483, "frac_reward_zero_std": 0.0, "grad_norm": 0.5640650987625122, "kl": 3.16015625, "learning_rate": 5e-05, "loss": 0.0202, "num_tokens": 14363087.0, "reward": 6.283203125, "reward_std": 0.24287907779216766, "rewards/helpfulness_reward/mean": 6.283203125, "rewards/helpfulness_reward/std": 0.5520790815353394, "rewards/safety_reward/mean": 7.950927734375, "rewards/safety_reward/std": 0.5034282803535461, "step": 566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 116.6484375, "completions/mean_terminated_length": 116.6484375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.19806130468954677, "frac_reward_zero_std": 0.0, "grad_norm": 0.4572945237159729, "kl": 2.96484375, "learning_rate": 5e-05, "loss": 0.0184, "num_tokens": 14384562.0, "reward": 5.98046875, "reward_std": 0.269226998090744, "rewards/helpfulness_reward/mean": 5.98046875, "rewards/helpfulness_reward/std": 0.648926854133606, "rewards/safety_reward/mean": 7.830322265625, "rewards/safety_reward/std": 0.6339381337165833, "step": 567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 116.59375, "completions/mean_terminated_length": 116.59375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.1984106191598987, "frac_reward_zero_std": 0.0, "grad_norm": 0.4959539473056793, "kl": 3.18359375, "learning_rate": 5e-05, "loss": 0.0239, "num_tokens": 14403334.0, "reward": 6.264892578125, "reward_std": 0.23557306826114655, "rewards/helpfulness_reward/mean": 6.264892578125, "rewards/helpfulness_reward/std": 0.5494713187217712, "rewards/safety_reward/mean": 7.906005859375, "rewards/safety_reward/std": 0.4467438757419586, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 118.5234375, "completions/mean_terminated_length": 118.5234375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.19875993363025063, "frac_reward_zero_std": 0.0, "grad_norm": 0.5292760133743286, "kl": 3.169921875, "learning_rate": 5e-05, "loss": 0.0289, "num_tokens": 14422361.0, "reward": 6.177001953125, "reward_std": 0.23566538095474243, "rewards/helpfulness_reward/mean": 6.177001953125, "rewards/helpfulness_reward/std": 0.419487863779068, "rewards/safety_reward/mean": 7.7197265625, "rewards/safety_reward/std": 0.5165646076202393, "step": 569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 837.0, "completions/max_terminated_length": 837.0, "completions/mean_length": 123.8515625, "completions/mean_terminated_length": 123.8515625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.19910924810060257, "frac_reward_zero_std": 0.0, "grad_norm": 0.3905145227909088, "kl": 2.9296875, "learning_rate": 5e-05, "loss": 0.0075, "num_tokens": 14445526.0, "reward": 5.7202911376953125, "reward_std": 0.25406527519226074, "rewards/helpfulness_reward/mean": 5.7202911376953125, "rewards/helpfulness_reward/std": 1.4808140993118286, "rewards/safety_reward/mean": 7.356231689453125, "rewards/safety_reward/std": 1.6835740804672241, "step": 570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 117.6484375, "completions/mean_terminated_length": 117.6484375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.1994585625709545, "frac_reward_zero_std": 0.0, "grad_norm": 0.651841938495636, "kl": 3.400390625, "learning_rate": 5e-05, "loss": 0.0263, "num_tokens": 14464065.0, "reward": 6.1160888671875, "reward_std": 0.28253674507141113, "rewards/helpfulness_reward/mean": 6.1160888671875, "rewards/helpfulness_reward/std": 0.5057634115219116, "rewards/safety_reward/mean": 7.970458984375, "rewards/safety_reward/std": 0.49302011728286743, "step": 571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 117.7265625, "completions/mean_terminated_length": 117.7265625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.19980787704130643, "frac_reward_zero_std": 0.0, "grad_norm": 0.3773845434188843, "kl": 3.15625, "learning_rate": 5e-05, "loss": 0.0319, "num_tokens": 14482966.0, "reward": 6.076904296875, "reward_std": 0.2242600917816162, "rewards/helpfulness_reward/mean": 6.076904296875, "rewards/helpfulness_reward/std": 0.5554831027984619, "rewards/safety_reward/mean": 7.93994140625, "rewards/safety_reward/std": 0.46040987968444824, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 118.9140625, "completions/mean_terminated_length": 118.9140625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.20015719151165837, "frac_reward_zero_std": 0.0, "grad_norm": 0.46856945753097534, "kl": 3.2734375, "learning_rate": 5e-05, "loss": 0.0385, "num_tokens": 14502027.0, "reward": 6.264892578125, "reward_std": 0.26982441544532776, "rewards/helpfulness_reward/mean": 6.264892578125, "rewards/helpfulness_reward/std": 0.4233322739601135, "rewards/safety_reward/mean": 7.90869140625, "rewards/safety_reward/std": 0.5490463972091675, "step": 573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 119.0703125, "completions/mean_terminated_length": 119.0703125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.2005065059820103, "frac_reward_zero_std": 0.0, "grad_norm": 0.699648916721344, "kl": 3.4296875, "learning_rate": 5e-05, "loss": 0.0394, "num_tokens": 14521580.0, "reward": 6.12890625, "reward_std": 0.23937514424324036, "rewards/helpfulness_reward/mean": 6.12890625, "rewards/helpfulness_reward/std": 0.5431945323944092, "rewards/safety_reward/mean": 7.87548828125, "rewards/safety_reward/std": 0.4532116949558258, "step": 574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 118.5390625, "completions/mean_terminated_length": 118.5390625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.20085582045236225, "frac_reward_zero_std": 0.0, "grad_norm": 0.588413655757904, "kl": 3.47265625, "learning_rate": 5e-05, "loss": 0.0338, "num_tokens": 14542521.0, "reward": 6.043241500854492, "reward_std": 0.38931718468666077, "rewards/helpfulness_reward/mean": 6.043241500854492, "rewards/helpfulness_reward/std": 1.366448163986206, "rewards/safety_reward/mean": 7.4976959228515625, "rewards/safety_reward/std": 1.5678980350494385, "step": 575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 118.9296875, "completions/mean_terminated_length": 118.9296875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.20120513492271416, "frac_reward_zero_std": 0.0, "grad_norm": 0.44245803356170654, "kl": 3.171875, "learning_rate": 5e-05, "loss": 0.0253, "num_tokens": 14562408.0, "reward": 5.965087890625, "reward_std": 0.21449780464172363, "rewards/helpfulness_reward/mean": 5.965087890625, "rewards/helpfulness_reward/std": 0.4089876115322113, "rewards/safety_reward/mean": 7.80810546875, "rewards/safety_reward/std": 0.47460636496543884, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 118.2734375, "completions/mean_terminated_length": 118.2734375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.2015544493930661, "frac_reward_zero_std": 0.0, "grad_norm": 0.4120824337005615, "kl": 3.3125, "learning_rate": 5e-05, "loss": 0.0342, "num_tokens": 14581915.0, "reward": 6.0791015625, "reward_std": 0.21361960470676422, "rewards/helpfulness_reward/mean": 6.0791015625, "rewards/helpfulness_reward/std": 0.5339417457580566, "rewards/safety_reward/mean": 7.6552734375, "rewards/safety_reward/std": 0.5231174230575562, "step": 577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 116.8984375, "completions/mean_terminated_length": 116.8984375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.20190376386341805, "frac_reward_zero_std": 0.0, "grad_norm": 0.3898095488548279, "kl": 3.2578125, "learning_rate": 5e-05, "loss": 0.0229, "num_tokens": 14603654.0, "reward": 6.21142578125, "reward_std": 0.24348512291908264, "rewards/helpfulness_reward/mean": 6.21142578125, "rewards/helpfulness_reward/std": 0.6916535496711731, "rewards/safety_reward/mean": 7.79443359375, "rewards/safety_reward/std": 0.8249221444129944, "step": 578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 117.7109375, "completions/mean_terminated_length": 117.7109375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.20225307833377, "frac_reward_zero_std": 0.0, "grad_norm": 0.3882938325405121, "kl": 3.232421875, "learning_rate": 5e-05, "loss": 0.0311, "num_tokens": 14622817.0, "reward": 5.9560546875, "reward_std": 0.24021267890930176, "rewards/helpfulness_reward/mean": 5.9560546875, "rewards/helpfulness_reward/std": 0.47424498200416565, "rewards/safety_reward/mean": 7.88037109375, "rewards/safety_reward/std": 0.4852193593978882, "step": 579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 117.90625, "completions/mean_terminated_length": 117.90625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.2026023928041219, "frac_reward_zero_std": 0.0, "grad_norm": 0.42653265595436096, "kl": 3.419921875, "learning_rate": 5e-05, "loss": 0.0312, "num_tokens": 14641397.0, "reward": 6.117431640625, "reward_std": 0.2492954283952713, "rewards/helpfulness_reward/mean": 6.117431640625, "rewards/helpfulness_reward/std": 0.5559790730476379, "rewards/safety_reward/mean": 7.60546875, "rewards/safety_reward/std": 0.5896499156951904, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 119.53125, "completions/mean_terminated_length": 119.53125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.20295170727447384, "frac_reward_zero_std": 0.0, "grad_norm": 0.39955583214759827, "kl": 3.0859375, "learning_rate": 5e-05, "loss": 0.0318, "num_tokens": 14660833.0, "reward": 6.103515625, "reward_std": 0.20720377564430237, "rewards/helpfulness_reward/mean": 6.103515625, "rewards/helpfulness_reward/std": 0.558270275592804, "rewards/safety_reward/mean": 7.659912109375, "rewards/safety_reward/std": 0.640597939491272, "step": 581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 118.1015625, "completions/mean_terminated_length": 118.1015625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.20330102174482578, "frac_reward_zero_std": 0.0, "grad_norm": 0.4247634708881378, "kl": 3.173828125, "learning_rate": 5e-05, "loss": 0.0255, "num_tokens": 14679550.0, "reward": 6.37939453125, "reward_std": 0.23161275684833527, "rewards/helpfulness_reward/mean": 6.37939453125, "rewards/helpfulness_reward/std": 0.4786717891693115, "rewards/safety_reward/mean": 8.05908203125, "rewards/safety_reward/std": 0.3733196258544922, "step": 582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 110.8125, "completions/mean_terminated_length": 110.8125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.20365033621517772, "frac_reward_zero_std": 0.0, "grad_norm": 0.5328911542892456, "kl": 3.4296875, "learning_rate": 5e-05, "loss": 0.0335, "num_tokens": 14699470.0, "reward": 5.607147216796875, "reward_std": 0.2929902672767639, "rewards/helpfulness_reward/mean": 5.607147216796875, "rewards/helpfulness_reward/std": 2.0534939765930176, "rewards/safety_reward/mean": 7.2357330322265625, "rewards/safety_reward/std": 2.2360763549804688, "step": 583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 117.0078125, "completions/mean_terminated_length": 117.0078125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.20399965068552964, "frac_reward_zero_std": 0.0, "grad_norm": 0.5530322790145874, "kl": 3.48828125, "learning_rate": 5e-05, "loss": 0.0321, "num_tokens": 14718191.0, "reward": 6.215087890625, "reward_std": 0.25776612758636475, "rewards/helpfulness_reward/mean": 6.215087890625, "rewards/helpfulness_reward/std": 0.46993160247802734, "rewards/safety_reward/mean": 7.680419921875, "rewards/safety_reward/std": 0.5235329270362854, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 117.890625, "completions/mean_terminated_length": 117.890625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.20434896515588158, "frac_reward_zero_std": 0.0, "grad_norm": 0.4328213036060333, "kl": 3.19140625, "learning_rate": 5e-05, "loss": 0.0104, "num_tokens": 14736809.0, "reward": 6.1353759765625, "reward_std": 0.27969294786453247, "rewards/helpfulness_reward/mean": 6.1353759765625, "rewards/helpfulness_reward/std": 0.6508326530456543, "rewards/safety_reward/mean": 7.708740234375, "rewards/safety_reward/std": 0.7254420518875122, "step": 585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 118.875, "completions/mean_terminated_length": 118.875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.20469827962623352, "frac_reward_zero_std": 0.0, "grad_norm": 0.6061602234840393, "kl": 3.52734375, "learning_rate": 5e-05, "loss": 0.0316, "num_tokens": 14755745.0, "reward": 6.15478515625, "reward_std": 0.2629871964454651, "rewards/helpfulness_reward/mean": 6.15478515625, "rewards/helpfulness_reward/std": 0.4065595269203186, "rewards/safety_reward/mean": 7.5, "rewards/safety_reward/std": 0.5996328592300415, "step": 586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 117.890625, "completions/mean_terminated_length": 117.890625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.20504759409658546, "frac_reward_zero_std": 0.0, "grad_norm": 2.036149263381958, "kl": 3.693359375, "learning_rate": 5e-05, "loss": 0.0332, "num_tokens": 14775459.0, "reward": 6.133544921875, "reward_std": 0.2626320719718933, "rewards/helpfulness_reward/mean": 6.133544921875, "rewards/helpfulness_reward/std": 0.5577048063278198, "rewards/safety_reward/mean": 7.95166015625, "rewards/safety_reward/std": 0.5650349855422974, "step": 587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 118.5, "completions/mean_terminated_length": 118.5, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.20539690856693738, "frac_reward_zero_std": 0.0, "grad_norm": 0.4120984971523285, "kl": 3.25, "learning_rate": 5e-05, "loss": 0.0307, "num_tokens": 14795027.0, "reward": 6.1416015625, "reward_std": 0.2348957508802414, "rewards/helpfulness_reward/mean": 6.1416015625, "rewards/helpfulness_reward/std": 0.4926866888999939, "rewards/safety_reward/mean": 7.96826171875, "rewards/safety_reward/std": 0.6065685749053955, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 118.1484375, "completions/mean_terminated_length": 118.1484375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.20574622303728932, "frac_reward_zero_std": 0.0, "grad_norm": 0.6733253002166748, "kl": 3.416015625, "learning_rate": 5e-05, "loss": 0.0321, "num_tokens": 14813974.0, "reward": 6.117919921875, "reward_std": 0.2306673675775528, "rewards/helpfulness_reward/mean": 6.117919921875, "rewards/helpfulness_reward/std": 0.540017306804657, "rewards/safety_reward/mean": 7.798583984375, "rewards/safety_reward/std": 0.5722687244415283, "step": 589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 117.359375, "completions/mean_terminated_length": 117.359375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.20609553750764126, "frac_reward_zero_std": 0.0, "grad_norm": 0.43816572427749634, "kl": 3.2890625, "learning_rate": 5e-05, "loss": 0.0263, "num_tokens": 14833532.0, "reward": 6.22265625, "reward_std": 0.28401654958724976, "rewards/helpfulness_reward/mean": 6.22265625, "rewards/helpfulness_reward/std": 0.5193564891815186, "rewards/safety_reward/mean": 8.095947265625, "rewards/safety_reward/std": 0.5711574554443359, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 118.109375, "completions/mean_terminated_length": 118.109375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.2064448519779932, "frac_reward_zero_std": 0.0, "grad_norm": 0.43169909715652466, "kl": 3.216796875, "learning_rate": 5e-05, "loss": 0.0316, "num_tokens": 14852650.0, "reward": 6.312744140625, "reward_std": 0.24104243516921997, "rewards/helpfulness_reward/mean": 6.312744140625, "rewards/helpfulness_reward/std": 0.49934589862823486, "rewards/safety_reward/mean": 8.095703125, "rewards/safety_reward/std": 0.5001652836799622, "step": 591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 110.9609375, "completions/mean_terminated_length": 110.9609375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.2067941664483451, "frac_reward_zero_std": 0.0, "grad_norm": 0.6389086246490479, "kl": 3.201171875, "learning_rate": 5e-05, "loss": 0.0106, "num_tokens": 14872541.0, "reward": 6.04791259765625, "reward_std": 0.24054138362407684, "rewards/helpfulness_reward/mean": 6.04791259765625, "rewards/helpfulness_reward/std": 1.0158687829971313, "rewards/safety_reward/mean": 7.7369384765625, "rewards/safety_reward/std": 1.5638198852539062, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 117.4453125, "completions/mean_terminated_length": 117.4453125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.20714348091869705, "frac_reward_zero_std": 0.0, "grad_norm": 0.5299161672592163, "kl": 3.19140625, "learning_rate": 5e-05, "loss": 0.0192, "num_tokens": 14891758.0, "reward": 6.43017578125, "reward_std": 0.30376362800598145, "rewards/helpfulness_reward/mean": 6.43017578125, "rewards/helpfulness_reward/std": 0.5252232551574707, "rewards/safety_reward/mean": 8.010986328125, "rewards/safety_reward/std": 0.7398591041564941, "step": 593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 118.296875, "completions/mean_terminated_length": 118.296875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.207492795389049, "frac_reward_zero_std": 0.0, "grad_norm": 18.051015853881836, "kl": 7.75, "learning_rate": 5e-05, "loss": 0.0752, "num_tokens": 14911108.0, "reward": 5.999755859375, "reward_std": 0.3053683340549469, "rewards/helpfulness_reward/mean": 5.999755859375, "rewards/helpfulness_reward/std": 0.5661448836326599, "rewards/safety_reward/mean": 7.83642578125, "rewards/safety_reward/std": 0.5876837968826294, "step": 594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 117.5625, "completions/mean_terminated_length": 117.5625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.20784210985940094, "frac_reward_zero_std": 0.0, "grad_norm": 0.48149794340133667, "kl": 3.265625, "learning_rate": 5e-05, "loss": 0.0269, "num_tokens": 14932060.0, "reward": 5.981658935546875, "reward_std": 0.3737162947654724, "rewards/helpfulness_reward/mean": 5.981658935546875, "rewards/helpfulness_reward/std": 1.0886223316192627, "rewards/safety_reward/mean": 7.51123046875, "rewards/safety_reward/std": 1.5302207469940186, "step": 595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 118.359375, "completions/mean_terminated_length": 118.359375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.20819142432975285, "frac_reward_zero_std": 0.0, "grad_norm": 0.413831889629364, "kl": 3.146484375, "learning_rate": 5e-05, "loss": 0.0239, "num_tokens": 14951610.0, "reward": 6.221923828125, "reward_std": 0.3012981116771698, "rewards/helpfulness_reward/mean": 6.221923828125, "rewards/helpfulness_reward/std": 0.5831568837165833, "rewards/safety_reward/mean": 7.853271484375, "rewards/safety_reward/std": 0.6214163303375244, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 118.7265625, "completions/mean_terminated_length": 118.7265625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.2085407388001048, "frac_reward_zero_std": 0.0, "grad_norm": 0.9601730108261108, "kl": 3.513671875, "learning_rate": 5e-05, "loss": 0.0328, "num_tokens": 14970831.0, "reward": 6.11328125, "reward_std": 0.24721659719944, "rewards/helpfulness_reward/mean": 6.11328125, "rewards/helpfulness_reward/std": 0.46239957213401794, "rewards/safety_reward/mean": 7.66455078125, "rewards/safety_reward/std": 0.4579964876174927, "step": 597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 119.265625, "completions/mean_terminated_length": 119.265625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.20889005327045673, "frac_reward_zero_std": 0.0, "grad_norm": 0.41411536931991577, "kl": 3.53515625, "learning_rate": 5e-05, "loss": 0.035, "num_tokens": 14990353.0, "reward": 6.0888671875, "reward_std": 0.34778642654418945, "rewards/helpfulness_reward/mean": 6.0888671875, "rewards/helpfulness_reward/std": 0.57149738073349, "rewards/safety_reward/mean": 7.890869140625, "rewards/safety_reward/std": 0.596534252166748, "step": 598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 121.875, "completions/mean_terminated_length": 121.875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.20923936774080867, "frac_reward_zero_std": 0.0, "grad_norm": 1.14131498336792, "kl": 3.533203125, "learning_rate": 5e-05, "loss": 0.035, "num_tokens": 15011585.0, "reward": 5.85498046875, "reward_std": 0.32161980867385864, "rewards/helpfulness_reward/mean": 5.85498046875, "rewards/helpfulness_reward/std": 1.1272242069244385, "rewards/safety_reward/mean": 7.5697021484375, "rewards/safety_reward/std": 1.2452926635742188, "step": 599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 120.421875, "completions/mean_terminated_length": 120.421875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.2095886822111606, "frac_reward_zero_std": 0.0, "grad_norm": 0.4207112193107605, "kl": 3.263671875, "learning_rate": 5e-05, "loss": 0.0347, "num_tokens": 15030471.0, "reward": 6.296630859375, "reward_std": 0.32845833897590637, "rewards/helpfulness_reward/mean": 6.296630859375, "rewards/helpfulness_reward/std": 0.7148289680480957, "rewards/safety_reward/mean": 8.0498046875, "rewards/safety_reward/std": 0.6649444699287415, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 120.21875, "completions/mean_terminated_length": 120.21875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.20993799668151253, "frac_reward_zero_std": 0.0, "grad_norm": 0.49950188398361206, "kl": 3.251953125, "learning_rate": 5e-05, "loss": 0.0343, "num_tokens": 15049691.0, "reward": 6.11279296875, "reward_std": 0.363620787858963, "rewards/helpfulness_reward/mean": 6.11279296875, "rewards/helpfulness_reward/std": 0.7100908160209656, "rewards/safety_reward/mean": 7.79443359375, "rewards/safety_reward/std": 0.7886666655540466, "step": 601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 120.7421875, "completions/mean_terminated_length": 120.7421875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.21028731115186447, "frac_reward_zero_std": 0.0, "grad_norm": 0.46379354596138, "kl": 3.208984375, "learning_rate": 5e-05, "loss": 0.0342, "num_tokens": 15069018.0, "reward": 6.192626953125, "reward_std": 0.25702813267707825, "rewards/helpfulness_reward/mean": 6.192626953125, "rewards/helpfulness_reward/std": 0.5762985944747925, "rewards/safety_reward/mean": 7.929931640625, "rewards/safety_reward/std": 0.5217015743255615, "step": 602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 121.3125, "completions/mean_terminated_length": 121.3125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.2106366256222164, "frac_reward_zero_std": 0.0, "grad_norm": 0.4046405553817749, "kl": 3.17578125, "learning_rate": 5e-05, "loss": 0.0344, "num_tokens": 15089202.0, "reward": 6.049072265625, "reward_std": 0.2991634011268616, "rewards/helpfulness_reward/mean": 6.049072265625, "rewards/helpfulness_reward/std": 0.43622711300849915, "rewards/safety_reward/mean": 7.7646484375, "rewards/safety_reward/std": 0.4937001168727875, "step": 603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 120.4921875, "completions/mean_terminated_length": 120.4921875, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.21098594009256832, "frac_reward_zero_std": 0.0, "grad_norm": 0.39065271615982056, "kl": 3.224609375, "learning_rate": 5e-05, "loss": 0.0306, "num_tokens": 15109393.0, "reward": 5.994384765625, "reward_std": 0.36217379570007324, "rewards/helpfulness_reward/mean": 5.994384765625, "rewards/helpfulness_reward/std": 0.6434103846549988, "rewards/safety_reward/mean": 8.023193359375, "rewards/safety_reward/std": 0.4968411326408386, "step": 604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 142.0625, "completions/mean_terminated_length": 142.0625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.21133525456292027, "frac_reward_zero_std": 0.0, "grad_norm": 0.3581761419773102, "kl": 2.724609375, "learning_rate": 5e-05, "loss": 0.0455, "num_tokens": 15132985.0, "reward": 5.767202377319336, "reward_std": 0.27135229110717773, "rewards/helpfulness_reward/mean": 5.767202377319336, "rewards/helpfulness_reward/std": 1.6717233657836914, "rewards/safety_reward/mean": 7.327789306640625, "rewards/safety_reward/std": 2.280963659286499, "step": 605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 121.8203125, "completions/mean_terminated_length": 121.8203125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.2116845690332722, "frac_reward_zero_std": 0.0, "grad_norm": 0.43263980746269226, "kl": 3.10546875, "learning_rate": 5e-05, "loss": 0.0313, "num_tokens": 15157050.0, "reward": 5.773151397705078, "reward_std": 0.37341994047164917, "rewards/helpfulness_reward/mean": 5.773151397705078, "rewards/helpfulness_reward/std": 1.607913613319397, "rewards/safety_reward/mean": 7.5699462890625, "rewards/safety_reward/std": 1.235331654548645, "step": 606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 122.875, "completions/mean_terminated_length": 122.875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.21203388350362415, "frac_reward_zero_std": 0.0, "grad_norm": 0.42943862080574036, "kl": 3.29296875, "learning_rate": 5e-05, "loss": 0.0382, "num_tokens": 15176658.0, "reward": 6.093505859375, "reward_std": 0.35356605052948, "rewards/helpfulness_reward/mean": 6.093505859375, "rewards/helpfulness_reward/std": 0.5371516346931458, "rewards/safety_reward/mean": 7.750732421875, "rewards/safety_reward/std": 0.5904378890991211, "step": 607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 120.1015625, "completions/mean_terminated_length": 120.1015625, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.21238319797397606, "frac_reward_zero_std": 0.0, "grad_norm": 0.51362144947052, "kl": 3.205078125, "learning_rate": 5e-05, "loss": 0.0103, "num_tokens": 15196967.0, "reward": 5.630195617675781, "reward_std": 0.44920802116394043, "rewards/helpfulness_reward/mean": 5.630195617675781, "rewards/helpfulness_reward/std": 1.605789065361023, "rewards/safety_reward/mean": 7.3353271484375, "rewards/safety_reward/std": 1.9328359365463257, "step": 608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 122.3671875, "completions/mean_terminated_length": 122.3671875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.212732512444328, "frac_reward_zero_std": 0.0, "grad_norm": 0.3990735709667206, "kl": 3.130859375, "learning_rate": 5e-05, "loss": 0.0316, "num_tokens": 15216726.0, "reward": 6.115966796875, "reward_std": 0.2708530128002167, "rewards/helpfulness_reward/mean": 6.115966796875, "rewards/helpfulness_reward/std": 0.43648210167884827, "rewards/safety_reward/mean": 7.79345703125, "rewards/safety_reward/std": 0.3661234974861145, "step": 609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 123.109375, "completions/mean_terminated_length": 123.109375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.21308182691467994, "frac_reward_zero_std": 0.0, "grad_norm": 0.4218122065067291, "kl": 2.923828125, "learning_rate": 5e-05, "loss": 0.0327, "num_tokens": 15235892.0, "reward": 6.24951171875, "reward_std": 0.2822079062461853, "rewards/helpfulness_reward/mean": 6.24951171875, "rewards/helpfulness_reward/std": 0.4725320637226105, "rewards/safety_reward/mean": 7.805908203125, "rewards/safety_reward/std": 0.6779677867889404, "step": 610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 122.390625, "completions/mean_terminated_length": 122.390625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.2134311413850319, "frac_reward_zero_std": 0.0, "grad_norm": 0.3728272020816803, "kl": 3.208984375, "learning_rate": 5e-05, "loss": 0.0326, "num_tokens": 15255142.0, "reward": 6.275146484375, "reward_std": 0.340798944234848, "rewards/helpfulness_reward/mean": 6.275146484375, "rewards/helpfulness_reward/std": 0.5521112084388733, "rewards/safety_reward/mean": 7.953857421875, "rewards/safety_reward/std": 0.5084177851676941, "step": 611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/max_terminated_length": 608.0, "completions/mean_length": 131.4921875, "completions/mean_terminated_length": 131.4921875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.2137804558553838, "frac_reward_zero_std": 0.0, "grad_norm": 0.3822488784790039, "kl": 2.982421875, "learning_rate": 5e-05, "loss": 0.0208, "num_tokens": 15278165.0, "reward": 5.546905517578125, "reward_std": 0.40317922830581665, "rewards/helpfulness_reward/mean": 5.546905517578125, "rewards/helpfulness_reward/std": 1.472669005393982, "rewards/safety_reward/mean": 7.35198974609375, "rewards/safety_reward/std": 2.286212921142578, "step": 612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 115.0078125, "completions/mean_terminated_length": 115.0078125, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.21412977032573574, "frac_reward_zero_std": 0.0, "grad_norm": 0.34808629751205444, "kl": 2.984375, "learning_rate": 5e-05, "loss": 0.0312, "num_tokens": 15300902.0, "reward": 5.91455078125, "reward_std": 0.2882629334926605, "rewards/helpfulness_reward/mean": 5.91455078125, "rewards/helpfulness_reward/std": 1.1704398393630981, "rewards/safety_reward/mean": 7.69775390625, "rewards/safety_reward/std": 1.499848484992981, "step": 613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 121.65625, "completions/mean_terminated_length": 121.65625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.21447908479608768, "frac_reward_zero_std": 0.0, "grad_norm": 0.5642153024673462, "kl": 3.376953125, "learning_rate": 5e-05, "loss": 0.0253, "num_tokens": 15320338.0, "reward": 6.166259765625, "reward_std": 0.29613780975341797, "rewards/helpfulness_reward/mean": 6.166259765625, "rewards/helpfulness_reward/std": 0.5077211260795593, "rewards/safety_reward/mean": 7.822998046875, "rewards/safety_reward/std": 0.4973984658718109, "step": 614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 121.75, "completions/mean_terminated_length": 121.75, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.21482839926643962, "frac_reward_zero_std": 0.0, "grad_norm": 0.3847460448741913, "kl": 3.24609375, "learning_rate": 5e-05, "loss": 0.0358, "num_tokens": 15340666.0, "reward": 6.30078125, "reward_std": 0.24370978772640228, "rewards/helpfulness_reward/mean": 6.30078125, "rewards/helpfulness_reward/std": 0.4995845854282379, "rewards/safety_reward/mean": 7.86865234375, "rewards/safety_reward/std": 0.49895867705345154, "step": 615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 121.5078125, "completions/mean_terminated_length": 121.5078125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.21517771373679154, "frac_reward_zero_std": 0.0, "grad_norm": 0.3438880145549774, "kl": 3.134765625, "learning_rate": 5e-05, "loss": 0.0288, "num_tokens": 15360155.0, "reward": 6.281005859375, "reward_std": 0.23847195506095886, "rewards/helpfulness_reward/mean": 6.281005859375, "rewards/helpfulness_reward/std": 0.4451402723789215, "rewards/safety_reward/mean": 8.104248046875, "rewards/safety_reward/std": 0.41895592212677, "step": 616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 121.453125, "completions/mean_terminated_length": 121.453125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.21552702820714348, "frac_reward_zero_std": 0.0, "grad_norm": 0.5125246047973633, "kl": 3.298828125, "learning_rate": 5e-05, "loss": 0.0304, "num_tokens": 15380997.0, "reward": 6.152587890625, "reward_std": 0.2230842560529709, "rewards/helpfulness_reward/mean": 6.152587890625, "rewards/helpfulness_reward/std": 0.4696696996688843, "rewards/safety_reward/mean": 7.9365234375, "rewards/safety_reward/std": 0.49210888147354126, "step": 617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 121.5546875, "completions/mean_terminated_length": 121.5546875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.21587634267749542, "frac_reward_zero_std": 0.0, "grad_norm": 0.4506770372390747, "kl": 3.37109375, "learning_rate": 5e-05, "loss": 0.0363, "num_tokens": 15401252.0, "reward": 6.28125, "reward_std": 0.26476019620895386, "rewards/helpfulness_reward/mean": 6.28125, "rewards/helpfulness_reward/std": 0.6550361514091492, "rewards/safety_reward/mean": 8.1162109375, "rewards/safety_reward/std": 0.5226173996925354, "step": 618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 121.640625, "completions/mean_terminated_length": 121.640625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.21622565714784736, "frac_reward_zero_std": 0.0, "grad_norm": 0.4438520669937134, "kl": 3.21484375, "learning_rate": 5e-05, "loss": 0.0407, "num_tokens": 15420622.0, "reward": 6.4013671875, "reward_std": 0.35561996698379517, "rewards/helpfulness_reward/mean": 6.4013671875, "rewards/helpfulness_reward/std": 0.5219327807426453, "rewards/safety_reward/mean": 8.00732421875, "rewards/safety_reward/std": 0.6950139999389648, "step": 619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 121.1796875, "completions/mean_terminated_length": 121.1796875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.21657497161819927, "frac_reward_zero_std": 0.0, "grad_norm": 0.35497134923934937, "kl": 3.140625, "learning_rate": 5e-05, "loss": 0.0268, "num_tokens": 15441805.0, "reward": 5.9971923828125, "reward_std": 0.29930514097213745, "rewards/helpfulness_reward/mean": 5.9971923828125, "rewards/helpfulness_reward/std": 0.6405557990074158, "rewards/safety_reward/mean": 7.800048828125, "rewards/safety_reward/std": 0.660021960735321, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 121.3203125, "completions/mean_terminated_length": 121.3203125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.21692428608855122, "frac_reward_zero_std": 0.0, "grad_norm": 0.48554113507270813, "kl": 3.208984375, "learning_rate": 5e-05, "loss": 0.0482, "num_tokens": 15462126.0, "reward": 6.2685546875, "reward_std": 0.26218539476394653, "rewards/helpfulness_reward/mean": 6.2685546875, "rewards/helpfulness_reward/std": 0.5755196809768677, "rewards/safety_reward/mean": 8.04296875, "rewards/safety_reward/std": 0.4993228614330292, "step": 621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 120.203125, "completions/mean_terminated_length": 120.203125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.21727360055890316, "frac_reward_zero_std": 0.0, "grad_norm": 0.43801403045654297, "kl": 3.19921875, "learning_rate": 5e-05, "loss": 0.0355, "num_tokens": 15481184.0, "reward": 6.3519287109375, "reward_std": 0.33131837844848633, "rewards/helpfulness_reward/mean": 6.3519287109375, "rewards/helpfulness_reward/std": 0.6893846988677979, "rewards/safety_reward/mean": 8.160400390625, "rewards/safety_reward/std": 0.7274155616760254, "step": 622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 119.5390625, "completions/mean_terminated_length": 119.5390625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.2176229150292551, "frac_reward_zero_std": 0.0, "grad_norm": 0.39591360092163086, "kl": 3.322265625, "learning_rate": 5e-05, "loss": 0.0337, "num_tokens": 15500565.0, "reward": 6.208740234375, "reward_std": 0.2681586742401123, "rewards/helpfulness_reward/mean": 6.208740234375, "rewards/helpfulness_reward/std": 0.468822717666626, "rewards/safety_reward/mean": 8.066650390625, "rewards/safety_reward/std": 0.5849018096923828, "step": 623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 115.078125, "completions/mean_terminated_length": 115.078125, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.217972229499607, "frac_reward_zero_std": 0.0, "grad_norm": 0.3894875645637512, "kl": 3.298828125, "learning_rate": 5e-05, "loss": 0.0167, "num_tokens": 15520287.0, "reward": 6.0015082359313965, "reward_std": 0.34023961424827576, "rewards/helpfulness_reward/mean": 6.0015082359313965, "rewards/helpfulness_reward/std": 1.176798939704895, "rewards/safety_reward/mean": 7.665313720703125, "rewards/safety_reward/std": 1.205680012702942, "step": 624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 117.109375, "completions/mean_terminated_length": 117.109375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.21832154396995895, "frac_reward_zero_std": 0.0, "grad_norm": 0.44892749190330505, "kl": 3.189453125, "learning_rate": 5e-05, "loss": 0.0247, "num_tokens": 15539517.0, "reward": 6.382080078125, "reward_std": 0.2913985848426819, "rewards/helpfulness_reward/mean": 6.382080078125, "rewards/helpfulness_reward/std": 0.5441027879714966, "rewards/safety_reward/mean": 8.14013671875, "rewards/safety_reward/std": 0.5754067301750183, "step": 625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 117.453125, "completions/mean_terminated_length": 117.453125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.2186708584403109, "frac_reward_zero_std": 0.0, "grad_norm": 0.4197944104671478, "kl": 3.23046875, "learning_rate": 5e-05, "loss": 0.0181, "num_tokens": 15560047.0, "reward": 6.37109375, "reward_std": 0.22553718090057373, "rewards/helpfulness_reward/mean": 6.37109375, "rewards/helpfulness_reward/std": 0.513519823551178, "rewards/safety_reward/mean": 8.037353515625, "rewards/safety_reward/std": 0.5199640393257141, "step": 626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 116.8828125, "completions/mean_terminated_length": 116.8828125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.21902017291066284, "frac_reward_zero_std": 0.0, "grad_norm": 0.44599905610084534, "kl": 3.140625, "learning_rate": 5e-05, "loss": 0.022, "num_tokens": 15579344.0, "reward": 6.27783203125, "reward_std": 0.23917387425899506, "rewards/helpfulness_reward/mean": 6.27783203125, "rewards/helpfulness_reward/std": 0.5033450126647949, "rewards/safety_reward/mean": 7.945068359375, "rewards/safety_reward/std": 0.7330006957054138, "step": 627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 117.5546875, "completions/mean_terminated_length": 117.5546875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.21936948738101475, "frac_reward_zero_std": 0.0, "grad_norm": 0.37930288910865784, "kl": 3.142578125, "learning_rate": 5e-05, "loss": 0.0254, "num_tokens": 15599951.0, "reward": 6.31494140625, "reward_std": 0.20359288156032562, "rewards/helpfulness_reward/mean": 6.31494140625, "rewards/helpfulness_reward/std": 0.38666266202926636, "rewards/safety_reward/mean": 7.94580078125, "rewards/safety_reward/std": 0.43390828371047974, "step": 628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 127.0, "completions/max_terminated_length": 127.0, "completions/mean_length": 116.4609375, "completions/mean_terminated_length": 116.4609375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.2197188018513667, "frac_reward_zero_std": 0.0, "grad_norm": 0.6429547071456909, "kl": 3.673828125, "learning_rate": 5e-05, "loss": 0.0245, "num_tokens": 15618994.0, "reward": 6.2818603515625, "reward_std": 0.266573429107666, "rewards/helpfulness_reward/mean": 6.2818603515625, "rewards/helpfulness_reward/std": 0.49247848987579346, "rewards/safety_reward/mean": 7.978271484375, "rewards/safety_reward/std": 0.5155457258224487, "step": 629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 127.0, "completions/max_terminated_length": 127.0, "completions/mean_length": 116.5, "completions/mean_terminated_length": 116.5, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.22006811632171863, "frac_reward_zero_std": 0.0, "grad_norm": 0.3675682842731476, "kl": 3.375, "learning_rate": 5e-05, "loss": 0.0302, "num_tokens": 15637930.0, "reward": 6.43896484375, "reward_std": 0.2016790807247162, "rewards/helpfulness_reward/mean": 6.43896484375, "rewards/helpfulness_reward/std": 0.55086749792099, "rewards/safety_reward/mean": 8.13720703125, "rewards/safety_reward/std": 0.49589595198631287, "step": 630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 127.0, "completions/max_terminated_length": 127.0, "completions/mean_length": 116.9375, "completions/mean_terminated_length": 116.9375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.22041743079207057, "frac_reward_zero_std": 0.0, "grad_norm": 0.38953056931495667, "kl": 3.408203125, "learning_rate": 5e-05, "loss": 0.0292, "num_tokens": 15656674.0, "reward": 6.318603515625, "reward_std": 0.18113230168819427, "rewards/helpfulness_reward/mean": 6.318603515625, "rewards/helpfulness_reward/std": 0.5990434288978577, "rewards/safety_reward/mean": 8.1767578125, "rewards/safety_reward/std": 0.5172563195228577, "step": 631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 116.6640625, "completions/mean_terminated_length": 116.6640625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.2207667452624225, "frac_reward_zero_std": 0.0, "grad_norm": 0.4030478298664093, "kl": 3.5078125, "learning_rate": 5e-05, "loss": 0.0281, "num_tokens": 15675215.0, "reward": 6.3447265625, "reward_std": 0.2755507230758667, "rewards/helpfulness_reward/mean": 6.3447265625, "rewards/helpfulness_reward/std": 0.4621989130973816, "rewards/safety_reward/mean": 8.013916015625, "rewards/safety_reward/std": 0.48150795698165894, "step": 632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 111.21875, "completions/mean_terminated_length": 111.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.22111605973277443, "frac_reward_zero_std": 0.0, "grad_norm": 0.47698283195495605, "kl": 3.34765625, "learning_rate": 5e-05, "loss": 0.0061, "num_tokens": 15695611.0, "reward": 6.04949951171875, "reward_std": 0.3724903464317322, "rewards/helpfulness_reward/mean": 6.04949951171875, "rewards/helpfulness_reward/std": 1.196193814277649, "rewards/safety_reward/mean": 7.7667236328125, "rewards/safety_reward/std": 1.369847059249878, "step": 633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 117.5234375, "completions/mean_terminated_length": 117.5234375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.22146537420312637, "frac_reward_zero_std": 0.0, "grad_norm": 0.3776949346065521, "kl": 3.279296875, "learning_rate": 5e-05, "loss": 0.0322, "num_tokens": 15715798.0, "reward": 6.356689453125, "reward_std": 0.23028096556663513, "rewards/helpfulness_reward/mean": 6.356689453125, "rewards/helpfulness_reward/std": 0.4232153296470642, "rewards/safety_reward/mean": 8.13330078125, "rewards/safety_reward/std": 0.41938164830207825, "step": 634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 116.6015625, "completions/mean_terminated_length": 116.6015625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.2218146886734783, "frac_reward_zero_std": 0.0, "grad_norm": 0.4502240717411041, "kl": 3.453125, "learning_rate": 5e-05, "loss": 0.0275, "num_tokens": 15735611.0, "reward": 6.182373046875, "reward_std": 0.25974157452583313, "rewards/helpfulness_reward/mean": 6.182373046875, "rewards/helpfulness_reward/std": 0.617185115814209, "rewards/safety_reward/mean": 8.04296875, "rewards/safety_reward/std": 0.48253196477890015, "step": 635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 117.9453125, "completions/mean_terminated_length": 117.9453125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.22216400314383022, "frac_reward_zero_std": 0.0, "grad_norm": 0.334327757358551, "kl": 3.322265625, "learning_rate": 5e-05, "loss": 0.0311, "num_tokens": 15754364.0, "reward": 6.3450927734375, "reward_std": 0.27647146582603455, "rewards/helpfulness_reward/mean": 6.3450927734375, "rewards/helpfulness_reward/std": 0.6918983459472656, "rewards/safety_reward/mean": 8.1171875, "rewards/safety_reward/std": 0.6526487469673157, "step": 636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 118.3671875, "completions/mean_terminated_length": 118.3671875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.22251331761418217, "frac_reward_zero_std": 0.0, "grad_norm": 1.055790662765503, "kl": 3.740234375, "learning_rate": 5e-05, "loss": 0.0301, "num_tokens": 15773579.0, "reward": 6.3759765625, "reward_std": 0.2596825063228607, "rewards/helpfulness_reward/mean": 6.3759765625, "rewards/helpfulness_reward/std": 0.5589610934257507, "rewards/safety_reward/mean": 7.990966796875, "rewards/safety_reward/std": 0.6265378594398499, "step": 637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 120.5546875, "completions/mean_terminated_length": 120.5546875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.2228626320845341, "frac_reward_zero_std": 0.0, "grad_norm": 0.396619975566864, "kl": 3.115234375, "learning_rate": 5e-05, "loss": 0.0349, "num_tokens": 15794274.0, "reward": 6.411376953125, "reward_std": 0.21974094212055206, "rewards/helpfulness_reward/mean": 6.411376953125, "rewards/helpfulness_reward/std": 0.4701468348503113, "rewards/safety_reward/mean": 8.202880859375, "rewards/safety_reward/std": 0.4419938623905182, "step": 638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 118.78125, "completions/mean_terminated_length": 118.78125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.22321194655488605, "frac_reward_zero_std": 0.0, "grad_norm": 0.34840983152389526, "kl": 3.2109375, "learning_rate": 5e-05, "loss": 0.0252, "num_tokens": 15814750.0, "reward": 6.3526611328125, "reward_std": 0.2538660168647766, "rewards/helpfulness_reward/mean": 6.3526611328125, "rewards/helpfulness_reward/std": 0.5893879532814026, "rewards/safety_reward/mean": 8.022705078125, "rewards/safety_reward/std": 0.5749486088752747, "step": 639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 120.3125, "completions/mean_terminated_length": 120.3125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.22356126102523796, "frac_reward_zero_std": 0.0, "grad_norm": 0.4096473157405853, "kl": 3.36328125, "learning_rate": 5e-05, "loss": 0.038, "num_tokens": 15834574.0, "reward": 6.330810546875, "reward_std": 0.2524451017379761, "rewards/helpfulness_reward/mean": 6.330810546875, "rewards/helpfulness_reward/std": 0.6250802874565125, "rewards/safety_reward/mean": 7.98779296875, "rewards/safety_reward/std": 0.6168543100357056, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 120.9765625, "completions/mean_terminated_length": 120.9765625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.2239105754955899, "frac_reward_zero_std": 0.0, "grad_norm": 0.3983217477798462, "kl": 3.265625, "learning_rate": 5e-05, "loss": 0.0323, "num_tokens": 15854027.0, "reward": 6.486328125, "reward_std": 0.24596086144447327, "rewards/helpfulness_reward/mean": 6.486328125, "rewards/helpfulness_reward/std": 0.5137556195259094, "rewards/safety_reward/mean": 8.135986328125, "rewards/safety_reward/std": 0.44036585092544556, "step": 641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 120.8125, "completions/mean_terminated_length": 120.8125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.22425988996594184, "frac_reward_zero_std": 0.0, "grad_norm": 0.3584713935852051, "kl": 3.34765625, "learning_rate": 5e-05, "loss": 0.0393, "num_tokens": 15874451.0, "reward": 6.1851806640625, "reward_std": 0.25840356945991516, "rewards/helpfulness_reward/mean": 6.1851806640625, "rewards/helpfulness_reward/std": 0.6284815669059753, "rewards/safety_reward/mean": 7.758056640625, "rewards/safety_reward/std": 0.659729540348053, "step": 642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 121.0078125, "completions/mean_terminated_length": 121.0078125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.22460920443629379, "frac_reward_zero_std": 0.0, "grad_norm": 0.3767482340335846, "kl": 3.29296875, "learning_rate": 5e-05, "loss": 0.028, "num_tokens": 15894004.0, "reward": 6.303955078125, "reward_std": 0.31169912219047546, "rewards/helpfulness_reward/mean": 6.303955078125, "rewards/helpfulness_reward/std": 0.5911830067634583, "rewards/safety_reward/mean": 8.1328125, "rewards/safety_reward/std": 0.564505934715271, "step": 643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 121.9375, "completions/mean_terminated_length": 121.9375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.2249585189066457, "frac_reward_zero_std": 0.0, "grad_norm": 0.30190354585647583, "kl": 3.37890625, "learning_rate": 5e-05, "loss": 0.0321, "num_tokens": 15914292.0, "reward": 6.20849609375, "reward_std": 0.2380363941192627, "rewards/helpfulness_reward/mean": 6.20849609375, "rewards/helpfulness_reward/std": 0.40576910972595215, "rewards/safety_reward/mean": 8.19677734375, "rewards/safety_reward/std": 0.5519343018531799, "step": 644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 122.265625, "completions/mean_terminated_length": 122.265625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.22530783337699764, "frac_reward_zero_std": 0.0, "grad_norm": 0.4230567514896393, "kl": 3.263671875, "learning_rate": 5e-05, "loss": 0.0303, "num_tokens": 15933838.0, "reward": 6.445068359375, "reward_std": 0.29249563813209534, "rewards/helpfulness_reward/mean": 6.445068359375, "rewards/helpfulness_reward/std": 0.5452256202697754, "rewards/safety_reward/mean": 8.030029296875, "rewards/safety_reward/std": 0.5240054726600647, "step": 645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 120.96875, "completions/mean_terminated_length": 120.96875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.22565714784734958, "frac_reward_zero_std": 0.0, "grad_norm": 0.42950719594955444, "kl": 3.126953125, "learning_rate": 5e-05, "loss": 0.0361, "num_tokens": 15953754.0, "reward": 6.44580078125, "reward_std": 0.23225875198841095, "rewards/helpfulness_reward/mean": 6.44580078125, "rewards/helpfulness_reward/std": 0.44015443325042725, "rewards/safety_reward/mean": 8.21435546875, "rewards/safety_reward/std": 0.48747435212135315, "step": 646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 120.640625, "completions/mean_terminated_length": 120.640625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.22600646231770152, "frac_reward_zero_std": 0.0, "grad_norm": 0.3028284013271332, "kl": 3.240234375, "learning_rate": 5e-05, "loss": 0.0316, "num_tokens": 15973500.0, "reward": 6.514404296875, "reward_std": 0.22717413306236267, "rewards/helpfulness_reward/mean": 6.514404296875, "rewards/helpfulness_reward/std": 0.5553308129310608, "rewards/safety_reward/mean": 8.1884765625, "rewards/safety_reward/std": 0.4620325267314911, "step": 647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 120.8515625, "completions/mean_terminated_length": 120.8515625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.22635577678805344, "frac_reward_zero_std": 0.0, "grad_norm": 0.39228320121765137, "kl": 3.1875, "learning_rate": 5e-05, "loss": 0.0282, "num_tokens": 15995041.0, "reward": 6.219482421875, "reward_std": 0.22848501801490784, "rewards/helpfulness_reward/mean": 6.219482421875, "rewards/helpfulness_reward/std": 0.6790309548377991, "rewards/safety_reward/mean": 7.922607421875, "rewards/safety_reward/std": 0.5296331644058228, "step": 648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 120.859375, "completions/mean_terminated_length": 120.859375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.22670509125840538, "frac_reward_zero_std": 0.0, "grad_norm": 0.49615976214408875, "kl": 3.4609375, "learning_rate": 5e-05, "loss": 0.0327, "num_tokens": 16014095.0, "reward": 6.529296875, "reward_std": 0.23784174025058746, "rewards/helpfulness_reward/mean": 6.529296875, "rewards/helpfulness_reward/std": 0.4895837604999542, "rewards/safety_reward/mean": 8.275390625, "rewards/safety_reward/std": 0.5363243222236633, "step": 649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 121.2421875, "completions/mean_terminated_length": 121.2421875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.22705440572875732, "frac_reward_zero_std": 0.0, "grad_norm": 0.3843204379081726, "kl": 3.3671875, "learning_rate": 5e-05, "loss": 0.036, "num_tokens": 16034566.0, "reward": 6.312744140625, "reward_std": 0.3270202875137329, "rewards/helpfulness_reward/mean": 6.312744140625, "rewards/helpfulness_reward/std": 0.6124539971351624, "rewards/safety_reward/mean": 8.013916015625, "rewards/safety_reward/std": 0.5971245169639587, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 119.9921875, "completions/mean_terminated_length": 119.9921875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.22740372019910926, "frac_reward_zero_std": 0.0, "grad_norm": 0.4105948209762573, "kl": 3.259765625, "learning_rate": 5e-05, "loss": 0.0224, "num_tokens": 16055661.0, "reward": 6.5335693359375, "reward_std": 0.27834269404411316, "rewards/helpfulness_reward/mean": 6.5335693359375, "rewards/helpfulness_reward/std": 0.6398671269416809, "rewards/safety_reward/mean": 8.017578125, "rewards/safety_reward/std": 0.6630256772041321, "step": 651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 120.53125, "completions/mean_terminated_length": 120.53125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.22775303466946117, "frac_reward_zero_std": 0.0, "grad_norm": 0.41737234592437744, "kl": 3.505859375, "learning_rate": 5e-05, "loss": 0.0355, "num_tokens": 16075713.0, "reward": 6.518798828125, "reward_std": 0.2063661813735962, "rewards/helpfulness_reward/mean": 6.518798828125, "rewards/helpfulness_reward/std": 0.576772928237915, "rewards/safety_reward/mean": 8.30126953125, "rewards/safety_reward/std": 0.5272725224494934, "step": 652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 120.609375, "completions/mean_terminated_length": 120.609375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.22810234913981312, "frac_reward_zero_std": 0.0, "grad_norm": 0.3672561049461365, "kl": 3.470703125, "learning_rate": 5e-05, "loss": 0.0297, "num_tokens": 16095055.0, "reward": 6.4814453125, "reward_std": 0.2720222771167755, "rewards/helpfulness_reward/mean": 6.4814453125, "rewards/helpfulness_reward/std": 0.4379819631576538, "rewards/safety_reward/mean": 8.015869140625, "rewards/safety_reward/std": 0.44341814517974854, "step": 653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 121.0390625, "completions/mean_terminated_length": 121.0390625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.22845166361016506, "frac_reward_zero_std": 0.0, "grad_norm": 0.42960453033447266, "kl": 3.3671875, "learning_rate": 5e-05, "loss": 0.0325, "num_tokens": 16114300.0, "reward": 6.355712890625, "reward_std": 0.20529377460479736, "rewards/helpfulness_reward/mean": 6.355712890625, "rewards/helpfulness_reward/std": 0.3679856061935425, "rewards/safety_reward/mean": 8.02490234375, "rewards/safety_reward/std": 0.5855192542076111, "step": 654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 121.1953125, "completions/mean_terminated_length": 121.1953125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.228800978080517, "frac_reward_zero_std": 0.0, "grad_norm": 0.32849088311195374, "kl": 3.28515625, "learning_rate": 5e-05, "loss": 0.0359, "num_tokens": 16133373.0, "reward": 6.6494140625, "reward_std": 0.2002326101064682, "rewards/helpfulness_reward/mean": 6.6494140625, "rewards/helpfulness_reward/std": 0.44607970118522644, "rewards/safety_reward/mean": 8.195556640625, "rewards/safety_reward/std": 0.458814412355423, "step": 655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 120.2734375, "completions/mean_terminated_length": 120.2734375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.2291502925508689, "frac_reward_zero_std": 0.0, "grad_norm": 0.3399338126182556, "kl": 3.365234375, "learning_rate": 5e-05, "loss": 0.0265, "num_tokens": 16153544.0, "reward": 6.348876953125, "reward_std": 0.19860443472862244, "rewards/helpfulness_reward/mean": 6.348876953125, "rewards/helpfulness_reward/std": 0.4853091835975647, "rewards/safety_reward/mean": 8.1044921875, "rewards/safety_reward/std": 0.42068058252334595, "step": 656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 120.703125, "completions/mean_terminated_length": 120.703125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.22949960702122085, "frac_reward_zero_std": 0.0, "grad_norm": 0.3181075155735016, "kl": 3.259765625, "learning_rate": 5e-05, "loss": 0.0282, "num_tokens": 16172378.0, "reward": 6.44140625, "reward_std": 0.2138429433107376, "rewards/helpfulness_reward/mean": 6.44140625, "rewards/helpfulness_reward/std": 0.4841387867927551, "rewards/safety_reward/mean": 7.953125, "rewards/safety_reward/std": 0.6518235206604004, "step": 657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 119.7421875, "completions/mean_terminated_length": 119.7421875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.2298489214915728, "frac_reward_zero_std": 0.0, "grad_norm": 0.4499879777431488, "kl": 3.505859375, "learning_rate": 5e-05, "loss": 0.0285, "num_tokens": 16191745.0, "reward": 6.438720703125, "reward_std": 0.21684005856513977, "rewards/helpfulness_reward/mean": 6.438720703125, "rewards/helpfulness_reward/std": 0.4418965280056, "rewards/safety_reward/mean": 7.978271484375, "rewards/safety_reward/std": 0.4330759048461914, "step": 658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 119.6953125, "completions/mean_terminated_length": 119.6953125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.23019823596192474, "frac_reward_zero_std": 0.0, "grad_norm": 0.3529299795627594, "kl": 3.275390625, "learning_rate": 5e-05, "loss": 0.0267, "num_tokens": 16211106.0, "reward": 6.5224609375, "reward_std": 0.19469481706619263, "rewards/helpfulness_reward/mean": 6.5224609375, "rewards/helpfulness_reward/std": 0.3772679269313812, "rewards/safety_reward/mean": 8.0556640625, "rewards/safety_reward/std": 0.4698549509048462, "step": 659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 120.015625, "completions/mean_terminated_length": 120.015625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.23054755043227665, "frac_reward_zero_std": 0.0, "grad_norm": 0.3493387997150421, "kl": 3.41796875, "learning_rate": 5e-05, "loss": 0.0287, "num_tokens": 16230876.0, "reward": 6.65576171875, "reward_std": 0.1792699098587036, "rewards/helpfulness_reward/mean": 6.65576171875, "rewards/helpfulness_reward/std": 0.44138434529304504, "rewards/safety_reward/mean": 8.259765625, "rewards/safety_reward/std": 0.4352753758430481, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 120.4453125, "completions/mean_terminated_length": 120.4453125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.2308968649026286, "frac_reward_zero_std": 0.0, "grad_norm": 0.3263210356235504, "kl": 3.376953125, "learning_rate": 5e-05, "loss": 0.027, "num_tokens": 16250093.0, "reward": 6.371337890625, "reward_std": 0.22170525789260864, "rewards/helpfulness_reward/mean": 6.371337890625, "rewards/helpfulness_reward/std": 0.7798029780387878, "rewards/safety_reward/mean": 7.93701171875, "rewards/safety_reward/std": 0.5345181226730347, "step": 661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 120.578125, "completions/mean_terminated_length": 120.578125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.23124617937298053, "frac_reward_zero_std": 0.0, "grad_norm": 0.3115438222885132, "kl": 3.40234375, "learning_rate": 5e-05, "loss": 0.0314, "num_tokens": 16270455.0, "reward": 6.49658203125, "reward_std": 0.18059110641479492, "rewards/helpfulness_reward/mean": 6.49658203125, "rewards/helpfulness_reward/std": 0.49499577283859253, "rewards/safety_reward/mean": 8.22412109375, "rewards/safety_reward/std": 0.3581368625164032, "step": 662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 121.1640625, "completions/mean_terminated_length": 121.1640625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.23159549384333247, "frac_reward_zero_std": 0.0, "grad_norm": 0.7314559817314148, "kl": 3.59765625, "learning_rate": 5e-05, "loss": 0.0315, "num_tokens": 16289684.0, "reward": 6.639404296875, "reward_std": 0.1873043030500412, "rewards/helpfulness_reward/mean": 6.639404296875, "rewards/helpfulness_reward/std": 0.5477885603904724, "rewards/safety_reward/mean": 8.214599609375, "rewards/safety_reward/std": 0.5297116041183472, "step": 663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 121.53125, "completions/mean_terminated_length": 121.53125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.2319448083136844, "frac_reward_zero_std": 0.0, "grad_norm": 0.28068843483924866, "kl": 3.39453125, "learning_rate": 5e-05, "loss": 0.0319, "num_tokens": 16309280.0, "reward": 6.490478515625, "reward_std": 0.17507407069206238, "rewards/helpfulness_reward/mean": 6.490478515625, "rewards/helpfulness_reward/std": 0.6751936078071594, "rewards/safety_reward/mean": 7.845703125, "rewards/safety_reward/std": 1.1417124271392822, "step": 664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 120.921875, "completions/mean_terminated_length": 120.921875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.23229412278403633, "frac_reward_zero_std": 0.0, "grad_norm": 0.30809393525123596, "kl": 3.33984375, "learning_rate": 5e-05, "loss": 0.0315, "num_tokens": 16330670.0, "reward": 6.2484130859375, "reward_std": 0.19134265184402466, "rewards/helpfulness_reward/mean": 6.2484130859375, "rewards/helpfulness_reward/std": 0.9433153867721558, "rewards/safety_reward/mean": 8.013427734375, "rewards/safety_reward/std": 0.7439747452735901, "step": 665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 122.59375, "completions/mean_terminated_length": 122.59375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.23264343725438827, "frac_reward_zero_std": 0.0, "grad_norm": 0.943487286567688, "kl": 3.47265625, "learning_rate": 5e-05, "loss": 0.0222, "num_tokens": 16352586.0, "reward": 5.9297943115234375, "reward_std": 0.21669162809848785, "rewards/helpfulness_reward/mean": 5.9297943115234375, "rewards/helpfulness_reward/std": 2.0143373012542725, "rewards/safety_reward/mean": 7.630126953125, "rewards/safety_reward/std": 2.179171323776245, "step": 666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 120.2421875, "completions/mean_terminated_length": 120.2421875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.2329927517247402, "frac_reward_zero_std": 0.0, "grad_norm": 0.3301786780357361, "kl": 3.349609375, "learning_rate": 5e-05, "loss": 0.0258, "num_tokens": 16372137.0, "reward": 6.71533203125, "reward_std": 0.21258218586444855, "rewards/helpfulness_reward/mean": 6.71533203125, "rewards/helpfulness_reward/std": 0.6246336698532104, "rewards/safety_reward/mean": 8.268798828125, "rewards/safety_reward/std": 0.5535616874694824, "step": 667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 121.4609375, "completions/mean_terminated_length": 121.4609375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.23334206619509212, "frac_reward_zero_std": 0.0, "grad_norm": 0.5151230096817017, "kl": 3.662109375, "learning_rate": 5e-05, "loss": 0.0402, "num_tokens": 16392076.0, "reward": 6.498291015625, "reward_std": 0.219859316945076, "rewards/helpfulness_reward/mean": 6.498291015625, "rewards/helpfulness_reward/std": 0.6356052160263062, "rewards/safety_reward/mean": 8.21875, "rewards/safety_reward/std": 0.5071462988853455, "step": 668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 121.5390625, "completions/mean_terminated_length": 121.5390625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.23369138066544407, "frac_reward_zero_std": 0.0, "grad_norm": 0.392708420753479, "kl": 3.533203125, "learning_rate": 5e-05, "loss": 0.0345, "num_tokens": 16411969.0, "reward": 6.6171875, "reward_std": 0.20838752388954163, "rewards/helpfulness_reward/mean": 6.6171875, "rewards/helpfulness_reward/std": 0.5501291751861572, "rewards/safety_reward/mean": 8.103271484375, "rewards/safety_reward/std": 0.6297745108604431, "step": 669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 121.9765625, "completions/mean_terminated_length": 121.9765625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.234040695135796, "frac_reward_zero_std": 0.0, "grad_norm": 0.4247790575027466, "kl": 3.296875, "learning_rate": 5e-05, "loss": 0.0262, "num_tokens": 16432502.0, "reward": 6.626708984375, "reward_std": 0.1826355904340744, "rewards/helpfulness_reward/mean": 6.626708984375, "rewards/helpfulness_reward/std": 0.6246961355209351, "rewards/safety_reward/mean": 8.29296875, "rewards/safety_reward/std": 0.5252161622047424, "step": 670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 122.359375, "completions/mean_terminated_length": 122.359375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.23439000960614795, "frac_reward_zero_std": 0.0, "grad_norm": 0.35473790764808655, "kl": 3.40234375, "learning_rate": 5e-05, "loss": 0.0322, "num_tokens": 16452436.0, "reward": 6.505859375, "reward_std": 0.20820550620555878, "rewards/helpfulness_reward/mean": 6.505859375, "rewards/helpfulness_reward/std": 0.46877050399780273, "rewards/safety_reward/mean": 8.126220703125, "rewards/safety_reward/std": 0.5001599788665771, "step": 671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 121.703125, "completions/mean_terminated_length": 121.703125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.23473932407649986, "frac_reward_zero_std": 0.0, "grad_norm": 0.3428294360637665, "kl": 3.546875, "learning_rate": 5e-05, "loss": 0.0303, "num_tokens": 16472350.0, "reward": 6.72607421875, "reward_std": 0.19332027435302734, "rewards/helpfulness_reward/mean": 6.72607421875, "rewards/helpfulness_reward/std": 0.4465715289115906, "rewards/safety_reward/mean": 8.331298828125, "rewards/safety_reward/std": 0.42177408933639526, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 122.96875, "completions/mean_terminated_length": 122.96875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.2350886385468518, "frac_reward_zero_std": 0.0, "grad_norm": 0.3943884074687958, "kl": 3.587890625, "learning_rate": 5e-05, "loss": 0.038, "num_tokens": 16491714.0, "reward": 6.370849609375, "reward_std": 0.19317607581615448, "rewards/helpfulness_reward/mean": 6.370849609375, "rewards/helpfulness_reward/std": 0.5416829586029053, "rewards/safety_reward/mean": 7.98193359375, "rewards/safety_reward/std": 0.40613847970962524, "step": 673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 122.09375, "completions/mean_terminated_length": 122.09375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.23543795301720374, "frac_reward_zero_std": 0.0, "grad_norm": 0.4225866496562958, "kl": 3.369140625, "learning_rate": 5e-05, "loss": 0.0308, "num_tokens": 16511350.0, "reward": 6.633544921875, "reward_std": 0.2188122272491455, "rewards/helpfulness_reward/mean": 6.633544921875, "rewards/helpfulness_reward/std": 0.4750109314918518, "rewards/safety_reward/mean": 8.151123046875, "rewards/safety_reward/std": 0.5412343144416809, "step": 674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 123.1875, "completions/mean_terminated_length": 123.1875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.23578726748755569, "frac_reward_zero_std": 0.0, "grad_norm": 0.3295573890209198, "kl": 3.501953125, "learning_rate": 5e-05, "loss": 0.0322, "num_tokens": 16530950.0, "reward": 6.49560546875, "reward_std": 0.21812857687473297, "rewards/helpfulness_reward/mean": 6.49560546875, "rewards/helpfulness_reward/std": 0.6311671137809753, "rewards/safety_reward/mean": 8.14599609375, "rewards/safety_reward/std": 0.5142098069190979, "step": 675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 122.2421875, "completions/mean_terminated_length": 122.2421875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.2361365819579076, "frac_reward_zero_std": 0.0, "grad_norm": 0.38947394490242004, "kl": 3.283203125, "learning_rate": 5e-05, "loss": 0.0365, "num_tokens": 16551285.0, "reward": 6.53369140625, "reward_std": 0.26566773653030396, "rewards/helpfulness_reward/mean": 6.53369140625, "rewards/helpfulness_reward/std": 0.4758179187774658, "rewards/safety_reward/mean": 8.10205078125, "rewards/safety_reward/std": 0.5894083976745605, "step": 676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 122.765625, "completions/mean_terminated_length": 122.765625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.23648589642825954, "frac_reward_zero_std": 0.0, "grad_norm": 0.3516508936882019, "kl": 3.45703125, "learning_rate": 5e-05, "loss": 0.0357, "num_tokens": 16571023.0, "reward": 6.446533203125, "reward_std": 0.17413489520549774, "rewards/helpfulness_reward/mean": 6.446533203125, "rewards/helpfulness_reward/std": 0.4810660779476166, "rewards/safety_reward/mean": 7.971923828125, "rewards/safety_reward/std": 0.6965320110321045, "step": 677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 123.515625, "completions/mean_terminated_length": 123.515625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.23683521089861148, "frac_reward_zero_std": 0.0, "grad_norm": 0.33089718222618103, "kl": 3.466796875, "learning_rate": 5e-05, "loss": 0.0386, "num_tokens": 16590561.0, "reward": 6.43603515625, "reward_std": 0.22719943523406982, "rewards/helpfulness_reward/mean": 6.43603515625, "rewards/helpfulness_reward/std": 0.47575733065605164, "rewards/safety_reward/mean": 8.23388671875, "rewards/safety_reward/std": 0.45356783270835876, "step": 678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 123.0859375, "completions/mean_terminated_length": 123.0859375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.23718452536896342, "frac_reward_zero_std": 0.0, "grad_norm": 0.30210328102111816, "kl": 3.212890625, "learning_rate": 5e-05, "loss": 0.0343, "num_tokens": 16611004.0, "reward": 6.40869140625, "reward_std": 0.1861041784286499, "rewards/helpfulness_reward/mean": 6.40869140625, "rewards/helpfulness_reward/std": 0.7590837478637695, "rewards/safety_reward/mean": 8.06787109375, "rewards/safety_reward/std": 0.8991252183914185, "step": 679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 122.6796875, "completions/mean_terminated_length": 122.6796875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.23753383983931534, "frac_reward_zero_std": 0.0, "grad_norm": 0.4160638749599457, "kl": 3.431640625, "learning_rate": 5e-05, "loss": 0.0323, "num_tokens": 16630915.0, "reward": 6.746826171875, "reward_std": 0.22188617289066315, "rewards/helpfulness_reward/mean": 6.746826171875, "rewards/helpfulness_reward/std": 0.42888811230659485, "rewards/safety_reward/mean": 8.157470703125, "rewards/safety_reward/std": 0.4223400354385376, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 122.7265625, "completions/mean_terminated_length": 122.7265625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.23788315430966728, "frac_reward_zero_std": 0.0, "grad_norm": 0.5441198945045471, "kl": 3.61328125, "learning_rate": 5e-05, "loss": 0.0382, "num_tokens": 16650496.0, "reward": 6.548828125, "reward_std": 0.20048919320106506, "rewards/helpfulness_reward/mean": 6.548828125, "rewards/helpfulness_reward/std": 0.40604647994041443, "rewards/safety_reward/mean": 8.050048828125, "rewards/safety_reward/std": 0.5055416822433472, "step": 681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 122.140625, "completions/mean_terminated_length": 122.140625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.23823246878001922, "frac_reward_zero_std": 0.0, "grad_norm": 0.607631266117096, "kl": 3.48046875, "learning_rate": 5e-05, "loss": 0.031, "num_tokens": 16670714.0, "reward": 6.488037109375, "reward_std": 0.19451609253883362, "rewards/helpfulness_reward/mean": 6.488037109375, "rewards/helpfulness_reward/std": 0.5151810646057129, "rewards/safety_reward/mean": 8.114990234375, "rewards/safety_reward/std": 0.4441101551055908, "step": 682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 121.421875, "completions/mean_terminated_length": 121.421875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.23858178325037116, "frac_reward_zero_std": 0.0, "grad_norm": 0.30777737498283386, "kl": 3.404296875, "learning_rate": 5e-05, "loss": 0.0288, "num_tokens": 16689736.0, "reward": 6.70263671875, "reward_std": 0.22711102664470673, "rewards/helpfulness_reward/mean": 6.70263671875, "rewards/helpfulness_reward/std": 0.5596216917037964, "rewards/safety_reward/mean": 8.31005859375, "rewards/safety_reward/std": 0.5556589961051941, "step": 683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 121.8984375, "completions/mean_terminated_length": 121.8984375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.23893109772072307, "frac_reward_zero_std": 0.0, "grad_norm": 0.40564194321632385, "kl": 3.369140625, "learning_rate": 5e-05, "loss": 0.0307, "num_tokens": 16708835.0, "reward": 6.696533203125, "reward_std": 0.21660545468330383, "rewards/helpfulness_reward/mean": 6.696533203125, "rewards/helpfulness_reward/std": 0.4998639225959778, "rewards/safety_reward/mean": 8.447509765625, "rewards/safety_reward/std": 0.37120649218559265, "step": 684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 120.7265625, "completions/mean_terminated_length": 120.7265625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.23928041219107501, "frac_reward_zero_std": 0.0, "grad_norm": 0.3661944568157196, "kl": 3.587890625, "learning_rate": 5e-05, "loss": 0.0308, "num_tokens": 16729064.0, "reward": 6.32861328125, "reward_std": 0.25346750020980835, "rewards/helpfulness_reward/mean": 6.32861328125, "rewards/helpfulness_reward/std": 0.5766215324401855, "rewards/safety_reward/mean": 8.11083984375, "rewards/safety_reward/std": 0.45239442586898804, "step": 685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 119.75, "completions/mean_terminated_length": 119.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.23962972666142696, "frac_reward_zero_std": 0.0, "grad_norm": 0.42177125811576843, "kl": 3.5703125, "learning_rate": 5e-05, "loss": 0.0244, "num_tokens": 16751192.0, "reward": 6.4417724609375, "reward_std": 0.32709890604019165, "rewards/helpfulness_reward/mean": 6.4417724609375, "rewards/helpfulness_reward/std": 0.8414267897605896, "rewards/safety_reward/mean": 8.111572265625, "rewards/safety_reward/std": 0.9809382557868958, "step": 686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 120.3359375, "completions/mean_terminated_length": 120.3359375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.2399790411317789, "frac_reward_zero_std": 0.0, "grad_norm": 0.3709355890750885, "kl": 3.42578125, "learning_rate": 5e-05, "loss": 0.03, "num_tokens": 16770955.0, "reward": 6.34716796875, "reward_std": 0.23096272349357605, "rewards/helpfulness_reward/mean": 6.34716796875, "rewards/helpfulness_reward/std": 0.6644562482833862, "rewards/safety_reward/mean": 8.150390625, "rewards/safety_reward/std": 0.4643866717815399, "step": 687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 119.8828125, "completions/mean_terminated_length": 119.8828125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.2403283556021308, "frac_reward_zero_std": 0.0, "grad_norm": 0.35717853903770447, "kl": 3.412109375, "learning_rate": 5e-05, "loss": 0.0253, "num_tokens": 16790156.0, "reward": 6.35302734375, "reward_std": 0.21742799878120422, "rewards/helpfulness_reward/mean": 6.35302734375, "rewards/helpfulness_reward/std": 0.46018853783607483, "rewards/safety_reward/mean": 7.861572265625, "rewards/safety_reward/std": 0.4943498969078064, "step": 688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 119.7109375, "completions/mean_terminated_length": 119.7109375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.24067767007248275, "frac_reward_zero_std": 0.0, "grad_norm": 0.4462888538837433, "kl": 3.48828125, "learning_rate": 5e-05, "loss": 0.0308, "num_tokens": 16809279.0, "reward": 6.538330078125, "reward_std": 0.21923264861106873, "rewards/helpfulness_reward/mean": 6.538330078125, "rewards/helpfulness_reward/std": 0.48916682600975037, "rewards/safety_reward/mean": 8.21044921875, "rewards/safety_reward/std": 0.36260443925857544, "step": 689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 119.1015625, "completions/mean_terminated_length": 119.1015625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.2410269845428347, "frac_reward_zero_std": 0.0, "grad_norm": 0.36660581827163696, "kl": 3.46484375, "learning_rate": 5e-05, "loss": 0.0219, "num_tokens": 16829868.0, "reward": 6.340576171875, "reward_std": 0.21415013074874878, "rewards/helpfulness_reward/mean": 6.340576171875, "rewards/helpfulness_reward/std": 0.49104365706443787, "rewards/safety_reward/mean": 8.040771484375, "rewards/safety_reward/std": 0.5114275813102722, "step": 690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 120.1328125, "completions/mean_terminated_length": 120.1328125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.24137629901318663, "frac_reward_zero_std": 0.0, "grad_norm": 0.32754072546958923, "kl": 3.283203125, "learning_rate": 5e-05, "loss": 0.0278, "num_tokens": 16849525.0, "reward": 6.523681640625, "reward_std": 0.1818857192993164, "rewards/helpfulness_reward/mean": 6.523681640625, "rewards/helpfulness_reward/std": 0.4320482015609741, "rewards/safety_reward/mean": 8.289306640625, "rewards/safety_reward/std": 0.4735441207885742, "step": 691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 118.8984375, "completions/mean_terminated_length": 118.8984375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.24172561348353855, "frac_reward_zero_std": 0.0, "grad_norm": 0.36778244376182556, "kl": 3.498046875, "learning_rate": 5e-05, "loss": 0.0339, "num_tokens": 16870480.0, "reward": 6.324592590332031, "reward_std": 0.3876371681690216, "rewards/helpfulness_reward/mean": 6.324592590332031, "rewards/helpfulness_reward/std": 0.9059271216392517, "rewards/safety_reward/mean": 7.863777160644531, "rewards/safety_reward/std": 1.245850682258606, "step": 692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 119.65625, "completions/mean_terminated_length": 119.65625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.2420749279538905, "frac_reward_zero_std": 0.0, "grad_norm": 0.4180377721786499, "kl": 3.486328125, "learning_rate": 5e-05, "loss": 0.0219, "num_tokens": 16889556.0, "reward": 6.490234375, "reward_std": 0.22947876155376434, "rewards/helpfulness_reward/mean": 6.490234375, "rewards/helpfulness_reward/std": 0.620799720287323, "rewards/safety_reward/mean": 8.2158203125, "rewards/safety_reward/std": 0.4048457145690918, "step": 693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 120.6171875, "completions/mean_terminated_length": 120.6171875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.24242424242424243, "frac_reward_zero_std": 0.0, "grad_norm": 0.33132031559944153, "kl": 3.44921875, "learning_rate": 5e-05, "loss": 0.033, "num_tokens": 16909219.0, "reward": 6.533447265625, "reward_std": 0.25851869583129883, "rewards/helpfulness_reward/mean": 6.533447265625, "rewards/helpfulness_reward/std": 0.5960724949836731, "rewards/safety_reward/mean": 8.13671875, "rewards/safety_reward/std": 0.48629406094551086, "step": 694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 120.0546875, "completions/mean_terminated_length": 120.0546875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.24277355689459437, "frac_reward_zero_std": 0.0, "grad_norm": 0.7784700989723206, "kl": 3.6015625, "learning_rate": 5e-05, "loss": 0.027, "num_tokens": 16928866.0, "reward": 6.3740234375, "reward_std": 0.2270260602235794, "rewards/helpfulness_reward/mean": 6.3740234375, "rewards/helpfulness_reward/std": 0.6651699542999268, "rewards/safety_reward/mean": 8.03564453125, "rewards/safety_reward/std": 0.47981101274490356, "step": 695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 120.3515625, "completions/mean_terminated_length": 120.3515625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.24312287136494629, "frac_reward_zero_std": 0.0, "grad_norm": 0.34745773673057556, "kl": 3.396484375, "learning_rate": 5e-05, "loss": 0.0291, "num_tokens": 16948263.0, "reward": 6.4765625, "reward_std": 0.2748950719833374, "rewards/helpfulness_reward/mean": 6.4765625, "rewards/helpfulness_reward/std": 0.6346814036369324, "rewards/safety_reward/mean": 8.13232421875, "rewards/safety_reward/std": 0.5656284093856812, "step": 696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 120.4140625, "completions/mean_terminated_length": 120.4140625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.24347218583529823, "frac_reward_zero_std": 0.0, "grad_norm": 0.554302453994751, "kl": 3.53125, "learning_rate": 5e-05, "loss": 0.0295, "num_tokens": 16967900.0, "reward": 6.697998046875, "reward_std": 0.19260527193546295, "rewards/helpfulness_reward/mean": 6.697998046875, "rewards/helpfulness_reward/std": 0.4552818536758423, "rewards/safety_reward/mean": 8.1943359375, "rewards/safety_reward/std": 0.5059460997581482, "step": 697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 120.4609375, "completions/mean_terminated_length": 120.4609375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.24382150030565017, "frac_reward_zero_std": 0.0, "grad_norm": 0.3059175908565521, "kl": 3.51171875, "learning_rate": 5e-05, "loss": 0.0337, "num_tokens": 16988799.0, "reward": 6.46875, "reward_std": 0.23319415748119354, "rewards/helpfulness_reward/mean": 6.46875, "rewards/helpfulness_reward/std": 0.5001844763755798, "rewards/safety_reward/mean": 8.242431640625, "rewards/safety_reward/std": 0.45505326986312866, "step": 698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 121.3203125, "completions/mean_terminated_length": 121.3203125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.2441708147760021, "frac_reward_zero_std": 0.0, "grad_norm": 0.4187367558479309, "kl": 3.419921875, "learning_rate": 5e-05, "loss": 0.033, "num_tokens": 17008376.0, "reward": 6.43115234375, "reward_std": 0.25299882888793945, "rewards/helpfulness_reward/mean": 6.43115234375, "rewards/helpfulness_reward/std": 0.5041559338569641, "rewards/safety_reward/mean": 8.1591796875, "rewards/safety_reward/std": 0.4772029519081116, "step": 699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 122.734375, "completions/mean_terminated_length": 122.734375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.24452012924635402, "frac_reward_zero_std": 0.0, "grad_norm": 0.3484741747379303, "kl": 3.244140625, "learning_rate": 5e-05, "loss": 0.0369, "num_tokens": 17027726.0, "reward": 6.531005859375, "reward_std": 0.30124980211257935, "rewards/helpfulness_reward/mean": 6.531005859375, "rewards/helpfulness_reward/std": 0.5695168375968933, "rewards/safety_reward/mean": 7.995849609375, "rewards/safety_reward/std": 0.7305904626846313, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 121.71875, "completions/mean_terminated_length": 121.71875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.24486944371670596, "frac_reward_zero_std": 0.0, "grad_norm": 0.3550264239311218, "kl": 3.498046875, "learning_rate": 5e-05, "loss": 0.0381, "num_tokens": 17047330.0, "reward": 6.6083984375, "reward_std": 0.2482021600008011, "rewards/helpfulness_reward/mean": 6.6083984375, "rewards/helpfulness_reward/std": 0.41760769486427307, "rewards/safety_reward/mean": 8.232421875, "rewards/safety_reward/std": 0.5104974508285522, "step": 701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 122.5078125, "completions/mean_terminated_length": 122.5078125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.2452187581870579, "frac_reward_zero_std": 0.0, "grad_norm": 0.56267911195755, "kl": 3.828125, "learning_rate": 5e-05, "loss": 0.0384, "num_tokens": 17066907.0, "reward": 6.30712890625, "reward_std": 0.27751874923706055, "rewards/helpfulness_reward/mean": 6.30712890625, "rewards/helpfulness_reward/std": 0.5573516488075256, "rewards/safety_reward/mean": 7.994384765625, "rewards/safety_reward/std": 0.5181623101234436, "step": 702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 122.15625, "completions/mean_terminated_length": 122.15625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.24556807265740982, "frac_reward_zero_std": 0.0, "grad_norm": 0.4054076075553894, "kl": 3.3828125, "learning_rate": 5e-05, "loss": 0.0368, "num_tokens": 17087271.0, "reward": 6.624755859375, "reward_std": 0.24201038479804993, "rewards/helpfulness_reward/mean": 6.624755859375, "rewards/helpfulness_reward/std": 0.5451233386993408, "rewards/safety_reward/mean": 8.35888671875, "rewards/safety_reward/std": 0.4765506386756897, "step": 703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 120.375, "completions/mean_terminated_length": 120.375, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.24591738712776176, "frac_reward_zero_std": 0.0, "grad_norm": 0.3543106019496918, "kl": 3.4921875, "learning_rate": 5e-05, "loss": 0.0304, "num_tokens": 17110319.0, "reward": 5.94390869140625, "reward_std": 0.23864299058914185, "rewards/helpfulness_reward/mean": 5.94390869140625, "rewards/helpfulness_reward/std": 1.090762972831726, "rewards/safety_reward/mean": 7.62249755859375, "rewards/safety_reward/std": 1.2622370719909668, "step": 704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 122.4140625, "completions/mean_terminated_length": 122.4140625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.2462667015981137, "frac_reward_zero_std": 0.0, "grad_norm": 0.4336755573749542, "kl": 3.427734375, "learning_rate": 5e-05, "loss": 0.0404, "num_tokens": 17132284.0, "reward": 6.34423828125, "reward_std": 0.33294540643692017, "rewards/helpfulness_reward/mean": 6.34423828125, "rewards/helpfulness_reward/std": 0.7939565181732178, "rewards/safety_reward/mean": 7.95263671875, "rewards/safety_reward/std": 0.7917841076850891, "step": 705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 120.4609375, "completions/mean_terminated_length": 120.4609375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.24661601606846564, "frac_reward_zero_std": 0.0, "grad_norm": 0.3150099813938141, "kl": 3.5, "learning_rate": 5e-05, "loss": 0.0276, "num_tokens": 17151743.0, "reward": 6.223876953125, "reward_std": 0.31973373889923096, "rewards/helpfulness_reward/mean": 6.223876953125, "rewards/helpfulness_reward/std": 0.6768198609352112, "rewards/safety_reward/mean": 7.8642578125, "rewards/safety_reward/std": 0.6604307293891907, "step": 706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 119.6953125, "completions/mean_terminated_length": 119.6953125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.24696533053881756, "frac_reward_zero_std": 0.0, "grad_norm": 0.384539932012558, "kl": 3.458984375, "learning_rate": 5e-05, "loss": 0.0313, "num_tokens": 17172296.0, "reward": 6.38134765625, "reward_std": 0.3157775402069092, "rewards/helpfulness_reward/mean": 6.38134765625, "rewards/helpfulness_reward/std": 0.6159202456474304, "rewards/safety_reward/mean": 8.05908203125, "rewards/safety_reward/std": 0.693427324295044, "step": 707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 119.9140625, "completions/mean_terminated_length": 119.9140625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.2473146450091695, "frac_reward_zero_std": 0.0, "grad_norm": 0.6608363389968872, "kl": 3.494140625, "learning_rate": 5e-05, "loss": 0.0315, "num_tokens": 17191829.0, "reward": 6.69970703125, "reward_std": 0.25390559434890747, "rewards/helpfulness_reward/mean": 6.69970703125, "rewards/helpfulness_reward/std": 0.4359370768070221, "rewards/safety_reward/mean": 8.1767578125, "rewards/safety_reward/std": 0.41881200671195984, "step": 708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 119.8359375, "completions/mean_terminated_length": 119.8359375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.24766395947952144, "frac_reward_zero_std": 0.0, "grad_norm": 0.34109294414520264, "kl": 3.412109375, "learning_rate": 5e-05, "loss": 0.0211, "num_tokens": 17211064.0, "reward": 6.55908203125, "reward_std": 0.266074538230896, "rewards/helpfulness_reward/mean": 6.55908203125, "rewards/helpfulness_reward/std": 0.5464861989021301, "rewards/safety_reward/mean": 8.192626953125, "rewards/safety_reward/std": 0.5796247124671936, "step": 709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 118.9921875, "completions/mean_terminated_length": 118.9921875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.24801327394987338, "frac_reward_zero_std": 0.0, "grad_norm": 0.3728678524494171, "kl": 3.62890625, "learning_rate": 5e-05, "loss": 0.0283, "num_tokens": 17230199.0, "reward": 6.380126953125, "reward_std": 0.26890242099761963, "rewards/helpfulness_reward/mean": 6.380126953125, "rewards/helpfulness_reward/std": 0.6200563907623291, "rewards/safety_reward/mean": 8.173095703125, "rewards/safety_reward/std": 0.7527722716331482, "step": 710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 119.078125, "completions/mean_terminated_length": 119.078125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.2483625884202253, "frac_reward_zero_std": 0.0, "grad_norm": 0.48470279574394226, "kl": 3.7734375, "learning_rate": 5e-05, "loss": 0.0337, "num_tokens": 17249593.0, "reward": 6.8525390625, "reward_std": 0.2675965130329132, "rewards/helpfulness_reward/mean": 6.8525390625, "rewards/helpfulness_reward/std": 0.5579972863197327, "rewards/safety_reward/mean": 8.440673828125, "rewards/safety_reward/std": 0.5044834613800049, "step": 711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 118.6953125, "completions/mean_terminated_length": 118.6953125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.24871190289057724, "frac_reward_zero_std": 0.0, "grad_norm": 0.5032156109809875, "kl": 3.599609375, "learning_rate": 5e-05, "loss": 0.0205, "num_tokens": 17269066.0, "reward": 6.5078125, "reward_std": 0.27217626571655273, "rewards/helpfulness_reward/mean": 6.5078125, "rewards/helpfulness_reward/std": 0.47113075852394104, "rewards/safety_reward/mean": 8.183837890625, "rewards/safety_reward/std": 0.3877749741077423, "step": 712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 117.90625, "completions/mean_terminated_length": 117.90625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.24906121736092918, "frac_reward_zero_std": 0.0, "grad_norm": 0.36353617906570435, "kl": 3.291015625, "learning_rate": 5e-05, "loss": 0.0217, "num_tokens": 17291974.0, "reward": 6.363037109375, "reward_std": 0.32885292172431946, "rewards/helpfulness_reward/mean": 6.363037109375, "rewards/helpfulness_reward/std": 0.7441523671150208, "rewards/safety_reward/mean": 7.914794921875, "rewards/safety_reward/std": 0.7938957810401917, "step": 713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 118.9296875, "completions/mean_terminated_length": 118.9296875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.24941053183128112, "frac_reward_zero_std": 0.0, "grad_norm": 0.38376450538635254, "kl": 3.494140625, "learning_rate": 5e-05, "loss": 0.0226, "num_tokens": 17312693.0, "reward": 6.431640625, "reward_std": 0.27132678031921387, "rewards/helpfulness_reward/mean": 6.431640625, "rewards/helpfulness_reward/std": 0.5361235737800598, "rewards/safety_reward/mean": 8.254150390625, "rewards/safety_reward/std": 0.5187694430351257, "step": 714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 120.578125, "completions/mean_terminated_length": 120.578125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.24975984630163303, "frac_reward_zero_std": 0.0, "grad_norm": 0.2996848523616791, "kl": 3.5234375, "learning_rate": 5e-05, "loss": 0.0321, "num_tokens": 17331951.0, "reward": 6.551513671875, "reward_std": 0.22808417677879333, "rewards/helpfulness_reward/mean": 6.551513671875, "rewards/helpfulness_reward/std": 0.5989218354225159, "rewards/safety_reward/mean": 8.32861328125, "rewards/safety_reward/std": 0.6969199180603027, "step": 715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 120.25, "completions/mean_terminated_length": 120.25, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.250109160771985, "frac_reward_zero_std": 0.0, "grad_norm": 0.32051002979278564, "kl": 3.548828125, "learning_rate": 5e-05, "loss": 0.0314, "num_tokens": 17353063.0, "reward": 6.27294921875, "reward_std": 0.26652276515960693, "rewards/helpfulness_reward/mean": 6.27294921875, "rewards/helpfulness_reward/std": 0.6320284605026245, "rewards/safety_reward/mean": 8.015869140625, "rewards/safety_reward/std": 0.4857560694217682, "step": 716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 119.0546875, "completions/mean_terminated_length": 119.0546875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.2504584752423369, "frac_reward_zero_std": 0.0, "grad_norm": 0.37701281905174255, "kl": 3.67578125, "learning_rate": 5e-05, "loss": 0.029, "num_tokens": 17374262.0, "reward": 6.2147216796875, "reward_std": 0.2918081283569336, "rewards/helpfulness_reward/mean": 6.2147216796875, "rewards/helpfulness_reward/std": 0.9532797932624817, "rewards/safety_reward/mean": 8.080810546875, "rewards/safety_reward/std": 0.7734753489494324, "step": 717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 121.1640625, "completions/mean_terminated_length": 121.1640625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.25080778971268886, "frac_reward_zero_std": 0.0, "grad_norm": 0.38885483145713806, "kl": 3.546875, "learning_rate": 5e-05, "loss": 0.0409, "num_tokens": 17393755.0, "reward": 6.532470703125, "reward_std": 0.3288189768791199, "rewards/helpfulness_reward/mean": 6.532470703125, "rewards/helpfulness_reward/std": 0.6107555031776428, "rewards/safety_reward/mean": 8.069091796875, "rewards/safety_reward/std": 0.6252663135528564, "step": 718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 122.015625, "completions/mean_terminated_length": 122.015625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.25115710418304077, "frac_reward_zero_std": 0.0, "grad_norm": 0.42395731806755066, "kl": 3.634765625, "learning_rate": 5e-05, "loss": 0.0373, "num_tokens": 17412829.0, "reward": 6.67724609375, "reward_std": 0.22885054349899292, "rewards/helpfulness_reward/mean": 6.67724609375, "rewards/helpfulness_reward/std": 0.4545014500617981, "rewards/safety_reward/mean": 8.309814453125, "rewards/safety_reward/std": 0.4725490212440491, "step": 719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 121.125, "completions/mean_terminated_length": 121.125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.25150641865339274, "frac_reward_zero_std": 0.0, "grad_norm": 0.3069835603237152, "kl": 3.603515625, "learning_rate": 5e-05, "loss": 0.034, "num_tokens": 17432549.0, "reward": 6.560791015625, "reward_std": 0.19864816963672638, "rewards/helpfulness_reward/mean": 6.560791015625, "rewards/helpfulness_reward/std": 0.4136182367801666, "rewards/safety_reward/mean": 8.31103515625, "rewards/safety_reward/std": 0.44433391094207764, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 121.5859375, "completions/mean_terminated_length": 121.5859375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.25185573312374465, "frac_reward_zero_std": 0.0, "grad_norm": 0.3051222562789917, "kl": 3.384765625, "learning_rate": 5e-05, "loss": 0.0351, "num_tokens": 17451920.0, "reward": 6.4892578125, "reward_std": 0.23535138368606567, "rewards/helpfulness_reward/mean": 6.4892578125, "rewards/helpfulness_reward/std": 0.6766711473464966, "rewards/safety_reward/mean": 8.087646484375, "rewards/safety_reward/std": 0.7371359467506409, "step": 721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 121.2734375, "completions/mean_terminated_length": 121.2734375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.25220504759409657, "frac_reward_zero_std": 0.0, "grad_norm": 0.3174189031124115, "kl": 3.51171875, "learning_rate": 5e-05, "loss": 0.0352, "num_tokens": 17471587.0, "reward": 6.513916015625, "reward_std": 0.27315282821655273, "rewards/helpfulness_reward/mean": 6.513916015625, "rewards/helpfulness_reward/std": 0.5612930655479431, "rewards/safety_reward/mean": 7.995849609375, "rewards/safety_reward/std": 0.5978010296821594, "step": 722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 121.109375, "completions/mean_terminated_length": 121.109375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.25255436206444853, "frac_reward_zero_std": 0.0, "grad_norm": 0.3511388599872589, "kl": 3.53125, "learning_rate": 5e-05, "loss": 0.0358, "num_tokens": 17491057.0, "reward": 6.539794921875, "reward_std": 0.2516654133796692, "rewards/helpfulness_reward/mean": 6.539794921875, "rewards/helpfulness_reward/std": 0.5014085173606873, "rewards/safety_reward/mean": 7.94140625, "rewards/safety_reward/std": 0.4948056936264038, "step": 723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 120.8984375, "completions/mean_terminated_length": 120.8984375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.25290367653480045, "frac_reward_zero_std": 0.0, "grad_norm": 0.3196503520011902, "kl": 3.400390625, "learning_rate": 5e-05, "loss": 0.0284, "num_tokens": 17510236.0, "reward": 6.6083984375, "reward_std": 0.20679683983325958, "rewards/helpfulness_reward/mean": 6.6083984375, "rewards/helpfulness_reward/std": 0.44538965821266174, "rewards/safety_reward/mean": 8.054931640625, "rewards/safety_reward/std": 0.5865346789360046, "step": 724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 121.515625, "completions/mean_terminated_length": 121.515625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.25325299100515236, "frac_reward_zero_std": 0.0, "grad_norm": 0.2730674743652344, "kl": 3.4140625, "learning_rate": 5e-05, "loss": 0.0345, "num_tokens": 17532366.0, "reward": 6.572509765625, "reward_std": 0.1764759123325348, "rewards/helpfulness_reward/mean": 6.572509765625, "rewards/helpfulness_reward/std": 0.8968052268028259, "rewards/safety_reward/mean": 8.197509765625, "rewards/safety_reward/std": 0.9165635704994202, "step": 725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 121.3359375, "completions/mean_terminated_length": 121.3359375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.25360230547550433, "frac_reward_zero_std": 0.0, "grad_norm": 0.40002137422561646, "kl": 3.498046875, "learning_rate": 5e-05, "loss": 0.035, "num_tokens": 17551449.0, "reward": 6.3214111328125, "reward_std": 0.23204100131988525, "rewards/helpfulness_reward/mean": 6.3214111328125, "rewards/helpfulness_reward/std": 0.7777966856956482, "rewards/safety_reward/mean": 7.75, "rewards/safety_reward/std": 1.2230453491210938, "step": 726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 120.8671875, "completions/mean_terminated_length": 120.8671875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.25395161994585624, "frac_reward_zero_std": 0.0, "grad_norm": 0.3639064133167267, "kl": 3.533203125, "learning_rate": 5e-05, "loss": 0.0374, "num_tokens": 17573232.0, "reward": 6.466796875, "reward_std": 0.2886878252029419, "rewards/helpfulness_reward/mean": 6.466796875, "rewards/helpfulness_reward/std": 0.743613600730896, "rewards/safety_reward/mean": 7.9443359375, "rewards/safety_reward/std": 0.9354878067970276, "step": 727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 121.03125, "completions/mean_terminated_length": 121.03125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.2543009344162082, "frac_reward_zero_std": 0.0, "grad_norm": 0.398827463388443, "kl": 3.478515625, "learning_rate": 5e-05, "loss": 0.0328, "num_tokens": 17593892.0, "reward": 6.3818359375, "reward_std": 0.335668683052063, "rewards/helpfulness_reward/mean": 6.3818359375, "rewards/helpfulness_reward/std": 0.6944877505302429, "rewards/safety_reward/mean": 7.976318359375, "rewards/safety_reward/std": 0.5015009641647339, "step": 728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 121.8671875, "completions/mean_terminated_length": 121.8671875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.2546502488865601, "frac_reward_zero_std": 0.0, "grad_norm": 0.39969107508659363, "kl": 3.650390625, "learning_rate": 5e-05, "loss": 0.0443, "num_tokens": 17613971.0, "reward": 6.48046875, "reward_std": 0.23287591338157654, "rewards/helpfulness_reward/mean": 6.48046875, "rewards/helpfulness_reward/std": 0.5570464730262756, "rewards/safety_reward/mean": 8.214599609375, "rewards/safety_reward/std": 0.5019702911376953, "step": 729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 120.96875, "completions/mean_terminated_length": 120.96875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.25499956335691204, "frac_reward_zero_std": 0.0, "grad_norm": 0.3301944136619568, "kl": 3.60546875, "learning_rate": 5e-05, "loss": 0.0253, "num_tokens": 17634111.0, "reward": 6.785888671875, "reward_std": 0.25626006722450256, "rewards/helpfulness_reward/mean": 6.785888671875, "rewards/helpfulness_reward/std": 0.5904720425605774, "rewards/safety_reward/mean": 8.419921875, "rewards/safety_reward/std": 0.46676498651504517, "step": 730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 120.8671875, "completions/mean_terminated_length": 120.8671875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.255348877827264, "frac_reward_zero_std": 0.0, "grad_norm": 0.32261013984680176, "kl": 3.396484375, "learning_rate": 5e-05, "loss": 0.0261, "num_tokens": 17655942.0, "reward": 6.63720703125, "reward_std": 0.2109335958957672, "rewards/helpfulness_reward/mean": 6.63720703125, "rewards/helpfulness_reward/std": 0.6013645529747009, "rewards/safety_reward/mean": 8.3212890625, "rewards/safety_reward/std": 0.47204285860061646, "step": 731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 121.09375, "completions/mean_terminated_length": 121.09375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.2556981922976159, "frac_reward_zero_std": 0.0, "grad_norm": 0.4117641746997833, "kl": 3.625, "learning_rate": 5e-05, "loss": 0.0351, "num_tokens": 17675082.0, "reward": 6.839111328125, "reward_std": 0.19916611909866333, "rewards/helpfulness_reward/mean": 6.839111328125, "rewards/helpfulness_reward/std": 0.39327770471572876, "rewards/safety_reward/mean": 8.196533203125, "rewards/safety_reward/std": 0.6067734956741333, "step": 732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 121.4296875, "completions/mean_terminated_length": 121.4296875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.25604750676796784, "frac_reward_zero_std": 0.0, "grad_norm": 0.38308748602867126, "kl": 3.376953125, "learning_rate": 5e-05, "loss": 0.0323, "num_tokens": 17694681.0, "reward": 6.737060546875, "reward_std": 0.1957615166902542, "rewards/helpfulness_reward/mean": 6.737060546875, "rewards/helpfulness_reward/std": 0.4429936110973358, "rewards/safety_reward/mean": 8.263427734375, "rewards/safety_reward/std": 0.6221475601196289, "step": 733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 121.4296875, "completions/mean_terminated_length": 121.4296875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.2563968212383198, "frac_reward_zero_std": 0.0, "grad_norm": 0.4542280435562134, "kl": 3.810546875, "learning_rate": 5e-05, "loss": 0.0386, "num_tokens": 17714736.0, "reward": 6.757568359375, "reward_std": 0.20950672030448914, "rewards/helpfulness_reward/mean": 6.757568359375, "rewards/helpfulness_reward/std": 0.5020679831504822, "rewards/safety_reward/mean": 8.366455078125, "rewards/safety_reward/std": 0.5545799732208252, "step": 734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 134.8515625, "completions/mean_terminated_length": 134.8515625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.2567461357086717, "frac_reward_zero_std": 0.0, "grad_norm": 614.5602416992188, "kl": 325.08984375, "learning_rate": 5e-05, "loss": 3.2417, "num_tokens": 17738349.0, "reward": 6.237953186035156, "reward_std": 0.18268178403377533, "rewards/helpfulness_reward/mean": 6.237953186035156, "rewards/helpfulness_reward/std": 1.7411792278289795, "rewards/safety_reward/mean": 7.5653533935546875, "rewards/safety_reward/std": 2.243513822555542, "step": 735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 120.9921875, "completions/mean_terminated_length": 120.9921875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.2570954501790237, "frac_reward_zero_std": 0.0, "grad_norm": 0.36674362421035767, "kl": 3.4609375, "learning_rate": 5e-05, "loss": 0.0234, "num_tokens": 17759068.0, "reward": 6.5042724609375, "reward_std": 0.29646897315979004, "rewards/helpfulness_reward/mean": 6.5042724609375, "rewards/helpfulness_reward/std": 0.617666482925415, "rewards/safety_reward/mean": 8.03369140625, "rewards/safety_reward/std": 0.6266416907310486, "step": 736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 121.0078125, "completions/mean_terminated_length": 121.0078125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.2574447646493756, "frac_reward_zero_std": 0.0, "grad_norm": 0.3453172445297241, "kl": 3.345703125, "learning_rate": 5e-05, "loss": 0.0226, "num_tokens": 17779181.0, "reward": 6.58251953125, "reward_std": 0.2232564091682434, "rewards/helpfulness_reward/mean": 6.58251953125, "rewards/helpfulness_reward/std": 0.5070816278457642, "rewards/safety_reward/mean": 8.290283203125, "rewards/safety_reward/std": 0.4423617720603943, "step": 737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 121.71875, "completions/mean_terminated_length": 121.71875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.2577940791197275, "frac_reward_zero_std": 0.0, "grad_norm": 0.46091318130493164, "kl": 3.73046875, "learning_rate": 5e-05, "loss": 0.034, "num_tokens": 17799057.0, "reward": 6.76416015625, "reward_std": 0.19742906093597412, "rewards/helpfulness_reward/mean": 6.76416015625, "rewards/helpfulness_reward/std": 0.51747065782547, "rewards/safety_reward/mean": 8.2255859375, "rewards/safety_reward/std": 0.5802500247955322, "step": 738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 120.890625, "completions/mean_terminated_length": 120.890625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.2581433935900795, "frac_reward_zero_std": 0.0, "grad_norm": 0.3578047752380371, "kl": 3.552734375, "learning_rate": 5e-05, "loss": 0.032, "num_tokens": 17819939.0, "reward": 6.469482421875, "reward_std": 0.18021312355995178, "rewards/helpfulness_reward/mean": 6.469482421875, "rewards/helpfulness_reward/std": 0.6702316999435425, "rewards/safety_reward/mean": 8.1728515625, "rewards/safety_reward/std": 0.8152487277984619, "step": 739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 121.796875, "completions/mean_terminated_length": 121.796875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.2584927080604314, "frac_reward_zero_std": 0.0, "grad_norm": 0.31041276454925537, "kl": 3.630859375, "learning_rate": 5e-05, "loss": 0.0382, "num_tokens": 17840961.0, "reward": 6.4088134765625, "reward_std": 0.2638705372810364, "rewards/helpfulness_reward/mean": 6.4088134765625, "rewards/helpfulness_reward/std": 0.5638066530227661, "rewards/safety_reward/mean": 8.102294921875, "rewards/safety_reward/std": 0.5077632069587708, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 122.078125, "completions/mean_terminated_length": 122.078125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.2588420225307833, "frac_reward_zero_std": 0.0, "grad_norm": 0.3782041072845459, "kl": 3.544921875, "learning_rate": 5e-05, "loss": 0.0335, "num_tokens": 17860915.0, "reward": 6.62158203125, "reward_std": 0.19930383563041687, "rewards/helpfulness_reward/mean": 6.62158203125, "rewards/helpfulness_reward/std": 0.47935405373573303, "rewards/safety_reward/mean": 8.174560546875, "rewards/safety_reward/std": 0.5728827118873596, "step": 741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 121.4296875, "completions/mean_terminated_length": 121.4296875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.2591913370011353, "frac_reward_zero_std": 0.0, "grad_norm": 0.38899365067481995, "kl": 3.58984375, "learning_rate": 5e-05, "loss": 0.0303, "num_tokens": 17882170.0, "reward": 6.45654296875, "reward_std": 0.19547805190086365, "rewards/helpfulness_reward/mean": 6.45654296875, "rewards/helpfulness_reward/std": 0.5533518195152283, "rewards/safety_reward/mean": 8.04541015625, "rewards/safety_reward/std": 0.5932493805885315, "step": 742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 120.828125, "completions/mean_terminated_length": 120.828125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.2595406514714872, "frac_reward_zero_std": 0.0, "grad_norm": 0.2840179204940796, "kl": 3.462890625, "learning_rate": 5e-05, "loss": 0.0266, "num_tokens": 17904676.0, "reward": 6.3408203125, "reward_std": 0.24067208170890808, "rewards/helpfulness_reward/mean": 6.3408203125, "rewards/helpfulness_reward/std": 0.5976089239120483, "rewards/safety_reward/mean": 7.985107421875, "rewards/safety_reward/std": 0.66556316614151, "step": 743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 120.953125, "completions/mean_terminated_length": 120.953125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.25988996594183916, "frac_reward_zero_std": 0.0, "grad_norm": 0.31399819254875183, "kl": 3.52734375, "learning_rate": 5e-05, "loss": 0.0227, "num_tokens": 17924462.0, "reward": 6.5150146484375, "reward_std": 0.27620741724967957, "rewards/helpfulness_reward/mean": 6.5150146484375, "rewards/helpfulness_reward/std": 0.623896062374115, "rewards/safety_reward/mean": 8.0537109375, "rewards/safety_reward/std": 0.7650931477546692, "step": 744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 121.2109375, "completions/mean_terminated_length": 121.2109375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.2602392804121911, "frac_reward_zero_std": 0.0, "grad_norm": 0.30448317527770996, "kl": 3.59375, "learning_rate": 5e-05, "loss": 0.0298, "num_tokens": 17945945.0, "reward": 6.505706787109375, "reward_std": 0.31002259254455566, "rewards/helpfulness_reward/mean": 6.505706787109375, "rewards/helpfulness_reward/std": 1.0634011030197144, "rewards/safety_reward/mean": 8.0416259765625, "rewards/safety_reward/std": 0.8279271721839905, "step": 745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 120.90625, "completions/mean_terminated_length": 120.90625, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.260588594882543, "frac_reward_zero_std": 0.0, "grad_norm": 0.40632322430610657, "kl": 3.533203125, "learning_rate": 5e-05, "loss": 0.0307, "num_tokens": 17970613.0, "reward": 6.2982940673828125, "reward_std": 0.29728662967681885, "rewards/helpfulness_reward/mean": 6.2982940673828125, "rewards/helpfulness_reward/std": 1.3382930755615234, "rewards/safety_reward/mean": 7.7969970703125, "rewards/safety_reward/std": 1.194879174232483, "step": 746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 121.28125, "completions/mean_terminated_length": 121.28125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.26093790935289496, "frac_reward_zero_std": 0.0, "grad_norm": 0.3331221342086792, "kl": 3.724609375, "learning_rate": 5e-05, "loss": 0.032, "num_tokens": 17990681.0, "reward": 6.609619140625, "reward_std": 0.24605287611484528, "rewards/helpfulness_reward/mean": 6.609619140625, "rewards/helpfulness_reward/std": 0.42784976959228516, "rewards/safety_reward/mean": 8.0244140625, "rewards/safety_reward/std": 0.5385447144508362, "step": 747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 122.03125, "completions/mean_terminated_length": 122.03125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.2612872238232469, "frac_reward_zero_std": 0.0, "grad_norm": 0.28412124514579773, "kl": 3.380859375, "learning_rate": 5e-05, "loss": 0.0318, "num_tokens": 18010565.0, "reward": 6.572021484375, "reward_std": 0.2083304226398468, "rewards/helpfulness_reward/mean": 6.572021484375, "rewards/helpfulness_reward/std": 0.5109010934829712, "rewards/safety_reward/mean": 8.11474609375, "rewards/safety_reward/std": 0.3866129517555237, "step": 748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 121.75, "completions/mean_terminated_length": 121.75, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.2616365382935988, "frac_reward_zero_std": 0.0, "grad_norm": 0.3260829746723175, "kl": 3.66015625, "learning_rate": 5e-05, "loss": 0.0319, "num_tokens": 18030949.0, "reward": 6.63525390625, "reward_std": 0.25642356276512146, "rewards/helpfulness_reward/mean": 6.63525390625, "rewards/helpfulness_reward/std": 0.48399755358695984, "rewards/safety_reward/mean": 8.0615234375, "rewards/safety_reward/std": 0.5193851590156555, "step": 749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 122.0390625, "completions/mean_terminated_length": 122.0390625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.26198585276395076, "frac_reward_zero_std": 0.0, "grad_norm": 0.2995162606239319, "kl": 3.423828125, "learning_rate": 5e-05, "loss": 0.0338, "num_tokens": 18050002.0, "reward": 6.965087890625, "reward_std": 0.23446239531040192, "rewards/helpfulness_reward/mean": 6.965087890625, "rewards/helpfulness_reward/std": 0.3967917859554291, "rewards/safety_reward/mean": 8.32177734375, "rewards/safety_reward/std": 0.520796000957489, "step": 750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 122.7734375, "completions/mean_terminated_length": 122.7734375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.26233516723430267, "frac_reward_zero_std": 0.0, "grad_norm": 0.3243657350540161, "kl": 3.400390625, "learning_rate": 5e-05, "loss": 0.046, "num_tokens": 18069581.0, "reward": 6.73779296875, "reward_std": 0.24078896641731262, "rewards/helpfulness_reward/mean": 6.73779296875, "rewards/helpfulness_reward/std": 0.49552369117736816, "rewards/safety_reward/mean": 8.170654296875, "rewards/safety_reward/std": 0.7436151504516602, "step": 751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 121.859375, "completions/mean_terminated_length": 121.859375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.26268448170465464, "frac_reward_zero_std": 0.0, "grad_norm": 0.3758942186832428, "kl": 3.7265625, "learning_rate": 5e-05, "loss": 0.0355, "num_tokens": 18090523.0, "reward": 6.47802734375, "reward_std": 0.17392833530902863, "rewards/helpfulness_reward/mean": 6.47802734375, "rewards/helpfulness_reward/std": 0.5874335169792175, "rewards/safety_reward/mean": 8.19921875, "rewards/safety_reward/std": 0.4934985637664795, "step": 752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 121.6640625, "completions/mean_terminated_length": 121.6640625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.26303379617500655, "frac_reward_zero_std": 0.0, "grad_norm": 14.299758911132812, "kl": 6.990234375, "learning_rate": 5e-05, "loss": 0.0662, "num_tokens": 18110280.0, "reward": 6.6650390625, "reward_std": 0.2567199468612671, "rewards/helpfulness_reward/mean": 6.6650390625, "rewards/helpfulness_reward/std": 0.5637826919555664, "rewards/safety_reward/mean": 7.84326171875, "rewards/safety_reward/std": 0.8547104597091675, "step": 753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 122.0859375, "completions/mean_terminated_length": 122.0859375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.26338311064535846, "frac_reward_zero_std": 0.0, "grad_norm": 0.36373451352119446, "kl": 3.69140625, "learning_rate": 5e-05, "loss": 0.0382, "num_tokens": 18131659.0, "reward": 6.53564453125, "reward_std": 0.2395787090063095, "rewards/helpfulness_reward/mean": 6.53564453125, "rewards/helpfulness_reward/std": 0.6928337812423706, "rewards/safety_reward/mean": 8.154052734375, "rewards/safety_reward/std": 0.5087990760803223, "step": 754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 121.4765625, "completions/mean_terminated_length": 121.4765625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.26373242511571043, "frac_reward_zero_std": 0.0, "grad_norm": 0.8530830144882202, "kl": 3.736328125, "learning_rate": 5e-05, "loss": 0.0369, "num_tokens": 18154280.0, "reward": 6.5426025390625, "reward_std": 0.29300788044929504, "rewards/helpfulness_reward/mean": 6.5426025390625, "rewards/helpfulness_reward/std": 0.6589765548706055, "rewards/safety_reward/mean": 7.8900146484375, "rewards/safety_reward/std": 0.9890816807746887, "step": 755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 121.5234375, "completions/mean_terminated_length": 121.5234375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.26408173958606235, "frac_reward_zero_std": 0.0, "grad_norm": 0.38379889726638794, "kl": 3.53125, "learning_rate": 5e-05, "loss": 0.0351, "num_tokens": 18173755.0, "reward": 6.449462890625, "reward_std": 0.30495715141296387, "rewards/helpfulness_reward/mean": 6.449462890625, "rewards/helpfulness_reward/std": 0.566547155380249, "rewards/safety_reward/mean": 8.2705078125, "rewards/safety_reward/std": 0.5716991424560547, "step": 756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 121.6796875, "completions/mean_terminated_length": 121.6796875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.26443105405641426, "frac_reward_zero_std": 0.0, "grad_norm": 0.3689285218715668, "kl": 3.55078125, "learning_rate": 5e-05, "loss": 0.0377, "num_tokens": 18193818.0, "reward": 6.59375, "reward_std": 0.24605430662631989, "rewards/helpfulness_reward/mean": 6.59375, "rewards/helpfulness_reward/std": 0.57635498046875, "rewards/safety_reward/mean": 8.1953125, "rewards/safety_reward/std": 0.47331276535987854, "step": 757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 120.7109375, "completions/mean_terminated_length": 120.7109375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.26478036852676623, "frac_reward_zero_std": 0.0, "grad_norm": 0.3804476261138916, "kl": 3.60546875, "learning_rate": 5e-05, "loss": 0.0234, "num_tokens": 18214181.0, "reward": 6.5367431640625, "reward_std": 0.32791459560394287, "rewards/helpfulness_reward/mean": 6.5367431640625, "rewards/helpfulness_reward/std": 0.6317383646965027, "rewards/safety_reward/mean": 8.108154296875, "rewards/safety_reward/std": 0.5122890472412109, "step": 758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 121.1796875, "completions/mean_terminated_length": 121.1796875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.26512968299711814, "frac_reward_zero_std": 0.0, "grad_norm": 0.41951411962509155, "kl": 3.591796875, "learning_rate": 5e-05, "loss": 0.0288, "num_tokens": 18234076.0, "reward": 6.594482421875, "reward_std": 0.18857115507125854, "rewards/helpfulness_reward/mean": 6.594482421875, "rewards/helpfulness_reward/std": 0.4514007866382599, "rewards/safety_reward/mean": 8.261962890625, "rewards/safety_reward/std": 0.38455912470817566, "step": 759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 121.078125, "completions/mean_terminated_length": 121.078125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.2654789974674701, "frac_reward_zero_std": 0.0, "grad_norm": 0.38777342438697815, "kl": 3.5625, "learning_rate": 5e-05, "loss": 0.034, "num_tokens": 18253694.0, "reward": 6.6748046875, "reward_std": 0.32240626215934753, "rewards/helpfulness_reward/mean": 6.6748046875, "rewards/helpfulness_reward/std": 0.4800465703010559, "rewards/safety_reward/mean": 8.296142578125, "rewards/safety_reward/std": 0.5359401702880859, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 121.09375, "completions/mean_terminated_length": 121.09375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.265828311937822, "frac_reward_zero_std": 0.0, "grad_norm": 0.421627402305603, "kl": 3.6015625, "learning_rate": 5e-05, "loss": 0.0359, "num_tokens": 18272922.0, "reward": 6.505126953125, "reward_std": 0.26935845613479614, "rewards/helpfulness_reward/mean": 6.505126953125, "rewards/helpfulness_reward/std": 0.5112654566764832, "rewards/safety_reward/mean": 8.127685546875, "rewards/safety_reward/std": 0.6416702270507812, "step": 761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 120.3046875, "completions/mean_terminated_length": 120.3046875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.26617762640817394, "frac_reward_zero_std": 0.0, "grad_norm": 0.5596006512641907, "kl": 3.908203125, "learning_rate": 5e-05, "loss": 0.0311, "num_tokens": 18292481.0, "reward": 6.599853515625, "reward_std": 0.26894229650497437, "rewards/helpfulness_reward/mean": 6.599853515625, "rewards/helpfulness_reward/std": 0.5379321575164795, "rewards/safety_reward/mean": 8.169189453125, "rewards/safety_reward/std": 0.5593817830085754, "step": 762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 120.9921875, "completions/mean_terminated_length": 120.9921875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.2665269408785259, "frac_reward_zero_std": 0.0, "grad_norm": 0.46795815229415894, "kl": 3.779296875, "learning_rate": 5e-05, "loss": 0.0313, "num_tokens": 18311608.0, "reward": 6.75439453125, "reward_std": 0.222733736038208, "rewards/helpfulness_reward/mean": 6.75439453125, "rewards/helpfulness_reward/std": 0.6271583437919617, "rewards/safety_reward/mean": 8.244140625, "rewards/safety_reward/std": 0.4890652000904083, "step": 763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 127.0, "completions/max_terminated_length": 127.0, "completions/mean_length": 120.90625, "completions/mean_terminated_length": 120.90625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.2668762553488778, "frac_reward_zero_std": 0.0, "grad_norm": 0.5935874581336975, "kl": 3.85546875, "learning_rate": 5e-05, "loss": 0.0301, "num_tokens": 18331116.0, "reward": 6.80322265625, "reward_std": 0.24437358975410461, "rewards/helpfulness_reward/mean": 6.80322265625, "rewards/helpfulness_reward/std": 0.5341206192970276, "rewards/safety_reward/mean": 8.354248046875, "rewards/safety_reward/std": 0.3318626880645752, "step": 764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 120.7109375, "completions/mean_terminated_length": 120.7109375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.26722556981922974, "frac_reward_zero_std": 0.0, "grad_norm": 0.4014729857444763, "kl": 3.384765625, "learning_rate": 5e-05, "loss": 0.0303, "num_tokens": 18352159.0, "reward": 6.643310546875, "reward_std": 0.264298677444458, "rewards/helpfulness_reward/mean": 6.643310546875, "rewards/helpfulness_reward/std": 0.7812108397483826, "rewards/safety_reward/mean": 8.12646484375, "rewards/safety_reward/std": 0.5159229636192322, "step": 765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 121.7890625, "completions/mean_terminated_length": 121.7890625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.2675748842895817, "frac_reward_zero_std": 0.0, "grad_norm": 0.4141797423362732, "kl": 3.615234375, "learning_rate": 5e-05, "loss": 0.0387, "num_tokens": 18372468.0, "reward": 6.596435546875, "reward_std": 0.2917146682739258, "rewards/helpfulness_reward/mean": 6.596435546875, "rewards/helpfulness_reward/std": 0.6379445195198059, "rewards/safety_reward/mean": 8.15869140625, "rewards/safety_reward/std": 0.5749070048332214, "step": 766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 121.5, "completions/mean_terminated_length": 121.5, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.2679241987599336, "frac_reward_zero_std": 0.0, "grad_norm": 0.2881162464618683, "kl": 3.43359375, "learning_rate": 5e-05, "loss": 0.032, "num_tokens": 18392308.0, "reward": 6.666259765625, "reward_std": 0.1799127161502838, "rewards/helpfulness_reward/mean": 6.666259765625, "rewards/helpfulness_reward/std": 0.3981906771659851, "rewards/safety_reward/mean": 8.20556640625, "rewards/safety_reward/std": 0.40463998913764954, "step": 767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 121.0859375, "completions/mean_terminated_length": 121.0859375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.2682735132302856, "frac_reward_zero_std": 0.0, "grad_norm": 0.3050372302532196, "kl": 3.568359375, "learning_rate": 5e-05, "loss": 0.0304, "num_tokens": 18413951.0, "reward": 6.447998046875, "reward_std": 0.2697106599807739, "rewards/helpfulness_reward/mean": 6.447998046875, "rewards/helpfulness_reward/std": 0.5148101449012756, "rewards/safety_reward/mean": 8.094970703125, "rewards/safety_reward/std": 0.5641162395477295, "step": 768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 121.3671875, "completions/mean_terminated_length": 121.3671875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.2686228277006375, "frac_reward_zero_std": 0.0, "grad_norm": 0.33245962858200073, "kl": 3.57421875, "learning_rate": 5e-05, "loss": 0.0356, "num_tokens": 18433318.0, "reward": 6.7027587890625, "reward_std": 0.26188385486602783, "rewards/helpfulness_reward/mean": 6.7027587890625, "rewards/helpfulness_reward/std": 0.6409908533096313, "rewards/safety_reward/mean": 8.28564453125, "rewards/safety_reward/std": 0.5073999166488647, "step": 769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 120.5078125, "completions/mean_terminated_length": 120.5078125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.2689721421709894, "frac_reward_zero_std": 0.0, "grad_norm": 0.2882586121559143, "kl": 3.59375, "learning_rate": 5e-05, "loss": 0.0233, "num_tokens": 18453463.0, "reward": 6.68017578125, "reward_std": 0.24661722779273987, "rewards/helpfulness_reward/mean": 6.68017578125, "rewards/helpfulness_reward/std": 0.6235648393630981, "rewards/safety_reward/mean": 8.456787109375, "rewards/safety_reward/std": 0.4364815354347229, "step": 770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 120.5390625, "completions/mean_terminated_length": 120.5390625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.2693214566413414, "frac_reward_zero_std": 0.0, "grad_norm": 0.3334254026412964, "kl": 3.6171875, "learning_rate": 5e-05, "loss": 0.0318, "num_tokens": 18473508.0, "reward": 6.6005859375, "reward_std": 0.2054293304681778, "rewards/helpfulness_reward/mean": 6.6005859375, "rewards/helpfulness_reward/std": 0.34886667132377625, "rewards/safety_reward/mean": 8.219482421875, "rewards/safety_reward/std": 0.48478105664253235, "step": 771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 121.3984375, "completions/mean_terminated_length": 121.3984375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.2696707711116933, "frac_reward_zero_std": 0.0, "grad_norm": 0.29426443576812744, "kl": 3.5234375, "learning_rate": 5e-05, "loss": 0.0279, "num_tokens": 18494055.0, "reward": 6.47265625, "reward_std": 0.24163326621055603, "rewards/helpfulness_reward/mean": 6.47265625, "rewards/helpfulness_reward/std": 0.5932382345199585, "rewards/safety_reward/mean": 8.139892578125, "rewards/safety_reward/std": 0.4963567554950714, "step": 772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 121.078125, "completions/mean_terminated_length": 121.078125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.2700200855820452, "frac_reward_zero_std": 0.0, "grad_norm": 0.3199301064014435, "kl": 3.634765625, "learning_rate": 5e-05, "loss": 0.0337, "num_tokens": 18514161.0, "reward": 6.5009765625, "reward_std": 0.2132146656513214, "rewards/helpfulness_reward/mean": 6.5009765625, "rewards/helpfulness_reward/std": 0.4590103030204773, "rewards/safety_reward/mean": 8.189697265625, "rewards/safety_reward/std": 0.5326187014579773, "step": 773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 124.984375, "completions/mean_terminated_length": 124.984375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.2703694000523972, "frac_reward_zero_std": 0.0, "grad_norm": 0.32061830163002014, "kl": 3.38671875, "learning_rate": 5e-05, "loss": 0.0371, "num_tokens": 18536575.0, "reward": 6.3598175048828125, "reward_std": 0.3014945685863495, "rewards/helpfulness_reward/mean": 6.3598175048828125, "rewards/helpfulness_reward/std": 1.5243045091629028, "rewards/safety_reward/mean": 7.91070556640625, "rewards/safety_reward/std": 1.6210832595825195, "step": 774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 121.4140625, "completions/mean_terminated_length": 121.4140625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.2707187145227491, "frac_reward_zero_std": 0.0, "grad_norm": 0.3282027244567871, "kl": 3.41796875, "learning_rate": 5e-05, "loss": 0.0347, "num_tokens": 18556108.0, "reward": 6.55908203125, "reward_std": 0.23279841244220734, "rewards/helpfulness_reward/mean": 6.55908203125, "rewards/helpfulness_reward/std": 0.5390912294387817, "rewards/safety_reward/mean": 8.250732421875, "rewards/safety_reward/std": 0.40399062633514404, "step": 775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1792.0, "completions/max_terminated_length": 659.0, "completions/mean_length": 139.03125, "completions/mean_terminated_length": 126.0157470703125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.27106802899310106, "frac_reward_zero_std": 0.0, "grad_norm": 0.3357503116130829, "kl": 3.326171875, "learning_rate": 5e-05, "loss": 0.1048, "num_tokens": 18580240.0, "reward": 6.320629119873047, "reward_std": 0.41289958357810974, "rewards/helpfulness_reward/mean": 6.320629119873047, "rewards/helpfulness_reward/std": 1.1881561279296875, "rewards/safety_reward/mean": 8.108329772949219, "rewards/safety_reward/std": 1.5020897388458252, "step": 776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 122.0546875, "completions/mean_terminated_length": 122.0546875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.271417343463453, "frac_reward_zero_std": 0.0, "grad_norm": 0.3120361864566803, "kl": 3.595703125, "learning_rate": 5e-05, "loss": 0.0355, "num_tokens": 18599543.0, "reward": 6.65869140625, "reward_std": 0.2263159602880478, "rewards/helpfulness_reward/mean": 6.65869140625, "rewards/helpfulness_reward/std": 0.5667030215263367, "rewards/safety_reward/mean": 8.172119140625, "rewards/safety_reward/std": 0.7755053639411926, "step": 777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 122.1328125, "completions/mean_terminated_length": 122.1328125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.2717666579338049, "frac_reward_zero_std": 0.0, "grad_norm": 0.320938378572464, "kl": 3.4921875, "learning_rate": 5e-05, "loss": 0.0339, "num_tokens": 18619504.0, "reward": 6.56005859375, "reward_std": 0.23227445781230927, "rewards/helpfulness_reward/mean": 6.56005859375, "rewards/helpfulness_reward/std": 0.6575038433074951, "rewards/safety_reward/mean": 8.2998046875, "rewards/safety_reward/std": 0.6268236041069031, "step": 778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 122.0625, "completions/mean_terminated_length": 122.0625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.27211597240415686, "frac_reward_zero_std": 0.0, "grad_norm": 0.318129301071167, "kl": 3.802734375, "learning_rate": 5e-05, "loss": 0.0355, "num_tokens": 18639128.0, "reward": 6.498046875, "reward_std": 0.23643271625041962, "rewards/helpfulness_reward/mean": 6.498046875, "rewards/helpfulness_reward/std": 0.6002576947212219, "rewards/safety_reward/mean": 8.10498046875, "rewards/safety_reward/std": 0.604126513004303, "step": 779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 121.1640625, "completions/mean_terminated_length": 121.1640625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.27246528687450877, "frac_reward_zero_std": 0.0, "grad_norm": 0.34662508964538574, "kl": 3.53515625, "learning_rate": 5e-05, "loss": 0.0321, "num_tokens": 18658189.0, "reward": 6.385009765625, "reward_std": 0.224930077791214, "rewards/helpfulness_reward/mean": 6.385009765625, "rewards/helpfulness_reward/std": 0.570628821849823, "rewards/safety_reward/mean": 7.865966796875, "rewards/safety_reward/std": 0.6526269912719727, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 122.4453125, "completions/mean_terminated_length": 122.4453125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.2728146013448607, "frac_reward_zero_std": 0.0, "grad_norm": 0.29240643978118896, "kl": 3.603515625, "learning_rate": 5e-05, "loss": 0.0346, "num_tokens": 18678158.0, "reward": 6.649658203125, "reward_std": 0.1548805981874466, "rewards/helpfulness_reward/mean": 6.649658203125, "rewards/helpfulness_reward/std": 0.43705427646636963, "rewards/safety_reward/mean": 8.155029296875, "rewards/safety_reward/std": 0.4919286072254181, "step": 781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 122.9140625, "completions/mean_terminated_length": 122.9140625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.27316391581521265, "frac_reward_zero_std": 0.0, "grad_norm": 0.3074226677417755, "kl": 3.470703125, "learning_rate": 5e-05, "loss": 0.0382, "num_tokens": 18698299.0, "reward": 6.66259765625, "reward_std": 0.2061394453048706, "rewards/helpfulness_reward/mean": 6.66259765625, "rewards/helpfulness_reward/std": 0.4796607494354248, "rewards/safety_reward/mean": 8.283447265625, "rewards/safety_reward/std": 0.3834991455078125, "step": 782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 121.828125, "completions/mean_terminated_length": 121.828125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.27351323028556457, "frac_reward_zero_std": 0.0, "grad_norm": 0.3775358498096466, "kl": 3.732421875, "learning_rate": 5e-05, "loss": 0.0331, "num_tokens": 18717493.0, "reward": 6.8197021484375, "reward_std": 0.34657347202301025, "rewards/helpfulness_reward/mean": 6.8197021484375, "rewards/helpfulness_reward/std": 0.6018692851066589, "rewards/safety_reward/mean": 8.101318359375, "rewards/safety_reward/std": 0.6863441467285156, "step": 783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 122.2421875, "completions/mean_terminated_length": 122.2421875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.27386254475591654, "frac_reward_zero_std": 0.0, "grad_norm": 0.4447477161884308, "kl": 3.689453125, "learning_rate": 5e-05, "loss": 0.0414, "num_tokens": 18737212.0, "reward": 6.548828125, "reward_std": 0.21912464499473572, "rewards/helpfulness_reward/mean": 6.548828125, "rewards/helpfulness_reward/std": 0.5313693881034851, "rewards/safety_reward/mean": 8.24951171875, "rewards/safety_reward/std": 0.5608569383621216, "step": 784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 122.1171875, "completions/mean_terminated_length": 122.1171875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.27421185922626845, "frac_reward_zero_std": 0.0, "grad_norm": 0.2934853434562683, "kl": 3.533203125, "learning_rate": 5e-05, "loss": 0.0332, "num_tokens": 18758715.0, "reward": 6.70556640625, "reward_std": 0.23076212406158447, "rewards/helpfulness_reward/mean": 6.70556640625, "rewards/helpfulness_reward/std": 0.5132743716239929, "rewards/safety_reward/mean": 8.426513671875, "rewards/safety_reward/std": 0.5553680062294006, "step": 785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 122.3359375, "completions/mean_terminated_length": 122.3359375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.27456117369662036, "frac_reward_zero_std": 0.0, "grad_norm": 0.5149805545806885, "kl": 3.724609375, "learning_rate": 5e-05, "loss": 0.0366, "num_tokens": 18779150.0, "reward": 6.4892578125, "reward_std": 0.23599590361118317, "rewards/helpfulness_reward/mean": 6.4892578125, "rewards/helpfulness_reward/std": 0.5374799370765686, "rewards/safety_reward/mean": 8.0625, "rewards/safety_reward/std": 0.5773147344589233, "step": 786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 121.8125, "completions/mean_terminated_length": 121.8125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.27491048816697233, "frac_reward_zero_std": 0.0, "grad_norm": 0.26162710785865784, "kl": 3.572265625, "learning_rate": 5e-05, "loss": 0.0325, "num_tokens": 18801366.0, "reward": 6.40478515625, "reward_std": 0.25880831480026245, "rewards/helpfulness_reward/mean": 6.40478515625, "rewards/helpfulness_reward/std": 0.7117119431495667, "rewards/safety_reward/mean": 8.147705078125, "rewards/safety_reward/std": 0.6144998073577881, "step": 787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 122.2109375, "completions/mean_terminated_length": 122.2109375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.27525980263732425, "frac_reward_zero_std": 0.0, "grad_norm": 0.3167549669742584, "kl": 3.7109375, "learning_rate": 5e-05, "loss": 0.0327, "num_tokens": 18821721.0, "reward": 6.49658203125, "reward_std": 0.1788773238658905, "rewards/helpfulness_reward/mean": 6.49658203125, "rewards/helpfulness_reward/std": 0.9554283618927002, "rewards/safety_reward/mean": 8.19140625, "rewards/safety_reward/std": 0.855151355266571, "step": 788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 123.1328125, "completions/mean_terminated_length": 123.1328125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.27560911710767616, "frac_reward_zero_std": 0.0, "grad_norm": 0.2915768027305603, "kl": 3.42578125, "learning_rate": 5e-05, "loss": 0.035, "num_tokens": 18842058.0, "reward": 6.65478515625, "reward_std": 0.2910012900829315, "rewards/helpfulness_reward/mean": 6.65478515625, "rewards/helpfulness_reward/std": 0.5363402366638184, "rewards/safety_reward/mean": 8.290771484375, "rewards/safety_reward/std": 0.47340306639671326, "step": 789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 122.6171875, "completions/mean_terminated_length": 122.6171875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.27595843157802813, "frac_reward_zero_std": 0.0, "grad_norm": 0.3355342149734497, "kl": 3.587890625, "learning_rate": 5e-05, "loss": 0.0308, "num_tokens": 18861417.0, "reward": 6.681884765625, "reward_std": 0.18161159753799438, "rewards/helpfulness_reward/mean": 6.681884765625, "rewards/helpfulness_reward/std": 0.5581694841384888, "rewards/safety_reward/mean": 8.224609375, "rewards/safety_reward/std": 0.4791055917739868, "step": 790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 122.4921875, "completions/mean_terminated_length": 122.4921875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.27630774604838004, "frac_reward_zero_std": 0.0, "grad_norm": 0.4461621940135956, "kl": 3.859375, "learning_rate": 5e-05, "loss": 0.0341, "num_tokens": 18880496.0, "reward": 6.591796875, "reward_std": 0.2978411912918091, "rewards/helpfulness_reward/mean": 6.591796875, "rewards/helpfulness_reward/std": 0.553108811378479, "rewards/safety_reward/mean": 7.9833984375, "rewards/safety_reward/std": 0.4670809805393219, "step": 791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 122.8359375, "completions/mean_terminated_length": 122.8359375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.276657060518732, "frac_reward_zero_std": 0.0, "grad_norm": 0.2721949517726898, "kl": 3.560546875, "learning_rate": 5e-05, "loss": 0.0317, "num_tokens": 18899843.0, "reward": 6.989990234375, "reward_std": 0.19755128026008606, "rewards/helpfulness_reward/mean": 6.989990234375, "rewards/helpfulness_reward/std": 0.4478859603404999, "rewards/safety_reward/mean": 8.38232421875, "rewards/safety_reward/std": 0.38566455245018005, "step": 792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 122.9921875, "completions/mean_terminated_length": 122.9921875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.2770063749890839, "frac_reward_zero_std": 0.0, "grad_norm": 0.4225562810897827, "kl": 4.068359375, "learning_rate": 5e-05, "loss": 0.0358, "num_tokens": 18919266.0, "reward": 6.681396484375, "reward_std": 0.27860313653945923, "rewards/helpfulness_reward/mean": 6.681396484375, "rewards/helpfulness_reward/std": 0.5479030609130859, "rewards/safety_reward/mean": 8.228759765625, "rewards/safety_reward/std": 0.5076150894165039, "step": 793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 122.921875, "completions/mean_terminated_length": 122.921875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.27735568945943584, "frac_reward_zero_std": 0.0, "grad_norm": 0.28613758087158203, "kl": 3.583984375, "learning_rate": 5e-05, "loss": 0.0359, "num_tokens": 18939240.0, "reward": 6.670654296875, "reward_std": 0.24458825588226318, "rewards/helpfulness_reward/mean": 6.670654296875, "rewards/helpfulness_reward/std": 0.434836208820343, "rewards/safety_reward/mean": 8.186767578125, "rewards/safety_reward/std": 0.4316692650318146, "step": 794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 124.2109375, "completions/mean_terminated_length": 124.2109375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.2777050039297878, "frac_reward_zero_std": 0.0, "grad_norm": 0.401102215051651, "kl": 3.673828125, "learning_rate": 5e-05, "loss": 0.0358, "num_tokens": 18960267.0, "reward": 6.553955078125, "reward_std": 0.218324214220047, "rewards/helpfulness_reward/mean": 6.553955078125, "rewards/helpfulness_reward/std": 0.6537048816680908, "rewards/safety_reward/mean": 8.1591796875, "rewards/safety_reward/std": 0.46863412857055664, "step": 795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 124.265625, "completions/mean_terminated_length": 124.265625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.2780543184001397, "frac_reward_zero_std": 0.0, "grad_norm": 0.3515780568122864, "kl": 3.517578125, "learning_rate": 5e-05, "loss": 0.0326, "num_tokens": 18979629.0, "reward": 6.922119140625, "reward_std": 0.19122180342674255, "rewards/helpfulness_reward/mean": 6.922119140625, "rewards/helpfulness_reward/std": 0.4829460680484772, "rewards/safety_reward/mean": 8.447998046875, "rewards/safety_reward/std": 0.6210291981697083, "step": 796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 124.3203125, "completions/mean_terminated_length": 124.3203125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.27840363287049164, "frac_reward_zero_std": 0.0, "grad_norm": 0.39420753717422485, "kl": 3.3671875, "learning_rate": 5e-05, "loss": 0.0372, "num_tokens": 18999406.0, "reward": 6.7166748046875, "reward_std": 0.2793922424316406, "rewards/helpfulness_reward/mean": 6.7166748046875, "rewards/helpfulness_reward/std": 0.5330827832221985, "rewards/safety_reward/mean": 8.144775390625, "rewards/safety_reward/std": 0.4813652038574219, "step": 797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 124.3203125, "completions/mean_terminated_length": 124.3203125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.2787529473408436, "frac_reward_zero_std": 0.0, "grad_norm": 0.31383374333381653, "kl": 3.505859375, "learning_rate": 5e-05, "loss": 0.0359, "num_tokens": 19018847.0, "reward": 6.8505859375, "reward_std": 0.3268381953239441, "rewards/helpfulness_reward/mean": 6.8505859375, "rewards/helpfulness_reward/std": 0.5604586005210876, "rewards/safety_reward/mean": 8.253173828125, "rewards/safety_reward/std": 0.5898042917251587, "step": 798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 124.4921875, "completions/mean_terminated_length": 124.4921875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.2791022618111955, "frac_reward_zero_std": 0.0, "grad_norm": 0.2874152958393097, "kl": 3.44921875, "learning_rate": 5e-05, "loss": 0.0361, "num_tokens": 19039118.0, "reward": 6.5902099609375, "reward_std": 0.2953406870365143, "rewards/helpfulness_reward/mean": 6.5902099609375, "rewards/helpfulness_reward/std": 0.5025128126144409, "rewards/safety_reward/mean": 8.09619140625, "rewards/safety_reward/std": 0.5949804186820984, "step": 799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 124.6875, "completions/mean_terminated_length": 124.6875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.2794515762815475, "frac_reward_zero_std": 0.0, "grad_norm": 0.25621435046195984, "kl": 3.36328125, "learning_rate": 5e-05, "loss": 0.0356, "num_tokens": 19059142.0, "reward": 6.770751953125, "reward_std": 0.19572454690933228, "rewards/helpfulness_reward/mean": 6.770751953125, "rewards/helpfulness_reward/std": 0.4232107698917389, "rewards/safety_reward/mean": 8.41650390625, "rewards/safety_reward/std": 0.42051005363464355, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 123.03125, "completions/mean_terminated_length": 123.03125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.2798008907518994, "frac_reward_zero_std": 0.0, "grad_norm": 22.75128936767578, "kl": 16.904296875, "learning_rate": 5e-05, "loss": 0.1633, "num_tokens": 19079218.0, "reward": 6.6279296875, "reward_std": 0.19024315476417542, "rewards/helpfulness_reward/mean": 6.6279296875, "rewards/helpfulness_reward/std": 0.5510151386260986, "rewards/safety_reward/mean": 8.236328125, "rewards/safety_reward/std": 0.47322747111320496, "step": 801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 124.421875, "completions/mean_terminated_length": 124.421875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.2801502052222513, "frac_reward_zero_std": 0.0, "grad_norm": 1.5474977493286133, "kl": 3.8203125, "learning_rate": 5e-05, "loss": 0.0395, "num_tokens": 19098696.0, "reward": 6.80517578125, "reward_std": 0.21316806972026825, "rewards/helpfulness_reward/mean": 6.80517578125, "rewards/helpfulness_reward/std": 0.5217419266700745, "rewards/safety_reward/mean": 8.031494140625, "rewards/safety_reward/std": 0.5529245138168335, "step": 802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 123.9609375, "completions/mean_terminated_length": 123.9609375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.2804995196926033, "frac_reward_zero_std": 0.0, "grad_norm": 0.3325349688529968, "kl": 3.70703125, "learning_rate": 5e-05, "loss": 0.0402, "num_tokens": 19118475.0, "reward": 6.83837890625, "reward_std": 0.236969456076622, "rewards/helpfulness_reward/mean": 6.83837890625, "rewards/helpfulness_reward/std": 0.5479318499565125, "rewards/safety_reward/mean": 8.44775390625, "rewards/safety_reward/std": 0.4874802827835083, "step": 803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 119.4765625, "completions/mean_terminated_length": 119.4765625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.2808488341629552, "frac_reward_zero_std": 0.0, "grad_norm": 0.3577045798301697, "kl": 3.39453125, "learning_rate": 5e-05, "loss": 0.0388, "num_tokens": 19139216.0, "reward": 6.288909912109375, "reward_std": 0.20894834399223328, "rewards/helpfulness_reward/mean": 6.288909912109375, "rewards/helpfulness_reward/std": 1.4189987182617188, "rewards/safety_reward/mean": 7.57977294921875, "rewards/safety_reward/std": 1.65172278881073, "step": 804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 123.7890625, "completions/mean_terminated_length": 123.7890625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.2811981486333071, "frac_reward_zero_std": 0.0, "grad_norm": 0.3229028880596161, "kl": 3.5390625, "learning_rate": 5e-05, "loss": 0.0383, "num_tokens": 19159101.0, "reward": 6.627197265625, "reward_std": 0.23323120176792145, "rewards/helpfulness_reward/mean": 6.627197265625, "rewards/helpfulness_reward/std": 0.4388895034790039, "rewards/safety_reward/mean": 8.298095703125, "rewards/safety_reward/std": 0.2867571711540222, "step": 805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 124.40625, "completions/mean_terminated_length": 124.40625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.2815474631036591, "frac_reward_zero_std": 0.0, "grad_norm": 0.35798871517181396, "kl": 3.63671875, "learning_rate": 5e-05, "loss": 0.0426, "num_tokens": 19180393.0, "reward": 6.616943359375, "reward_std": 0.2428077757358551, "rewards/helpfulness_reward/mean": 6.616943359375, "rewards/helpfulness_reward/std": 0.48545917868614197, "rewards/safety_reward/mean": 8.3037109375, "rewards/safety_reward/std": 0.5141773223876953, "step": 806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 123.5078125, "completions/mean_terminated_length": 123.5078125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.281896777574011, "frac_reward_zero_std": 0.0, "grad_norm": 0.33834174275398254, "kl": 3.537109375, "learning_rate": 5e-05, "loss": 0.031, "num_tokens": 19201514.0, "reward": 6.6837158203125, "reward_std": 0.35064369440078735, "rewards/helpfulness_reward/mean": 6.6837158203125, "rewards/helpfulness_reward/std": 0.7349801063537598, "rewards/safety_reward/mean": 8.134765625, "rewards/safety_reward/std": 0.8053588271141052, "step": 807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 124.859375, "completions/mean_terminated_length": 124.859375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.28224609204436296, "frac_reward_zero_std": 0.0, "grad_norm": 0.359640896320343, "kl": 3.509765625, "learning_rate": 5e-05, "loss": 0.0466, "num_tokens": 19223128.0, "reward": 6.385009765625, "reward_std": 0.2987923324108124, "rewards/helpfulness_reward/mean": 6.385009765625, "rewards/helpfulness_reward/std": 0.8437106013298035, "rewards/safety_reward/mean": 7.826171875, "rewards/safety_reward/std": 0.8953083753585815, "step": 808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 123.2578125, "completions/mean_terminated_length": 123.2578125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.2825954065147149, "frac_reward_zero_std": 0.0, "grad_norm": 0.4522637128829956, "kl": 3.587890625, "learning_rate": 5e-05, "loss": 0.0409, "num_tokens": 19244361.0, "reward": 6.466064453125, "reward_std": 0.3659130334854126, "rewards/helpfulness_reward/mean": 6.466064453125, "rewards/helpfulness_reward/std": 0.8021373748779297, "rewards/safety_reward/mean": 7.96728515625, "rewards/safety_reward/std": 0.7582678198814392, "step": 809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 122.296875, "completions/mean_terminated_length": 122.296875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.2829447209850668, "frac_reward_zero_std": 0.0, "grad_norm": 0.40905115008354187, "kl": 3.482421875, "learning_rate": 5e-05, "loss": 0.0465, "num_tokens": 19263847.0, "reward": 6.55224609375, "reward_std": 0.24507004022598267, "rewards/helpfulness_reward/mean": 6.55224609375, "rewards/helpfulness_reward/std": 0.402439147233963, "rewards/safety_reward/mean": 8.125732421875, "rewards/safety_reward/std": 0.4852091372013092, "step": 810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 122.1796875, "completions/mean_terminated_length": 122.1796875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.28329403545541876, "frac_reward_zero_std": 0.0, "grad_norm": 0.7730554342269897, "kl": 3.947265625, "learning_rate": 5e-05, "loss": 0.0373, "num_tokens": 19284902.0, "reward": 6.6156005859375, "reward_std": 0.20528648793697357, "rewards/helpfulness_reward/mean": 6.6156005859375, "rewards/helpfulness_reward/std": 0.7983784675598145, "rewards/safety_reward/mean": 8.278076171875, "rewards/safety_reward/std": 0.6723664999008179, "step": 811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 121.6875, "completions/mean_terminated_length": 121.6875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.28364334992577067, "frac_reward_zero_std": 0.0, "grad_norm": 1.0685721635818481, "kl": 3.83203125, "learning_rate": 5e-05, "loss": 0.0369, "num_tokens": 19305318.0, "reward": 6.275634765625, "reward_std": 0.21334561705589294, "rewards/helpfulness_reward/mean": 6.275634765625, "rewards/helpfulness_reward/std": 0.7356570959091187, "rewards/safety_reward/mean": 7.88330078125, "rewards/safety_reward/std": 1.130488634109497, "step": 812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 121.5703125, "completions/mean_terminated_length": 121.5703125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.2839926643961226, "frac_reward_zero_std": 0.0, "grad_norm": 0.3019798696041107, "kl": 3.3203125, "learning_rate": 5e-05, "loss": 0.0288, "num_tokens": 19325111.0, "reward": 6.74658203125, "reward_std": 0.20227859914302826, "rewards/helpfulness_reward/mean": 6.74658203125, "rewards/helpfulness_reward/std": 0.5553769469261169, "rewards/safety_reward/mean": 8.253662109375, "rewards/safety_reward/std": 0.5194249153137207, "step": 813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 122.28125, "completions/mean_terminated_length": 122.28125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.28434197886647455, "frac_reward_zero_std": 0.0, "grad_norm": 0.3413572311401367, "kl": 3.48046875, "learning_rate": 5e-05, "loss": 0.0387, "num_tokens": 19345763.0, "reward": 6.556640625, "reward_std": 0.22021138668060303, "rewards/helpfulness_reward/mean": 6.556640625, "rewards/helpfulness_reward/std": 0.5973812937736511, "rewards/safety_reward/mean": 8.115234375, "rewards/safety_reward/std": 0.38405147194862366, "step": 814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 122.3671875, "completions/mean_terminated_length": 122.3671875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.28469129333682647, "frac_reward_zero_std": 0.0, "grad_norm": 0.41811585426330566, "kl": 3.734375, "learning_rate": 5e-05, "loss": 0.04, "num_tokens": 19366410.0, "reward": 6.3714599609375, "reward_std": 0.40225374698638916, "rewards/helpfulness_reward/mean": 6.3714599609375, "rewards/helpfulness_reward/std": 0.9131625890731812, "rewards/safety_reward/mean": 8.0338134765625, "rewards/safety_reward/std": 1.0320005416870117, "step": 815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 121.8359375, "completions/mean_terminated_length": 121.8359375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.28504060780717844, "frac_reward_zero_std": 0.0, "grad_norm": 0.3272208273410797, "kl": 3.546875, "learning_rate": 5e-05, "loss": 0.0317, "num_tokens": 19385773.0, "reward": 6.712890625, "reward_std": 0.24832430481910706, "rewards/helpfulness_reward/mean": 6.712890625, "rewards/helpfulness_reward/std": 0.4095161259174347, "rewards/safety_reward/mean": 8.200439453125, "rewards/safety_reward/std": 0.47002822160720825, "step": 816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 122.859375, "completions/mean_terminated_length": 122.859375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.28538992227753035, "frac_reward_zero_std": 0.0, "grad_norm": 0.35044094920158386, "kl": 3.560546875, "learning_rate": 5e-05, "loss": 0.0402, "num_tokens": 19405571.0, "reward": 6.70703125, "reward_std": 0.22435912489891052, "rewards/helpfulness_reward/mean": 6.70703125, "rewards/helpfulness_reward/std": 0.6638862490653992, "rewards/safety_reward/mean": 8.257080078125, "rewards/safety_reward/std": 0.47302374243736267, "step": 817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 122.8515625, "completions/mean_terminated_length": 122.8515625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.28573923674788226, "frac_reward_zero_std": 0.0, "grad_norm": 0.26502716541290283, "kl": 3.419921875, "learning_rate": 5e-05, "loss": 0.037, "num_tokens": 19427240.0, "reward": 6.67333984375, "reward_std": 0.17520758509635925, "rewards/helpfulness_reward/mean": 6.67333984375, "rewards/helpfulness_reward/std": 0.4954402446746826, "rewards/safety_reward/mean": 8.19091796875, "rewards/safety_reward/std": 0.48344317078590393, "step": 818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 122.4453125, "completions/mean_terminated_length": 122.4453125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.28608855121823423, "frac_reward_zero_std": 0.0, "grad_norm": 0.3119049072265625, "kl": 3.509765625, "learning_rate": 5e-05, "loss": 0.0339, "num_tokens": 19446713.0, "reward": 6.931396484375, "reward_std": 0.21699851751327515, "rewards/helpfulness_reward/mean": 6.931396484375, "rewards/helpfulness_reward/std": 0.41836416721343994, "rewards/safety_reward/mean": 8.312744140625, "rewards/safety_reward/std": 0.38872313499450684, "step": 819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 122.578125, "completions/mean_terminated_length": 122.578125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.28643786568858615, "frac_reward_zero_std": 0.0, "grad_norm": 0.34789347648620605, "kl": 3.51171875, "learning_rate": 5e-05, "loss": 0.0313, "num_tokens": 19466579.0, "reward": 6.7193603515625, "reward_std": 0.2485279142856598, "rewards/helpfulness_reward/mean": 6.7193603515625, "rewards/helpfulness_reward/std": 0.6241024732589722, "rewards/safety_reward/mean": 8.517822265625, "rewards/safety_reward/std": 0.5000258684158325, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 122.6875, "completions/mean_terminated_length": 122.6875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.28678718015893806, "frac_reward_zero_std": 0.0, "grad_norm": 0.4233524203300476, "kl": 3.65234375, "learning_rate": 5e-05, "loss": 0.0392, "num_tokens": 19486699.0, "reward": 6.674560546875, "reward_std": 0.23308110237121582, "rewards/helpfulness_reward/mean": 6.674560546875, "rewards/helpfulness_reward/std": 0.5973526835441589, "rewards/safety_reward/mean": 8.079345703125, "rewards/safety_reward/std": 0.6390870809555054, "step": 821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 122.984375, "completions/mean_terminated_length": 122.984375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.28713649462929003, "frac_reward_zero_std": 0.0, "grad_norm": 0.34752312302589417, "kl": 3.568359375, "learning_rate": 5e-05, "loss": 0.0361, "num_tokens": 19506849.0, "reward": 6.63671875, "reward_std": 0.19016656279563904, "rewards/helpfulness_reward/mean": 6.63671875, "rewards/helpfulness_reward/std": 0.4445439577102661, "rewards/safety_reward/mean": 8.376220703125, "rewards/safety_reward/std": 0.3859021067619324, "step": 822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 123.1328125, "completions/mean_terminated_length": 123.1328125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.28748580909964194, "frac_reward_zero_std": 0.0, "grad_norm": 0.3444106876850128, "kl": 3.498046875, "learning_rate": 5e-05, "loss": 0.0411, "num_tokens": 19528866.0, "reward": 6.61767578125, "reward_std": 0.22550266981124878, "rewards/helpfulness_reward/mean": 6.61767578125, "rewards/helpfulness_reward/std": 0.8403345346450806, "rewards/safety_reward/mean": 8.138427734375, "rewards/safety_reward/std": 0.980420708656311, "step": 823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 122.3671875, "completions/mean_terminated_length": 122.3671875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.2878351235699939, "frac_reward_zero_std": 0.0, "grad_norm": 0.44941920042037964, "kl": 3.716796875, "learning_rate": 5e-05, "loss": 0.0389, "num_tokens": 19548809.0, "reward": 6.642333984375, "reward_std": 0.19705408811569214, "rewards/helpfulness_reward/mean": 6.642333984375, "rewards/helpfulness_reward/std": 0.4393032491207123, "rewards/safety_reward/mean": 8.22119140625, "rewards/safety_reward/std": 0.49277520179748535, "step": 824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 122.265625, "completions/mean_terminated_length": 122.265625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.2881844380403458, "frac_reward_zero_std": 0.0, "grad_norm": 0.3763573169708252, "kl": 3.533203125, "learning_rate": 5e-05, "loss": 0.0354, "num_tokens": 19568995.0, "reward": 6.4638671875, "reward_std": 0.15748010575771332, "rewards/helpfulness_reward/mean": 6.4638671875, "rewards/helpfulness_reward/std": 0.540703535079956, "rewards/safety_reward/mean": 8.01611328125, "rewards/safety_reward/std": 0.5583836436271667, "step": 825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 122.953125, "completions/mean_terminated_length": 122.953125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.28853375251069774, "frac_reward_zero_std": 0.0, "grad_norm": 1.3115875720977783, "kl": 4.142578125, "learning_rate": 5e-05, "loss": 0.0465, "num_tokens": 19588821.0, "reward": 6.775390625, "reward_std": 0.20652805268764496, "rewards/helpfulness_reward/mean": 6.775390625, "rewards/helpfulness_reward/std": 0.5007491707801819, "rewards/safety_reward/mean": 8.413330078125, "rewards/safety_reward/std": 0.32993245124816895, "step": 826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 122.171875, "completions/mean_terminated_length": 122.171875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.2888830669810497, "frac_reward_zero_std": 0.0, "grad_norm": 0.3592408001422882, "kl": 3.716796875, "learning_rate": 5e-05, "loss": 0.0411, "num_tokens": 19609115.0, "reward": 6.50732421875, "reward_std": 0.195842444896698, "rewards/helpfulness_reward/mean": 6.50732421875, "rewards/helpfulness_reward/std": 0.5450526475906372, "rewards/safety_reward/mean": 8.020751953125, "rewards/safety_reward/std": 0.4997119903564453, "step": 827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 121.953125, "completions/mean_terminated_length": 121.953125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.2892323814514016, "frac_reward_zero_std": 0.0, "grad_norm": 0.2878507375717163, "kl": 3.61328125, "learning_rate": 5e-05, "loss": 0.0394, "num_tokens": 19628893.0, "reward": 6.69189453125, "reward_std": 0.19885015487670898, "rewards/helpfulness_reward/mean": 6.69189453125, "rewards/helpfulness_reward/std": 0.33832135796546936, "rewards/safety_reward/mean": 7.9794921875, "rewards/safety_reward/std": 0.3800196647644043, "step": 828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 122.125, "completions/mean_terminated_length": 122.125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.28958169592175353, "frac_reward_zero_std": 0.0, "grad_norm": 0.321296751499176, "kl": 3.671875, "learning_rate": 5e-05, "loss": 0.0401, "num_tokens": 19648605.0, "reward": 6.742919921875, "reward_std": 0.24089908599853516, "rewards/helpfulness_reward/mean": 6.742919921875, "rewards/helpfulness_reward/std": 0.6075341701507568, "rewards/safety_reward/mean": 8.26025390625, "rewards/safety_reward/std": 0.5253348350524902, "step": 829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 121.140625, "completions/mean_terminated_length": 121.140625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.2899310103921055, "frac_reward_zero_std": 0.0, "grad_norm": 0.3726164698600769, "kl": 3.861328125, "learning_rate": 5e-05, "loss": 0.0383, "num_tokens": 19668623.0, "reward": 6.40771484375, "reward_std": 0.388671338558197, "rewards/helpfulness_reward/mean": 6.40771484375, "rewards/helpfulness_reward/std": 0.7788680195808411, "rewards/safety_reward/mean": 8.030517578125, "rewards/safety_reward/std": 0.8552364110946655, "step": 830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 121.578125, "completions/mean_terminated_length": 121.578125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.2902803248624574, "frac_reward_zero_std": 0.0, "grad_norm": 0.3957148492336273, "kl": 3.62109375, "learning_rate": 5e-05, "loss": 0.0382, "num_tokens": 19689089.0, "reward": 6.591552734375, "reward_std": 0.300356388092041, "rewards/helpfulness_reward/mean": 6.591552734375, "rewards/helpfulness_reward/std": 0.5171186923980713, "rewards/safety_reward/mean": 8.128173828125, "rewards/safety_reward/std": 0.5901301503181458, "step": 831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 122.21875, "completions/mean_terminated_length": 122.21875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.2906296393328094, "frac_reward_zero_std": 0.0, "grad_norm": 0.3029744029045105, "kl": 3.66796875, "learning_rate": 5e-05, "loss": 0.038, "num_tokens": 19709029.0, "reward": 6.614990234375, "reward_std": 0.19297409057617188, "rewards/helpfulness_reward/mean": 6.614990234375, "rewards/helpfulness_reward/std": 0.44607964158058167, "rewards/safety_reward/mean": 8.083251953125, "rewards/safety_reward/std": 0.6384897232055664, "step": 832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 121.375, "completions/mean_terminated_length": 121.375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.2909789538031613, "frac_reward_zero_std": 0.0, "grad_norm": 0.588921308517456, "kl": 3.93359375, "learning_rate": 5e-05, "loss": 0.0409, "num_tokens": 19728429.0, "reward": 6.753173828125, "reward_std": 0.2551179528236389, "rewards/helpfulness_reward/mean": 6.753173828125, "rewards/helpfulness_reward/std": 0.5062788724899292, "rewards/safety_reward/mean": 8.25244140625, "rewards/safety_reward/std": 0.35964736342430115, "step": 833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 121.953125, "completions/mean_terminated_length": 121.953125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.2913282682735132, "frac_reward_zero_std": 0.0, "grad_norm": 0.3133939802646637, "kl": 3.68359375, "learning_rate": 5e-05, "loss": 0.0428, "num_tokens": 19748687.0, "reward": 6.558349609375, "reward_std": 0.2651151716709137, "rewards/helpfulness_reward/mean": 6.558349609375, "rewards/helpfulness_reward/std": 0.5723569393157959, "rewards/safety_reward/mean": 8.140380859375, "rewards/safety_reward/std": 0.6006962060928345, "step": 834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 121.9296875, "completions/mean_terminated_length": 121.9296875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.2916775827438652, "frac_reward_zero_std": 0.0, "grad_norm": 0.3138387203216553, "kl": 3.693359375, "learning_rate": 5e-05, "loss": 0.0373, "num_tokens": 19767942.0, "reward": 6.4345703125, "reward_std": 0.26405882835388184, "rewards/helpfulness_reward/mean": 6.4345703125, "rewards/helpfulness_reward/std": 0.607093870639801, "rewards/safety_reward/mean": 8.10693359375, "rewards/safety_reward/std": 0.6274265050888062, "step": 835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 121.5703125, "completions/mean_terminated_length": 121.5703125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.2920268972142171, "frac_reward_zero_std": 0.0, "grad_norm": 0.8333446979522705, "kl": 3.953125, "learning_rate": 5e-05, "loss": 0.0419, "num_tokens": 19787791.0, "reward": 6.572998046875, "reward_std": 0.2808069884777069, "rewards/helpfulness_reward/mean": 6.572998046875, "rewards/helpfulness_reward/std": 0.5102642774581909, "rewards/safety_reward/mean": 8.08154296875, "rewards/safety_reward/std": 0.4732353389263153, "step": 836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 121.46875, "completions/mean_terminated_length": 121.46875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.292376211684569, "frac_reward_zero_std": 0.0, "grad_norm": 0.2771880030632019, "kl": 3.6328125, "learning_rate": 5e-05, "loss": 0.0388, "num_tokens": 19807259.0, "reward": 6.824951171875, "reward_std": 0.3081824481487274, "rewards/helpfulness_reward/mean": 6.824951171875, "rewards/helpfulness_reward/std": 0.5813807249069214, "rewards/safety_reward/mean": 8.296875, "rewards/safety_reward/std": 0.5547780394554138, "step": 837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 121.65625, "completions/mean_terminated_length": 121.65625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.292725526154921, "frac_reward_zero_std": 0.0, "grad_norm": 0.2911149859428406, "kl": 3.56640625, "learning_rate": 5e-05, "loss": 0.0309, "num_tokens": 19828111.0, "reward": 6.7470703125, "reward_std": 0.1965523511171341, "rewards/helpfulness_reward/mean": 6.7470703125, "rewards/helpfulness_reward/std": 0.5306842923164368, "rewards/safety_reward/mean": 8.314697265625, "rewards/safety_reward/std": 0.5215458869934082, "step": 838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 121.90625, "completions/mean_terminated_length": 121.90625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.2930748406252729, "frac_reward_zero_std": 0.0, "grad_norm": 0.6274298429489136, "kl": 3.9453125, "learning_rate": 5e-05, "loss": 0.0443, "num_tokens": 19848675.0, "reward": 6.4853515625, "reward_std": 0.24883243441581726, "rewards/helpfulness_reward/mean": 6.4853515625, "rewards/helpfulness_reward/std": 0.643440306186676, "rewards/safety_reward/mean": 8.12646484375, "rewards/safety_reward/std": 0.6865471005439758, "step": 839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 120.96875, "completions/mean_terminated_length": 120.96875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.29342415509562486, "frac_reward_zero_std": 0.0, "grad_norm": 0.3696424067020416, "kl": 3.59765625, "learning_rate": 5e-05, "loss": 0.0312, "num_tokens": 19868231.0, "reward": 6.5938720703125, "reward_std": 0.30550330877304077, "rewards/helpfulness_reward/mean": 6.5938720703125, "rewards/helpfulness_reward/std": 0.7266250252723694, "rewards/safety_reward/mean": 8.131103515625, "rewards/safety_reward/std": 0.6293531656265259, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 120.578125, "completions/mean_terminated_length": 120.578125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.2937734695659768, "frac_reward_zero_std": 0.0, "grad_norm": 0.37078234553337097, "kl": 3.66015625, "learning_rate": 5e-05, "loss": 0.0236, "num_tokens": 19888073.0, "reward": 6.61474609375, "reward_std": 0.44856998324394226, "rewards/helpfulness_reward/mean": 6.61474609375, "rewards/helpfulness_reward/std": 0.7381294965744019, "rewards/safety_reward/mean": 8.15966796875, "rewards/safety_reward/std": 0.6233150959014893, "step": 841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 120.7890625, "completions/mean_terminated_length": 120.7890625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.2941227840363287, "frac_reward_zero_std": 0.0, "grad_norm": 0.3043350875377655, "kl": 3.7421875, "learning_rate": 5e-05, "loss": 0.0303, "num_tokens": 19907926.0, "reward": 6.6038818359375, "reward_std": 0.29805633425712585, "rewards/helpfulness_reward/mean": 6.6038818359375, "rewards/helpfulness_reward/std": 0.6057365536689758, "rewards/safety_reward/mean": 8.28076171875, "rewards/safety_reward/std": 0.526984453201294, "step": 842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 120.4765625, "completions/mean_terminated_length": 120.4765625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.29447209850668066, "frac_reward_zero_std": 0.0, "grad_norm": 0.3988349735736847, "kl": 3.80078125, "learning_rate": 5e-05, "loss": 0.0315, "num_tokens": 19929843.0, "reward": 6.24847412109375, "reward_std": 0.44166475534439087, "rewards/helpfulness_reward/mean": 6.24847412109375, "rewards/helpfulness_reward/std": 1.2318682670593262, "rewards/safety_reward/mean": 7.786865234375, "rewards/safety_reward/std": 1.5493178367614746, "step": 843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 121.3515625, "completions/mean_terminated_length": 121.3515625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.29482141297703257, "frac_reward_zero_std": 0.0, "grad_norm": 4.283538341522217, "kl": 4.8984375, "learning_rate": 5e-05, "loss": 0.0432, "num_tokens": 19949680.0, "reward": 6.48974609375, "reward_std": 0.2759495973587036, "rewards/helpfulness_reward/mean": 6.48974609375, "rewards/helpfulness_reward/std": 0.5553613901138306, "rewards/safety_reward/mean": 7.927001953125, "rewards/safety_reward/std": 0.6443996429443359, "step": 844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 122.4296875, "completions/mean_terminated_length": 122.4296875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.2951707274473845, "frac_reward_zero_std": 0.0, "grad_norm": 0.2921689450740814, "kl": 3.66796875, "learning_rate": 5e-05, "loss": 0.0269, "num_tokens": 19969015.0, "reward": 6.685302734375, "reward_std": 0.251525342464447, "rewards/helpfulness_reward/mean": 6.685302734375, "rewards/helpfulness_reward/std": 0.5212361216545105, "rewards/safety_reward/mean": 8.101806640625, "rewards/safety_reward/std": 0.3771723210811615, "step": 845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 121.5, "completions/mean_terminated_length": 121.5, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.29552004191773645, "frac_reward_zero_std": 0.0, "grad_norm": 0.3388688266277313, "kl": 3.93359375, "learning_rate": 5e-05, "loss": 0.0354, "num_tokens": 19988647.0, "reward": 6.5517578125, "reward_std": 0.26949453353881836, "rewards/helpfulness_reward/mean": 6.5517578125, "rewards/helpfulness_reward/std": 0.4586164653301239, "rewards/safety_reward/mean": 8.1640625, "rewards/safety_reward/std": 0.5254796147346497, "step": 846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 121.6796875, "completions/mean_terminated_length": 121.6796875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.29586935638808837, "frac_reward_zero_std": 0.0, "grad_norm": 0.3240537643432617, "kl": 3.73046875, "learning_rate": 5e-05, "loss": 0.0283, "num_tokens": 20009046.0, "reward": 6.852783203125, "reward_std": 0.21139903366565704, "rewards/helpfulness_reward/mean": 6.852783203125, "rewards/helpfulness_reward/std": 0.511361300945282, "rewards/safety_reward/mean": 8.38525390625, "rewards/safety_reward/std": 0.3889330327510834, "step": 847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 122.5859375, "completions/mean_terminated_length": 122.5859375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.29621867085844034, "frac_reward_zero_std": 0.0, "grad_norm": 0.3118867874145508, "kl": 3.646484375, "learning_rate": 5e-05, "loss": 0.0332, "num_tokens": 20029481.0, "reward": 6.416015625, "reward_std": 0.22206513583660126, "rewards/helpfulness_reward/mean": 6.416015625, "rewards/helpfulness_reward/std": 0.6727199554443359, "rewards/safety_reward/mean": 7.9384765625, "rewards/safety_reward/std": 0.7544970512390137, "step": 848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 123.125, "completions/mean_terminated_length": 123.125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.29656798532879225, "frac_reward_zero_std": 0.0, "grad_norm": 0.3681095242500305, "kl": 3.572265625, "learning_rate": 5e-05, "loss": 0.0331, "num_tokens": 20049561.0, "reward": 6.689208984375, "reward_std": 0.276374489068985, "rewards/helpfulness_reward/mean": 6.689208984375, "rewards/helpfulness_reward/std": 0.591346800327301, "rewards/safety_reward/mean": 8.247802734375, "rewards/safety_reward/std": 0.4712316691875458, "step": 849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 122.1015625, "completions/mean_terminated_length": 122.1015625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.29691729979914416, "frac_reward_zero_std": 0.0, "grad_norm": 0.3153475522994995, "kl": 3.546875, "learning_rate": 5e-05, "loss": 0.0238, "num_tokens": 20069358.0, "reward": 6.716064453125, "reward_std": 0.22990038990974426, "rewards/helpfulness_reward/mean": 6.716064453125, "rewards/helpfulness_reward/std": 0.5642065405845642, "rewards/safety_reward/mean": 8.212646484375, "rewards/safety_reward/std": 0.3209022581577301, "step": 850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 122.5, "completions/mean_terminated_length": 122.5, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.29726661426949613, "frac_reward_zero_std": 0.0, "grad_norm": 0.28820133209228516, "kl": 3.51171875, "learning_rate": 5e-05, "loss": 0.032, "num_tokens": 20088910.0, "reward": 6.57568359375, "reward_std": 0.1940680295228958, "rewards/helpfulness_reward/mean": 6.57568359375, "rewards/helpfulness_reward/std": 0.5003936886787415, "rewards/safety_reward/mean": 8.24658203125, "rewards/safety_reward/std": 0.36926040053367615, "step": 851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 121.796875, "completions/mean_terminated_length": 121.796875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.29761592873984805, "frac_reward_zero_std": 0.0, "grad_norm": 0.32221177220344543, "kl": 3.537109375, "learning_rate": 5e-05, "loss": 0.0277, "num_tokens": 20110188.0, "reward": 6.617431640625, "reward_std": 0.24910284578800201, "rewards/helpfulness_reward/mean": 6.617431640625, "rewards/helpfulness_reward/std": 0.5623598098754883, "rewards/safety_reward/mean": 8.14794921875, "rewards/safety_reward/std": 0.5922845602035522, "step": 852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 123.2421875, "completions/mean_terminated_length": 123.2421875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.29796524321019996, "frac_reward_zero_std": 0.0, "grad_norm": 0.40810805559158325, "kl": 3.748046875, "learning_rate": 5e-05, "loss": 0.0305, "num_tokens": 20130819.0, "reward": 6.5673828125, "reward_std": 0.24175503849983215, "rewards/helpfulness_reward/mean": 6.5673828125, "rewards/helpfulness_reward/std": 0.6104411482810974, "rewards/safety_reward/mean": 8.23095703125, "rewards/safety_reward/std": 0.4703505337238312, "step": 853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 123.265625, "completions/mean_terminated_length": 123.265625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.29831455768055193, "frac_reward_zero_std": 0.0, "grad_norm": 0.3635454773902893, "kl": 3.4765625, "learning_rate": 5e-05, "loss": 0.0337, "num_tokens": 20152165.0, "reward": 6.680908203125, "reward_std": 0.23123514652252197, "rewards/helpfulness_reward/mean": 6.680908203125, "rewards/helpfulness_reward/std": 0.5900885462760925, "rewards/safety_reward/mean": 8.253662109375, "rewards/safety_reward/std": 0.6070708632469177, "step": 854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 117.9296875, "completions/mean_terminated_length": 117.9296875, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.29866387215090384, "frac_reward_zero_std": 0.0, "grad_norm": 0.5335740447044373, "kl": 3.8671875, "learning_rate": 5e-05, "loss": 0.021, "num_tokens": 20173228.0, "reward": 6.288578033447266, "reward_std": 0.35007983446121216, "rewards/helpfulness_reward/mean": 6.288578033447266, "rewards/helpfulness_reward/std": 1.5842156410217285, "rewards/safety_reward/mean": 7.504638671875, "rewards/safety_reward/std": 1.6998939514160156, "step": 855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 123.25, "completions/mean_terminated_length": 123.25, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.2990131866212558, "frac_reward_zero_std": 0.0, "grad_norm": 1.141304850578308, "kl": 4.064453125, "learning_rate": 5e-05, "loss": 0.0335, "num_tokens": 20195412.0, "reward": 6.4892578125, "reward_std": 0.2969384789466858, "rewards/helpfulness_reward/mean": 6.4892578125, "rewards/helpfulness_reward/std": 0.9846507906913757, "rewards/safety_reward/mean": 7.730712890625, "rewards/safety_reward/std": 1.0118190050125122, "step": 856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 123.3359375, "completions/mean_terminated_length": 123.3359375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.2993625010916077, "frac_reward_zero_std": 0.0, "grad_norm": 0.41420915722846985, "kl": 3.494140625, "learning_rate": 5e-05, "loss": 0.0307, "num_tokens": 20214991.0, "reward": 6.894775390625, "reward_std": 0.20094728469848633, "rewards/helpfulness_reward/mean": 6.894775390625, "rewards/helpfulness_reward/std": 0.4493076205253601, "rewards/safety_reward/mean": 8.2998046875, "rewards/safety_reward/std": 0.48070287704467773, "step": 857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 123.5703125, "completions/mean_terminated_length": 123.5703125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.29971181556195964, "frac_reward_zero_std": 0.0, "grad_norm": 0.31301286816596985, "kl": 3.609375, "learning_rate": 5e-05, "loss": 0.0371, "num_tokens": 20235032.0, "reward": 6.80078125, "reward_std": 0.23017580807209015, "rewards/helpfulness_reward/mean": 6.80078125, "rewards/helpfulness_reward/std": 0.5507577657699585, "rewards/safety_reward/mean": 8.36376953125, "rewards/safety_reward/std": 0.46737387776374817, "step": 858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 123.546875, "completions/mean_terminated_length": 123.546875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.3000611300323116, "frac_reward_zero_std": 0.0, "grad_norm": 0.3108864724636078, "kl": 3.396484375, "learning_rate": 5e-05, "loss": 0.0353, "num_tokens": 20254430.0, "reward": 6.7802734375, "reward_std": 0.22071608901023865, "rewards/helpfulness_reward/mean": 6.7802734375, "rewards/helpfulness_reward/std": 0.536068856716156, "rewards/safety_reward/mean": 8.05126953125, "rewards/safety_reward/std": 0.5332326889038086, "step": 859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 123.0390625, "completions/mean_terminated_length": 123.0390625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.3004104445026635, "frac_reward_zero_std": 0.0, "grad_norm": 0.5773299932479858, "kl": 3.90234375, "learning_rate": 5e-05, "loss": 0.0404, "num_tokens": 20275131.0, "reward": 6.692138671875, "reward_std": 0.26927387714385986, "rewards/helpfulness_reward/mean": 6.692138671875, "rewards/helpfulness_reward/std": 0.5546267628669739, "rewards/safety_reward/mean": 8.20166015625, "rewards/safety_reward/std": 0.48637089133262634, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 122.0703125, "completions/mean_terminated_length": 122.0703125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.30075975897301543, "frac_reward_zero_std": 0.0, "grad_norm": 0.47419875860214233, "kl": 3.609375, "learning_rate": 5e-05, "loss": 0.0366, "num_tokens": 20295260.0, "reward": 6.5244140625, "reward_std": 0.2775493264198303, "rewards/helpfulness_reward/mean": 6.5244140625, "rewards/helpfulness_reward/std": 0.6615998148918152, "rewards/safety_reward/mean": 8.005615234375, "rewards/safety_reward/std": 0.5962562561035156, "step": 861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 122.453125, "completions/mean_terminated_length": 122.453125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.3011090734433674, "frac_reward_zero_std": 0.0, "grad_norm": 0.42739337682724, "kl": 3.68359375, "learning_rate": 5e-05, "loss": 0.0334, "num_tokens": 20314886.0, "reward": 6.818603515625, "reward_std": 0.30433738231658936, "rewards/helpfulness_reward/mean": 6.818603515625, "rewards/helpfulness_reward/std": 0.4491738975048065, "rewards/safety_reward/mean": 8.133056640625, "rewards/safety_reward/std": 0.3867150843143463, "step": 862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 122.6015625, "completions/mean_terminated_length": 122.6015625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.3014583879137193, "frac_reward_zero_std": 0.0, "grad_norm": 0.3189779818058014, "kl": 3.529296875, "learning_rate": 5e-05, "loss": 0.0284, "num_tokens": 20334099.0, "reward": 6.724365234375, "reward_std": 0.2762157917022705, "rewards/helpfulness_reward/mean": 6.724365234375, "rewards/helpfulness_reward/std": 0.44993290305137634, "rewards/safety_reward/mean": 8.292724609375, "rewards/safety_reward/std": 0.5387579202651978, "step": 863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 122.265625, "completions/mean_terminated_length": 122.265625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.3018077023840713, "frac_reward_zero_std": 0.0, "grad_norm": 0.3070511817932129, "kl": 3.400390625, "learning_rate": 5e-05, "loss": 0.03, "num_tokens": 20354101.0, "reward": 6.54931640625, "reward_std": 0.3583882749080658, "rewards/helpfulness_reward/mean": 6.54931640625, "rewards/helpfulness_reward/std": 0.5954325795173645, "rewards/safety_reward/mean": 8.1171875, "rewards/safety_reward/std": 0.42510926723480225, "step": 864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 123.265625, "completions/mean_terminated_length": 123.265625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.3021570168544232, "frac_reward_zero_std": 0.0, "grad_norm": 0.32609257102012634, "kl": 3.642578125, "learning_rate": 5e-05, "loss": 0.0444, "num_tokens": 20373415.0, "reward": 6.6380615234375, "reward_std": 0.37586820125579834, "rewards/helpfulness_reward/mean": 6.6380615234375, "rewards/helpfulness_reward/std": 0.5750185251235962, "rewards/safety_reward/mean": 8.267822265625, "rewards/safety_reward/std": 0.6228886842727661, "step": 865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 122.734375, "completions/mean_terminated_length": 122.734375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.3025063313247751, "frac_reward_zero_std": 0.0, "grad_norm": 0.31391283869743347, "kl": 3.51171875, "learning_rate": 5e-05, "loss": 0.035, "num_tokens": 20394229.0, "reward": 6.609619140625, "reward_std": 0.2989014983177185, "rewards/helpfulness_reward/mean": 6.609619140625, "rewards/helpfulness_reward/std": 0.6426637172698975, "rewards/safety_reward/mean": 8.129150390625, "rewards/safety_reward/std": 0.5091493725776672, "step": 866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 121.4765625, "completions/mean_terminated_length": 121.4765625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.3028556457951271, "frac_reward_zero_std": 0.0, "grad_norm": 0.34199249744415283, "kl": 3.58203125, "learning_rate": 5e-05, "loss": 0.0298, "num_tokens": 20414850.0, "reward": 6.580078125, "reward_std": 0.2464853823184967, "rewards/helpfulness_reward/mean": 6.580078125, "rewards/helpfulness_reward/std": 0.5630431175231934, "rewards/safety_reward/mean": 8.195556640625, "rewards/safety_reward/std": 0.42891329526901245, "step": 867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 122.1875, "completions/mean_terminated_length": 122.1875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.303204960265479, "frac_reward_zero_std": 0.0, "grad_norm": 0.36728668212890625, "kl": 3.849609375, "learning_rate": 5e-05, "loss": 0.0381, "num_tokens": 20434338.0, "reward": 6.6123046875, "reward_std": 0.29382410645484924, "rewards/helpfulness_reward/mean": 6.6123046875, "rewards/helpfulness_reward/std": 0.5756933093070984, "rewards/safety_reward/mean": 8.254150390625, "rewards/safety_reward/std": 0.5628740191459656, "step": 868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 121.3203125, "completions/mean_terminated_length": 121.3203125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.3035542747358309, "frac_reward_zero_std": 0.0, "grad_norm": 0.36362016201019287, "kl": 3.54296875, "learning_rate": 5e-05, "loss": 0.032, "num_tokens": 20453675.0, "reward": 6.854736328125, "reward_std": 0.22084742784500122, "rewards/helpfulness_reward/mean": 6.854736328125, "rewards/helpfulness_reward/std": 0.4567878246307373, "rewards/safety_reward/mean": 8.16162109375, "rewards/safety_reward/std": 0.521426796913147, "step": 869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 121.3984375, "completions/mean_terminated_length": 121.3984375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.3039035892061829, "frac_reward_zero_std": 0.0, "grad_norm": 0.9664633274078369, "kl": 3.8515625, "learning_rate": 5e-05, "loss": 0.0384, "num_tokens": 20473534.0, "reward": 6.688720703125, "reward_std": 0.2707352340221405, "rewards/helpfulness_reward/mean": 6.688720703125, "rewards/helpfulness_reward/std": 0.6520405411720276, "rewards/safety_reward/mean": 8.18505859375, "rewards/safety_reward/std": 0.5147404074668884, "step": 870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 120.8828125, "completions/mean_terminated_length": 120.8828125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.3042529036765348, "frac_reward_zero_std": 0.0, "grad_norm": 0.3162759244441986, "kl": 3.427734375, "learning_rate": 5e-05, "loss": 0.0312, "num_tokens": 20493287.0, "reward": 6.592041015625, "reward_std": 0.3062294125556946, "rewards/helpfulness_reward/mean": 6.592041015625, "rewards/helpfulness_reward/std": 0.6906614303588867, "rewards/safety_reward/mean": 8.006591796875, "rewards/safety_reward/std": 0.6100784540176392, "step": 871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 121.09375, "completions/mean_terminated_length": 121.09375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.30460221814688676, "frac_reward_zero_std": 0.0, "grad_norm": 0.3730362057685852, "kl": 3.65625, "learning_rate": 5e-05, "loss": 0.0311, "num_tokens": 20512355.0, "reward": 6.8447265625, "reward_std": 0.23565427958965302, "rewards/helpfulness_reward/mean": 6.8447265625, "rewards/helpfulness_reward/std": 0.4540753662586212, "rewards/safety_reward/mean": 8.331298828125, "rewards/safety_reward/std": 0.48368337750434875, "step": 872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 120.8046875, "completions/mean_terminated_length": 120.8046875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.3049515326172387, "frac_reward_zero_std": 0.0, "grad_norm": 0.3144315779209137, "kl": 3.693359375, "learning_rate": 5e-05, "loss": 0.0324, "num_tokens": 20532250.0, "reward": 6.573974609375, "reward_std": 0.24558818340301514, "rewards/helpfulness_reward/mean": 6.573974609375, "rewards/helpfulness_reward/std": 0.5766199827194214, "rewards/safety_reward/mean": 8.009521484375, "rewards/safety_reward/std": 0.45469510555267334, "step": 873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 120.53125, "completions/mean_terminated_length": 120.53125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.3053008470875906, "frac_reward_zero_std": 0.0, "grad_norm": 0.2864292562007904, "kl": 3.642578125, "learning_rate": 5e-05, "loss": 0.0287, "num_tokens": 20551030.0, "reward": 6.7822265625, "reward_std": 0.2642967104911804, "rewards/helpfulness_reward/mean": 6.7822265625, "rewards/helpfulness_reward/std": 0.4734254479408264, "rewards/safety_reward/mean": 8.03759765625, "rewards/safety_reward/std": 0.6035614013671875, "step": 874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 120.265625, "completions/mean_terminated_length": 120.265625, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.30565016155794256, "frac_reward_zero_std": 0.0, "grad_norm": 0.3495979309082031, "kl": 3.59765625, "learning_rate": 5e-05, "loss": 0.0173, "num_tokens": 20570792.0, "reward": 6.542724609375, "reward_std": 0.5391672253608704, "rewards/helpfulness_reward/mean": 6.542724609375, "rewards/helpfulness_reward/std": 0.9467402100563049, "rewards/safety_reward/mean": 7.923583984375, "rewards/safety_reward/std": 0.6376886963844299, "step": 875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 120.4765625, "completions/mean_terminated_length": 120.4765625, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.30599947602829447, "frac_reward_zero_std": 0.0, "grad_norm": 0.3272758722305298, "kl": 3.76171875, "learning_rate": 5e-05, "loss": 0.0246, "num_tokens": 20590829.0, "reward": 6.63916015625, "reward_std": 0.21035245060920715, "rewards/helpfulness_reward/mean": 6.63916015625, "rewards/helpfulness_reward/std": 0.5111613869667053, "rewards/safety_reward/mean": 8.22314453125, "rewards/safety_reward/std": 0.44155415892601013, "step": 876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 127.0, "completions/max_terminated_length": 127.0, "completions/mean_length": 120.6328125, "completions/mean_terminated_length": 120.6328125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.3063487904986464, "frac_reward_zero_std": 0.0, "grad_norm": 0.27482762932777405, "kl": 3.6875, "learning_rate": 5e-05, "loss": 0.0262, "num_tokens": 20610358.0, "reward": 6.711669921875, "reward_std": 0.1874057650566101, "rewards/helpfulness_reward/mean": 6.711669921875, "rewards/helpfulness_reward/std": 0.5198036432266235, "rewards/safety_reward/mean": 8.295654296875, "rewards/safety_reward/std": 0.41692519187927246, "step": 877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 121.5625, "completions/mean_terminated_length": 121.5625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.30669810496899835, "frac_reward_zero_std": 0.0, "grad_norm": 0.24521872401237488, "kl": 3.78515625, "learning_rate": 5e-05, "loss": 0.0303, "num_tokens": 20630622.0, "reward": 6.791748046875, "reward_std": 0.2534835934638977, "rewards/helpfulness_reward/mean": 6.791748046875, "rewards/helpfulness_reward/std": 0.6315025687217712, "rewards/safety_reward/mean": 8.221923828125, "rewards/safety_reward/std": 0.499104768037796, "step": 878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 122.2578125, "completions/mean_terminated_length": 122.2578125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.30704741943935027, "frac_reward_zero_std": 0.0, "grad_norm": 0.29959332942962646, "kl": 3.693359375, "learning_rate": 5e-05, "loss": 0.0305, "num_tokens": 20650823.0, "reward": 6.77294921875, "reward_std": 0.22665470838546753, "rewards/helpfulness_reward/mean": 6.77294921875, "rewards/helpfulness_reward/std": 0.524168074131012, "rewards/safety_reward/mean": 8.121826171875, "rewards/safety_reward/std": 0.6539885997772217, "step": 879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 122.8046875, "completions/mean_terminated_length": 122.8046875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.30739673390970224, "frac_reward_zero_std": 0.0, "grad_norm": 0.3418130576610565, "kl": 3.705078125, "learning_rate": 5e-05, "loss": 0.0393, "num_tokens": 20671550.0, "reward": 6.753662109375, "reward_std": 0.2446623146533966, "rewards/helpfulness_reward/mean": 6.753662109375, "rewards/helpfulness_reward/std": 0.4812813103199005, "rewards/safety_reward/mean": 8.2265625, "rewards/safety_reward/std": 0.4335096478462219, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 122.78125, "completions/mean_terminated_length": 122.78125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.30774604838005415, "frac_reward_zero_std": 0.0, "grad_norm": 0.29165521264076233, "kl": 3.609375, "learning_rate": 5e-05, "loss": 0.0485, "num_tokens": 20691602.0, "reward": 6.83673095703125, "reward_std": 0.335172176361084, "rewards/helpfulness_reward/mean": 6.83673095703125, "rewards/helpfulness_reward/std": 0.7746875286102295, "rewards/safety_reward/mean": 8.1854248046875, "rewards/safety_reward/std": 0.7162512540817261, "step": 881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 122.1953125, "completions/mean_terminated_length": 122.1953125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.30809536285040606, "frac_reward_zero_std": 0.0, "grad_norm": 0.40541526675224304, "kl": 3.666015625, "learning_rate": 5e-05, "loss": 0.0384, "num_tokens": 20711859.0, "reward": 6.884033203125, "reward_std": 0.18193461000919342, "rewards/helpfulness_reward/mean": 6.884033203125, "rewards/helpfulness_reward/std": 0.36712852120399475, "rewards/safety_reward/mean": 8.189208984375, "rewards/safety_reward/std": 0.43301817774772644, "step": 882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 122.5625, "completions/mean_terminated_length": 122.5625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.30844467732075803, "frac_reward_zero_std": 0.0, "grad_norm": 0.47489693760871887, "kl": 3.794921875, "learning_rate": 5e-05, "loss": 0.0346, "num_tokens": 20732595.0, "reward": 6.585205078125, "reward_std": 0.1817113757133484, "rewards/helpfulness_reward/mean": 6.585205078125, "rewards/helpfulness_reward/std": 0.4969412386417389, "rewards/safety_reward/mean": 8.265625, "rewards/safety_reward/std": 0.2764935791492462, "step": 883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 122.9453125, "completions/mean_terminated_length": 122.9453125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.30879399179110995, "frac_reward_zero_std": 0.0, "grad_norm": 0.2539450526237488, "kl": 3.57421875, "learning_rate": 5e-05, "loss": 0.0341, "num_tokens": 20752292.0, "reward": 6.781494140625, "reward_std": 0.18159613013267517, "rewards/helpfulness_reward/mean": 6.781494140625, "rewards/helpfulness_reward/std": 0.527633547782898, "rewards/safety_reward/mean": 8.046875, "rewards/safety_reward/std": 0.53005450963974, "step": 884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 123.2421875, "completions/mean_terminated_length": 123.2421875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.30914330626146186, "frac_reward_zero_std": 0.0, "grad_norm": 0.26741325855255127, "kl": 3.69140625, "learning_rate": 5e-05, "loss": 0.0356, "num_tokens": 20772771.0, "reward": 6.76904296875, "reward_std": 0.19202345609664917, "rewards/helpfulness_reward/mean": 6.76904296875, "rewards/helpfulness_reward/std": 0.6060137152671814, "rewards/safety_reward/mean": 8.518798828125, "rewards/safety_reward/std": 0.47695958614349365, "step": 885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 123.3203125, "completions/mean_terminated_length": 123.3203125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.30949262073181383, "frac_reward_zero_std": 0.0, "grad_norm": 0.30637484788894653, "kl": 3.607421875, "learning_rate": 5e-05, "loss": 0.0304, "num_tokens": 20793220.0, "reward": 6.802734375, "reward_std": 0.20775416493415833, "rewards/helpfulness_reward/mean": 6.802734375, "rewards/helpfulness_reward/std": 0.49598148465156555, "rewards/safety_reward/mean": 8.400634765625, "rewards/safety_reward/std": 0.45774370431900024, "step": 886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 123.265625, "completions/mean_terminated_length": 123.265625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.30984193520216574, "frac_reward_zero_std": 0.0, "grad_norm": 0.2554667294025421, "kl": 3.505859375, "learning_rate": 5e-05, "loss": 0.0335, "num_tokens": 20812366.0, "reward": 6.8251953125, "reward_std": 0.19955241680145264, "rewards/helpfulness_reward/mean": 6.8251953125, "rewards/helpfulness_reward/std": 0.4817574620246887, "rewards/safety_reward/mean": 8.136962890625, "rewards/safety_reward/std": 0.40692368149757385, "step": 887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 122.2109375, "completions/mean_terminated_length": 122.2109375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.3101912496725177, "frac_reward_zero_std": 0.0, "grad_norm": 0.2629694938659668, "kl": 3.5625, "learning_rate": 5e-05, "loss": 0.0276, "num_tokens": 20831209.0, "reward": 6.858154296875, "reward_std": 0.20524175465106964, "rewards/helpfulness_reward/mean": 6.858154296875, "rewards/helpfulness_reward/std": 0.5411223769187927, "rewards/safety_reward/mean": 8.23046875, "rewards/safety_reward/std": 0.5671966075897217, "step": 888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 123.7421875, "completions/mean_terminated_length": 123.7421875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.3105405641428696, "frac_reward_zero_std": 0.0, "grad_norm": 0.27946892380714417, "kl": 3.498046875, "learning_rate": 5e-05, "loss": 0.0347, "num_tokens": 20850848.0, "reward": 6.980224609375, "reward_std": 0.2063683569431305, "rewards/helpfulness_reward/mean": 6.980224609375, "rewards/helpfulness_reward/std": 0.5733703374862671, "rewards/safety_reward/mean": 8.346923828125, "rewards/safety_reward/std": 0.5594959855079651, "step": 889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 122.921875, "completions/mean_terminated_length": 122.921875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.31088987861322154, "frac_reward_zero_std": 0.0, "grad_norm": 0.3490521013736725, "kl": 3.689453125, "learning_rate": 5e-05, "loss": 0.0321, "num_tokens": 20870446.0, "reward": 6.692138671875, "reward_std": 0.20793838798999786, "rewards/helpfulness_reward/mean": 6.692138671875, "rewards/helpfulness_reward/std": 0.5039077401161194, "rewards/safety_reward/mean": 8.354736328125, "rewards/safety_reward/std": 0.3878140151500702, "step": 890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 123.5078125, "completions/mean_terminated_length": 123.5078125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.3112391930835735, "frac_reward_zero_std": 0.0, "grad_norm": 0.4262334704399109, "kl": 3.75390625, "learning_rate": 5e-05, "loss": 0.0362, "num_tokens": 20890623.0, "reward": 6.854248046875, "reward_std": 0.24138566851615906, "rewards/helpfulness_reward/mean": 6.854248046875, "rewards/helpfulness_reward/std": 0.46858227252960205, "rewards/safety_reward/mean": 8.396728515625, "rewards/safety_reward/std": 0.3389985263347626, "step": 891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 123.9453125, "completions/mean_terminated_length": 123.9453125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.3115885075539254, "frac_reward_zero_std": 0.0, "grad_norm": 0.28549519181251526, "kl": 3.75, "learning_rate": 5e-05, "loss": 0.0344, "num_tokens": 20911016.0, "reward": 6.866455078125, "reward_std": 0.2624770700931549, "rewards/helpfulness_reward/mean": 6.866455078125, "rewards/helpfulness_reward/std": 0.5554389953613281, "rewards/safety_reward/mean": 8.3916015625, "rewards/safety_reward/std": 0.39703041315078735, "step": 892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 124.0, "completions/mean_terminated_length": 124.0, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.31193782202427733, "frac_reward_zero_std": 0.0, "grad_norm": 1.0999526977539062, "kl": 4.30078125, "learning_rate": 5e-05, "loss": 0.045, "num_tokens": 20931168.0, "reward": 6.695556640625, "reward_std": 0.24250847101211548, "rewards/helpfulness_reward/mean": 6.695556640625, "rewards/helpfulness_reward/std": 0.5487050414085388, "rewards/safety_reward/mean": 8.183837890625, "rewards/safety_reward/std": 0.5272709131240845, "step": 893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 116.0078125, "completions/mean_terminated_length": 116.0078125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.3122871364946293, "frac_reward_zero_std": 0.0, "grad_norm": 0.3330245018005371, "kl": 3.736328125, "learning_rate": 5e-05, "loss": 0.0501, "num_tokens": 20952817.0, "reward": 6.421133041381836, "reward_std": 0.23781496286392212, "rewards/helpfulness_reward/mean": 6.421133041381836, "rewards/helpfulness_reward/std": 1.2776652574539185, "rewards/safety_reward/mean": 7.602996826171875, "rewards/safety_reward/std": 1.5984879732131958, "step": 894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 122.640625, "completions/mean_terminated_length": 122.640625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.3126364509649812, "frac_reward_zero_std": 0.0, "grad_norm": 0.25533342361450195, "kl": 3.57421875, "learning_rate": 5e-05, "loss": 0.0285, "num_tokens": 20972827.0, "reward": 6.746337890625, "reward_std": 0.2466278225183487, "rewards/helpfulness_reward/mean": 6.746337890625, "rewards/helpfulness_reward/std": 0.7023671865463257, "rewards/safety_reward/mean": 8.1103515625, "rewards/safety_reward/std": 0.8851608633995056, "step": 895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 123.21875, "completions/mean_terminated_length": 123.21875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.3129857654353332, "frac_reward_zero_std": 0.0, "grad_norm": 0.4437929093837738, "kl": 3.962890625, "learning_rate": 5e-05, "loss": 0.0359, "num_tokens": 20992999.0, "reward": 6.756591796875, "reward_std": 0.1919289082288742, "rewards/helpfulness_reward/mean": 6.756591796875, "rewards/helpfulness_reward/std": 0.7737945914268494, "rewards/safety_reward/mean": 8.295654296875, "rewards/safety_reward/std": 0.5219249129295349, "step": 896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 122.953125, "completions/mean_terminated_length": 122.953125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.3133350799056851, "frac_reward_zero_std": 0.0, "grad_norm": 0.3194538354873657, "kl": 3.828125, "learning_rate": 5e-05, "loss": 0.0303, "num_tokens": 21012793.0, "reward": 6.81494140625, "reward_std": 0.21317172050476074, "rewards/helpfulness_reward/mean": 6.81494140625, "rewards/helpfulness_reward/std": 0.571983814239502, "rewards/safety_reward/mean": 8.253173828125, "rewards/safety_reward/std": 0.35765841603279114, "step": 897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 123.5546875, "completions/mean_terminated_length": 123.5546875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.313684394376037, "frac_reward_zero_std": 0.0, "grad_norm": 0.2906832993030548, "kl": 3.603515625, "learning_rate": 5e-05, "loss": 0.0353, "num_tokens": 21032208.0, "reward": 6.79443359375, "reward_std": 0.26963552832603455, "rewards/helpfulness_reward/mean": 6.79443359375, "rewards/helpfulness_reward/std": 0.4318855404853821, "rewards/safety_reward/mean": 8.185791015625, "rewards/safety_reward/std": 0.3760710656642914, "step": 898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 123.28125, "completions/mean_terminated_length": 123.28125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.314033708846389, "frac_reward_zero_std": 0.0, "grad_norm": 0.36667224764823914, "kl": 3.732421875, "learning_rate": 5e-05, "loss": 0.0344, "num_tokens": 21051540.0, "reward": 6.986328125, "reward_std": 0.2930539846420288, "rewards/helpfulness_reward/mean": 6.986328125, "rewards/helpfulness_reward/std": 0.5730602145195007, "rewards/safety_reward/mean": 8.39306640625, "rewards/safety_reward/std": 0.5684777498245239, "step": 899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 121.765625, "completions/mean_terminated_length": 121.765625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.3143830233167409, "frac_reward_zero_std": 0.0, "grad_norm": 0.344419002532959, "kl": 3.701171875, "learning_rate": 5e-05, "loss": 0.0313, "num_tokens": 21070630.0, "reward": 6.638427734375, "reward_std": 0.3492825925350189, "rewards/helpfulness_reward/mean": 6.638427734375, "rewards/helpfulness_reward/std": 0.5324959754943848, "rewards/safety_reward/mean": 8.119384765625, "rewards/safety_reward/std": 0.5572043061256409, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 122.375, "completions/mean_terminated_length": 122.375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.3147323377870928, "frac_reward_zero_std": 0.0, "grad_norm": 0.2501136064529419, "kl": 3.572265625, "learning_rate": 5e-05, "loss": 0.0302, "num_tokens": 21090718.0, "reward": 6.722900390625, "reward_std": 0.25474393367767334, "rewards/helpfulness_reward/mean": 6.722900390625, "rewards/helpfulness_reward/std": 0.4686330258846283, "rewards/safety_reward/mean": 8.1416015625, "rewards/safety_reward/std": 0.56844162940979, "step": 901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 122.765625, "completions/mean_terminated_length": 122.765625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.3150816522574448, "frac_reward_zero_std": 0.0, "grad_norm": 0.4532710015773773, "kl": 3.9140625, "learning_rate": 5e-05, "loss": 0.0294, "num_tokens": 21112008.0, "reward": 6.6085205078125, "reward_std": 0.35834264755249023, "rewards/helpfulness_reward/mean": 6.6085205078125, "rewards/helpfulness_reward/std": 0.7989464998245239, "rewards/safety_reward/mean": 7.943603515625, "rewards/safety_reward/std": 0.8402848243713379, "step": 902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 122.140625, "completions/mean_terminated_length": 122.140625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.3154309667277967, "frac_reward_zero_std": 0.0, "grad_norm": 0.28203606605529785, "kl": 3.658203125, "learning_rate": 5e-05, "loss": 0.0301, "num_tokens": 21133146.0, "reward": 6.529052734375, "reward_std": 0.2713039517402649, "rewards/helpfulness_reward/mean": 6.529052734375, "rewards/helpfulness_reward/std": 0.6650224328041077, "rewards/safety_reward/mean": 8.03271484375, "rewards/safety_reward/std": 0.6654926538467407, "step": 903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 122.4609375, "completions/mean_terminated_length": 122.4609375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.31578028119814866, "frac_reward_zero_std": 0.0, "grad_norm": 0.29974064230918884, "kl": 3.865234375, "learning_rate": 5e-05, "loss": 0.0286, "num_tokens": 21153741.0, "reward": 6.723876953125, "reward_std": 0.3074558973312378, "rewards/helpfulness_reward/mean": 6.723876953125, "rewards/helpfulness_reward/std": 0.443379670381546, "rewards/safety_reward/mean": 8.164306640625, "rewards/safety_reward/std": 0.5138677954673767, "step": 904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 122.6484375, "completions/mean_terminated_length": 122.6484375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.3161295956685006, "frac_reward_zero_std": 0.0, "grad_norm": 0.3163100779056549, "kl": 3.513671875, "learning_rate": 5e-05, "loss": 0.031, "num_tokens": 21175704.0, "reward": 6.6025390625, "reward_std": 0.26529446244239807, "rewards/helpfulness_reward/mean": 6.6025390625, "rewards/helpfulness_reward/std": 0.9873296618461609, "rewards/safety_reward/mean": 8.0146484375, "rewards/safety_reward/std": 0.8389466404914856, "step": 905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 122.6015625, "completions/mean_terminated_length": 122.6015625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.3164789101388525, "frac_reward_zero_std": 0.0, "grad_norm": 0.2831648290157318, "kl": 3.494140625, "learning_rate": 5e-05, "loss": 0.0227, "num_tokens": 21195749.0, "reward": 6.5556640625, "reward_std": 0.2273041307926178, "rewards/helpfulness_reward/mean": 6.5556640625, "rewards/helpfulness_reward/std": 0.4721568822860718, "rewards/safety_reward/mean": 7.974609375, "rewards/safety_reward/std": 0.429817795753479, "step": 906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 122.96875, "completions/mean_terminated_length": 122.96875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.31682822460920446, "frac_reward_zero_std": 0.0, "grad_norm": 0.4426369071006775, "kl": 3.7734375, "learning_rate": 5e-05, "loss": 0.0317, "num_tokens": 21215377.0, "reward": 6.8671875, "reward_std": 0.3123186230659485, "rewards/helpfulness_reward/mean": 6.8671875, "rewards/helpfulness_reward/std": 0.5435482859611511, "rewards/safety_reward/mean": 8.239013671875, "rewards/safety_reward/std": 0.39034369587898254, "step": 907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 123.5703125, "completions/mean_terminated_length": 123.5703125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.31717753907955637, "frac_reward_zero_std": 0.0, "grad_norm": 0.4057593047618866, "kl": 3.9765625, "learning_rate": 5e-05, "loss": 0.0363, "num_tokens": 21235834.0, "reward": 6.563720703125, "reward_std": 0.32661598920822144, "rewards/helpfulness_reward/mean": 6.563720703125, "rewards/helpfulness_reward/std": 0.7513561844825745, "rewards/safety_reward/mean": 7.984619140625, "rewards/safety_reward/std": 0.6774951219558716, "step": 908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 122.5859375, "completions/mean_terminated_length": 122.5859375, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.3175268535499083, "frac_reward_zero_std": 0.0, "grad_norm": 0.41203686594963074, "kl": 3.513671875, "learning_rate": 5e-05, "loss": 0.0344, "num_tokens": 21256981.0, "reward": 6.329899787902832, "reward_std": 0.3828624188899994, "rewards/helpfulness_reward/mean": 6.329899787902832, "rewards/helpfulness_reward/std": 1.5810472965240479, "rewards/safety_reward/mean": 7.566131591796875, "rewards/safety_reward/std": 2.047659158706665, "step": 909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 123.5390625, "completions/mean_terminated_length": 123.5390625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.31787616802026025, "frac_reward_zero_std": 0.0, "grad_norm": 0.44696998596191406, "kl": 3.802734375, "learning_rate": 5e-05, "loss": 0.0316, "num_tokens": 21277650.0, "reward": 6.7255859375, "reward_std": 0.3177916705608368, "rewards/helpfulness_reward/mean": 6.7255859375, "rewards/helpfulness_reward/std": 0.5093541741371155, "rewards/safety_reward/mean": 8.044921875, "rewards/safety_reward/std": 0.5231440663337708, "step": 910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 123.375, "completions/mean_terminated_length": 123.375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.31822548249061217, "frac_reward_zero_std": 0.0, "grad_norm": 0.24943965673446655, "kl": 3.634765625, "learning_rate": 5e-05, "loss": 0.0353, "num_tokens": 21297874.0, "reward": 6.5172119140625, "reward_std": 0.4359835386276245, "rewards/helpfulness_reward/mean": 6.5172119140625, "rewards/helpfulness_reward/std": 0.7158364057540894, "rewards/safety_reward/mean": 8.1376953125, "rewards/safety_reward/std": 0.8071107268333435, "step": 911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 123.453125, "completions/mean_terminated_length": 123.453125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.31857479696096414, "frac_reward_zero_std": 0.0, "grad_norm": 0.3687186539173126, "kl": 3.818359375, "learning_rate": 5e-05, "loss": 0.0332, "num_tokens": 21319636.0, "reward": 6.7001953125, "reward_std": 0.20527467131614685, "rewards/helpfulness_reward/mean": 6.7001953125, "rewards/helpfulness_reward/std": 0.45538589358329773, "rewards/safety_reward/mean": 8.208251953125, "rewards/safety_reward/std": 0.34204068779945374, "step": 912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 123.296875, "completions/mean_terminated_length": 123.296875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.31892411143131605, "frac_reward_zero_std": 0.0, "grad_norm": 0.3273080587387085, "kl": 3.810546875, "learning_rate": 5e-05, "loss": 0.0285, "num_tokens": 21340066.0, "reward": 6.770263671875, "reward_std": 0.28819575905799866, "rewards/helpfulness_reward/mean": 6.770263671875, "rewards/helpfulness_reward/std": 0.5511986613273621, "rewards/safety_reward/mean": 8.06298828125, "rewards/safety_reward/std": 0.5275094509124756, "step": 913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 123.046875, "completions/mean_terminated_length": 123.046875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.31927342590166796, "frac_reward_zero_std": 0.0, "grad_norm": 0.31972891092300415, "kl": 3.73046875, "learning_rate": 5e-05, "loss": 0.0281, "num_tokens": 21359504.0, "reward": 6.975830078125, "reward_std": 0.40338218212127686, "rewards/helpfulness_reward/mean": 6.975830078125, "rewards/helpfulness_reward/std": 0.7356862425804138, "rewards/safety_reward/mean": 8.3125, "rewards/safety_reward/std": 0.8083822131156921, "step": 914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 123.5234375, "completions/mean_terminated_length": 123.5234375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.31962274037201993, "frac_reward_zero_std": 0.0, "grad_norm": 0.2813856899738312, "kl": 3.572265625, "learning_rate": 5e-05, "loss": 0.029, "num_tokens": 21378859.0, "reward": 6.9461669921875, "reward_std": 0.330610454082489, "rewards/helpfulness_reward/mean": 6.9461669921875, "rewards/helpfulness_reward/std": 0.5407072901725769, "rewards/safety_reward/mean": 8.17431640625, "rewards/safety_reward/std": 0.5121345520019531, "step": 915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 123.625, "completions/mean_terminated_length": 123.625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.31997205484237184, "frac_reward_zero_std": 0.0, "grad_norm": 0.2686629295349121, "kl": 3.716796875, "learning_rate": 5e-05, "loss": 0.0269, "num_tokens": 21400643.0, "reward": 6.50390625, "reward_std": 0.31599345803260803, "rewards/helpfulness_reward/mean": 6.50390625, "rewards/helpfulness_reward/std": 0.6240642666816711, "rewards/safety_reward/mean": 8.07275390625, "rewards/safety_reward/std": 0.4786456823348999, "step": 916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 123.2421875, "completions/mean_terminated_length": 123.2421875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.32032136931272376, "frac_reward_zero_std": 0.0, "grad_norm": 0.3282928168773651, "kl": 3.640625, "learning_rate": 5e-05, "loss": 0.0367, "num_tokens": 21420026.0, "reward": 6.76025390625, "reward_std": 0.33678823709487915, "rewards/helpfulness_reward/mean": 6.76025390625, "rewards/helpfulness_reward/std": 0.6359536051750183, "rewards/safety_reward/mean": 8.116943359375, "rewards/safety_reward/std": 0.5334130525588989, "step": 917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 123.8203125, "completions/mean_terminated_length": 123.8203125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.3206706837830757, "frac_reward_zero_std": 0.0, "grad_norm": 0.2851731479167938, "kl": 3.6875, "learning_rate": 5e-05, "loss": 0.0342, "num_tokens": 21440419.0, "reward": 6.900634765625, "reward_std": 0.29250550270080566, "rewards/helpfulness_reward/mean": 6.900634765625, "rewards/helpfulness_reward/std": 0.5874083042144775, "rewards/safety_reward/mean": 8.111328125, "rewards/safety_reward/std": 0.6230129599571228, "step": 918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 123.6015625, "completions/mean_terminated_length": 123.6015625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.32101999825342764, "frac_reward_zero_std": 0.0, "grad_norm": 0.345880925655365, "kl": 3.67578125, "learning_rate": 5e-05, "loss": 0.0337, "num_tokens": 21460872.0, "reward": 6.8360595703125, "reward_std": 0.2998114824295044, "rewards/helpfulness_reward/mean": 6.8360595703125, "rewards/helpfulness_reward/std": 0.6785187125205994, "rewards/safety_reward/mean": 8.247314453125, "rewards/safety_reward/std": 0.5598970055580139, "step": 919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 123.6953125, "completions/mean_terminated_length": 123.6953125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.3213693127237796, "frac_reward_zero_std": 0.0, "grad_norm": 0.2489052712917328, "kl": 3.669921875, "learning_rate": 5e-05, "loss": 0.0359, "num_tokens": 21480281.0, "reward": 6.94091796875, "reward_std": 0.2694767117500305, "rewards/helpfulness_reward/mean": 6.94091796875, "rewards/helpfulness_reward/std": 0.5608878135681152, "rewards/safety_reward/mean": 8.234130859375, "rewards/safety_reward/std": 0.507706880569458, "step": 920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 123.5546875, "completions/mean_terminated_length": 123.5546875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.3217186271941315, "frac_reward_zero_std": 0.0, "grad_norm": 0.7947971224784851, "kl": 4.013671875, "learning_rate": 5e-05, "loss": 0.0344, "num_tokens": 21501296.0, "reward": 6.6649169921875, "reward_std": 0.27388253808021545, "rewards/helpfulness_reward/mean": 6.6649169921875, "rewards/helpfulness_reward/std": 0.7031887173652649, "rewards/safety_reward/mean": 8.162353515625, "rewards/safety_reward/std": 0.5857253670692444, "step": 921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 123.3125, "completions/mean_terminated_length": 123.3125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.32206794166448344, "frac_reward_zero_std": 0.0, "grad_norm": 0.2937372028827667, "kl": 3.794921875, "learning_rate": 5e-05, "loss": 0.0312, "num_tokens": 21522344.0, "reward": 6.7740478515625, "reward_std": 0.2925602197647095, "rewards/helpfulness_reward/mean": 6.7740478515625, "rewards/helpfulness_reward/std": 0.6683677434921265, "rewards/safety_reward/mean": 8.1064453125, "rewards/safety_reward/std": 0.6523946523666382, "step": 922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 122.984375, "completions/mean_terminated_length": 122.984375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.3224172561348354, "frac_reward_zero_std": 0.0, "grad_norm": 0.39254364371299744, "kl": 3.76171875, "learning_rate": 5e-05, "loss": 0.032, "num_tokens": 21542062.0, "reward": 6.9267578125, "reward_std": 0.23941022157669067, "rewards/helpfulness_reward/mean": 6.9267578125, "rewards/helpfulness_reward/std": 0.6017506122589111, "rewards/safety_reward/mean": 8.407958984375, "rewards/safety_reward/std": 0.39808204770088196, "step": 923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 123.6171875, "completions/mean_terminated_length": 123.6171875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.3227665706051873, "frac_reward_zero_std": 0.0, "grad_norm": 0.3180893063545227, "kl": 3.490234375, "learning_rate": 5e-05, "loss": 0.0286, "num_tokens": 21562861.0, "reward": 6.90771484375, "reward_std": 0.22618591785430908, "rewards/helpfulness_reward/mean": 6.90771484375, "rewards/helpfulness_reward/std": 0.5766465067863464, "rewards/safety_reward/mean": 8.18701171875, "rewards/safety_reward/std": 0.6774042248725891, "step": 924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 123.859375, "completions/mean_terminated_length": 123.859375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.32311588507553923, "frac_reward_zero_std": 0.0, "grad_norm": 0.3134085237979889, "kl": 3.6953125, "learning_rate": 5e-05, "loss": 0.0327, "num_tokens": 21582851.0, "reward": 7.0350341796875, "reward_std": 0.310152530670166, "rewards/helpfulness_reward/mean": 7.0350341796875, "rewards/helpfulness_reward/std": 0.6762988567352295, "rewards/safety_reward/mean": 8.23828125, "rewards/safety_reward/std": 0.6823920011520386, "step": 925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 123.7421875, "completions/mean_terminated_length": 123.7421875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.3234651995458912, "frac_reward_zero_std": 0.0, "grad_norm": 0.3234570622444153, "kl": 3.681640625, "learning_rate": 5e-05, "loss": 0.0338, "num_tokens": 21603634.0, "reward": 6.849365234375, "reward_std": 0.30904191732406616, "rewards/helpfulness_reward/mean": 6.849365234375, "rewards/helpfulness_reward/std": 0.6123653054237366, "rewards/safety_reward/mean": 8.38134765625, "rewards/safety_reward/std": 0.5475562810897827, "step": 926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 123.484375, "completions/mean_terminated_length": 123.484375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.3238145140162431, "frac_reward_zero_std": 0.0, "grad_norm": 0.30880215764045715, "kl": 3.59375, "learning_rate": 5e-05, "loss": 0.0376, "num_tokens": 21626000.0, "reward": 6.509765625, "reward_std": 0.29651209712028503, "rewards/helpfulness_reward/mean": 6.509765625, "rewards/helpfulness_reward/std": 0.8182597160339355, "rewards/safety_reward/mean": 8.172607421875, "rewards/safety_reward/std": 0.6213719248771667, "step": 927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 123.0859375, "completions/mean_terminated_length": 123.0859375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.3241638284865951, "frac_reward_zero_std": 0.0, "grad_norm": 0.2551758289337158, "kl": 3.626953125, "learning_rate": 5e-05, "loss": 0.0333, "num_tokens": 21646195.0, "reward": 6.8438720703125, "reward_std": 0.23314666748046875, "rewards/helpfulness_reward/mean": 6.8438720703125, "rewards/helpfulness_reward/std": 0.7227627038955688, "rewards/safety_reward/mean": 8.321044921875, "rewards/safety_reward/std": 0.6746827363967896, "step": 928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 122.96875, "completions/mean_terminated_length": 122.96875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.324513142956947, "frac_reward_zero_std": 0.0, "grad_norm": 0.5538408756256104, "kl": 3.853515625, "learning_rate": 5e-05, "loss": 0.0352, "num_tokens": 21666631.0, "reward": 6.91259765625, "reward_std": 0.1993541717529297, "rewards/helpfulness_reward/mean": 6.91259765625, "rewards/helpfulness_reward/std": 0.5630378127098083, "rewards/safety_reward/mean": 8.20703125, "rewards/safety_reward/std": 0.5137144327163696, "step": 929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 123.3828125, "completions/mean_terminated_length": 123.3828125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.3248624574272989, "frac_reward_zero_std": 0.0, "grad_norm": 0.3850196599960327, "kl": 3.880859375, "learning_rate": 5e-05, "loss": 0.0394, "num_tokens": 21686416.0, "reward": 6.9742431640625, "reward_std": 0.2766277492046356, "rewards/helpfulness_reward/mean": 6.9742431640625, "rewards/helpfulness_reward/std": 0.6026917695999146, "rewards/safety_reward/mean": 8.37841796875, "rewards/safety_reward/std": 0.5948980450630188, "step": 930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 123.171875, "completions/mean_terminated_length": 123.171875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.3252117718976509, "frac_reward_zero_std": 0.0, "grad_norm": 0.27959954738616943, "kl": 3.6484375, "learning_rate": 5e-05, "loss": 0.0348, "num_tokens": 21706374.0, "reward": 7.01904296875, "reward_std": 0.2188147008419037, "rewards/helpfulness_reward/mean": 7.01904296875, "rewards/helpfulness_reward/std": 0.42603498697280884, "rewards/safety_reward/mean": 8.32861328125, "rewards/safety_reward/std": 0.4177742302417755, "step": 931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 122.9765625, "completions/mean_terminated_length": 122.9765625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.3255610863680028, "frac_reward_zero_std": 0.0, "grad_norm": 0.2574630677700043, "kl": 3.658203125, "learning_rate": 5e-05, "loss": 0.027, "num_tokens": 21726043.0, "reward": 7.147705078125, "reward_std": 0.3097951114177704, "rewards/helpfulness_reward/mean": 7.147705078125, "rewards/helpfulness_reward/std": 0.4926832318305969, "rewards/safety_reward/mean": 8.460693359375, "rewards/safety_reward/std": 0.61714106798172, "step": 932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 123.15625, "completions/mean_terminated_length": 123.15625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.3259104008383547, "frac_reward_zero_std": 0.0, "grad_norm": 0.2858911454677582, "kl": 3.658203125, "learning_rate": 5e-05, "loss": 0.0327, "num_tokens": 21745823.0, "reward": 6.97802734375, "reward_std": 0.2916627824306488, "rewards/helpfulness_reward/mean": 6.97802734375, "rewards/helpfulness_reward/std": 0.7267703413963318, "rewards/safety_reward/mean": 8.37841796875, "rewards/safety_reward/std": 0.6058106422424316, "step": 933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 123.3828125, "completions/mean_terminated_length": 123.3828125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.3262597153087067, "frac_reward_zero_std": 0.0, "grad_norm": 0.3245081305503845, "kl": 3.599609375, "learning_rate": 5e-05, "loss": 0.028, "num_tokens": 21765960.0, "reward": 7.080078125, "reward_std": 0.31896400451660156, "rewards/helpfulness_reward/mean": 7.080078125, "rewards/helpfulness_reward/std": 0.6049026250839233, "rewards/safety_reward/mean": 8.21630859375, "rewards/safety_reward/std": 0.6494789123535156, "step": 934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 123.3046875, "completions/mean_terminated_length": 123.3046875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.3266090297790586, "frac_reward_zero_std": 0.0, "grad_norm": 0.5974668264389038, "kl": 3.83984375, "learning_rate": 5e-05, "loss": 0.0353, "num_tokens": 21787055.0, "reward": 6.851318359375, "reward_std": 0.2342110574245453, "rewards/helpfulness_reward/mean": 6.851318359375, "rewards/helpfulness_reward/std": 0.749443531036377, "rewards/safety_reward/mean": 8.231201171875, "rewards/safety_reward/std": 0.545332670211792, "step": 935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 122.96875, "completions/mean_terminated_length": 122.96875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.3269583442494105, "frac_reward_zero_std": 0.0, "grad_norm": 0.25280261039733887, "kl": 3.80078125, "learning_rate": 5e-05, "loss": 0.032, "num_tokens": 21806795.0, "reward": 7.0244140625, "reward_std": 0.275603711605072, "rewards/helpfulness_reward/mean": 7.0244140625, "rewards/helpfulness_reward/std": 0.5868386030197144, "rewards/safety_reward/mean": 8.334228515625, "rewards/safety_reward/std": 0.5326241254806519, "step": 936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 123.6484375, "completions/mean_terminated_length": 123.6484375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.3273076587197625, "frac_reward_zero_std": 0.0, "grad_norm": 0.2962500751018524, "kl": 3.76953125, "learning_rate": 5e-05, "loss": 0.0349, "num_tokens": 21828446.0, "reward": 6.81396484375, "reward_std": 0.2557346224784851, "rewards/helpfulness_reward/mean": 6.81396484375, "rewards/helpfulness_reward/std": 0.747482419013977, "rewards/safety_reward/mean": 8.218994140625, "rewards/safety_reward/std": 0.579886257648468, "step": 937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 124.2734375, "completions/mean_terminated_length": 124.2734375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.3276569731901144, "frac_reward_zero_std": 0.0, "grad_norm": 0.40900641679763794, "kl": 3.76953125, "learning_rate": 5e-05, "loss": 0.0389, "num_tokens": 21848841.0, "reward": 6.976806640625, "reward_std": 0.23174187541007996, "rewards/helpfulness_reward/mean": 6.976806640625, "rewards/helpfulness_reward/std": 0.45960143208503723, "rewards/safety_reward/mean": 8.3115234375, "rewards/safety_reward/std": 0.5793880224227905, "step": 938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 123.78125, "completions/mean_terminated_length": 123.78125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.32800628766046636, "frac_reward_zero_std": 0.0, "grad_norm": 0.2994992733001709, "kl": 3.65625, "learning_rate": 5e-05, "loss": 0.0397, "num_tokens": 21868549.0, "reward": 6.914703369140625, "reward_std": 0.4414321184158325, "rewards/helpfulness_reward/mean": 6.914703369140625, "rewards/helpfulness_reward/std": 0.8343000411987305, "rewards/safety_reward/mean": 8.088226318359375, "rewards/safety_reward/std": 0.8940563201904297, "step": 939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 123.5859375, "completions/mean_terminated_length": 123.5859375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.32835560213081827, "frac_reward_zero_std": 0.0, "grad_norm": 0.6821057796478271, "kl": 3.8984375, "learning_rate": 5e-05, "loss": 0.0337, "num_tokens": 21891320.0, "reward": 6.88720703125, "reward_std": 0.3164084255695343, "rewards/helpfulness_reward/mean": 6.88720703125, "rewards/helpfulness_reward/std": 0.5884912014007568, "rewards/safety_reward/mean": 8.373291015625, "rewards/safety_reward/std": 0.5916977524757385, "step": 940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 123.8125, "completions/mean_terminated_length": 123.8125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.3287049166011702, "frac_reward_zero_std": 0.0, "grad_norm": 0.23114962875843048, "kl": 3.546875, "learning_rate": 5e-05, "loss": 0.0327, "num_tokens": 21913256.0, "reward": 7.143798828125, "reward_std": 0.30146780610084534, "rewards/helpfulness_reward/mean": 7.143798828125, "rewards/helpfulness_reward/std": 0.6447106003761292, "rewards/safety_reward/mean": 8.255126953125, "rewards/safety_reward/std": 0.77626633644104, "step": 941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 124.015625, "completions/mean_terminated_length": 124.015625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.32905423107152215, "frac_reward_zero_std": 0.0, "grad_norm": 3.6338186264038086, "kl": 4.28515625, "learning_rate": 5e-05, "loss": 0.0441, "num_tokens": 21936418.0, "reward": 6.5377349853515625, "reward_std": 0.37215664982795715, "rewards/helpfulness_reward/mean": 6.5377349853515625, "rewards/helpfulness_reward/std": 1.4548107385635376, "rewards/safety_reward/mean": 8.1190185546875, "rewards/safety_reward/std": 1.3849714994430542, "step": 942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 123.578125, "completions/mean_terminated_length": 123.578125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.32940354554187407, "frac_reward_zero_std": 0.0, "grad_norm": 0.3179508447647095, "kl": 3.703125, "learning_rate": 5e-05, "loss": 0.0359, "num_tokens": 21958164.0, "reward": 6.6248779296875, "reward_std": 0.22465139627456665, "rewards/helpfulness_reward/mean": 6.6248779296875, "rewards/helpfulness_reward/std": 0.9056271910667419, "rewards/safety_reward/mean": 7.906982421875, "rewards/safety_reward/std": 0.8478952050209045, "step": 943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 123.8046875, "completions/mean_terminated_length": 123.8046875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.329752860012226, "frac_reward_zero_std": 0.0, "grad_norm": 0.2726781666278839, "kl": 3.6015625, "learning_rate": 5e-05, "loss": 0.0336, "num_tokens": 21977971.0, "reward": 6.826171875, "reward_std": 0.1954975128173828, "rewards/helpfulness_reward/mean": 6.826171875, "rewards/helpfulness_reward/std": 0.6686843633651733, "rewards/safety_reward/mean": 8.1572265625, "rewards/safety_reward/std": 0.48136627674102783, "step": 944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 123.859375, "completions/mean_terminated_length": 123.859375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.33010217448257795, "frac_reward_zero_std": 0.0, "grad_norm": 0.2543465793132782, "kl": 3.708984375, "learning_rate": 5e-05, "loss": 0.0329, "num_tokens": 21998089.0, "reward": 7.0118408203125, "reward_std": 0.3375548720359802, "rewards/helpfulness_reward/mean": 7.0118408203125, "rewards/helpfulness_reward/std": 0.613500714302063, "rewards/safety_reward/mean": 8.336181640625, "rewards/safety_reward/std": 0.6142749190330505, "step": 945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 123.890625, "completions/mean_terminated_length": 123.890625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.33045148895292986, "frac_reward_zero_std": 0.0, "grad_norm": 0.32520100474357605, "kl": 3.697265625, "learning_rate": 5e-05, "loss": 0.0386, "num_tokens": 22019267.0, "reward": 6.8262939453125, "reward_std": 0.38342034816741943, "rewards/helpfulness_reward/mean": 6.8262939453125, "rewards/helpfulness_reward/std": 0.5780755281448364, "rewards/safety_reward/mean": 8.21142578125, "rewards/safety_reward/std": 0.5488064885139465, "step": 946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 123.03125, "completions/mean_terminated_length": 123.03125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.33080080342328183, "frac_reward_zero_std": 0.0, "grad_norm": 0.2776974141597748, "kl": 3.9140625, "learning_rate": 5e-05, "loss": 0.0312, "num_tokens": 22039583.0, "reward": 6.949951171875, "reward_std": 0.31248611211776733, "rewards/helpfulness_reward/mean": 6.949951171875, "rewards/helpfulness_reward/std": 0.5893412828445435, "rewards/safety_reward/mean": 8.27001953125, "rewards/safety_reward/std": 0.5135757327079773, "step": 947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 123.3125, "completions/mean_terminated_length": 123.3125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.33115011789363374, "frac_reward_zero_std": 0.0, "grad_norm": 0.293298602104187, "kl": 3.705078125, "learning_rate": 5e-05, "loss": 0.0307, "num_tokens": 22061711.0, "reward": 6.76953125, "reward_std": 0.24565628170967102, "rewards/helpfulness_reward/mean": 6.76953125, "rewards/helpfulness_reward/std": 0.8362573981285095, "rewards/safety_reward/mean": 8.099609375, "rewards/safety_reward/std": 0.9429528713226318, "step": 948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 123.7734375, "completions/mean_terminated_length": 123.7734375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.33149943236398566, "frac_reward_zero_std": 0.0, "grad_norm": 0.3282162845134735, "kl": 3.751953125, "learning_rate": 5e-05, "loss": 0.0327, "num_tokens": 22082626.0, "reward": 6.6514892578125, "reward_std": 0.5515741109848022, "rewards/helpfulness_reward/mean": 6.6514892578125, "rewards/helpfulness_reward/std": 0.7407373785972595, "rewards/safety_reward/mean": 8.0859375, "rewards/safety_reward/std": 0.685730516910553, "step": 949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 123.078125, "completions/mean_terminated_length": 123.078125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.3318487468343376, "frac_reward_zero_std": 0.0, "grad_norm": 0.2647554278373718, "kl": 3.6640625, "learning_rate": 5e-05, "loss": 0.0298, "num_tokens": 22102444.0, "reward": 6.59033203125, "reward_std": 0.32492703199386597, "rewards/helpfulness_reward/mean": 6.59033203125, "rewards/helpfulness_reward/std": 0.7403022646903992, "rewards/safety_reward/mean": 8.2919921875, "rewards/safety_reward/std": 0.49028512835502625, "step": 950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 122.953125, "completions/mean_terminated_length": 122.953125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.33219806130468954, "frac_reward_zero_std": 0.0, "grad_norm": 0.25937652587890625, "kl": 3.640625, "learning_rate": 5e-05, "loss": 0.0196, "num_tokens": 22121918.0, "reward": 6.938720703125, "reward_std": 0.38436830043792725, "rewards/helpfulness_reward/mean": 6.938720703125, "rewards/helpfulness_reward/std": 0.5833752751350403, "rewards/safety_reward/mean": 8.121826171875, "rewards/safety_reward/std": 0.4666840732097626, "step": 951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 123.8203125, "completions/mean_terminated_length": 123.8203125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.33254737577504145, "frac_reward_zero_std": 0.0, "grad_norm": 0.4492400586605072, "kl": 3.861328125, "learning_rate": 5e-05, "loss": 0.0322, "num_tokens": 22142487.0, "reward": 6.87744140625, "reward_std": 0.2540045976638794, "rewards/helpfulness_reward/mean": 6.87744140625, "rewards/helpfulness_reward/std": 0.5422964096069336, "rewards/safety_reward/mean": 8.356201171875, "rewards/safety_reward/std": 0.45250457525253296, "step": 952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 123.6171875, "completions/mean_terminated_length": 123.6171875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.3328966902453934, "frac_reward_zero_std": 0.0, "grad_norm": 0.2996532917022705, "kl": 3.7890625, "learning_rate": 5e-05, "loss": 0.0303, "num_tokens": 22162446.0, "reward": 6.8681640625, "reward_std": 0.24314451217651367, "rewards/helpfulness_reward/mean": 6.8681640625, "rewards/helpfulness_reward/std": 0.5388302206993103, "rewards/safety_reward/mean": 8.294921875, "rewards/safety_reward/std": 0.451235294342041, "step": 953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 123.390625, "completions/mean_terminated_length": 123.390625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.33324600471574534, "frac_reward_zero_std": 0.0, "grad_norm": 0.2692510485649109, "kl": 3.720703125, "learning_rate": 5e-05, "loss": 0.0289, "num_tokens": 22182032.0, "reward": 7.0107421875, "reward_std": 0.24995574355125427, "rewards/helpfulness_reward/mean": 7.0107421875, "rewards/helpfulness_reward/std": 0.5419535636901855, "rewards/safety_reward/mean": 8.36474609375, "rewards/safety_reward/std": 0.49841514229774475, "step": 954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 124.390625, "completions/mean_terminated_length": 124.390625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.3335953191860973, "frac_reward_zero_std": 0.0, "grad_norm": 1.6374828815460205, "kl": 4.51953125, "learning_rate": 5e-05, "loss": 0.042, "num_tokens": 22202050.0, "reward": 6.9873046875, "reward_std": 0.2518870532512665, "rewards/helpfulness_reward/mean": 6.9873046875, "rewards/helpfulness_reward/std": 0.526991069316864, "rewards/safety_reward/mean": 8.23681640625, "rewards/safety_reward/std": 0.5280722379684448, "step": 955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 124.2421875, "completions/mean_terminated_length": 124.2421875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.3339446336564492, "frac_reward_zero_std": 0.0, "grad_norm": 0.3756209909915924, "kl": 3.9296875, "learning_rate": 5e-05, "loss": 0.0395, "num_tokens": 22224105.0, "reward": 6.776611328125, "reward_std": 0.20533457398414612, "rewards/helpfulness_reward/mean": 6.776611328125, "rewards/helpfulness_reward/std": 0.6815134882926941, "rewards/safety_reward/mean": 8.2900390625, "rewards/safety_reward/std": 0.41426149010658264, "step": 956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 124.0078125, "completions/mean_terminated_length": 124.0078125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.33429394812680113, "frac_reward_zero_std": 0.0, "grad_norm": 1.0364395380020142, "kl": 4.181640625, "learning_rate": 5e-05, "loss": 0.039, "num_tokens": 22244466.0, "reward": 6.96533203125, "reward_std": 0.30000919103622437, "rewards/helpfulness_reward/mean": 6.96533203125, "rewards/helpfulness_reward/std": 0.5669828057289124, "rewards/safety_reward/mean": 8.3505859375, "rewards/safety_reward/std": 0.4937312602996826, "step": 957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 123.8984375, "completions/mean_terminated_length": 123.8984375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.3346432625971531, "frac_reward_zero_std": 0.0, "grad_norm": 0.4478715658187866, "kl": 3.779296875, "learning_rate": 5e-05, "loss": 0.0321, "num_tokens": 22264357.0, "reward": 6.913330078125, "reward_std": 0.28110113739967346, "rewards/helpfulness_reward/mean": 6.913330078125, "rewards/helpfulness_reward/std": 0.511843204498291, "rewards/safety_reward/mean": 8.191162109375, "rewards/safety_reward/std": 0.5269791483879089, "step": 958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 123.5703125, "completions/mean_terminated_length": 123.5703125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.334992577067505, "frac_reward_zero_std": 0.0, "grad_norm": 0.3899720311164856, "kl": 3.81640625, "learning_rate": 5e-05, "loss": 0.0316, "num_tokens": 22284982.0, "reward": 6.928955078125, "reward_std": 0.2692888081073761, "rewards/helpfulness_reward/mean": 6.928955078125, "rewards/helpfulness_reward/std": 0.5182257890701294, "rewards/safety_reward/mean": 8.308837890625, "rewards/safety_reward/std": 0.5418710112571716, "step": 959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 123.65625, "completions/mean_terminated_length": 123.65625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.33534189153785693, "frac_reward_zero_std": 0.0, "grad_norm": 0.29709509015083313, "kl": 3.74609375, "learning_rate": 5e-05, "loss": 0.0316, "num_tokens": 22305138.0, "reward": 6.6401824951171875, "reward_std": 0.4049210846424103, "rewards/helpfulness_reward/mean": 6.6401824951171875, "rewards/helpfulness_reward/std": 0.7393394112586975, "rewards/safety_reward/mean": 7.953765869140625, "rewards/safety_reward/std": 0.8331400156021118, "step": 960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 123.078125, "completions/mean_terminated_length": 123.078125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.3356912060082089, "frac_reward_zero_std": 0.0, "grad_norm": 0.3525257706642151, "kl": 4.001953125, "learning_rate": 5e-05, "loss": 0.0358, "num_tokens": 22325124.0, "reward": 6.870849609375, "reward_std": 0.26582372188568115, "rewards/helpfulness_reward/mean": 6.870849609375, "rewards/helpfulness_reward/std": 0.510627269744873, "rewards/safety_reward/mean": 8.243896484375, "rewards/safety_reward/std": 0.5262554287910461, "step": 961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 122.7578125, "completions/mean_terminated_length": 122.7578125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.3360405204785608, "frac_reward_zero_std": 0.0, "grad_norm": 0.35778099298477173, "kl": 3.912109375, "learning_rate": 5e-05, "loss": 0.0322, "num_tokens": 22345165.0, "reward": 6.69873046875, "reward_std": 0.24610696732997894, "rewards/helpfulness_reward/mean": 6.69873046875, "rewards/helpfulness_reward/std": 0.718410074710846, "rewards/safety_reward/mean": 8.25634765625, "rewards/safety_reward/std": 0.3912835717201233, "step": 962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 123.1015625, "completions/mean_terminated_length": 123.1015625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.3363898349489128, "frac_reward_zero_std": 0.0, "grad_norm": 0.27618667483329773, "kl": 3.689453125, "learning_rate": 5e-05, "loss": 0.0271, "num_tokens": 22364610.0, "reward": 6.9649658203125, "reward_std": 0.27775976061820984, "rewards/helpfulness_reward/mean": 6.9649658203125, "rewards/helpfulness_reward/std": 0.7200239896774292, "rewards/safety_reward/mean": 8.22265625, "rewards/safety_reward/std": 0.5955538749694824, "step": 963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 122.921875, "completions/mean_terminated_length": 122.921875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.3367391494192647, "frac_reward_zero_std": 0.0, "grad_norm": 0.44240617752075195, "kl": 3.873046875, "learning_rate": 5e-05, "loss": 0.0286, "num_tokens": 22386496.0, "reward": 6.8013763427734375, "reward_std": 0.4368356764316559, "rewards/helpfulness_reward/mean": 6.8013763427734375, "rewards/helpfulness_reward/std": 0.8787831664085388, "rewards/safety_reward/mean": 8.1441650390625, "rewards/safety_reward/std": 0.8229898810386658, "step": 964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 123.21875, "completions/mean_terminated_length": 123.21875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.3370884638896166, "frac_reward_zero_std": 0.0, "grad_norm": 0.39530134201049805, "kl": 3.80859375, "learning_rate": 5e-05, "loss": 0.0262, "num_tokens": 22407852.0, "reward": 6.8067626953125, "reward_std": 0.345937043428421, "rewards/helpfulness_reward/mean": 6.8067626953125, "rewards/helpfulness_reward/std": 0.9319303631782532, "rewards/safety_reward/mean": 8.026123046875, "rewards/safety_reward/std": 0.7874974012374878, "step": 965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 123.71875, "completions/mean_terminated_length": 123.71875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.3374377783599686, "frac_reward_zero_std": 0.0, "grad_norm": 0.33235734701156616, "kl": 3.634765625, "learning_rate": 5e-05, "loss": 0.0314, "num_tokens": 22427704.0, "reward": 7.01123046875, "reward_std": 0.3011094033718109, "rewards/helpfulness_reward/mean": 7.01123046875, "rewards/helpfulness_reward/std": 0.493229478597641, "rewards/safety_reward/mean": 8.305419921875, "rewards/safety_reward/std": 0.5511794686317444, "step": 966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 129.0703125, "completions/mean_terminated_length": 129.0703125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.3377870928303205, "frac_reward_zero_std": 0.0, "grad_norm": 0.32559266686439514, "kl": 3.087890625, "learning_rate": 5e-05, "loss": 0.0363, "num_tokens": 22450297.0, "reward": 5.916416168212891, "reward_std": 0.38800984621047974, "rewards/helpfulness_reward/mean": 5.916416168212891, "rewards/helpfulness_reward/std": 2.2159273624420166, "rewards/safety_reward/mean": 7.254638671875, "rewards/safety_reward/std": 2.2672367095947266, "step": 967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 123.296875, "completions/mean_terminated_length": 123.296875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.3381364073006724, "frac_reward_zero_std": 0.0, "grad_norm": 0.3370540738105774, "kl": 4.009765625, "learning_rate": 5e-05, "loss": 0.0328, "num_tokens": 22470191.0, "reward": 6.865234375, "reward_std": 0.3142293691635132, "rewards/helpfulness_reward/mean": 6.865234375, "rewards/helpfulness_reward/std": 0.4986409544944763, "rewards/safety_reward/mean": 8.2705078125, "rewards/safety_reward/std": 0.48260268568992615, "step": 968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 123.6484375, "completions/mean_terminated_length": 123.6484375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.3384857217710244, "frac_reward_zero_std": 0.0, "grad_norm": 0.3001128137111664, "kl": 3.697265625, "learning_rate": 5e-05, "loss": 0.0366, "num_tokens": 22490098.0, "reward": 6.980712890625, "reward_std": 0.3495861887931824, "rewards/helpfulness_reward/mean": 6.980712890625, "rewards/helpfulness_reward/std": 0.7655370235443115, "rewards/safety_reward/mean": 8.404541015625, "rewards/safety_reward/std": 0.6204349398612976, "step": 969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 122.9453125, "completions/mean_terminated_length": 122.9453125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.3388350362413763, "frac_reward_zero_std": 0.0, "grad_norm": 0.31521710753440857, "kl": 3.802734375, "learning_rate": 5e-05, "loss": 0.0262, "num_tokens": 22510211.0, "reward": 6.79327392578125, "reward_std": 0.41820091009140015, "rewards/helpfulness_reward/mean": 6.79327392578125, "rewards/helpfulness_reward/std": 0.7232033610343933, "rewards/safety_reward/mean": 8.23291015625, "rewards/safety_reward/std": 0.5463402271270752, "step": 970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 122.7890625, "completions/mean_terminated_length": 122.7890625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.33918435071172826, "frac_reward_zero_std": 0.0, "grad_norm": 0.32030826807022095, "kl": 3.78125, "learning_rate": 5e-05, "loss": 0.0341, "num_tokens": 22531168.0, "reward": 6.597412109375, "reward_std": 0.30204588174819946, "rewards/helpfulness_reward/mean": 6.597412109375, "rewards/helpfulness_reward/std": 0.5261175036430359, "rewards/safety_reward/mean": 8.066162109375, "rewards/safety_reward/std": 0.5477986335754395, "step": 971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 122.9765625, "completions/mean_terminated_length": 122.9765625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.33953366518208017, "frac_reward_zero_std": 0.0, "grad_norm": 0.27699509263038635, "kl": 3.98828125, "learning_rate": 5e-05, "loss": 0.0281, "num_tokens": 22552805.0, "reward": 6.528564453125, "reward_std": 0.2819410562515259, "rewards/helpfulness_reward/mean": 6.528564453125, "rewards/helpfulness_reward/std": 0.9646178483963013, "rewards/safety_reward/mean": 7.831787109375, "rewards/safety_reward/std": 1.0718462467193604, "step": 972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 123.9921875, "completions/mean_terminated_length": 123.9921875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.3398829796524321, "frac_reward_zero_std": 0.0, "grad_norm": 0.26421764492988586, "kl": 3.91796875, "learning_rate": 5e-05, "loss": 0.0318, "num_tokens": 22572996.0, "reward": 6.750732421875, "reward_std": 0.2769775390625, "rewards/helpfulness_reward/mean": 6.750732421875, "rewards/helpfulness_reward/std": 0.5442193746566772, "rewards/safety_reward/mean": 8.263671875, "rewards/safety_reward/std": 0.5894640684127808, "step": 973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 124.2109375, "completions/mean_terminated_length": 124.2109375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.34023229412278405, "frac_reward_zero_std": 0.0, "grad_norm": 0.30272337794303894, "kl": 3.62890625, "learning_rate": 5e-05, "loss": 0.0364, "num_tokens": 22593071.0, "reward": 6.97509765625, "reward_std": 0.3150978684425354, "rewards/helpfulness_reward/mean": 6.97509765625, "rewards/helpfulness_reward/std": 0.5578515529632568, "rewards/safety_reward/mean": 8.41748046875, "rewards/safety_reward/std": 0.42320090532302856, "step": 974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 123.2734375, "completions/mean_terminated_length": 123.2734375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.34058160859313596, "frac_reward_zero_std": 0.0, "grad_norm": 0.3725462257862091, "kl": 3.802734375, "learning_rate": 5e-05, "loss": 0.0294, "num_tokens": 22612410.0, "reward": 7.036865234375, "reward_std": 0.31536567211151123, "rewards/helpfulness_reward/mean": 7.036865234375, "rewards/helpfulness_reward/std": 0.5476415753364563, "rewards/safety_reward/mean": 8.374267578125, "rewards/safety_reward/std": 0.5017265677452087, "step": 975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 124.03125, "completions/mean_terminated_length": 124.03125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.3409309230634879, "frac_reward_zero_std": 0.0, "grad_norm": 0.2612150013446808, "kl": 3.736328125, "learning_rate": 5e-05, "loss": 0.0358, "num_tokens": 22633718.0, "reward": 6.70849609375, "reward_std": 0.3484098017215729, "rewards/helpfulness_reward/mean": 6.70849609375, "rewards/helpfulness_reward/std": 0.985666811466217, "rewards/safety_reward/mean": 8.0419921875, "rewards/safety_reward/std": 0.9306874871253967, "step": 976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 124.46875, "completions/mean_terminated_length": 124.46875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.34128023753383985, "frac_reward_zero_std": 0.0, "grad_norm": 0.2819872796535492, "kl": 3.87109375, "learning_rate": 5e-05, "loss": 0.0352, "num_tokens": 22653850.0, "reward": 6.80419921875, "reward_std": 0.25498801469802856, "rewards/helpfulness_reward/mean": 6.80419921875, "rewards/helpfulness_reward/std": 0.4671064615249634, "rewards/safety_reward/mean": 8.26171875, "rewards/safety_reward/std": 0.5945330262184143, "step": 977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 123.4765625, "completions/mean_terminated_length": 123.4765625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.34162955200419176, "frac_reward_zero_std": 0.0, "grad_norm": 0.31529298424720764, "kl": 3.62109375, "learning_rate": 5e-05, "loss": 0.0276, "num_tokens": 22674039.0, "reward": 6.586273193359375, "reward_std": 0.5181601047515869, "rewards/helpfulness_reward/mean": 6.586273193359375, "rewards/helpfulness_reward/std": 1.2680777311325073, "rewards/safety_reward/mean": 8.13336181640625, "rewards/safety_reward/std": 1.421878695487976, "step": 978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 124.546875, "completions/mean_terminated_length": 124.546875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.34197886647454373, "frac_reward_zero_std": 0.0, "grad_norm": 1.0407917499542236, "kl": 4.3359375, "learning_rate": 5e-05, "loss": 0.0395, "num_tokens": 22694429.0, "reward": 6.90087890625, "reward_std": 0.24674823880195618, "rewards/helpfulness_reward/mean": 6.90087890625, "rewards/helpfulness_reward/std": 0.5629114508628845, "rewards/safety_reward/mean": 8.222900390625, "rewards/safety_reward/std": 0.5209290385246277, "step": 979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 124.6015625, "completions/mean_terminated_length": 124.6015625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.34232818094489564, "frac_reward_zero_std": 0.0, "grad_norm": 0.3505224287509918, "kl": 3.890625, "learning_rate": 5e-05, "loss": 0.0391, "num_tokens": 22715570.0, "reward": 6.77880859375, "reward_std": 0.273198664188385, "rewards/helpfulness_reward/mean": 6.77880859375, "rewards/helpfulness_reward/std": 0.8118240833282471, "rewards/safety_reward/mean": 8.3310546875, "rewards/safety_reward/std": 0.6410037279129028, "step": 980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 124.1875, "completions/mean_terminated_length": 124.1875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.34267749541524756, "frac_reward_zero_std": 0.0, "grad_norm": 0.2581382393836975, "kl": 3.751953125, "learning_rate": 5e-05, "loss": 0.037, "num_tokens": 22734898.0, "reward": 7.076416015625, "reward_std": 0.2961975336074829, "rewards/helpfulness_reward/mean": 7.076416015625, "rewards/helpfulness_reward/std": 0.556546688079834, "rewards/safety_reward/mean": 8.199951171875, "rewards/safety_reward/std": 0.5430719256401062, "step": 981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 128.40625, "completions/mean_terminated_length": 128.40625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.3430268098855995, "frac_reward_zero_std": 0.0, "grad_norm": 0.29937705397605896, "kl": 3.482421875, "learning_rate": 5e-05, "loss": 0.0231, "num_tokens": 22755534.0, "reward": 6.553466796875, "reward_std": 0.37368375062942505, "rewards/helpfulness_reward/mean": 6.553466796875, "rewards/helpfulness_reward/std": 1.3185462951660156, "rewards/safety_reward/mean": 7.994873046875, "rewards/safety_reward/std": 1.4729012250900269, "step": 982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 123.6015625, "completions/mean_terminated_length": 123.6015625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.34337612435595144, "frac_reward_zero_std": 0.0, "grad_norm": 0.31833696365356445, "kl": 3.921875, "learning_rate": 5e-05, "loss": 0.0325, "num_tokens": 22776587.0, "reward": 6.744140625, "reward_std": 0.4437563419342041, "rewards/helpfulness_reward/mean": 6.744140625, "rewards/helpfulness_reward/std": 0.7686108350753784, "rewards/safety_reward/mean": 8.17724609375, "rewards/safety_reward/std": 0.6733752489089966, "step": 983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 123.8046875, "completions/mean_terminated_length": 123.8046875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.34372543882630335, "frac_reward_zero_std": 0.0, "grad_norm": 0.2465653270483017, "kl": 3.78125, "learning_rate": 5e-05, "loss": 0.035, "num_tokens": 22796138.0, "reward": 6.697265625, "reward_std": 0.31651580333709717, "rewards/helpfulness_reward/mean": 6.697265625, "rewards/helpfulness_reward/std": 0.7907128930091858, "rewards/safety_reward/mean": 8.160400390625, "rewards/safety_reward/std": 0.5512374639511108, "step": 984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 124.1328125, "completions/mean_terminated_length": 124.1328125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.3440747532966553, "frac_reward_zero_std": 0.0, "grad_norm": 0.2784130275249481, "kl": 3.79296875, "learning_rate": 5e-05, "loss": 0.0333, "num_tokens": 22816371.0, "reward": 6.952392578125, "reward_std": 0.2523908019065857, "rewards/helpfulness_reward/mean": 6.952392578125, "rewards/helpfulness_reward/std": 0.46417492628097534, "rewards/safety_reward/mean": 8.141845703125, "rewards/safety_reward/std": 0.49329444766044617, "step": 985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 123.984375, "completions/mean_terminated_length": 123.984375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.34442406776700724, "frac_reward_zero_std": 0.0, "grad_norm": 0.23184055089950562, "kl": 3.62109375, "learning_rate": 5e-05, "loss": 0.0283, "num_tokens": 22837329.0, "reward": 6.948974609375, "reward_std": 0.349837064743042, "rewards/helpfulness_reward/mean": 6.948974609375, "rewards/helpfulness_reward/std": 0.8050161004066467, "rewards/safety_reward/mean": 8.132568359375, "rewards/safety_reward/std": 0.7600999474525452, "step": 986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 123.515625, "completions/mean_terminated_length": 123.515625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.3447733822373592, "frac_reward_zero_std": 0.0, "grad_norm": 0.27751675248146057, "kl": 3.802734375, "learning_rate": 5e-05, "loss": 0.0291, "num_tokens": 22857563.0, "reward": 7.08154296875, "reward_std": 0.34786179661750793, "rewards/helpfulness_reward/mean": 7.08154296875, "rewards/helpfulness_reward/std": 0.7270546555519104, "rewards/safety_reward/mean": 8.43896484375, "rewards/safety_reward/std": 0.6916911005973816, "step": 987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 123.3515625, "completions/mean_terminated_length": 123.3515625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.3451226967077111, "frac_reward_zero_std": 0.0, "grad_norm": 0.7191851735115051, "kl": 4.046875, "learning_rate": 5e-05, "loss": 0.0315, "num_tokens": 22877400.0, "reward": 6.77099609375, "reward_std": 0.26053738594055176, "rewards/helpfulness_reward/mean": 6.77099609375, "rewards/helpfulness_reward/std": 0.6534154415130615, "rewards/safety_reward/mean": 8.176025390625, "rewards/safety_reward/std": 0.48569077253341675, "step": 988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 124.2265625, "completions/mean_terminated_length": 124.2265625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.34547201117806303, "frac_reward_zero_std": 0.0, "grad_norm": 0.26717573404312134, "kl": 3.8359375, "learning_rate": 5e-05, "loss": 0.0418, "num_tokens": 22897349.0, "reward": 6.94140625, "reward_std": 0.26187366247177124, "rewards/helpfulness_reward/mean": 6.94140625, "rewards/helpfulness_reward/std": 0.638811469078064, "rewards/safety_reward/mean": 8.29638671875, "rewards/safety_reward/std": 0.5683205127716064, "step": 989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 124.046875, "completions/mean_terminated_length": 124.046875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.345821325648415, "frac_reward_zero_std": 0.0, "grad_norm": 1.5748494863510132, "kl": 4.328125, "learning_rate": 5e-05, "loss": 0.0458, "num_tokens": 22917051.0, "reward": 6.9921875, "reward_std": 0.29421281814575195, "rewards/helpfulness_reward/mean": 6.9921875, "rewards/helpfulness_reward/std": 0.5996456742286682, "rewards/safety_reward/mean": 8.30908203125, "rewards/safety_reward/std": 0.5516678094863892, "step": 990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 123.859375, "completions/mean_terminated_length": 123.859375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.3461706401187669, "frac_reward_zero_std": 0.0, "grad_norm": 0.3004021942615509, "kl": 3.5390625, "learning_rate": 5e-05, "loss": 0.0322, "num_tokens": 22936833.0, "reward": 6.872314453125, "reward_std": 0.39774879813194275, "rewards/helpfulness_reward/mean": 6.872314453125, "rewards/helpfulness_reward/std": 0.7456507682800293, "rewards/safety_reward/mean": 8.2060546875, "rewards/safety_reward/std": 0.663508951663971, "step": 991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 123.859375, "completions/mean_terminated_length": 123.859375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.34651995458911883, "frac_reward_zero_std": 0.0, "grad_norm": 0.28535202145576477, "kl": 3.587890625, "learning_rate": 5e-05, "loss": 0.0285, "num_tokens": 22957439.0, "reward": 6.98828125, "reward_std": 0.2207815796136856, "rewards/helpfulness_reward/mean": 6.98828125, "rewards/helpfulness_reward/std": 0.5107572078704834, "rewards/safety_reward/mean": 8.380615234375, "rewards/safety_reward/std": 0.46139127016067505, "step": 992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 124.515625, "completions/mean_terminated_length": 124.515625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.3468692690594708, "frac_reward_zero_std": 0.0, "grad_norm": 0.3241919279098511, "kl": 3.87109375, "learning_rate": 5e-05, "loss": 0.0398, "num_tokens": 22978769.0, "reward": 6.82373046875, "reward_std": 0.3214322626590729, "rewards/helpfulness_reward/mean": 6.82373046875, "rewards/helpfulness_reward/std": 0.6096760034561157, "rewards/safety_reward/mean": 8.374267578125, "rewards/safety_reward/std": 0.4281276762485504, "step": 993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 124.09375, "completions/mean_terminated_length": 124.09375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.3472185835298227, "frac_reward_zero_std": 0.0, "grad_norm": 0.2971508800983429, "kl": 3.771484375, "learning_rate": 5e-05, "loss": 0.0367, "num_tokens": 22999485.0, "reward": 7.075927734375, "reward_std": 0.35434383153915405, "rewards/helpfulness_reward/mean": 7.075927734375, "rewards/helpfulness_reward/std": 0.7553703784942627, "rewards/safety_reward/mean": 8.483642578125, "rewards/safety_reward/std": 0.5774341821670532, "step": 994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 123.84375, "completions/mean_terminated_length": 123.84375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.3475678980001747, "frac_reward_zero_std": 0.0, "grad_norm": 0.28162452578544617, "kl": 3.78125, "learning_rate": 5e-05, "loss": 0.0394, "num_tokens": 23019697.0, "reward": 6.75390625, "reward_std": 0.3725043833255768, "rewards/helpfulness_reward/mean": 6.75390625, "rewards/helpfulness_reward/std": 0.6745947599411011, "rewards/safety_reward/mean": 8.22607421875, "rewards/safety_reward/std": 0.5454615950584412, "step": 995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 124.046875, "completions/mean_terminated_length": 124.046875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.3479172124705266, "frac_reward_zero_std": 0.0, "grad_norm": 0.26081427931785583, "kl": 3.646484375, "learning_rate": 5e-05, "loss": 0.0387, "num_tokens": 23040223.0, "reward": 6.764404296875, "reward_std": 0.33159273862838745, "rewards/helpfulness_reward/mean": 6.764404296875, "rewards/helpfulness_reward/std": 0.6079351305961609, "rewards/safety_reward/mean": 8.14892578125, "rewards/safety_reward/std": 0.5828225016593933, "step": 996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 125.2109375, "completions/mean_terminated_length": 125.2109375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.3482665269408785, "frac_reward_zero_std": 0.0, "grad_norm": 0.328457236289978, "kl": 3.63671875, "learning_rate": 5e-05, "loss": 0.0383, "num_tokens": 23060490.0, "reward": 6.7120361328125, "reward_std": 0.3594512343406677, "rewards/helpfulness_reward/mean": 6.7120361328125, "rewards/helpfulness_reward/std": 1.0867953300476074, "rewards/safety_reward/mean": 8.1864013671875, "rewards/safety_reward/std": 1.0841929912567139, "step": 997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 123.4921875, "completions/mean_terminated_length": 123.4921875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.3486158414112305, "frac_reward_zero_std": 0.0, "grad_norm": 1.4189132452011108, "kl": 3.88671875, "learning_rate": 5e-05, "loss": 0.0314, "num_tokens": 23083033.0, "reward": 6.7711181640625, "reward_std": 0.3480232357978821, "rewards/helpfulness_reward/mean": 6.7711181640625, "rewards/helpfulness_reward/std": 1.0693345069885254, "rewards/safety_reward/mean": 8.007568359375, "rewards/safety_reward/std": 0.8571429252624512, "step": 998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 123.1171875, "completions/mean_terminated_length": 123.1171875, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.3489651558815824, "frac_reward_zero_std": 0.0, "grad_norm": 0.2879256010055542, "kl": 3.62109375, "learning_rate": 5e-05, "loss": 0.0253, "num_tokens": 23102360.0, "reward": 7.035369873046875, "reward_std": 0.4014226794242859, "rewards/helpfulness_reward/mean": 7.035369873046875, "rewards/helpfulness_reward/std": 0.8154723644256592, "rewards/safety_reward/mean": 8.299072265625, "rewards/safety_reward/std": 0.6307765245437622, "step": 999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 123.4921875, "completions/mean_terminated_length": 123.4921875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.3493144703519343, "frac_reward_zero_std": 0.0, "grad_norm": 0.27727609872817993, "kl": 3.859375, "learning_rate": 5e-05, "loss": 0.0315, "num_tokens": 23122063.0, "reward": 6.859375, "reward_std": 0.346643328666687, "rewards/helpfulness_reward/mean": 6.859375, "rewards/helpfulness_reward/std": 0.533755362033844, "rewards/safety_reward/mean": 8.154296875, "rewards/safety_reward/std": 0.6041776537895203, "step": 1000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 122.984375, "completions/mean_terminated_length": 122.984375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.34966378482228627, "frac_reward_zero_std": 0.0, "grad_norm": 0.4718151390552521, "kl": 3.99609375, "learning_rate": 5e-05, "loss": 0.0277, "num_tokens": 23142213.0, "reward": 6.900634765625, "reward_std": 0.295914888381958, "rewards/helpfulness_reward/mean": 6.900634765625, "rewards/helpfulness_reward/std": 0.49451708793640137, "rewards/safety_reward/mean": 8.248046875, "rewards/safety_reward/std": 0.5905197262763977, "step": 1001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 122.921875, "completions/mean_terminated_length": 122.921875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.3500130992926382, "frac_reward_zero_std": 0.0, "grad_norm": 0.20715047419071198, "kl": 3.677734375, "learning_rate": 5e-05, "loss": 0.0321, "num_tokens": 23162763.0, "reward": 6.900634765625, "reward_std": 0.18090477585792542, "rewards/helpfulness_reward/mean": 6.900634765625, "rewards/helpfulness_reward/std": 0.5693269371986389, "rewards/safety_reward/mean": 8.322998046875, "rewards/safety_reward/std": 0.5191236734390259, "step": 1002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 123.0078125, "completions/mean_terminated_length": 123.0078125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.35036241376299015, "frac_reward_zero_std": 0.0, "grad_norm": 0.2450132817029953, "kl": 3.650390625, "learning_rate": 5e-05, "loss": 0.0357, "num_tokens": 23182540.0, "reward": 7.029052734375, "reward_std": 0.2667156159877777, "rewards/helpfulness_reward/mean": 7.029052734375, "rewards/helpfulness_reward/std": 0.4345227777957916, "rewards/safety_reward/mean": 8.3720703125, "rewards/safety_reward/std": 0.5059536695480347, "step": 1003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 123.2578125, "completions/mean_terminated_length": 123.2578125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.35071172823334207, "frac_reward_zero_std": 0.0, "grad_norm": 0.315085768699646, "kl": 3.72265625, "learning_rate": 5e-05, "loss": 0.03, "num_tokens": 23202821.0, "reward": 6.9453125, "reward_std": 0.2273060828447342, "rewards/helpfulness_reward/mean": 6.9453125, "rewards/helpfulness_reward/std": 0.5392875671386719, "rewards/safety_reward/mean": 8.437255859375, "rewards/safety_reward/std": 0.38566479086875916, "step": 1004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 122.453125, "completions/mean_terminated_length": 122.453125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.351061042703694, "frac_reward_zero_std": 0.0, "grad_norm": 0.29105064272880554, "kl": 3.73046875, "learning_rate": 5e-05, "loss": 0.0246, "num_tokens": 23223335.0, "reward": 6.878662109375, "reward_std": 0.33357974886894226, "rewards/helpfulness_reward/mean": 6.878662109375, "rewards/helpfulness_reward/std": 0.7787767648696899, "rewards/safety_reward/mean": 8.300537109375, "rewards/safety_reward/std": 0.5440449118614197, "step": 1005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 123.4140625, "completions/mean_terminated_length": 123.4140625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.35141035717404595, "frac_reward_zero_std": 0.0, "grad_norm": 0.3639136254787445, "kl": 3.837890625, "learning_rate": 5e-05, "loss": 0.029, "num_tokens": 23243284.0, "reward": 6.832763671875, "reward_std": 0.27602964639663696, "rewards/helpfulness_reward/mean": 6.832763671875, "rewards/helpfulness_reward/std": 0.5556586980819702, "rewards/safety_reward/mean": 8.38916015625, "rewards/safety_reward/std": 0.40268269181251526, "step": 1006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 122.8203125, "completions/mean_terminated_length": 122.8203125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.35175967164439786, "frac_reward_zero_std": 0.0, "grad_norm": 0.25412166118621826, "kl": 3.6796875, "learning_rate": 5e-05, "loss": 0.0228, "num_tokens": 23264581.0, "reward": 7.082763671875, "reward_std": 0.23253507912158966, "rewards/helpfulness_reward/mean": 7.082763671875, "rewards/helpfulness_reward/std": 0.5508358478546143, "rewards/safety_reward/mean": 8.336181640625, "rewards/safety_reward/std": 0.3930949568748474, "step": 1007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 123.5625, "completions/mean_terminated_length": 123.5625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.3521089861147498, "frac_reward_zero_std": 0.0, "grad_norm": 0.37033993005752563, "kl": 3.75, "learning_rate": 5e-05, "loss": 0.0309, "num_tokens": 23285405.0, "reward": 6.9197998046875, "reward_std": 0.2871144115924835, "rewards/helpfulness_reward/mean": 6.9197998046875, "rewards/helpfulness_reward/std": 0.6913767457008362, "rewards/safety_reward/mean": 8.393310546875, "rewards/safety_reward/std": 0.6073102951049805, "step": 1008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 123.2578125, "completions/mean_terminated_length": 123.2578125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.35245830058510175, "frac_reward_zero_std": 0.0, "grad_norm": 0.27572280168533325, "kl": 3.666015625, "learning_rate": 5e-05, "loss": 0.031, "num_tokens": 23306486.0, "reward": 7.207275390625, "reward_std": 0.21528260409832, "rewards/helpfulness_reward/mean": 7.207275390625, "rewards/helpfulness_reward/std": 0.6272212862968445, "rewards/safety_reward/mean": 8.644775390625, "rewards/safety_reward/std": 0.5146845579147339, "step": 1009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 123.3828125, "completions/mean_terminated_length": 123.3828125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.35280761505545366, "frac_reward_zero_std": 0.0, "grad_norm": 0.3026842474937439, "kl": 3.943359375, "learning_rate": 5e-05, "loss": 0.0324, "num_tokens": 23326335.0, "reward": 7.06005859375, "reward_std": 0.24140331149101257, "rewards/helpfulness_reward/mean": 7.06005859375, "rewards/helpfulness_reward/std": 0.48308318853378296, "rewards/safety_reward/mean": 8.41455078125, "rewards/safety_reward/std": 0.46883174777030945, "step": 1010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 123.0859375, "completions/mean_terminated_length": 123.0859375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.35315692952580563, "frac_reward_zero_std": 0.0, "grad_norm": 0.3580208420753479, "kl": 3.8125, "learning_rate": 5e-05, "loss": 0.0334, "num_tokens": 23346314.0, "reward": 7.09228515625, "reward_std": 0.2297808676958084, "rewards/helpfulness_reward/mean": 7.09228515625, "rewards/helpfulness_reward/std": 0.6170615553855896, "rewards/safety_reward/mean": 8.370361328125, "rewards/safety_reward/std": 0.4239698052406311, "step": 1011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 122.953125, "completions/mean_terminated_length": 122.953125, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.35350624399615754, "frac_reward_zero_std": 0.0, "grad_norm": 0.3612865209579468, "kl": 3.853515625, "learning_rate": 5e-05, "loss": 0.0299, "num_tokens": 23366476.0, "reward": 6.81195068359375, "reward_std": 0.48002052307128906, "rewards/helpfulness_reward/mean": 6.81195068359375, "rewards/helpfulness_reward/std": 1.0629079341888428, "rewards/safety_reward/mean": 8.281341552734375, "rewards/safety_reward/std": 1.2035934925079346, "step": 1012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 124.5546875, "completions/mean_terminated_length": 124.5546875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.35385555846650946, "frac_reward_zero_std": 0.0, "grad_norm": 0.25046080350875854, "kl": 3.728515625, "learning_rate": 5e-05, "loss": 0.0407, "num_tokens": 23387523.0, "reward": 6.858154296875, "reward_std": 0.32021963596343994, "rewards/helpfulness_reward/mean": 6.858154296875, "rewards/helpfulness_reward/std": 0.5945317149162292, "rewards/safety_reward/mean": 8.30810546875, "rewards/safety_reward/std": 0.46921294927597046, "step": 1013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 123.9375, "completions/mean_terminated_length": 123.9375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.3542048729368614, "frac_reward_zero_std": 0.0, "grad_norm": 0.28864216804504395, "kl": 3.60546875, "learning_rate": 5e-05, "loss": 0.0325, "num_tokens": 23408067.0, "reward": 6.986328125, "reward_std": 0.3868858218193054, "rewards/helpfulness_reward/mean": 6.986328125, "rewards/helpfulness_reward/std": 0.6371330618858337, "rewards/safety_reward/mean": 8.292236328125, "rewards/safety_reward/std": 0.5693560242652893, "step": 1014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 123.8359375, "completions/mean_terminated_length": 123.8359375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.35455418740721334, "frac_reward_zero_std": 0.0, "grad_norm": 0.29941684007644653, "kl": 3.802734375, "learning_rate": 5e-05, "loss": 0.037, "num_tokens": 23428230.0, "reward": 6.85595703125, "reward_std": 0.2982313632965088, "rewards/helpfulness_reward/mean": 6.85595703125, "rewards/helpfulness_reward/std": 0.5401599407196045, "rewards/safety_reward/mean": 8.262451171875, "rewards/safety_reward/std": 0.4955253005027771, "step": 1015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 124.1171875, "completions/mean_terminated_length": 124.1171875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.35490350187756525, "frac_reward_zero_std": 0.0, "grad_norm": 0.28996455669403076, "kl": 3.80078125, "learning_rate": 5e-05, "loss": 0.0357, "num_tokens": 23447901.0, "reward": 7.0020751953125, "reward_std": 0.3783756494522095, "rewards/helpfulness_reward/mean": 7.0020751953125, "rewards/helpfulness_reward/std": 0.6963498592376709, "rewards/safety_reward/mean": 8.39794921875, "rewards/safety_reward/std": 0.5356749892234802, "step": 1016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 123.8984375, "completions/mean_terminated_length": 123.8984375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.3552528163479172, "frac_reward_zero_std": 0.0, "grad_norm": 0.2181229144334793, "kl": 3.716796875, "learning_rate": 5e-05, "loss": 0.036, "num_tokens": 23467520.0, "reward": 7.0675048828125, "reward_std": 0.29328131675720215, "rewards/helpfulness_reward/mean": 7.0675048828125, "rewards/helpfulness_reward/std": 0.7454492449760437, "rewards/safety_reward/mean": 8.3193359375, "rewards/safety_reward/std": 0.6773582696914673, "step": 1017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 123.96875, "completions/mean_terminated_length": 123.96875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.35560213081826914, "frac_reward_zero_std": 0.0, "grad_norm": 0.24552661180496216, "kl": 3.755859375, "learning_rate": 5e-05, "loss": 0.0376, "num_tokens": 23487148.0, "reward": 6.974365234375, "reward_std": 0.24196821451187134, "rewards/helpfulness_reward/mean": 6.974365234375, "rewards/helpfulness_reward/std": 0.44926589727401733, "rewards/safety_reward/mean": 8.35546875, "rewards/safety_reward/std": 0.30565837025642395, "step": 1018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 124.1953125, "completions/mean_terminated_length": 124.1953125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.3559514452886211, "frac_reward_zero_std": 0.0, "grad_norm": 0.33219054341316223, "kl": 3.921875, "learning_rate": 5e-05, "loss": 0.041, "num_tokens": 23506493.0, "reward": 7.095458984375, "reward_std": 0.20062552392482758, "rewards/helpfulness_reward/mean": 7.095458984375, "rewards/helpfulness_reward/std": 0.41672977805137634, "rewards/safety_reward/mean": 8.4365234375, "rewards/safety_reward/std": 0.4924837648868561, "step": 1019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 124.0859375, "completions/mean_terminated_length": 124.0859375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.356300759758973, "frac_reward_zero_std": 0.0, "grad_norm": 0.3653183877468109, "kl": 3.8125, "learning_rate": 5e-05, "loss": 0.0314, "num_tokens": 23527112.0, "reward": 6.692626953125, "reward_std": 0.31847018003463745, "rewards/helpfulness_reward/mean": 6.692626953125, "rewards/helpfulness_reward/std": 0.6865237355232239, "rewards/safety_reward/mean": 8.292236328125, "rewards/safety_reward/std": 0.4093253016471863, "step": 1020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 123.8125, "completions/mean_terminated_length": 123.8125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.35665007422932493, "frac_reward_zero_std": 0.0, "grad_norm": 0.31617534160614014, "kl": 3.74609375, "learning_rate": 5e-05, "loss": 0.0328, "num_tokens": 23547008.0, "reward": 7.001708984375, "reward_std": 0.262792706489563, "rewards/helpfulness_reward/mean": 7.001708984375, "rewards/helpfulness_reward/std": 0.5314931869506836, "rewards/safety_reward/mean": 8.283203125, "rewards/safety_reward/std": 0.6680976152420044, "step": 1021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/max_terminated_length": 540.0, "completions/mean_length": 143.765625, "completions/mean_terminated_length": 143.765625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.3569993886996769, "frac_reward_zero_std": 0.0, "grad_norm": 0.2725832760334015, "kl": 3.19140625, "learning_rate": 5e-05, "loss": 0.0135, "num_tokens": 23572162.0, "reward": 6.2909088134765625, "reward_std": 0.31526613235473633, "rewards/helpfulness_reward/mean": 6.2909088134765625, "rewards/helpfulness_reward/std": 1.8187466859817505, "rewards/safety_reward/mean": 7.509513854980469, "rewards/safety_reward/std": 2.2031824588775635, "step": 1022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 127.3984375, "completions/mean_terminated_length": 127.3984375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.3573487031700288, "frac_reward_zero_std": 0.0, "grad_norm": 0.3287450671195984, "kl": 3.564453125, "learning_rate": 5e-05, "loss": 0.0479, "num_tokens": 23592637.0, "reward": 6.7420654296875, "reward_std": 0.47943422198295593, "rewards/helpfulness_reward/mean": 6.7420654296875, "rewards/helpfulness_reward/std": 1.303890347480774, "rewards/safety_reward/mean": 7.9886474609375, "rewards/safety_reward/std": 1.5412877798080444, "step": 1023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 123.8203125, "completions/mean_terminated_length": 123.8203125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.3576980176403807, "frac_reward_zero_std": 0.0, "grad_norm": 0.28521278500556946, "kl": 3.6875, "learning_rate": 5e-05, "loss": 0.0351, "num_tokens": 23612390.0, "reward": 7.09521484375, "reward_std": 0.2142806351184845, "rewards/helpfulness_reward/mean": 7.09521484375, "rewards/helpfulness_reward/std": 0.4140186905860901, "rewards/safety_reward/mean": 8.384765625, "rewards/safety_reward/std": 0.423927366733551, "step": 1024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 124.015625, "completions/mean_terminated_length": 124.015625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.3580473321107327, "frac_reward_zero_std": 0.0, "grad_norm": 0.33221831917762756, "kl": 3.79296875, "learning_rate": 5e-05, "loss": 0.0331, "num_tokens": 23632104.0, "reward": 6.99560546875, "reward_std": 0.26752784848213196, "rewards/helpfulness_reward/mean": 6.99560546875, "rewards/helpfulness_reward/std": 0.5518280863761902, "rewards/safety_reward/mean": 8.4658203125, "rewards/safety_reward/std": 0.3827817440032959, "step": 1025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 123.6015625, "completions/mean_terminated_length": 123.6015625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.3583966465810846, "frac_reward_zero_std": 0.0, "grad_norm": 0.5721572041511536, "kl": 3.90625, "learning_rate": 5e-05, "loss": 0.0369, "num_tokens": 23651973.0, "reward": 7.0458984375, "reward_std": 0.2933475971221924, "rewards/helpfulness_reward/mean": 7.0458984375, "rewards/helpfulness_reward/std": 0.5121244192123413, "rewards/safety_reward/mean": 8.34765625, "rewards/safety_reward/std": 0.5649961233139038, "step": 1026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 123.984375, "completions/mean_terminated_length": 123.984375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.3587459610514366, "frac_reward_zero_std": 0.0, "grad_norm": 0.22555553913116455, "kl": 3.609375, "learning_rate": 5e-05, "loss": 0.0365, "num_tokens": 23671579.0, "reward": 7.29248046875, "reward_std": 0.23850032687187195, "rewards/helpfulness_reward/mean": 7.29248046875, "rewards/helpfulness_reward/std": 0.5571946501731873, "rewards/safety_reward/mean": 8.34619140625, "rewards/safety_reward/std": 0.5647185444831848, "step": 1027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 123.6796875, "completions/mean_terminated_length": 123.6796875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.3590952755217885, "frac_reward_zero_std": 0.0, "grad_norm": 0.2754611074924469, "kl": 3.779296875, "learning_rate": 5e-05, "loss": 0.033, "num_tokens": 23692202.0, "reward": 6.99072265625, "reward_std": 0.26680654287338257, "rewards/helpfulness_reward/mean": 6.99072265625, "rewards/helpfulness_reward/std": 0.6547350883483887, "rewards/safety_reward/mean": 8.37548828125, "rewards/safety_reward/std": 0.5579565763473511, "step": 1028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 123.6640625, "completions/mean_terminated_length": 123.6640625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.3594445899921404, "frac_reward_zero_std": 0.0, "grad_norm": 0.24284575879573822, "kl": 3.673828125, "learning_rate": 5e-05, "loss": 0.0322, "num_tokens": 23711671.0, "reward": 6.923095703125, "reward_std": 0.22385722398757935, "rewards/helpfulness_reward/mean": 6.923095703125, "rewards/helpfulness_reward/std": 0.540382981300354, "rewards/safety_reward/mean": 8.343017578125, "rewards/safety_reward/std": 0.4669579565525055, "step": 1029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 123.46875, "completions/mean_terminated_length": 123.46875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.3597939044624924, "frac_reward_zero_std": 0.0, "grad_norm": 0.2640675902366638, "kl": 3.654296875, "learning_rate": 5e-05, "loss": 0.0326, "num_tokens": 23733179.0, "reward": 6.904052734375, "reward_std": 0.23039549589157104, "rewards/helpfulness_reward/mean": 6.904052734375, "rewards/helpfulness_reward/std": 0.7496634125709534, "rewards/safety_reward/mean": 8.093017578125, "rewards/safety_reward/std": 0.6796874403953552, "step": 1030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 124.1015625, "completions/mean_terminated_length": 124.1015625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.3601432189328443, "frac_reward_zero_std": 0.0, "grad_norm": 0.31614381074905396, "kl": 3.720703125, "learning_rate": 5e-05, "loss": 0.0342, "num_tokens": 23752744.0, "reward": 7.146240234375, "reward_std": 0.22953879833221436, "rewards/helpfulness_reward/mean": 7.146240234375, "rewards/helpfulness_reward/std": 0.484693318605423, "rewards/safety_reward/mean": 8.338134765625, "rewards/safety_reward/std": 0.5074105262756348, "step": 1031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 124.390625, "completions/mean_terminated_length": 124.390625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.3604925334031962, "frac_reward_zero_std": 0.0, "grad_norm": 0.26799601316452026, "kl": 3.66796875, "learning_rate": 5e-05, "loss": 0.0396, "num_tokens": 23772762.0, "reward": 6.87451171875, "reward_std": 0.2566344439983368, "rewards/helpfulness_reward/mean": 6.87451171875, "rewards/helpfulness_reward/std": 0.4470103979110718, "rewards/safety_reward/mean": 8.25927734375, "rewards/safety_reward/std": 0.37036505341529846, "step": 1032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 124.515625, "completions/mean_terminated_length": 124.515625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.36084184787354817, "frac_reward_zero_std": 0.0, "grad_norm": 0.23551146686077118, "kl": 3.66015625, "learning_rate": 5e-05, "loss": 0.0338, "num_tokens": 23792780.0, "reward": 6.8388671875, "reward_std": 0.3219788074493408, "rewards/helpfulness_reward/mean": 6.8388671875, "rewards/helpfulness_reward/std": 0.6132938861846924, "rewards/safety_reward/mean": 8.222900390625, "rewards/safety_reward/std": 0.49361342191696167, "step": 1033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 124.4375, "completions/mean_terminated_length": 124.4375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.3611911623439001, "frac_reward_zero_std": 0.0, "grad_norm": 0.29628968238830566, "kl": 3.61328125, "learning_rate": 5e-05, "loss": 0.037, "num_tokens": 23812572.0, "reward": 6.923828125, "reward_std": 0.21518293023109436, "rewards/helpfulness_reward/mean": 6.923828125, "rewards/helpfulness_reward/std": 0.579372227191925, "rewards/safety_reward/mean": 8.21728515625, "rewards/safety_reward/std": 0.45936068892478943, "step": 1034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 139.6484375, "completions/mean_terminated_length": 139.6484375, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.36154047681425205, "frac_reward_zero_std": 0.0, "grad_norm": 0.21305741369724274, "kl": 3.21875, "learning_rate": 5e-05, "loss": 0.0583, "num_tokens": 23836551.0, "reward": 6.4793701171875, "reward_std": 0.274527370929718, "rewards/helpfulness_reward/mean": 6.4793701171875, "rewards/helpfulness_reward/std": 2.001458168029785, "rewards/safety_reward/mean": 7.713531494140625, "rewards/safety_reward/std": 2.426076889038086, "step": 1035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 124.2734375, "completions/mean_terminated_length": 124.2734375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.36188979128460397, "frac_reward_zero_std": 0.0, "grad_norm": 0.3925880789756775, "kl": 3.828125, "learning_rate": 5e-05, "loss": 0.0364, "num_tokens": 23856314.0, "reward": 7.033935546875, "reward_std": 0.3963046371936798, "rewards/helpfulness_reward/mean": 7.033935546875, "rewards/helpfulness_reward/std": 0.8220406770706177, "rewards/safety_reward/mean": 8.242431640625, "rewards/safety_reward/std": 0.6885588765144348, "step": 1036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 124.140625, "completions/mean_terminated_length": 124.140625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.3622391057549559, "frac_reward_zero_std": 0.0, "grad_norm": 0.29413551092147827, "kl": 3.533203125, "learning_rate": 5e-05, "loss": 0.0367, "num_tokens": 23875996.0, "reward": 7.269287109375, "reward_std": 0.26568400859832764, "rewards/helpfulness_reward/mean": 7.269287109375, "rewards/helpfulness_reward/std": 0.47951263189315796, "rewards/safety_reward/mean": 8.448974609375, "rewards/safety_reward/std": 0.5120483636856079, "step": 1037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 124.328125, "completions/mean_terminated_length": 124.328125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.36258842022530785, "frac_reward_zero_std": 0.0, "grad_norm": 0.22737540304660797, "kl": 3.685546875, "learning_rate": 5e-05, "loss": 0.0352, "num_tokens": 23896470.0, "reward": 6.964111328125, "reward_std": 0.23178933560848236, "rewards/helpfulness_reward/mean": 6.964111328125, "rewards/helpfulness_reward/std": 0.44590991735458374, "rewards/safety_reward/mean": 8.46044921875, "rewards/safety_reward/std": 0.36849433183670044, "step": 1038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 124.1875, "completions/mean_terminated_length": 124.1875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.36293773469565976, "frac_reward_zero_std": 0.0, "grad_norm": 0.3459704518318176, "kl": 3.728515625, "learning_rate": 5e-05, "loss": 0.037, "num_tokens": 23916750.0, "reward": 7.027099609375, "reward_std": 0.24907058477401733, "rewards/helpfulness_reward/mean": 7.027099609375, "rewards/helpfulness_reward/std": 0.5450933575630188, "rewards/safety_reward/mean": 8.37646484375, "rewards/safety_reward/std": 0.5663534998893738, "step": 1039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 124.5625, "completions/mean_terminated_length": 124.5625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.3632870491660117, "frac_reward_zero_std": 0.0, "grad_norm": 0.27452990412712097, "kl": 3.58203125, "learning_rate": 5e-05, "loss": 0.0367, "num_tokens": 23936126.0, "reward": 7.1376953125, "reward_std": 0.25608372688293457, "rewards/helpfulness_reward/mean": 7.1376953125, "rewards/helpfulness_reward/std": 0.5121169090270996, "rewards/safety_reward/mean": 8.313720703125, "rewards/safety_reward/std": 0.5603963732719421, "step": 1040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 124.40625, "completions/mean_terminated_length": 124.40625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.36363636363636365, "frac_reward_zero_std": 0.0, "grad_norm": 0.32697704434394836, "kl": 3.859375, "learning_rate": 5e-05, "loss": 0.035, "num_tokens": 23955866.0, "reward": 6.8690185546875, "reward_std": 0.36142346262931824, "rewards/helpfulness_reward/mean": 6.8690185546875, "rewards/helpfulness_reward/std": 0.6803186535835266, "rewards/safety_reward/mean": 8.3466796875, "rewards/safety_reward/std": 0.6031928658485413, "step": 1041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 123.828125, "completions/mean_terminated_length": 123.828125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.36398567810671556, "frac_reward_zero_std": 0.0, "grad_norm": 1.7497366666793823, "kl": 4.806640625, "learning_rate": 5e-05, "loss": 0.0364, "num_tokens": 23975700.0, "reward": 6.933837890625, "reward_std": 0.5834858417510986, "rewards/helpfulness_reward/mean": 6.933837890625, "rewards/helpfulness_reward/std": 1.119850754737854, "rewards/safety_reward/mean": 8.114990234375, "rewards/safety_reward/std": 0.6965330839157104, "step": 1042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 124.09375, "completions/mean_terminated_length": 124.09375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.36433499257706753, "frac_reward_zero_std": 0.0, "grad_norm": 0.36511918902397156, "kl": 3.888671875, "learning_rate": 5e-05, "loss": 0.0357, "num_tokens": 23996152.0, "reward": 7.194091796875, "reward_std": 0.2116793990135193, "rewards/helpfulness_reward/mean": 7.194091796875, "rewards/helpfulness_reward/std": 0.5206378698348999, "rewards/safety_reward/mean": 8.55419921875, "rewards/safety_reward/std": 0.48357439041137695, "step": 1043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 124.25, "completions/mean_terminated_length": 124.25, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.36468430704741944, "frac_reward_zero_std": 0.0, "grad_norm": 0.2788059115409851, "kl": 3.89453125, "learning_rate": 5e-05, "loss": 0.034, "num_tokens": 24015816.0, "reward": 7.14501953125, "reward_std": 0.2816089391708374, "rewards/helpfulness_reward/mean": 7.14501953125, "rewards/helpfulness_reward/std": 0.5952663421630859, "rewards/safety_reward/mean": 8.330810546875, "rewards/safety_reward/std": 0.6946049928665161, "step": 1044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 124.4765625, "completions/mean_terminated_length": 124.4765625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.36503362151777136, "frac_reward_zero_std": 0.0, "grad_norm": 0.1957363337278366, "kl": 3.845703125, "learning_rate": 5e-05, "loss": 0.0354, "num_tokens": 24036021.0, "reward": 6.990966796875, "reward_std": 0.15390262007713318, "rewards/helpfulness_reward/mean": 6.990966796875, "rewards/helpfulness_reward/std": 0.5029922723770142, "rewards/safety_reward/mean": 8.430419921875, "rewards/safety_reward/std": 0.3571716547012329, "step": 1045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 124.296875, "completions/mean_terminated_length": 124.296875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.3653829359881233, "frac_reward_zero_std": 0.0, "grad_norm": 0.23168614506721497, "kl": 3.646484375, "learning_rate": 5e-05, "loss": 0.0317, "num_tokens": 24055859.0, "reward": 7.142578125, "reward_std": 0.29621073603630066, "rewards/helpfulness_reward/mean": 7.142578125, "rewards/helpfulness_reward/std": 0.50996994972229, "rewards/safety_reward/mean": 8.3544921875, "rewards/safety_reward/std": 0.7091963887214661, "step": 1046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 124.4921875, "completions/mean_terminated_length": 124.4921875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.36573225045847524, "frac_reward_zero_std": 0.0, "grad_norm": 0.2791760563850403, "kl": 3.986328125, "learning_rate": 5e-05, "loss": 0.0356, "num_tokens": 24075634.0, "reward": 6.9158935546875, "reward_std": 0.46054452657699585, "rewards/helpfulness_reward/mean": 6.9158935546875, "rewards/helpfulness_reward/std": 0.7628142833709717, "rewards/safety_reward/mean": 8.180908203125, "rewards/safety_reward/std": 0.8327711224555969, "step": 1047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 123.9921875, "completions/mean_terminated_length": 123.9921875, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.36608156492882715, "frac_reward_zero_std": 0.0, "grad_norm": 0.37864744663238525, "kl": 3.8046875, "learning_rate": 5e-05, "loss": 0.0252, "num_tokens": 24096201.0, "reward": 6.743980407714844, "reward_std": 0.4512878358364105, "rewards/helpfulness_reward/mean": 6.743980407714844, "rewards/helpfulness_reward/std": 0.835516095161438, "rewards/safety_reward/mean": 8.17431640625, "rewards/safety_reward/std": 0.6287487149238586, "step": 1048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 124.5859375, "completions/mean_terminated_length": 124.5859375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.3664308793991791, "frac_reward_zero_std": 0.0, "grad_norm": 0.3175310492515564, "kl": 3.787109375, "learning_rate": 5e-05, "loss": 0.0378, "num_tokens": 24117084.0, "reward": 7.076416015625, "reward_std": 0.3057834804058075, "rewards/helpfulness_reward/mean": 7.076416015625, "rewards/helpfulness_reward/std": 0.7261293530464172, "rewards/safety_reward/mean": 8.31591796875, "rewards/safety_reward/std": 0.5287361145019531, "step": 1049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 121.921875, "completions/mean_terminated_length": 121.921875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.36678019386953103, "frac_reward_zero_std": 0.0, "grad_norm": 1.2367223501205444, "kl": 4.328125, "learning_rate": 5e-05, "loss": 0.0355, "num_tokens": 24139114.0, "reward": 6.7731475830078125, "reward_std": 0.35442453622817993, "rewards/helpfulness_reward/mean": 6.7731475830078125, "rewards/helpfulness_reward/std": 1.5484274625778198, "rewards/safety_reward/mean": 8.0279541015625, "rewards/safety_reward/std": 1.5170361995697021, "step": 1050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 124.3203125, "completions/mean_terminated_length": 124.3203125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.367129508339883, "frac_reward_zero_std": 0.0, "grad_norm": 0.29901549220085144, "kl": 3.716796875, "learning_rate": 5e-05, "loss": 0.0328, "num_tokens": 24159483.0, "reward": 7.089599609375, "reward_std": 0.4878734350204468, "rewards/helpfulness_reward/mean": 7.089599609375, "rewards/helpfulness_reward/std": 0.7747551202774048, "rewards/safety_reward/mean": 8.334228515625, "rewards/safety_reward/std": 0.6077789664268494, "step": 1051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 123.875, "completions/mean_terminated_length": 123.875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.3674788228102349, "frac_reward_zero_std": 0.0, "grad_norm": 0.23826293647289276, "kl": 3.75390625, "learning_rate": 5e-05, "loss": 0.0224, "num_tokens": 24179467.0, "reward": 7.03009033203125, "reward_std": 0.5736564993858337, "rewards/helpfulness_reward/mean": 7.03009033203125, "rewards/helpfulness_reward/std": 1.0290476083755493, "rewards/safety_reward/mean": 8.433837890625, "rewards/safety_reward/std": 0.6618402004241943, "step": 1052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 124.9609375, "completions/mean_terminated_length": 124.9609375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.36782813728058683, "frac_reward_zero_std": 0.0, "grad_norm": 1.0329923629760742, "kl": 4.01953125, "learning_rate": 5e-05, "loss": 0.0455, "num_tokens": 24199750.0, "reward": 6.938232421875, "reward_std": 0.32296448945999146, "rewards/helpfulness_reward/mean": 6.938232421875, "rewards/helpfulness_reward/std": 0.8063865900039673, "rewards/safety_reward/mean": 8.41796875, "rewards/safety_reward/std": 0.6829889416694641, "step": 1053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 124.75, "completions/mean_terminated_length": 124.75, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.3681774517509388, "frac_reward_zero_std": 0.0, "grad_norm": 0.2580507695674896, "kl": 3.75390625, "learning_rate": 5e-05, "loss": 0.0388, "num_tokens": 24219174.0, "reward": 6.986328125, "reward_std": 0.3307257890701294, "rewards/helpfulness_reward/mean": 6.986328125, "rewards/helpfulness_reward/std": 0.6247877478599548, "rewards/safety_reward/mean": 8.315185546875, "rewards/safety_reward/std": 0.5516091585159302, "step": 1054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 128.140625, "completions/mean_terminated_length": 128.140625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.3685267662212907, "frac_reward_zero_std": 0.0, "grad_norm": 0.3275890648365021, "kl": 3.73828125, "learning_rate": 5e-05, "loss": 0.0361, "num_tokens": 24241080.0, "reward": 6.35443115234375, "reward_std": 0.5111538171768188, "rewards/helpfulness_reward/mean": 6.35443115234375, "rewards/helpfulness_reward/std": 1.3856755495071411, "rewards/safety_reward/mean": 7.83441162109375, "rewards/safety_reward/std": 1.6321301460266113, "step": 1055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 124.3203125, "completions/mean_terminated_length": 124.3203125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.3688760806916426, "frac_reward_zero_std": 0.0, "grad_norm": 0.3022119998931885, "kl": 3.77734375, "learning_rate": 5e-05, "loss": 0.0328, "num_tokens": 24261321.0, "reward": 6.8662109375, "reward_std": 0.46285325288772583, "rewards/helpfulness_reward/mean": 6.8662109375, "rewards/helpfulness_reward/std": 0.7175796031951904, "rewards/safety_reward/mean": 8.2412109375, "rewards/safety_reward/std": 0.7516379952430725, "step": 1056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 124.5078125, "completions/mean_terminated_length": 124.5078125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.3692253951619946, "frac_reward_zero_std": 0.0, "grad_norm": 0.328652560710907, "kl": 3.71484375, "learning_rate": 5e-05, "loss": 0.0332, "num_tokens": 24281146.0, "reward": 6.868896484375, "reward_std": 0.3488815128803253, "rewards/helpfulness_reward/mean": 6.868896484375, "rewards/helpfulness_reward/std": 0.4601166844367981, "rewards/safety_reward/mean": 8.370361328125, "rewards/safety_reward/std": 0.39116033911705017, "step": 1057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 111.7734375, "completions/mean_terminated_length": 111.7734375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.3695747096323465, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5993388891220093, "kl": 3.751953125, "learning_rate": 5e-05, "loss": 0.0119, "num_tokens": 24302389.0, "reward": 6.531494140625, "reward_std": 0.4314262866973877, "rewards/helpfulness_reward/mean": 6.531494140625, "rewards/helpfulness_reward/std": 1.66111421585083, "rewards/safety_reward/mean": 7.7242431640625, "rewards/safety_reward/std": 2.1551554203033447, "step": 1058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 124.65625, "completions/mean_terminated_length": 124.65625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.3699240241026985, "frac_reward_zero_std": 0.0, "grad_norm": 0.3091287314891815, "kl": 3.732421875, "learning_rate": 5e-05, "loss": 0.0381, "num_tokens": 24322865.0, "reward": 7.0364990234375, "reward_std": 0.38752639293670654, "rewards/helpfulness_reward/mean": 7.0364990234375, "rewards/helpfulness_reward/std": 0.7054172158241272, "rewards/safety_reward/mean": 8.311767578125, "rewards/safety_reward/std": 0.5789170265197754, "step": 1059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 122.1484375, "completions/mean_terminated_length": 122.1484375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.3702733385730504, "frac_reward_zero_std": 0.0, "grad_norm": 31.690534591674805, "kl": 13.755859375, "learning_rate": 5e-05, "loss": 0.1314, "num_tokens": 24344508.0, "reward": 6.337360382080078, "reward_std": 0.3826621174812317, "rewards/helpfulness_reward/mean": 6.337360382080078, "rewards/helpfulness_reward/std": 1.5753000974655151, "rewards/safety_reward/mean": 7.837181091308594, "rewards/safety_reward/std": 1.7706490755081177, "step": 1060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 124.328125, "completions/mean_terminated_length": 124.328125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.3706226530434023, "frac_reward_zero_std": 0.0, "grad_norm": 0.3161573112010956, "kl": 3.619140625, "learning_rate": 5e-05, "loss": 0.0307, "num_tokens": 24365062.0, "reward": 6.6646728515625, "reward_std": 0.5387594699859619, "rewards/helpfulness_reward/mean": 6.6646728515625, "rewards/helpfulness_reward/std": 0.8101445436477661, "rewards/safety_reward/mean": 8.178466796875, "rewards/safety_reward/std": 0.6886081099510193, "step": 1061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 123.328125, "completions/mean_terminated_length": 123.328125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.3709719675137543, "frac_reward_zero_std": 0.0, "grad_norm": 0.31744295358657837, "kl": 3.623046875, "learning_rate": 5e-05, "loss": 0.0199, "num_tokens": 24385000.0, "reward": 6.7825927734375, "reward_std": 0.5573351383209229, "rewards/helpfulness_reward/mean": 6.7825927734375, "rewards/helpfulness_reward/std": 0.7498462796211243, "rewards/safety_reward/mean": 8.093505859375, "rewards/safety_reward/std": 0.6173684597015381, "step": 1062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 125.15625, "completions/mean_terminated_length": 125.15625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.3713212819841062, "frac_reward_zero_std": 0.0, "grad_norm": 0.3979454040527344, "kl": 3.91796875, "learning_rate": 5e-05, "loss": 0.039, "num_tokens": 24405444.0, "reward": 6.77685546875, "reward_std": 0.4053472876548767, "rewards/helpfulness_reward/mean": 6.77685546875, "rewards/helpfulness_reward/std": 0.6707279086112976, "rewards/safety_reward/mean": 8.279052734375, "rewards/safety_reward/std": 0.5000489354133606, "step": 1063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 123.3671875, "completions/mean_terminated_length": 123.3671875, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.3716705964544581, "frac_reward_zero_std": 0.0, "grad_norm": 0.3871913552284241, "kl": 3.55859375, "learning_rate": 5e-05, "loss": 0.0252, "num_tokens": 24425075.0, "reward": 6.8183441162109375, "reward_std": 0.5589699745178223, "rewards/helpfulness_reward/mean": 6.8183441162109375, "rewards/helpfulness_reward/std": 0.8763617873191833, "rewards/safety_reward/mean": 8.1722412109375, "rewards/safety_reward/std": 0.7985714673995972, "step": 1064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 119.5625, "completions/mean_terminated_length": 119.5625, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.37201991092481007, "frac_reward_zero_std": 0.0, "grad_norm": 0.32434388995170593, "kl": 3.63671875, "learning_rate": 5e-05, "loss": 0.0382, "num_tokens": 24445771.0, "reward": 6.7380523681640625, "reward_std": 0.47052431106567383, "rewards/helpfulness_reward/mean": 6.7380523681640625, "rewards/helpfulness_reward/std": 1.2984157800674438, "rewards/safety_reward/mean": 7.9731903076171875, "rewards/safety_reward/std": 1.7036633491516113, "step": 1065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 123.9921875, "completions/mean_terminated_length": 123.9921875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.372369225395162, "frac_reward_zero_std": 0.0, "grad_norm": 0.28682741522789, "kl": 3.537109375, "learning_rate": 5e-05, "loss": 0.0277, "num_tokens": 24465754.0, "reward": 6.892822265625, "reward_std": 0.544150710105896, "rewards/helpfulness_reward/mean": 6.892822265625, "rewards/helpfulness_reward/std": 1.0178923606872559, "rewards/safety_reward/mean": 8.0111083984375, "rewards/safety_reward/std": 0.7874365448951721, "step": 1066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 123.984375, "completions/mean_terminated_length": 123.984375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.37271853986551395, "frac_reward_zero_std": 0.0, "grad_norm": 0.2567194998264313, "kl": 3.546875, "learning_rate": 5e-05, "loss": 0.035, "num_tokens": 24485088.0, "reward": 6.9547119140625, "reward_std": 0.37369826436042786, "rewards/helpfulness_reward/mean": 6.9547119140625, "rewards/helpfulness_reward/std": 0.6954036355018616, "rewards/safety_reward/mean": 8.39599609375, "rewards/safety_reward/std": 0.5647253394126892, "step": 1067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 123.140625, "completions/mean_terminated_length": 123.140625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.37306785433586587, "frac_reward_zero_std": 0.0, "grad_norm": 0.31092584133148193, "kl": 3.5078125, "learning_rate": 5e-05, "loss": 0.0302, "num_tokens": 24505394.0, "reward": 6.7874755859375, "reward_std": 0.4509502947330475, "rewards/helpfulness_reward/mean": 6.7874755859375, "rewards/helpfulness_reward/std": 0.7236288189888, "rewards/safety_reward/mean": 8.104248046875, "rewards/safety_reward/std": 0.6334114074707031, "step": 1068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 123.015625, "completions/mean_terminated_length": 123.015625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.3734171688062178, "frac_reward_zero_std": 0.0, "grad_norm": 0.3033026158809662, "kl": 3.759765625, "learning_rate": 5e-05, "loss": 0.0353, "num_tokens": 24525036.0, "reward": 6.6435546875, "reward_std": 0.3993479013442993, "rewards/helpfulness_reward/mean": 6.6435546875, "rewards/helpfulness_reward/std": 0.7055765390396118, "rewards/safety_reward/mean": 8.196044921875, "rewards/safety_reward/std": 0.5450417399406433, "step": 1069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1343.0, "completions/max_terminated_length": 1343.0, "completions/mean_length": 162.21875, "completions/mean_terminated_length": 162.21875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.37376648327656975, "frac_reward_zero_std": 0.0, "grad_norm": 0.2754082977771759, "kl": 3.091796875, "learning_rate": 5e-05, "loss": 0.0344, "num_tokens": 24551544.0, "reward": 6.0052337646484375, "reward_std": 0.5251710414886475, "rewards/helpfulness_reward/mean": 6.0052337646484375, "rewards/helpfulness_reward/std": 2.240507125854492, "rewards/safety_reward/mean": 7.175804138183594, "rewards/safety_reward/std": 2.602161169052124, "step": 1070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 123.296875, "completions/mean_terminated_length": 123.296875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.37411579774692166, "frac_reward_zero_std": 0.0, "grad_norm": 0.2567051947116852, "kl": 3.5390625, "learning_rate": 5e-05, "loss": 0.0346, "num_tokens": 24571358.0, "reward": 6.8687744140625, "reward_std": 0.35922712087631226, "rewards/helpfulness_reward/mean": 6.8687744140625, "rewards/helpfulness_reward/std": 0.6975220441818237, "rewards/safety_reward/mean": 8.424072265625, "rewards/safety_reward/std": 0.5143380165100098, "step": 1071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 125.5, "completions/mean_terminated_length": 125.5, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.3744651122172736, "frac_reward_zero_std": 0.0, "grad_norm": 0.368326336145401, "kl": 3.66015625, "learning_rate": 5e-05, "loss": 0.035, "num_tokens": 24593870.0, "reward": 6.3509674072265625, "reward_std": 0.4026340842247009, "rewards/helpfulness_reward/mean": 6.3509674072265625, "rewards/helpfulness_reward/std": 1.6421256065368652, "rewards/safety_reward/mean": 7.60455322265625, "rewards/safety_reward/std": 1.9479453563690186, "step": 1072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 124.21875, "completions/mean_terminated_length": 124.21875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.37481442668762555, "frac_reward_zero_std": 0.0, "grad_norm": 0.3082959055900574, "kl": 3.33203125, "learning_rate": 5e-05, "loss": 0.0498, "num_tokens": 24615554.0, "reward": 6.459564208984375, "reward_std": 0.5456075668334961, "rewards/helpfulness_reward/mean": 6.459564208984375, "rewards/helpfulness_reward/std": 2.0768425464630127, "rewards/safety_reward/mean": 7.763118743896484, "rewards/safety_reward/std": 2.345055103302002, "step": 1073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 123.71875, "completions/mean_terminated_length": 123.71875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.37516374115797746, "frac_reward_zero_std": 0.0, "grad_norm": 0.3028582036495209, "kl": 3.7265625, "learning_rate": 5e-05, "loss": 0.0355, "num_tokens": 24636222.0, "reward": 6.80712890625, "reward_std": 0.2289087176322937, "rewards/helpfulness_reward/mean": 6.80712890625, "rewards/helpfulness_reward/std": 0.6137284636497498, "rewards/safety_reward/mean": 8.234130859375, "rewards/safety_reward/std": 0.48026371002197266, "step": 1074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 123.5546875, "completions/mean_terminated_length": 123.5546875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.37551305562832943, "frac_reward_zero_std": 0.0, "grad_norm": 0.28650403022766113, "kl": 3.681640625, "learning_rate": 5e-05, "loss": 0.0374, "num_tokens": 24658165.0, "reward": 6.6845703125, "reward_std": 0.2859604060649872, "rewards/helpfulness_reward/mean": 6.6845703125, "rewards/helpfulness_reward/std": 0.5852838158607483, "rewards/safety_reward/mean": 8.171142578125, "rewards/safety_reward/std": 0.4686099588871002, "step": 1075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 123.46875, "completions/mean_terminated_length": 123.46875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.37586237009868134, "frac_reward_zero_std": 0.0, "grad_norm": 0.323742538690567, "kl": 3.884765625, "learning_rate": 5e-05, "loss": 0.0413, "num_tokens": 24678145.0, "reward": 6.896728515625, "reward_std": 0.25017353892326355, "rewards/helpfulness_reward/mean": 6.896728515625, "rewards/helpfulness_reward/std": 0.5105547904968262, "rewards/safety_reward/mean": 8.289794921875, "rewards/safety_reward/std": 0.43371737003326416, "step": 1076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 124.140625, "completions/mean_terminated_length": 124.140625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.37621168456903326, "frac_reward_zero_std": 0.0, "grad_norm": 0.32495835423469543, "kl": 3.8515625, "learning_rate": 5e-05, "loss": 0.0383, "num_tokens": 24698195.0, "reward": 6.99560546875, "reward_std": 0.26757633686065674, "rewards/helpfulness_reward/mean": 6.99560546875, "rewards/helpfulness_reward/std": 0.5724558234214783, "rewards/safety_reward/mean": 8.42041015625, "rewards/safety_reward/std": 0.470246285200119, "step": 1077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 126.2109375, "completions/mean_terminated_length": 126.2109375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.3765609990393852, "frac_reward_zero_std": 0.0, "grad_norm": 0.6142357587814331, "kl": 3.73828125, "learning_rate": 5e-05, "loss": 0.0421, "num_tokens": 24720758.0, "reward": 6.5301055908203125, "reward_std": 0.2915646433830261, "rewards/helpfulness_reward/mean": 6.5301055908203125, "rewards/helpfulness_reward/std": 1.7256499528884888, "rewards/safety_reward/mean": 8.06414794921875, "rewards/safety_reward/std": 1.9322757720947266, "step": 1078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 123.40625, "completions/mean_terminated_length": 123.40625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.37691031350973714, "frac_reward_zero_std": 0.0, "grad_norm": 0.29598480463027954, "kl": 3.69921875, "learning_rate": 5e-05, "loss": 0.036, "num_tokens": 24741338.0, "reward": 7.023193359375, "reward_std": 0.2425975501537323, "rewards/helpfulness_reward/mean": 7.023193359375, "rewards/helpfulness_reward/std": 0.45832812786102295, "rewards/safety_reward/mean": 8.447265625, "rewards/safety_reward/std": 0.5001652836799622, "step": 1079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 122.890625, "completions/mean_terminated_length": 122.890625, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.37725962798008905, "frac_reward_zero_std": 0.0, "grad_norm": 1.0953662395477295, "kl": 4.224609375, "learning_rate": 5e-05, "loss": 0.0359, "num_tokens": 24763588.0, "reward": 6.501220703125, "reward_std": 0.22775974869728088, "rewards/helpfulness_reward/mean": 6.501220703125, "rewards/helpfulness_reward/std": 1.1136678457260132, "rewards/safety_reward/mean": 7.85693359375, "rewards/safety_reward/std": 0.9496960043907166, "step": 1080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 123.8515625, "completions/mean_terminated_length": 123.8515625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.377608942450441, "frac_reward_zero_std": 0.0, "grad_norm": 0.3183622360229492, "kl": 3.71875, "learning_rate": 5e-05, "loss": 0.0395, "num_tokens": 24783569.0, "reward": 6.947509765625, "reward_std": 0.39231640100479126, "rewards/helpfulness_reward/mean": 6.947509765625, "rewards/helpfulness_reward/std": 0.5697792172431946, "rewards/safety_reward/mean": 8.219482421875, "rewards/safety_reward/std": 0.7031751871109009, "step": 1081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1449.0, "completions/mean_length": 171.7890625, "completions/mean_terminated_length": 159.031494140625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.37795825692079293, "frac_reward_zero_std": 0.0, "grad_norm": 0.31793513894081116, "kl": 3.330078125, "learning_rate": 5e-05, "loss": 0.102, "num_tokens": 24810342.0, "reward": 6.835693359375, "reward_std": 0.43910521268844604, "rewards/helpfulness_reward/mean": 6.835693359375, "rewards/helpfulness_reward/std": 1.5707703828811646, "rewards/safety_reward/mean": 7.973888397216797, "rewards/safety_reward/std": 2.074110269546509, "step": 1082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 123.2421875, "completions/mean_terminated_length": 123.2421875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.3783075713911449, "frac_reward_zero_std": 0.0, "grad_norm": 0.32911890745162964, "kl": 3.79296875, "learning_rate": 5e-05, "loss": 0.0374, "num_tokens": 24830981.0, "reward": 6.78826904296875, "reward_std": 0.3240172266960144, "rewards/helpfulness_reward/mean": 6.78826904296875, "rewards/helpfulness_reward/std": 1.2376772165298462, "rewards/safety_reward/mean": 8.0814208984375, "rewards/safety_reward/std": 1.0535290241241455, "step": 1083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 140.28125, "completions/mean_terminated_length": 140.28125, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.3786568858614968, "frac_reward_zero_std": 0.0, "grad_norm": 0.46904268860816956, "kl": 3.49609375, "learning_rate": 5e-05, "loss": 0.0322, "num_tokens": 24855593.0, "reward": 6.4641265869140625, "reward_std": 0.561673104763031, "rewards/helpfulness_reward/mean": 6.4641265869140625, "rewards/helpfulness_reward/std": 2.109532117843628, "rewards/safety_reward/mean": 7.598886489868164, "rewards/safety_reward/std": 2.4852981567382812, "step": 1084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 124.328125, "completions/mean_terminated_length": 124.328125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.37900620033184873, "frac_reward_zero_std": 0.0, "grad_norm": 0.241562619805336, "kl": 3.537109375, "learning_rate": 5e-05, "loss": 0.0389, "num_tokens": 24875859.0, "reward": 7.05810546875, "reward_std": 0.33920982480049133, "rewards/helpfulness_reward/mean": 7.05810546875, "rewards/helpfulness_reward/std": 0.6876871585845947, "rewards/safety_reward/mean": 8.24853515625, "rewards/safety_reward/std": 0.7149458527565002, "step": 1085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 124.078125, "completions/mean_terminated_length": 124.078125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.3793555148022007, "frac_reward_zero_std": 0.0, "grad_norm": 0.27545464038848877, "kl": 3.728515625, "learning_rate": 5e-05, "loss": 0.0362, "num_tokens": 24896293.0, "reward": 7.02294921875, "reward_std": 0.21094243228435516, "rewards/helpfulness_reward/mean": 7.02294921875, "rewards/helpfulness_reward/std": 0.5786117315292358, "rewards/safety_reward/mean": 8.47119140625, "rewards/safety_reward/std": 0.515083909034729, "step": 1086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 123.75, "completions/mean_terminated_length": 123.75, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.3797048292725526, "frac_reward_zero_std": 0.0, "grad_norm": 0.24318470060825348, "kl": 3.630859375, "learning_rate": 5e-05, "loss": 0.0337, "num_tokens": 24915917.0, "reward": 6.924560546875, "reward_std": 0.46712353825569153, "rewards/helpfulness_reward/mean": 6.924560546875, "rewards/helpfulness_reward/std": 0.9143595099449158, "rewards/safety_reward/mean": 8.30926513671875, "rewards/safety_reward/std": 0.7730758190155029, "step": 1087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 124.484375, "completions/mean_terminated_length": 124.484375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.3800541437429045, "frac_reward_zero_std": 0.0, "grad_norm": 0.2693217694759369, "kl": 3.64453125, "learning_rate": 5e-05, "loss": 0.0391, "num_tokens": 24935763.0, "reward": 6.970703125, "reward_std": 0.25539538264274597, "rewards/helpfulness_reward/mean": 6.970703125, "rewards/helpfulness_reward/std": 0.4756424129009247, "rewards/safety_reward/mean": 8.28271484375, "rewards/safety_reward/std": 0.49430596828460693, "step": 1088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 117.4765625, "completions/mean_terminated_length": 117.4765625, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.3804034582132565, "frac_reward_zero_std": 0.0, "grad_norm": 0.45674771070480347, "kl": 3.755859375, "learning_rate": 5e-05, "loss": 0.042, "num_tokens": 24957976.0, "reward": 6.31915283203125, "reward_std": 0.43587726354599, "rewards/helpfulness_reward/mean": 6.31915283203125, "rewards/helpfulness_reward/std": 1.7943122386932373, "rewards/safety_reward/mean": 7.64569091796875, "rewards/safety_reward/std": 2.0822160243988037, "step": 1089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 123.8671875, "completions/mean_terminated_length": 123.8671875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.3807527726836084, "frac_reward_zero_std": 0.0, "grad_norm": 0.29088547825813293, "kl": 3.736328125, "learning_rate": 5e-05, "loss": 0.0384, "num_tokens": 24978599.0, "reward": 6.83349609375, "reward_std": 0.2364431917667389, "rewards/helpfulness_reward/mean": 6.83349609375, "rewards/helpfulness_reward/std": 0.5459072589874268, "rewards/safety_reward/mean": 8.39501953125, "rewards/safety_reward/std": 0.4042431116104126, "step": 1090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 123.1328125, "completions/mean_terminated_length": 123.1328125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.3811020871539604, "frac_reward_zero_std": 0.0, "grad_norm": 0.28439512848854065, "kl": 3.771484375, "learning_rate": 5e-05, "loss": 0.0323, "num_tokens": 24998016.0, "reward": 6.7587890625, "reward_std": 0.3244938254356384, "rewards/helpfulness_reward/mean": 6.7587890625, "rewards/helpfulness_reward/std": 0.6774150729179382, "rewards/safety_reward/mean": 8.10595703125, "rewards/safety_reward/std": 0.5304059982299805, "step": 1091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 124.03125, "completions/mean_terminated_length": 124.03125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.3814514016243123, "frac_reward_zero_std": 0.0, "grad_norm": 0.4436846971511841, "kl": 4.17578125, "learning_rate": 5e-05, "loss": 0.0386, "num_tokens": 25017660.0, "reward": 6.763671875, "reward_std": 0.39671680331230164, "rewards/helpfulness_reward/mean": 6.763671875, "rewards/helpfulness_reward/std": 0.7504381537437439, "rewards/safety_reward/mean": 8.053955078125, "rewards/safety_reward/std": 0.6070340275764465, "step": 1092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 122.8046875, "completions/mean_terminated_length": 122.8046875, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.3818007160946642, "frac_reward_zero_std": 0.0, "grad_norm": 0.2797738015651703, "kl": 3.7890625, "learning_rate": 5e-05, "loss": 0.0224, "num_tokens": 25037059.0, "reward": 6.800323486328125, "reward_std": 0.4324350357055664, "rewards/helpfulness_reward/mean": 6.800323486328125, "rewards/helpfulness_reward/std": 0.8555091023445129, "rewards/safety_reward/mean": 8.1982421875, "rewards/safety_reward/std": 0.7359393239021301, "step": 1093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 123.796875, "completions/mean_terminated_length": 123.796875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.3821500305650162, "frac_reward_zero_std": 0.0, "grad_norm": 0.29201996326446533, "kl": 3.640625, "learning_rate": 5e-05, "loss": 0.0341, "num_tokens": 25056601.0, "reward": 6.900146484375, "reward_std": 0.33314305543899536, "rewards/helpfulness_reward/mean": 6.900146484375, "rewards/helpfulness_reward/std": 0.5768758058547974, "rewards/safety_reward/mean": 8.352783203125, "rewards/safety_reward/std": 0.5432533025741577, "step": 1094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 123.796875, "completions/mean_terminated_length": 123.796875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.3824993450353681, "frac_reward_zero_std": 0.0, "grad_norm": 0.7334261536598206, "kl": 3.93359375, "learning_rate": 5e-05, "loss": 0.0369, "num_tokens": 25076959.0, "reward": 6.79052734375, "reward_std": 0.3138778805732727, "rewards/helpfulness_reward/mean": 6.79052734375, "rewards/helpfulness_reward/std": 0.4841127395629883, "rewards/safety_reward/mean": 8.249755859375, "rewards/safety_reward/std": 0.4076104462146759, "step": 1095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 124.9921875, "completions/mean_terminated_length": 124.9921875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.38284865950572, "frac_reward_zero_std": 0.0, "grad_norm": 0.3280932903289795, "kl": 3.662109375, "learning_rate": 5e-05, "loss": 0.049, "num_tokens": 25097862.0, "reward": 6.560302734375, "reward_std": 0.37416791915893555, "rewards/helpfulness_reward/mean": 6.560302734375, "rewards/helpfulness_reward/std": 0.8560382127761841, "rewards/safety_reward/mean": 8.1376953125, "rewards/safety_reward/std": 0.8506293892860413, "step": 1096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 126.6171875, "completions/mean_terminated_length": 126.6171875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.38319797397607197, "frac_reward_zero_std": 0.0, "grad_norm": 0.3507468104362488, "kl": 3.4140625, "learning_rate": 5e-05, "loss": 0.0353, "num_tokens": 25118589.0, "reward": 6.546089172363281, "reward_std": 0.33492743968963623, "rewards/helpfulness_reward/mean": 6.546089172363281, "rewards/helpfulness_reward/std": 1.549125075340271, "rewards/safety_reward/mean": 7.680145263671875, "rewards/safety_reward/std": 1.6324093341827393, "step": 1097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 123.6953125, "completions/mean_terminated_length": 123.6953125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.3835472884464239, "frac_reward_zero_std": 0.0, "grad_norm": 0.31812477111816406, "kl": 3.8515625, "learning_rate": 5e-05, "loss": 0.0454, "num_tokens": 25138694.0, "reward": 6.85546875, "reward_std": 0.34055501222610474, "rewards/helpfulness_reward/mean": 6.85546875, "rewards/helpfulness_reward/std": 0.49351415038108826, "rewards/safety_reward/mean": 8.19921875, "rewards/safety_reward/std": 0.46046650409698486, "step": 1098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 122.71875, "completions/mean_terminated_length": 122.71875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.38389660291677585, "frac_reward_zero_std": 0.0, "grad_norm": 0.30500176548957825, "kl": 3.6171875, "learning_rate": 5e-05, "loss": 0.0339, "num_tokens": 25158666.0, "reward": 6.8775634765625, "reward_std": 0.37842655181884766, "rewards/helpfulness_reward/mean": 6.8775634765625, "rewards/helpfulness_reward/std": 0.769754946231842, "rewards/safety_reward/mean": 8.286865234375, "rewards/safety_reward/std": 0.6960747241973877, "step": 1099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 123.0390625, "completions/mean_terminated_length": 123.0390625, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.38424591738712777, "frac_reward_zero_std": 0.0, "grad_norm": 0.32618507742881775, "kl": 3.568359375, "learning_rate": 5e-05, "loss": 0.0209, "num_tokens": 25181071.0, "reward": 6.439697265625, "reward_std": 0.37120506167411804, "rewards/helpfulness_reward/mean": 6.439697265625, "rewards/helpfulness_reward/std": 1.5165483951568604, "rewards/safety_reward/mean": 7.8634033203125, "rewards/safety_reward/std": 1.6278482675552368, "step": 1100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 121.8671875, "completions/mean_terminated_length": 121.8671875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.3845952318574797, "frac_reward_zero_std": 0.0, "grad_norm": 0.39996832609176636, "kl": 3.9140625, "learning_rate": 5e-05, "loss": 0.0267, "num_tokens": 25201142.0, "reward": 6.858642578125, "reward_std": 0.3053748905658722, "rewards/helpfulness_reward/mean": 6.858642578125, "rewards/helpfulness_reward/std": 0.5654581785202026, "rewards/safety_reward/mean": 8.300048828125, "rewards/safety_reward/std": 0.5234025120735168, "step": 1101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 122.921875, "completions/mean_terminated_length": 122.921875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.38494454632783165, "frac_reward_zero_std": 0.0, "grad_norm": 0.4560830593109131, "kl": 4.052734375, "learning_rate": 5e-05, "loss": 0.0375, "num_tokens": 25221420.0, "reward": 6.855224609375, "reward_std": 0.24766182899475098, "rewards/helpfulness_reward/mean": 6.855224609375, "rewards/helpfulness_reward/std": 0.5609513521194458, "rewards/safety_reward/mean": 8.25390625, "rewards/safety_reward/std": 0.5058254599571228, "step": 1102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 122.3359375, "completions/mean_terminated_length": 122.3359375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.38529386079818356, "frac_reward_zero_std": 0.0, "grad_norm": 0.6618987321853638, "kl": 4.232421875, "learning_rate": 5e-05, "loss": 0.0331, "num_tokens": 25241487.0, "reward": 6.765869140625, "reward_std": 0.4107394218444824, "rewards/helpfulness_reward/mean": 6.765869140625, "rewards/helpfulness_reward/std": 0.6774951219558716, "rewards/safety_reward/mean": 8.344970703125, "rewards/safety_reward/std": 0.47913211584091187, "step": 1103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 123.1015625, "completions/mean_terminated_length": 123.1015625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.3856431752685355, "frac_reward_zero_std": 0.0, "grad_norm": 0.31114456057548523, "kl": 3.83203125, "learning_rate": 5e-05, "loss": 0.0329, "num_tokens": 25261492.0, "reward": 7.029541015625, "reward_std": 0.31474030017852783, "rewards/helpfulness_reward/mean": 7.029541015625, "rewards/helpfulness_reward/std": 0.638068437576294, "rewards/safety_reward/mean": 8.3251953125, "rewards/safety_reward/std": 0.572613000869751, "step": 1104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 123.3515625, "completions/mean_terminated_length": 123.3515625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.38599248973888745, "frac_reward_zero_std": 0.0, "grad_norm": 0.3274034559726715, "kl": 3.78515625, "learning_rate": 5e-05, "loss": 0.0296, "num_tokens": 25282889.0, "reward": 7.0947265625, "reward_std": 0.43854400515556335, "rewards/helpfulness_reward/mean": 7.0947265625, "rewards/helpfulness_reward/std": 0.746670126914978, "rewards/safety_reward/mean": 8.42626953125, "rewards/safety_reward/std": 0.6178943514823914, "step": 1105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 122.9453125, "completions/mean_terminated_length": 122.9453125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.38634180420923936, "frac_reward_zero_std": 0.0, "grad_norm": 0.2896014153957367, "kl": 3.859375, "learning_rate": 5e-05, "loss": 0.0273, "num_tokens": 25302826.0, "reward": 7.0535888671875, "reward_std": 0.4462570250034332, "rewards/helpfulness_reward/mean": 7.0535888671875, "rewards/helpfulness_reward/std": 0.8394458889961243, "rewards/safety_reward/mean": 8.40869140625, "rewards/safety_reward/std": 0.6772751212120056, "step": 1106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 125.5546875, "completions/mean_terminated_length": 125.5546875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.38669111867959133, "frac_reward_zero_std": 0.0, "grad_norm": 0.23555344343185425, "kl": 3.546875, "learning_rate": 5e-05, "loss": 0.0369, "num_tokens": 25323401.0, "reward": 6.72479248046875, "reward_std": 0.35852164030075073, "rewards/helpfulness_reward/mean": 6.72479248046875, "rewards/helpfulness_reward/std": 1.3253173828125, "rewards/safety_reward/mean": 8.0565185546875, "rewards/safety_reward/std": 1.3224906921386719, "step": 1107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 124.1953125, "completions/mean_terminated_length": 124.1953125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.38704043314994324, "frac_reward_zero_std": 0.0, "grad_norm": 0.2981061041355133, "kl": 3.732421875, "learning_rate": 5e-05, "loss": 0.0401, "num_tokens": 25343882.0, "reward": 7.1220703125, "reward_std": 0.22291593253612518, "rewards/helpfulness_reward/mean": 7.1220703125, "rewards/helpfulness_reward/std": 0.46501031517982483, "rewards/safety_reward/mean": 8.46435546875, "rewards/safety_reward/std": 0.5358651280403137, "step": 1108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 123.9453125, "completions/mean_terminated_length": 123.9453125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.38738974762029516, "frac_reward_zero_std": 0.0, "grad_norm": 0.2806764841079712, "kl": 3.701171875, "learning_rate": 5e-05, "loss": 0.0376, "num_tokens": 25363523.0, "reward": 7.107666015625, "reward_std": 0.22736506164073944, "rewards/helpfulness_reward/mean": 7.107666015625, "rewards/helpfulness_reward/std": 0.40354010462760925, "rewards/safety_reward/mean": 8.265625, "rewards/safety_reward/std": 0.3985201120376587, "step": 1109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 122.53125, "completions/mean_terminated_length": 122.53125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.3877390620906471, "frac_reward_zero_std": 0.0, "grad_norm": 0.2740795314311981, "kl": 3.84765625, "learning_rate": 5e-05, "loss": 0.0236, "num_tokens": 25382975.0, "reward": 6.98956298828125, "reward_std": 0.5325258374214172, "rewards/helpfulness_reward/mean": 6.98956298828125, "rewards/helpfulness_reward/std": 0.9706037640571594, "rewards/safety_reward/mean": 8.1968994140625, "rewards/safety_reward/std": 0.8765789270401001, "step": 1110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/max_terminated_length": 620.0, "completions/mean_length": 136.5859375, "completions/mean_terminated_length": 136.5859375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.38808837656099904, "frac_reward_zero_std": 0.0, "grad_norm": 0.2862291634082794, "kl": 3.224609375, "learning_rate": 5e-05, "loss": 0.0172, "num_tokens": 25406362.0, "reward": 6.5362548828125, "reward_std": 0.34324872493743896, "rewards/helpfulness_reward/mean": 6.5362548828125, "rewards/helpfulness_reward/std": 1.9339048862457275, "rewards/safety_reward/mean": 7.8115692138671875, "rewards/safety_reward/std": 2.353045701980591, "step": 1111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 123.9375, "completions/mean_terminated_length": 123.9375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.38843769103135095, "frac_reward_zero_std": 0.0, "grad_norm": 0.2784320116043091, "kl": 3.841796875, "learning_rate": 5e-05, "loss": 0.0378, "num_tokens": 25426210.0, "reward": 6.96142578125, "reward_std": 0.328904926776886, "rewards/helpfulness_reward/mean": 6.96142578125, "rewards/helpfulness_reward/std": 0.5665385127067566, "rewards/safety_reward/mean": 8.40966796875, "rewards/safety_reward/std": 0.4449111223220825, "step": 1112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 123.3984375, "completions/mean_terminated_length": 123.3984375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.3887870055017029, "frac_reward_zero_std": 0.0, "grad_norm": 0.3102491796016693, "kl": 3.814453125, "learning_rate": 5e-05, "loss": 0.0309, "num_tokens": 25446165.0, "reward": 7.175048828125, "reward_std": 0.21681925654411316, "rewards/helpfulness_reward/mean": 7.175048828125, "rewards/helpfulness_reward/std": 0.6116982698440552, "rewards/safety_reward/mean": 8.36865234375, "rewards/safety_reward/std": 0.5574378371238708, "step": 1113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 124.03125, "completions/mean_terminated_length": 124.03125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.38913631997205483, "frac_reward_zero_std": 0.0, "grad_norm": 0.2732921838760376, "kl": 3.52734375, "learning_rate": 5e-05, "loss": 0.0346, "num_tokens": 25465777.0, "reward": 6.952392578125, "reward_std": 0.2694653868675232, "rewards/helpfulness_reward/mean": 6.952392578125, "rewards/helpfulness_reward/std": 0.576301097869873, "rewards/safety_reward/mean": 8.228515625, "rewards/safety_reward/std": 0.6262382864952087, "step": 1114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 123.796875, "completions/mean_terminated_length": 123.796875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.3894856344424068, "frac_reward_zero_std": 0.0, "grad_norm": 0.22877921164035797, "kl": 3.6875, "learning_rate": 5e-05, "loss": 0.0343, "num_tokens": 25487703.0, "reward": 6.656982421875, "reward_std": 0.25407150387763977, "rewards/helpfulness_reward/mean": 6.656982421875, "rewards/helpfulness_reward/std": 0.7465004324913025, "rewards/safety_reward/mean": 7.989501953125, "rewards/safety_reward/std": 0.7318723797798157, "step": 1115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 124.0625, "completions/mean_terminated_length": 124.0625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.3898349489127587, "frac_reward_zero_std": 0.0, "grad_norm": 0.44975602626800537, "kl": 4.021484375, "learning_rate": 5e-05, "loss": 0.0403, "num_tokens": 25507711.0, "reward": 6.9925537109375, "reward_std": 0.30001765489578247, "rewards/helpfulness_reward/mean": 6.9925537109375, "rewards/helpfulness_reward/std": 0.581462025642395, "rewards/safety_reward/mean": 8.47998046875, "rewards/safety_reward/std": 0.4530886709690094, "step": 1116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 124.25, "completions/mean_terminated_length": 124.25, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.39018426338311063, "frac_reward_zero_std": 0.0, "grad_norm": 0.7960940003395081, "kl": 4.05859375, "learning_rate": 5e-05, "loss": 0.0402, "num_tokens": 25526991.0, "reward": 6.885498046875, "reward_std": 0.2598438858985901, "rewards/helpfulness_reward/mean": 6.885498046875, "rewards/helpfulness_reward/std": 0.5643870830535889, "rewards/safety_reward/mean": 8.28173828125, "rewards/safety_reward/std": 0.5335246324539185, "step": 1117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 123.96875, "completions/mean_terminated_length": 123.96875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.3905335778534626, "frac_reward_zero_std": 0.0, "grad_norm": 0.22839197516441345, "kl": 3.61328125, "learning_rate": 5e-05, "loss": 0.0376, "num_tokens": 25546299.0, "reward": 7.286865234375, "reward_std": 0.18761278688907623, "rewards/helpfulness_reward/mean": 7.286865234375, "rewards/helpfulness_reward/std": 0.347353458404541, "rewards/safety_reward/mean": 8.598876953125, "rewards/safety_reward/std": 0.391409695148468, "step": 1118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 124.109375, "completions/mean_terminated_length": 124.109375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.3908828923238145, "frac_reward_zero_std": 0.0, "grad_norm": 0.22078773379325867, "kl": 3.82421875, "learning_rate": 5e-05, "loss": 0.0403, "num_tokens": 25566425.0, "reward": 6.935546875, "reward_std": 0.22781407833099365, "rewards/helpfulness_reward/mean": 6.935546875, "rewards/helpfulness_reward/std": 0.5855112671852112, "rewards/safety_reward/mean": 8.423583984375, "rewards/safety_reward/std": 0.5247739553451538, "step": 1119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 124.296875, "completions/mean_terminated_length": 124.296875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.3912322067941664, "frac_reward_zero_std": 0.0, "grad_norm": 0.8262118697166443, "kl": 4.072265625, "learning_rate": 5e-05, "loss": 0.044, "num_tokens": 25586775.0, "reward": 7.113525390625, "reward_std": 0.3376794159412384, "rewards/helpfulness_reward/mean": 7.113525390625, "rewards/helpfulness_reward/std": 0.6251091361045837, "rewards/safety_reward/mean": 8.26708984375, "rewards/safety_reward/std": 0.5805537104606628, "step": 1120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 123.6796875, "completions/mean_terminated_length": 123.6796875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.3915815212645184, "frac_reward_zero_std": 0.0, "grad_norm": 0.2895571291446686, "kl": 3.859375, "learning_rate": 5e-05, "loss": 0.0272, "num_tokens": 25606750.0, "reward": 6.9979248046875, "reward_std": 0.5120706558227539, "rewards/helpfulness_reward/mean": 6.9979248046875, "rewards/helpfulness_reward/std": 0.800163209438324, "rewards/safety_reward/mean": 8.293212890625, "rewards/safety_reward/std": 0.6462209224700928, "step": 1121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 124.5390625, "completions/mean_terminated_length": 124.5390625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.3919308357348703, "frac_reward_zero_std": 0.0, "grad_norm": 0.3152885138988495, "kl": 3.73046875, "learning_rate": 5e-05, "loss": 0.0423, "num_tokens": 25626595.0, "reward": 7.014404296875, "reward_std": 0.2776491045951843, "rewards/helpfulness_reward/mean": 7.014404296875, "rewards/helpfulness_reward/std": 0.4699852764606476, "rewards/safety_reward/mean": 8.253173828125, "rewards/safety_reward/std": 0.607934296131134, "step": 1122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 123.9296875, "completions/mean_terminated_length": 123.9296875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.3922801502052223, "frac_reward_zero_std": 0.0, "grad_norm": 0.8438903093338013, "kl": 3.984375, "learning_rate": 5e-05, "loss": 0.0405, "num_tokens": 25646658.0, "reward": 6.93701171875, "reward_std": 0.35819628834724426, "rewards/helpfulness_reward/mean": 6.93701171875, "rewards/helpfulness_reward/std": 0.5372301936149597, "rewards/safety_reward/mean": 8.18310546875, "rewards/safety_reward/std": 0.573891282081604, "step": 1123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 124.09375, "completions/mean_terminated_length": 124.09375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.3926294646755742, "frac_reward_zero_std": 0.0, "grad_norm": 0.27460813522338867, "kl": 3.875, "learning_rate": 5e-05, "loss": 0.038, "num_tokens": 25666342.0, "reward": 6.999755859375, "reward_std": 0.39515310525894165, "rewards/helpfulness_reward/mean": 6.999755859375, "rewards/helpfulness_reward/std": 0.8042289018630981, "rewards/safety_reward/mean": 8.499267578125, "rewards/safety_reward/std": 0.6540077328681946, "step": 1124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 124.390625, "completions/mean_terminated_length": 124.390625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.3929787791459261, "frac_reward_zero_std": 0.0, "grad_norm": 0.561272919178009, "kl": 3.99609375, "learning_rate": 5e-05, "loss": 0.0443, "num_tokens": 25686856.0, "reward": 6.69482421875, "reward_std": 0.37898826599121094, "rewards/helpfulness_reward/mean": 6.69482421875, "rewards/helpfulness_reward/std": 0.6193206906318665, "rewards/safety_reward/mean": 8.133056640625, "rewards/safety_reward/std": 0.581194281578064, "step": 1125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 123.625, "completions/mean_terminated_length": 123.625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.3933280936162781, "frac_reward_zero_std": 0.0, "grad_norm": 0.2967069745063782, "kl": 3.9453125, "learning_rate": 5e-05, "loss": 0.0402, "num_tokens": 25706704.0, "reward": 6.833984375, "reward_std": 0.3041961193084717, "rewards/helpfulness_reward/mean": 6.833984375, "rewards/helpfulness_reward/std": 0.544477641582489, "rewards/safety_reward/mean": 8.29345703125, "rewards/safety_reward/std": 0.45083436369895935, "step": 1126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 123.2890625, "completions/mean_terminated_length": 123.2890625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.39367740808663, "frac_reward_zero_std": 0.0, "grad_norm": 0.25671830773353577, "kl": 3.728515625, "learning_rate": 5e-05, "loss": 0.0388, "num_tokens": 25726685.0, "reward": 6.95703125, "reward_std": 0.3023136854171753, "rewards/helpfulness_reward/mean": 6.95703125, "rewards/helpfulness_reward/std": 0.5126656293869019, "rewards/safety_reward/mean": 8.331298828125, "rewards/safety_reward/std": 0.5147961378097534, "step": 1127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 123.46875, "completions/mean_terminated_length": 123.46875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.3940267225569819, "frac_reward_zero_std": 0.0, "grad_norm": 0.2976648211479187, "kl": 3.93359375, "learning_rate": 5e-05, "loss": 0.0375, "num_tokens": 25746521.0, "reward": 6.822509765625, "reward_std": 0.3929898142814636, "rewards/helpfulness_reward/mean": 6.822509765625, "rewards/helpfulness_reward/std": 0.7422492504119873, "rewards/safety_reward/mean": 7.982666015625, "rewards/safety_reward/std": 0.8614834547042847, "step": 1128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 123.0234375, "completions/mean_terminated_length": 123.0234375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.39437603702733387, "frac_reward_zero_std": 0.0, "grad_norm": 0.25232404470443726, "kl": 3.859375, "learning_rate": 5e-05, "loss": 0.0377, "num_tokens": 25766052.0, "reward": 6.9061279296875, "reward_std": 0.4277488887310028, "rewards/helpfulness_reward/mean": 6.9061279296875, "rewards/helpfulness_reward/std": 0.7684544920921326, "rewards/safety_reward/mean": 8.178466796875, "rewards/safety_reward/std": 0.752498984336853, "step": 1129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 123.09375, "completions/mean_terminated_length": 123.09375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.3947253514976858, "frac_reward_zero_std": 0.0, "grad_norm": 0.27320432662963867, "kl": 3.9296875, "learning_rate": 5e-05, "loss": 0.0316, "num_tokens": 25785752.0, "reward": 6.7750244140625, "reward_std": 0.4582080543041229, "rewards/helpfulness_reward/mean": 6.7750244140625, "rewards/helpfulness_reward/std": 0.8119848370552063, "rewards/safety_reward/mean": 8.11572265625, "rewards/safety_reward/std": 0.6197535395622253, "step": 1130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 123.4921875, "completions/mean_terminated_length": 123.4921875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.39507466596803775, "frac_reward_zero_std": 0.0, "grad_norm": 0.2850111126899719, "kl": 3.62109375, "learning_rate": 5e-05, "loss": 0.0301, "num_tokens": 25805159.0, "reward": 7.093017578125, "reward_std": 0.46220865845680237, "rewards/helpfulness_reward/mean": 7.093017578125, "rewards/helpfulness_reward/std": 0.7388073801994324, "rewards/safety_reward/mean": 8.38427734375, "rewards/safety_reward/std": 0.7668341398239136, "step": 1131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 124.078125, "completions/mean_terminated_length": 124.078125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.39542398043838967, "frac_reward_zero_std": 0.0, "grad_norm": 0.2765052914619446, "kl": 3.853515625, "learning_rate": 5e-05, "loss": 0.0389, "num_tokens": 25825657.0, "reward": 6.739013671875, "reward_std": 0.3286249041557312, "rewards/helpfulness_reward/mean": 6.739013671875, "rewards/helpfulness_reward/std": 0.708373486995697, "rewards/safety_reward/mean": 8.263916015625, "rewards/safety_reward/std": 0.5184348821640015, "step": 1132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 122.8984375, "completions/mean_terminated_length": 122.8984375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.3957732949087416, "frac_reward_zero_std": 0.0, "grad_norm": 0.29227814078330994, "kl": 3.6328125, "learning_rate": 5e-05, "loss": 0.0304, "num_tokens": 25846892.0, "reward": 6.6561279296875, "reward_std": 0.3736920952796936, "rewards/helpfulness_reward/mean": 6.6561279296875, "rewards/helpfulness_reward/std": 0.936015248298645, "rewards/safety_reward/mean": 8.245361328125, "rewards/safety_reward/std": 0.7706423997879028, "step": 1133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 123.1171875, "completions/mean_terminated_length": 123.1171875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.39612260937909355, "frac_reward_zero_std": 0.0, "grad_norm": 0.30620914697647095, "kl": 3.56640625, "learning_rate": 5e-05, "loss": 0.0316, "num_tokens": 25868947.0, "reward": 6.876708984375, "reward_std": 0.31817758083343506, "rewards/helpfulness_reward/mean": 6.876708984375, "rewards/helpfulness_reward/std": 0.6987856030464172, "rewards/safety_reward/mean": 8.23291015625, "rewards/safety_reward/std": 0.6088572144508362, "step": 1134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 122.4765625, "completions/mean_terminated_length": 122.4765625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.39647192384944546, "frac_reward_zero_std": 0.0, "grad_norm": 0.3042376637458801, "kl": 3.650390625, "learning_rate": 5e-05, "loss": 0.027, "num_tokens": 25889112.0, "reward": 6.72216796875, "reward_std": 0.30403220653533936, "rewards/helpfulness_reward/mean": 6.72216796875, "rewards/helpfulness_reward/std": 0.6915173530578613, "rewards/safety_reward/mean": 8.231201171875, "rewards/safety_reward/std": 0.5505251288414001, "step": 1135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 123.0625, "completions/mean_terminated_length": 123.0625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.3968212383197974, "frac_reward_zero_std": 0.0, "grad_norm": 0.34589144587516785, "kl": 3.888671875, "learning_rate": 5e-05, "loss": 0.0314, "num_tokens": 25909720.0, "reward": 6.854736328125, "reward_std": 0.366772323846817, "rewards/helpfulness_reward/mean": 6.854736328125, "rewards/helpfulness_reward/std": 0.6154110431671143, "rewards/safety_reward/mean": 8.284423828125, "rewards/safety_reward/std": 0.5180941224098206, "step": 1136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 125.5234375, "completions/mean_terminated_length": 125.5234375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.39717055279014934, "frac_reward_zero_std": 0.0, "grad_norm": 0.2791305482387543, "kl": 3.650390625, "learning_rate": 5e-05, "loss": 0.0421, "num_tokens": 25931651.0, "reward": 6.641582489013672, "reward_std": 0.38323718309402466, "rewards/helpfulness_reward/mean": 6.641582489013672, "rewards/helpfulness_reward/std": 1.5604265928268433, "rewards/safety_reward/mean": 7.932432174682617, "rewards/safety_reward/std": 1.7487963438034058, "step": 1137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 123.0390625, "completions/mean_terminated_length": 123.0390625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.39751986726050126, "frac_reward_zero_std": 0.0, "grad_norm": 0.27149587869644165, "kl": 3.625, "learning_rate": 5e-05, "loss": 0.035, "num_tokens": 25951312.0, "reward": 7.0406494140625, "reward_std": 0.3148924708366394, "rewards/helpfulness_reward/mean": 7.0406494140625, "rewards/helpfulness_reward/std": 0.6313266754150391, "rewards/safety_reward/mean": 8.17041015625, "rewards/safety_reward/std": 0.7683718204498291, "step": 1138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 123.7109375, "completions/mean_terminated_length": 123.7109375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.3978691817308532, "frac_reward_zero_std": 0.0, "grad_norm": 0.28576311469078064, "kl": 3.912109375, "learning_rate": 5e-05, "loss": 0.0395, "num_tokens": 25970947.0, "reward": 6.9853515625, "reward_std": 0.2839333713054657, "rewards/helpfulness_reward/mean": 6.9853515625, "rewards/helpfulness_reward/std": 0.44687193632125854, "rewards/safety_reward/mean": 8.461669921875, "rewards/safety_reward/std": 0.5790731310844421, "step": 1139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 123.4765625, "completions/mean_terminated_length": 123.4765625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.39821849620120514, "frac_reward_zero_std": 0.0, "grad_norm": 0.2574527859687805, "kl": 3.798828125, "learning_rate": 5e-05, "loss": 0.0318, "num_tokens": 25990672.0, "reward": 7.04833984375, "reward_std": 0.22392858564853668, "rewards/helpfulness_reward/mean": 7.04833984375, "rewards/helpfulness_reward/std": 0.4675733149051666, "rewards/safety_reward/mean": 8.2802734375, "rewards/safety_reward/std": 0.491201788187027, "step": 1140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 125.8046875, "completions/mean_terminated_length": 125.8046875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.39856781067155705, "frac_reward_zero_std": 0.0, "grad_norm": 0.2715247571468353, "kl": 3.57421875, "learning_rate": 5e-05, "loss": 0.0187, "num_tokens": 26010807.0, "reward": 6.88299560546875, "reward_std": 0.27498120069503784, "rewards/helpfulness_reward/mean": 6.88299560546875, "rewards/helpfulness_reward/std": 1.4420255422592163, "rewards/safety_reward/mean": 7.89935302734375, "rewards/safety_reward/std": 1.623557686805725, "step": 1141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 123.4140625, "completions/mean_terminated_length": 123.4140625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.398917125141909, "frac_reward_zero_std": 0.0, "grad_norm": 0.8143539428710938, "kl": 4.26171875, "learning_rate": 5e-05, "loss": 0.0342, "num_tokens": 26031324.0, "reward": 6.9024658203125, "reward_std": 0.3646683692932129, "rewards/helpfulness_reward/mean": 6.9024658203125, "rewards/helpfulness_reward/std": 0.7406651973724365, "rewards/safety_reward/mean": 8.36865234375, "rewards/safety_reward/std": 0.5525192022323608, "step": 1142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 123.671875, "completions/mean_terminated_length": 123.671875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.39926643961226094, "frac_reward_zero_std": 0.0, "grad_norm": 0.6657756567001343, "kl": 4.23828125, "learning_rate": 5e-05, "loss": 0.0383, "num_tokens": 26051346.0, "reward": 6.908447265625, "reward_std": 0.2126668393611908, "rewards/helpfulness_reward/mean": 6.908447265625, "rewards/helpfulness_reward/std": 0.5170146226882935, "rewards/safety_reward/mean": 8.354248046875, "rewards/safety_reward/std": 0.5632141828536987, "step": 1143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 123.6171875, "completions/mean_terminated_length": 123.6171875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.39961575408261285, "frac_reward_zero_std": 0.0, "grad_norm": 0.27173706889152527, "kl": 3.64453125, "learning_rate": 5e-05, "loss": 0.0319, "num_tokens": 26071073.0, "reward": 6.97900390625, "reward_std": 0.2671075463294983, "rewards/helpfulness_reward/mean": 6.97900390625, "rewards/helpfulness_reward/std": 0.461119145154953, "rewards/safety_reward/mean": 8.226318359375, "rewards/safety_reward/std": 0.6482005715370178, "step": 1144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 123.078125, "completions/mean_terminated_length": 123.078125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.3999650685529648, "frac_reward_zero_std": 0.0, "grad_norm": 0.6029937863349915, "kl": 4.16796875, "learning_rate": 5e-05, "loss": 0.0327, "num_tokens": 26090323.0, "reward": 7.052001953125, "reward_std": 0.3726353943347931, "rewards/helpfulness_reward/mean": 7.052001953125, "rewards/helpfulness_reward/std": 0.6853185892105103, "rewards/safety_reward/mean": 8.46337890625, "rewards/safety_reward/std": 0.6036856174468994, "step": 1145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 123.3125, "completions/mean_terminated_length": 123.3125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.40031438302331673, "frac_reward_zero_std": 0.0, "grad_norm": 0.306031733751297, "kl": 3.8984375, "learning_rate": 5e-05, "loss": 0.0386, "num_tokens": 26110115.0, "reward": 6.920166015625, "reward_std": 0.29332342743873596, "rewards/helpfulness_reward/mean": 6.920166015625, "rewards/helpfulness_reward/std": 0.7188923358917236, "rewards/safety_reward/mean": 8.340576171875, "rewards/safety_reward/std": 0.5536936521530151, "step": 1146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 123.2734375, "completions/mean_terminated_length": 123.2734375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.4006636974936687, "frac_reward_zero_std": 0.0, "grad_norm": 0.327629417181015, "kl": 3.958984375, "learning_rate": 5e-05, "loss": 0.0398, "num_tokens": 26129582.0, "reward": 7.09716796875, "reward_std": 0.2807144522666931, "rewards/helpfulness_reward/mean": 7.09716796875, "rewards/helpfulness_reward/std": 0.5434950590133667, "rewards/safety_reward/mean": 8.37109375, "rewards/safety_reward/std": 0.5162081718444824, "step": 1147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 123.5078125, "completions/mean_terminated_length": 123.5078125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.4010130119640206, "frac_reward_zero_std": 0.0, "grad_norm": 0.6378138661384583, "kl": 4.064453125, "learning_rate": 5e-05, "loss": 0.0374, "num_tokens": 26149575.0, "reward": 7.072509765625, "reward_std": 0.24078796803951263, "rewards/helpfulness_reward/mean": 7.072509765625, "rewards/helpfulness_reward/std": 0.4484521448612213, "rewards/safety_reward/mean": 8.283203125, "rewards/safety_reward/std": 0.6227536797523499, "step": 1148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 122.890625, "completions/mean_terminated_length": 122.890625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.40136232643437253, "frac_reward_zero_std": 0.0, "grad_norm": 0.2660507559776306, "kl": 3.66015625, "learning_rate": 5e-05, "loss": 0.0356, "num_tokens": 26170065.0, "reward": 7.0478515625, "reward_std": 0.2844647765159607, "rewards/helpfulness_reward/mean": 7.0478515625, "rewards/helpfulness_reward/std": 0.516609251499176, "rewards/safety_reward/mean": 8.396240234375, "rewards/safety_reward/std": 0.576474130153656, "step": 1149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 122.3671875, "completions/mean_terminated_length": 122.3671875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.4017116409047245, "frac_reward_zero_std": 0.0, "grad_norm": 0.2732886075973511, "kl": 3.873046875, "learning_rate": 5e-05, "loss": 0.0339, "num_tokens": 26189584.0, "reward": 6.855712890625, "reward_std": 0.39851999282836914, "rewards/helpfulness_reward/mean": 6.855712890625, "rewards/helpfulness_reward/std": 0.7390118837356567, "rewards/safety_reward/mean": 8.279052734375, "rewards/safety_reward/std": 0.6324626207351685, "step": 1150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 123.0546875, "completions/mean_terminated_length": 123.0546875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.4020609553750764, "frac_reward_zero_std": 0.0, "grad_norm": 0.24557621777057648, "kl": 3.759765625, "learning_rate": 5e-05, "loss": 0.032, "num_tokens": 26209415.0, "reward": 7.029052734375, "reward_std": 0.2511119842529297, "rewards/helpfulness_reward/mean": 7.029052734375, "rewards/helpfulness_reward/std": 0.5550300478935242, "rewards/safety_reward/mean": 8.208251953125, "rewards/safety_reward/std": 0.3938870131969452, "step": 1151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 122.46875, "completions/mean_terminated_length": 122.46875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.4024102698454283, "frac_reward_zero_std": 0.0, "grad_norm": 0.7570000290870667, "kl": 3.908203125, "learning_rate": 5e-05, "loss": 0.0294, "num_tokens": 26230923.0, "reward": 6.79296875, "reward_std": 0.3342941403388977, "rewards/helpfulness_reward/mean": 6.79296875, "rewards/helpfulness_reward/std": 0.7511781454086304, "rewards/safety_reward/mean": 8.4052734375, "rewards/safety_reward/std": 0.5511198043823242, "step": 1152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 123.34375, "completions/mean_terminated_length": 123.34375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.4027595843157803, "frac_reward_zero_std": 0.0, "grad_norm": 0.8965139985084534, "kl": 3.98828125, "learning_rate": 5e-05, "loss": 0.0367, "num_tokens": 26250783.0, "reward": 6.958740234375, "reward_std": 0.26545262336730957, "rewards/helpfulness_reward/mean": 6.958740234375, "rewards/helpfulness_reward/std": 0.5892707109451294, "rewards/safety_reward/mean": 8.193359375, "rewards/safety_reward/std": 0.5118813514709473, "step": 1153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 123.171875, "completions/mean_terminated_length": 123.171875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.4031088987861322, "frac_reward_zero_std": 0.0, "grad_norm": 0.33302041888237, "kl": 3.77734375, "learning_rate": 5e-05, "loss": 0.0378, "num_tokens": 26271693.0, "reward": 6.9189453125, "reward_std": 0.26160961389541626, "rewards/helpfulness_reward/mean": 6.9189453125, "rewards/helpfulness_reward/std": 0.5826503038406372, "rewards/safety_reward/mean": 8.2734375, "rewards/safety_reward/std": 0.4192444086074829, "step": 1154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 122.6953125, "completions/mean_terminated_length": 122.6953125, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.4034582132564842, "frac_reward_zero_std": 0.0, "grad_norm": 26.365459442138672, "kl": 11.744140625, "learning_rate": 5e-05, "loss": 0.1149, "num_tokens": 26295646.0, "reward": 6.2059783935546875, "reward_std": 0.43849897384643555, "rewards/helpfulness_reward/mean": 6.2059783935546875, "rewards/helpfulness_reward/std": 1.3851038217544556, "rewards/safety_reward/mean": 8.009132385253906, "rewards/safety_reward/std": 1.4531866312026978, "step": 1155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 123.3828125, "completions/mean_terminated_length": 123.3828125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.4038075277268361, "frac_reward_zero_std": 0.0, "grad_norm": 0.25469276309013367, "kl": 3.6796875, "learning_rate": 5e-05, "loss": 0.0288, "num_tokens": 26315559.0, "reward": 6.92333984375, "reward_std": 0.3186781406402588, "rewards/helpfulness_reward/mean": 6.92333984375, "rewards/helpfulness_reward/std": 0.4816843867301941, "rewards/safety_reward/mean": 8.255615234375, "rewards/safety_reward/std": 0.4996989965438843, "step": 1156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 123.4375, "completions/mean_terminated_length": 123.4375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.404156842197188, "frac_reward_zero_std": 0.0, "grad_norm": 0.26418712735176086, "kl": 3.890625, "learning_rate": 5e-05, "loss": 0.0385, "num_tokens": 26335967.0, "reward": 6.996337890625, "reward_std": 0.3229372501373291, "rewards/helpfulness_reward/mean": 6.996337890625, "rewards/helpfulness_reward/std": 0.5975983738899231, "rewards/safety_reward/mean": 8.26953125, "rewards/safety_reward/std": 0.6326062679290771, "step": 1157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 125.9375, "completions/mean_terminated_length": 125.9375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.40450615666754, "frac_reward_zero_std": 0.0, "grad_norm": 0.2935038208961487, "kl": 3.513671875, "learning_rate": 5e-05, "loss": 0.0526, "num_tokens": 26357687.0, "reward": 6.7688751220703125, "reward_std": 0.4071313440799713, "rewards/helpfulness_reward/mean": 6.7688751220703125, "rewards/helpfulness_reward/std": 1.4211680889129639, "rewards/safety_reward/mean": 8.155689239501953, "rewards/safety_reward/std": 1.5397316217422485, "step": 1158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 123.3984375, "completions/mean_terminated_length": 123.3984375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.4048554711378919, "frac_reward_zero_std": 0.0, "grad_norm": 0.5819442272186279, "kl": 4.080078125, "learning_rate": 5e-05, "loss": 0.0335, "num_tokens": 26378298.0, "reward": 6.87255859375, "reward_std": 0.40561914443969727, "rewards/helpfulness_reward/mean": 6.87255859375, "rewards/helpfulness_reward/std": 0.6362376809120178, "rewards/safety_reward/mean": 8.336669921875, "rewards/safety_reward/std": 0.6004173159599304, "step": 1159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 133.671875, "completions/mean_terminated_length": 133.671875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.4052047856082438, "frac_reward_zero_std": 0.0, "grad_norm": 0.299113392829895, "kl": 3.234375, "learning_rate": 5e-05, "loss": 0.052, "num_tokens": 26401352.0, "reward": 6.456634521484375, "reward_std": 0.28976014256477356, "rewards/helpfulness_reward/mean": 6.456634521484375, "rewards/helpfulness_reward/std": 1.4723650217056274, "rewards/safety_reward/mean": 7.7066650390625, "rewards/safety_reward/std": 1.7309038639068604, "step": 1160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 123.453125, "completions/mean_terminated_length": 123.453125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.40555410007859577, "frac_reward_zero_std": 0.0, "grad_norm": 0.3007562756538391, "kl": 3.7109375, "learning_rate": 5e-05, "loss": 0.0361, "num_tokens": 26420818.0, "reward": 6.95599365234375, "reward_std": 0.4004669189453125, "rewards/helpfulness_reward/mean": 6.95599365234375, "rewards/helpfulness_reward/std": 0.7107605934143066, "rewards/safety_reward/mean": 8.246337890625, "rewards/safety_reward/std": 0.6300438642501831, "step": 1161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 123.5703125, "completions/mean_terminated_length": 123.5703125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.4059034145489477, "frac_reward_zero_std": 0.0, "grad_norm": 0.3596142828464508, "kl": 3.76171875, "learning_rate": 5e-05, "loss": 0.038, "num_tokens": 26440891.0, "reward": 6.662841796875, "reward_std": 0.35065269470214844, "rewards/helpfulness_reward/mean": 6.662841796875, "rewards/helpfulness_reward/std": 0.6415098905563354, "rewards/safety_reward/mean": 8.25146484375, "rewards/safety_reward/std": 0.39642804861068726, "step": 1162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 123.8203125, "completions/mean_terminated_length": 123.8203125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.40625272901929965, "frac_reward_zero_std": 0.0, "grad_norm": 0.2859029471874237, "kl": 3.9296875, "learning_rate": 5e-05, "loss": 0.0403, "num_tokens": 26461004.0, "reward": 6.8583984375, "reward_std": 0.39776211977005005, "rewards/helpfulness_reward/mean": 6.8583984375, "rewards/helpfulness_reward/std": 0.7476786375045776, "rewards/safety_reward/mean": 8.226806640625, "rewards/safety_reward/std": 0.5998796224594116, "step": 1163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 123.4296875, "completions/mean_terminated_length": 123.4296875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.40660204348965157, "frac_reward_zero_std": 0.0, "grad_norm": 0.32089510560035706, "kl": 3.70703125, "learning_rate": 5e-05, "loss": 0.035, "num_tokens": 26482291.0, "reward": 6.6318359375, "reward_std": 0.4124923348426819, "rewards/helpfulness_reward/mean": 6.6318359375, "rewards/helpfulness_reward/std": 0.884265661239624, "rewards/safety_reward/mean": 8.15185546875, "rewards/safety_reward/std": 0.42915505170822144, "step": 1164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 122.8046875, "completions/mean_terminated_length": 122.8046875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.4069513579600035, "frac_reward_zero_std": 0.0, "grad_norm": 5.6559247970581055, "kl": 5.2421875, "learning_rate": 5e-05, "loss": 0.0503, "num_tokens": 26502370.0, "reward": 6.797119140625, "reward_std": 0.42621487379074097, "rewards/helpfulness_reward/mean": 6.797119140625, "rewards/helpfulness_reward/std": 0.6920732259750366, "rewards/safety_reward/mean": 8.17236328125, "rewards/safety_reward/std": 0.6047530770301819, "step": 1165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 123.0625, "completions/mean_terminated_length": 123.0625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.40730067243035545, "frac_reward_zero_std": 0.0, "grad_norm": 0.2952583134174347, "kl": 3.78515625, "learning_rate": 5e-05, "loss": 0.0355, "num_tokens": 26522202.0, "reward": 7.037841796875, "reward_std": 0.36850887537002563, "rewards/helpfulness_reward/mean": 7.037841796875, "rewards/helpfulness_reward/std": 0.5875158309936523, "rewards/safety_reward/mean": 8.31591796875, "rewards/safety_reward/std": 0.5269734859466553, "step": 1166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 121.6796875, "completions/mean_terminated_length": 121.6796875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.40764998690070736, "frac_reward_zero_std": 0.0, "grad_norm": 0.31921663880348206, "kl": 3.802734375, "learning_rate": 5e-05, "loss": 0.0222, "num_tokens": 26542361.0, "reward": 6.7410888671875, "reward_std": 0.5449591279029846, "rewards/helpfulness_reward/mean": 6.7410888671875, "rewards/helpfulness_reward/std": 0.9669697284698486, "rewards/safety_reward/mean": 8.1357421875, "rewards/safety_reward/std": 0.8836262226104736, "step": 1167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 122.390625, "completions/mean_terminated_length": 122.390625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.4079993013710593, "frac_reward_zero_std": 0.0, "grad_norm": 0.35713645815849304, "kl": 3.962890625, "learning_rate": 5e-05, "loss": 0.0265, "num_tokens": 26563163.0, "reward": 6.5582275390625, "reward_std": 0.46787452697753906, "rewards/helpfulness_reward/mean": 6.5582275390625, "rewards/helpfulness_reward/std": 0.8710490465164185, "rewards/safety_reward/mean": 8.113525390625, "rewards/safety_reward/std": 0.9465838074684143, "step": 1168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 123.21875, "completions/mean_terminated_length": 123.21875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.40834861584141124, "frac_reward_zero_std": 0.0, "grad_norm": 0.26351767778396606, "kl": 3.8125, "learning_rate": 5e-05, "loss": 0.0347, "num_tokens": 26582727.0, "reward": 6.7205810546875, "reward_std": 0.43379032611846924, "rewards/helpfulness_reward/mean": 6.7205810546875, "rewards/helpfulness_reward/std": 0.7871763706207275, "rewards/safety_reward/mean": 8.156494140625, "rewards/safety_reward/std": 0.6150349378585815, "step": 1169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 121.5, "completions/mean_terminated_length": 121.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.40869793031176316, "frac_reward_zero_std": 0.0, "grad_norm": 0.2799062728881836, "kl": 3.638671875, "learning_rate": 5e-05, "loss": 0.0257, "num_tokens": 26602799.0, "reward": 6.894775390625, "reward_std": 0.40542370080947876, "rewards/helpfulness_reward/mean": 6.894775390625, "rewards/helpfulness_reward/std": 0.8671067953109741, "rewards/safety_reward/mean": 8.22314453125, "rewards/safety_reward/std": 0.6513129472732544, "step": 1170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 121.0703125, "completions/mean_terminated_length": 121.0703125, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.40904724478211507, "frac_reward_zero_std": 0.0, "grad_norm": 0.29472628235816956, "kl": 3.912109375, "learning_rate": 5e-05, "loss": 0.012, "num_tokens": 26622592.0, "reward": 6.80682373046875, "reward_std": 0.5995458960533142, "rewards/helpfulness_reward/mean": 6.80682373046875, "rewards/helpfulness_reward/std": 1.064251184463501, "rewards/safety_reward/mean": 8.3363037109375, "rewards/safety_reward/std": 0.7184663414955139, "step": 1171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 122.8046875, "completions/mean_terminated_length": 122.8046875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.40939655925246704, "frac_reward_zero_std": 0.0, "grad_norm": 0.312311053276062, "kl": 3.826171875, "learning_rate": 5e-05, "loss": 0.0355, "num_tokens": 26642487.0, "reward": 6.78271484375, "reward_std": 0.5681902170181274, "rewards/helpfulness_reward/mean": 6.78271484375, "rewards/helpfulness_reward/std": 0.9546529650688171, "rewards/safety_reward/mean": 8.0614013671875, "rewards/safety_reward/std": 1.1117234230041504, "step": 1172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 121.859375, "completions/mean_terminated_length": 121.859375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.40974587372281895, "frac_reward_zero_std": 0.0, "grad_norm": 0.3111889064311981, "kl": 3.708984375, "learning_rate": 5e-05, "loss": 0.0231, "num_tokens": 26661621.0, "reward": 6.84716796875, "reward_std": 0.4325390160083771, "rewards/helpfulness_reward/mean": 6.84716796875, "rewards/helpfulness_reward/std": 0.8486908078193665, "rewards/safety_reward/mean": 8.24609375, "rewards/safety_reward/std": 0.8460207581520081, "step": 1173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 122.90625, "completions/mean_terminated_length": 122.90625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.4100951881931709, "frac_reward_zero_std": 0.0, "grad_norm": 0.3356730043888092, "kl": 3.626953125, "learning_rate": 5e-05, "loss": 0.0306, "num_tokens": 26682025.0, "reward": 6.67303466796875, "reward_std": 0.7450251579284668, "rewards/helpfulness_reward/mean": 6.67303466796875, "rewards/helpfulness_reward/std": 1.2978014945983887, "rewards/safety_reward/mean": 7.92694091796875, "rewards/safety_reward/std": 1.3179137706756592, "step": 1174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1792.0, "completions/max_terminated_length": 562.0, "completions/mean_length": 153.2421875, "completions/mean_terminated_length": 127.23016357421875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.41044450266352284, "frac_reward_zero_std": 0.0, "grad_norm": 0.2730296850204468, "kl": 3.27734375, "learning_rate": 5e-05, "loss": 0.02, "num_tokens": 26708408.0, "reward": 6.587860107421875, "reward_std": 0.42858022451400757, "rewards/helpfulness_reward/mean": 6.587860107421875, "rewards/helpfulness_reward/std": 1.3159267902374268, "rewards/safety_reward/mean": 7.7431640625, "rewards/safety_reward/std": 1.7242484092712402, "step": 1175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 122.75, "completions/mean_terminated_length": 122.75, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.41079381713387475, "frac_reward_zero_std": 0.0, "grad_norm": 0.29368382692337036, "kl": 3.59375, "learning_rate": 5e-05, "loss": 0.0284, "num_tokens": 26727736.0, "reward": 7.041015625, "reward_std": 0.2884865999221802, "rewards/helpfulness_reward/mean": 7.041015625, "rewards/helpfulness_reward/std": 0.5462401509284973, "rewards/safety_reward/mean": 8.18017578125, "rewards/safety_reward/std": 0.547347366809845, "step": 1176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 123.3046875, "completions/mean_terminated_length": 123.3046875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.4111431316042267, "frac_reward_zero_std": 0.0, "grad_norm": 0.28887858986854553, "kl": 3.67578125, "learning_rate": 5e-05, "loss": 0.0341, "num_tokens": 26747247.0, "reward": 7.048095703125, "reward_std": 0.3038061559200287, "rewards/helpfulness_reward/mean": 7.048095703125, "rewards/helpfulness_reward/std": 0.5509099960327148, "rewards/safety_reward/mean": 8.302978515625, "rewards/safety_reward/std": 0.46449941396713257, "step": 1177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 123.0234375, "completions/mean_terminated_length": 123.0234375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.41149244607457863, "frac_reward_zero_std": 0.0, "grad_norm": 0.25623178482055664, "kl": 3.83203125, "learning_rate": 5e-05, "loss": 0.0304, "num_tokens": 26766770.0, "reward": 7.219482421875, "reward_std": 0.25697585940361023, "rewards/helpfulness_reward/mean": 7.219482421875, "rewards/helpfulness_reward/std": 0.3794735074043274, "rewards/safety_reward/mean": 8.37890625, "rewards/safety_reward/std": 0.40457141399383545, "step": 1178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 123.671875, "completions/mean_terminated_length": 123.671875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.41184176054493055, "frac_reward_zero_std": 0.0, "grad_norm": 0.2801611125469208, "kl": 3.69140625, "learning_rate": 5e-05, "loss": 0.0358, "num_tokens": 26786240.0, "reward": 7.19970703125, "reward_std": 0.28624868392944336, "rewards/helpfulness_reward/mean": 7.19970703125, "rewards/helpfulness_reward/std": 0.6343691945075989, "rewards/safety_reward/mean": 8.693359375, "rewards/safety_reward/std": 0.5132914781570435, "step": 1179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 123.7265625, "completions/mean_terminated_length": 123.7265625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.4121910750152825, "frac_reward_zero_std": 0.0, "grad_norm": 0.35390883684158325, "kl": 3.884765625, "learning_rate": 5e-05, "loss": 0.0375, "num_tokens": 26805813.0, "reward": 7.029296875, "reward_std": 0.4462606906890869, "rewards/helpfulness_reward/mean": 7.029296875, "rewards/helpfulness_reward/std": 0.6512776613235474, "rewards/safety_reward/mean": 8.249267578125, "rewards/safety_reward/std": 0.5112306475639343, "step": 1180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 124.03125, "completions/mean_terminated_length": 124.03125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.41254038948563443, "frac_reward_zero_std": 0.0, "grad_norm": 0.3156339228153229, "kl": 3.85546875, "learning_rate": 5e-05, "loss": 0.0385, "num_tokens": 26826321.0, "reward": 6.8026123046875, "reward_std": 0.3455246388912201, "rewards/helpfulness_reward/mean": 6.8026123046875, "rewards/helpfulness_reward/std": 0.6548214554786682, "rewards/safety_reward/mean": 8.365478515625, "rewards/safety_reward/std": 0.49399006366729736, "step": 1181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 123.9765625, "completions/mean_terminated_length": 123.9765625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.4128897039559864, "frac_reward_zero_std": 0.0, "grad_norm": 0.2913846969604492, "kl": 3.541015625, "learning_rate": 5e-05, "loss": 0.0361, "num_tokens": 26846494.0, "reward": 7.06494140625, "reward_std": 0.2578188478946686, "rewards/helpfulness_reward/mean": 7.06494140625, "rewards/helpfulness_reward/std": 0.5466181039810181, "rewards/safety_reward/mean": 8.428955078125, "rewards/safety_reward/std": 0.4295487105846405, "step": 1182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 122.3828125, "completions/mean_terminated_length": 122.3828125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.4132390184263383, "frac_reward_zero_std": 0.0, "grad_norm": 0.2855990529060364, "kl": 3.837890625, "learning_rate": 5e-05, "loss": 0.0326, "num_tokens": 26867343.0, "reward": 6.7513427734375, "reward_std": 0.33097541332244873, "rewards/helpfulness_reward/mean": 6.7513427734375, "rewards/helpfulness_reward/std": 0.8264950513839722, "rewards/safety_reward/mean": 8.2001953125, "rewards/safety_reward/std": 0.9875087738037109, "step": 1183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 123.9921875, "completions/mean_terminated_length": 123.9921875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.4135883328966902, "frac_reward_zero_std": 0.0, "grad_norm": 0.2562224268913269, "kl": 3.79296875, "learning_rate": 5e-05, "loss": 0.0412, "num_tokens": 26887494.0, "reward": 7.0992431640625, "reward_std": 0.21831916272640228, "rewards/helpfulness_reward/mean": 7.0992431640625, "rewards/helpfulness_reward/std": 0.517170250415802, "rewards/safety_reward/mean": 8.415771484375, "rewards/safety_reward/std": 0.6416525840759277, "step": 1184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 123.84375, "completions/mean_terminated_length": 123.84375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.4139376473670422, "frac_reward_zero_std": 0.0, "grad_norm": 0.3307873606681824, "kl": 3.70703125, "learning_rate": 5e-05, "loss": 0.0383, "num_tokens": 26907698.0, "reward": 7.1103515625, "reward_std": 0.19054286181926727, "rewards/helpfulness_reward/mean": 7.1103515625, "rewards/helpfulness_reward/std": 0.38736459612846375, "rewards/safety_reward/mean": 8.36328125, "rewards/safety_reward/std": 0.3984622061252594, "step": 1185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 123.7421875, "completions/mean_terminated_length": 123.7421875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.4142869618373941, "frac_reward_zero_std": 0.0, "grad_norm": 0.3672562539577484, "kl": 3.826171875, "learning_rate": 5e-05, "loss": 0.0369, "num_tokens": 26928281.0, "reward": 7.2060546875, "reward_std": 0.2333821952342987, "rewards/helpfulness_reward/mean": 7.2060546875, "rewards/helpfulness_reward/std": 0.595159113407135, "rewards/safety_reward/mean": 8.45361328125, "rewards/safety_reward/std": 0.5775542259216309, "step": 1186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 131.75, "completions/mean_terminated_length": 131.75, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.414636276307746, "frac_reward_zero_std": 0.0, "grad_norm": 0.4340304732322693, "kl": 3.4765625, "learning_rate": 5e-05, "loss": 0.0396, "num_tokens": 26951393.0, "reward": 6.70050048828125, "reward_std": 0.19510230422019958, "rewards/helpfulness_reward/mean": 6.70050048828125, "rewards/helpfulness_reward/std": 1.3233944177627563, "rewards/safety_reward/mean": 8.01568603515625, "rewards/safety_reward/std": 1.7290596961975098, "step": 1187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 124.140625, "completions/mean_terminated_length": 124.140625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.414985590778098, "frac_reward_zero_std": 0.0, "grad_norm": 0.3062778115272522, "kl": 3.7578125, "learning_rate": 5e-05, "loss": 0.0392, "num_tokens": 26971939.0, "reward": 7.027099609375, "reward_std": 0.22575540840625763, "rewards/helpfulness_reward/mean": 7.027099609375, "rewards/helpfulness_reward/std": 0.49123987555503845, "rewards/safety_reward/mean": 8.345703125, "rewards/safety_reward/std": 0.6027504801750183, "step": 1188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 123.65625, "completions/mean_terminated_length": 123.65625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.4153349052484499, "frac_reward_zero_std": 0.0, "grad_norm": 0.2319786250591278, "kl": 3.73046875, "learning_rate": 5e-05, "loss": 0.0411, "num_tokens": 26993159.0, "reward": 6.8572998046875, "reward_std": 0.26155510544776917, "rewards/helpfulness_reward/mean": 6.8572998046875, "rewards/helpfulness_reward/std": 0.9332218170166016, "rewards/safety_reward/mean": 8.19677734375, "rewards/safety_reward/std": 0.7582728862762451, "step": 1189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 123.5703125, "completions/mean_terminated_length": 123.5703125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.4156842197188019, "frac_reward_zero_std": 0.0, "grad_norm": 0.25773343443870544, "kl": 3.861328125, "learning_rate": 5e-05, "loss": 0.0358, "num_tokens": 27013048.0, "reward": 7.298828125, "reward_std": 0.17542922496795654, "rewards/helpfulness_reward/mean": 7.298828125, "rewards/helpfulness_reward/std": 0.522879421710968, "rewards/safety_reward/mean": 8.697509765625, "rewards/safety_reward/std": 0.411822646856308, "step": 1190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 123.953125, "completions/mean_terminated_length": 123.953125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.4160335341891538, "frac_reward_zero_std": 0.0, "grad_norm": 0.2724013328552246, "kl": 3.763671875, "learning_rate": 5e-05, "loss": 0.0363, "num_tokens": 27033498.0, "reward": 7.128173828125, "reward_std": 0.22772464156150818, "rewards/helpfulness_reward/mean": 7.128173828125, "rewards/helpfulness_reward/std": 0.4723343849182129, "rewards/safety_reward/mean": 8.25, "rewards/safety_reward/std": 0.39235591888427734, "step": 1191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 124.015625, "completions/mean_terminated_length": 124.015625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.4163828486595057, "frac_reward_zero_std": 0.0, "grad_norm": 0.2120278924703598, "kl": 3.83984375, "learning_rate": 5e-05, "loss": 0.0372, "num_tokens": 27053652.0, "reward": 7.18017578125, "reward_std": 0.198802649974823, "rewards/helpfulness_reward/mean": 7.18017578125, "rewards/helpfulness_reward/std": 0.4377545118331909, "rewards/safety_reward/mean": 8.41943359375, "rewards/safety_reward/std": 0.46615272760391235, "step": 1192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 126.6171875, "completions/mean_terminated_length": 126.6171875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.41673216312985767, "frac_reward_zero_std": 0.0, "grad_norm": 0.2401939332485199, "kl": 3.69140625, "learning_rate": 5e-05, "loss": 0.0492, "num_tokens": 27073995.0, "reward": 6.96954345703125, "reward_std": 0.3732057511806488, "rewards/helpfulness_reward/mean": 6.96954345703125, "rewards/helpfulness_reward/std": 0.9415951371192932, "rewards/safety_reward/mean": 8.4666748046875, "rewards/safety_reward/std": 0.9866160154342651, "step": 1193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 124.265625, "completions/mean_terminated_length": 124.265625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.4170814776002096, "frac_reward_zero_std": 0.0, "grad_norm": 0.27284103631973267, "kl": 3.720703125, "learning_rate": 5e-05, "loss": 0.0364, "num_tokens": 27093525.0, "reward": 7.259765625, "reward_std": 0.29621028900146484, "rewards/helpfulness_reward/mean": 7.259765625, "rewards/helpfulness_reward/std": 0.6337084174156189, "rewards/safety_reward/mean": 8.481689453125, "rewards/safety_reward/std": 0.6007542014122009, "step": 1194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 124.59375, "completions/mean_terminated_length": 124.59375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.4174307920705615, "frac_reward_zero_std": 0.0, "grad_norm": 0.2746811509132385, "kl": 3.841796875, "learning_rate": 5e-05, "loss": 0.0408, "num_tokens": 27113873.0, "reward": 7.02734375, "reward_std": 0.19191290438175201, "rewards/helpfulness_reward/mean": 7.02734375, "rewards/helpfulness_reward/std": 0.49038803577423096, "rewards/safety_reward/mean": 8.24072265625, "rewards/safety_reward/std": 0.6009312272071838, "step": 1195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 123.5625, "completions/mean_terminated_length": 123.5625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.41778010654091347, "frac_reward_zero_std": 0.0, "grad_norm": 0.32327115535736084, "kl": 3.802734375, "learning_rate": 5e-05, "loss": 0.0276, "num_tokens": 27134609.0, "reward": 7.126220703125, "reward_std": 0.19524937868118286, "rewards/helpfulness_reward/mean": 7.126220703125, "rewards/helpfulness_reward/std": 0.5729884505271912, "rewards/safety_reward/mean": 8.3408203125, "rewards/safety_reward/std": 0.5769075155258179, "step": 1196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 123.7890625, "completions/mean_terminated_length": 123.7890625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.4181294210112654, "frac_reward_zero_std": 0.0, "grad_norm": 0.23792602121829987, "kl": 3.70703125, "learning_rate": 5e-05, "loss": 0.0288, "num_tokens": 27154222.0, "reward": 7.31982421875, "reward_std": 0.20879262685775757, "rewards/helpfulness_reward/mean": 7.31982421875, "rewards/helpfulness_reward/std": 0.39870306849479675, "rewards/safety_reward/mean": 8.401611328125, "rewards/safety_reward/std": 0.4821189045906067, "step": 1197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 124.6796875, "completions/mean_terminated_length": 124.6796875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.41847873548161735, "frac_reward_zero_std": 0.0, "grad_norm": 0.5250773429870605, "kl": 4.029296875, "learning_rate": 5e-05, "loss": 0.0414, "num_tokens": 27174133.0, "reward": 6.94189453125, "reward_std": 0.3462895154953003, "rewards/helpfulness_reward/mean": 6.94189453125, "rewards/helpfulness_reward/std": 0.6667379140853882, "rewards/safety_reward/mean": 8.061767578125, "rewards/safety_reward/std": 0.5916607975959778, "step": 1198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 124.6328125, "completions/mean_terminated_length": 124.6328125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.41882804995196926, "frac_reward_zero_std": 0.0, "grad_norm": 0.2594152092933655, "kl": 3.9921875, "learning_rate": 5e-05, "loss": 0.0385, "num_tokens": 27194454.0, "reward": 7.079345703125, "reward_std": 0.2834722399711609, "rewards/helpfulness_reward/mean": 7.079345703125, "rewards/helpfulness_reward/std": 0.6264243125915527, "rewards/safety_reward/mean": 8.30419921875, "rewards/safety_reward/std": 0.6046449542045593, "step": 1199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 124.15625, "completions/mean_terminated_length": 124.15625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.4191773644223212, "frac_reward_zero_std": 0.0, "grad_norm": 0.2814221978187561, "kl": 3.90625, "learning_rate": 5e-05, "loss": 0.032, "num_tokens": 27215466.0, "reward": 6.9417724609375, "reward_std": 0.2709079384803772, "rewards/helpfulness_reward/mean": 6.9417724609375, "rewards/helpfulness_reward/std": 0.5994844436645508, "rewards/safety_reward/mean": 8.170166015625, "rewards/safety_reward/std": 0.5608408451080322, "step": 1200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 124.5234375, "completions/mean_terminated_length": 124.5234375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.41952667889267314, "frac_reward_zero_std": 0.0, "grad_norm": 0.3422923684120178, "kl": 4.150390625, "learning_rate": 5e-05, "loss": 0.0419, "num_tokens": 27235645.0, "reward": 7.09716796875, "reward_std": 0.28064584732055664, "rewards/helpfulness_reward/mean": 7.09716796875, "rewards/helpfulness_reward/std": 0.5184115171432495, "rewards/safety_reward/mean": 8.453125, "rewards/safety_reward/std": 0.4877939224243164, "step": 1201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 124.2578125, "completions/mean_terminated_length": 124.2578125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.41987599336302506, "frac_reward_zero_std": 0.0, "grad_norm": 0.282366007566452, "kl": 3.93359375, "learning_rate": 5e-05, "loss": 0.0331, "num_tokens": 27255102.0, "reward": 7.087646484375, "reward_std": 0.2699028253555298, "rewards/helpfulness_reward/mean": 7.087646484375, "rewards/helpfulness_reward/std": 0.5404278635978699, "rewards/safety_reward/mean": 8.326416015625, "rewards/safety_reward/std": 0.4392857253551483, "step": 1202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1792.0, "completions/max_terminated_length": 596.0, "completions/mean_length": 162.5625, "completions/mean_terminated_length": 149.73228454589844, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.42022530783337697, "frac_reward_zero_std": 0.0, "grad_norm": 0.2555203437805176, "kl": 3.03125, "learning_rate": 5e-05, "loss": 0.0783, "num_tokens": 27283662.0, "reward": 6.102630615234375, "reward_std": 0.3853135108947754, "rewards/helpfulness_reward/mean": 6.102630615234375, "rewards/helpfulness_reward/std": 2.413639783859253, "rewards/safety_reward/mean": 7.36590576171875, "rewards/safety_reward/std": 2.9279210567474365, "step": 1203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 124.7421875, "completions/mean_terminated_length": 124.7421875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.42057462230372894, "frac_reward_zero_std": 0.0, "grad_norm": 0.594588041305542, "kl": 3.8984375, "learning_rate": 5e-05, "loss": 0.0388, "num_tokens": 27304117.0, "reward": 7.265380859375, "reward_std": 0.20887550711631775, "rewards/helpfulness_reward/mean": 7.265380859375, "rewards/helpfulness_reward/std": 0.5538485646247864, "rewards/safety_reward/mean": 8.526123046875, "rewards/safety_reward/std": 0.5498324632644653, "step": 1204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 124.125, "completions/mean_terminated_length": 124.125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.42092393677408085, "frac_reward_zero_std": 0.0, "grad_norm": 0.2207709103822708, "kl": 3.78125, "learning_rate": 5e-05, "loss": 0.0333, "num_tokens": 27323997.0, "reward": 6.93408203125, "reward_std": 0.2731107473373413, "rewards/helpfulness_reward/mean": 6.93408203125, "rewards/helpfulness_reward/std": 0.5581116080284119, "rewards/safety_reward/mean": 8.43359375, "rewards/safety_reward/std": 0.5033261775970459, "step": 1205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 124.5703125, "completions/mean_terminated_length": 124.5703125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.4212732512444328, "frac_reward_zero_std": 0.0, "grad_norm": 0.2655874788761139, "kl": 3.6953125, "learning_rate": 5e-05, "loss": 0.0383, "num_tokens": 27343710.0, "reward": 7.067626953125, "reward_std": 0.36388450860977173, "rewards/helpfulness_reward/mean": 7.067626953125, "rewards/helpfulness_reward/std": 0.7294571399688721, "rewards/safety_reward/mean": 8.36376953125, "rewards/safety_reward/std": 0.5759109854698181, "step": 1206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 124.5546875, "completions/mean_terminated_length": 124.5546875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.42162256571478474, "frac_reward_zero_std": 0.0, "grad_norm": 0.2769879400730133, "kl": 3.87109375, "learning_rate": 5e-05, "loss": 0.0357, "num_tokens": 27363669.0, "reward": 7.106201171875, "reward_std": 0.2997280955314636, "rewards/helpfulness_reward/mean": 7.106201171875, "rewards/helpfulness_reward/std": 0.6123892068862915, "rewards/safety_reward/mean": 8.23974609375, "rewards/safety_reward/std": 0.5776491165161133, "step": 1207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 917.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 144.71875, "completions/mean_terminated_length": 144.71875, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.42197188018513665, "frac_reward_zero_std": 0.0, "grad_norm": 0.3769305348396301, "kl": 3.466796875, "learning_rate": 5e-05, "loss": 0.0877, "num_tokens": 27389481.0, "reward": 5.967292785644531, "reward_std": 0.4670793414115906, "rewards/helpfulness_reward/mean": 5.967292785644531, "rewards/helpfulness_reward/std": 2.3041162490844727, "rewards/safety_reward/mean": 7.015838623046875, "rewards/safety_reward/std": 2.6410200595855713, "step": 1208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 124.2734375, "completions/mean_terminated_length": 124.2734375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.4223211946554886, "frac_reward_zero_std": 0.0, "grad_norm": 0.6874379515647888, "kl": 4.4140625, "learning_rate": 5e-05, "loss": 0.0399, "num_tokens": 27409188.0, "reward": 7.0517578125, "reward_std": 0.29679107666015625, "rewards/helpfulness_reward/mean": 7.0517578125, "rewards/helpfulness_reward/std": 0.6834890842437744, "rewards/safety_reward/mean": 8.440673828125, "rewards/safety_reward/std": 0.5286431312561035, "step": 1209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 124.3515625, "completions/mean_terminated_length": 124.3515625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.42267050912584053, "frac_reward_zero_std": 0.0, "grad_norm": 0.210683211684227, "kl": 3.802734375, "learning_rate": 5e-05, "loss": 0.0318, "num_tokens": 27428721.0, "reward": 7.13037109375, "reward_std": 0.28142300248146057, "rewards/helpfulness_reward/mean": 7.13037109375, "rewards/helpfulness_reward/std": 0.5836744904518127, "rewards/safety_reward/mean": 8.40576171875, "rewards/safety_reward/std": 0.5588929653167725, "step": 1210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 124.5625, "completions/mean_terminated_length": 124.5625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.42301982359619245, "frac_reward_zero_std": 0.0, "grad_norm": 0.2694905400276184, "kl": 3.970703125, "learning_rate": 5e-05, "loss": 0.0378, "num_tokens": 27448937.0, "reward": 7.112060546875, "reward_std": 0.29029083251953125, "rewards/helpfulness_reward/mean": 7.112060546875, "rewards/helpfulness_reward/std": 0.5827570557594299, "rewards/safety_reward/mean": 8.40234375, "rewards/safety_reward/std": 0.3541618585586548, "step": 1211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 124.0859375, "completions/mean_terminated_length": 124.0859375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.4233691380665444, "frac_reward_zero_std": 0.0, "grad_norm": 0.41393545269966125, "kl": 3.92578125, "learning_rate": 5e-05, "loss": 0.0335, "num_tokens": 27470100.0, "reward": 6.8583984375, "reward_std": 0.3163858652114868, "rewards/helpfulness_reward/mean": 6.8583984375, "rewards/helpfulness_reward/std": 0.8333368301391602, "rewards/safety_reward/mean": 8.2158203125, "rewards/safety_reward/std": 0.6063461303710938, "step": 1212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1792.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 139.953125, "completions/mean_terminated_length": 126.94487762451172, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.42371845253689633, "frac_reward_zero_std": 0.0, "grad_norm": 0.2364964783191681, "kl": 3.484375, "learning_rate": 5e-05, "loss": 0.0837, "num_tokens": 27494654.0, "reward": 6.65289306640625, "reward_std": 0.3180864453315735, "rewards/helpfulness_reward/mean": 6.65289306640625, "rewards/helpfulness_reward/std": 1.643993854522705, "rewards/safety_reward/mean": 7.8278961181640625, "rewards/safety_reward/std": 2.021986961364746, "step": 1213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 124.640625, "completions/mean_terminated_length": 124.640625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.4240677670072483, "frac_reward_zero_std": 0.0, "grad_norm": 0.25231337547302246, "kl": 3.685546875, "learning_rate": 5e-05, "loss": 0.0389, "num_tokens": 27514456.0, "reward": 7.032470703125, "reward_std": 0.2807440757751465, "rewards/helpfulness_reward/mean": 7.032470703125, "rewards/helpfulness_reward/std": 0.48628461360931396, "rewards/safety_reward/mean": 8.39501953125, "rewards/safety_reward/std": 0.4843251407146454, "step": 1214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 124.4375, "completions/mean_terminated_length": 124.4375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.4244170814776002, "frac_reward_zero_std": 0.0, "grad_norm": 0.1868339627981186, "kl": 3.783203125, "learning_rate": 5e-05, "loss": 0.033, "num_tokens": 27534176.0, "reward": 7.22998046875, "reward_std": 0.2611245810985565, "rewards/helpfulness_reward/mean": 7.22998046875, "rewards/helpfulness_reward/std": 0.5523434281349182, "rewards/safety_reward/mean": 8.435791015625, "rewards/safety_reward/std": 0.36587294936180115, "step": 1215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 124.5078125, "completions/mean_terminated_length": 124.5078125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.4247663959479521, "frac_reward_zero_std": 0.0, "grad_norm": 0.28502774238586426, "kl": 3.84765625, "learning_rate": 5e-05, "loss": 0.038, "num_tokens": 27553937.0, "reward": 7.07568359375, "reward_std": 0.2586135268211365, "rewards/helpfulness_reward/mean": 7.07568359375, "rewards/helpfulness_reward/std": 0.6859616041183472, "rewards/safety_reward/mean": 8.317626953125, "rewards/safety_reward/std": 0.5321072936058044, "step": 1216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 123.3046875, "completions/mean_terminated_length": 123.3046875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.4251157104183041, "frac_reward_zero_std": 0.0, "grad_norm": 0.20572584867477417, "kl": 3.759765625, "learning_rate": 5e-05, "loss": 0.0269, "num_tokens": 27574288.0, "reward": 6.7803955078125, "reward_std": 0.3894854784011841, "rewards/helpfulness_reward/mean": 6.7803955078125, "rewards/helpfulness_reward/std": 1.2646522521972656, "rewards/safety_reward/mean": 8.1561279296875, "rewards/safety_reward/std": 0.9708600640296936, "step": 1217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/max_terminated_length": 578.0, "completions/mean_length": 134.8984375, "completions/mean_terminated_length": 134.8984375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.425465024888656, "frac_reward_zero_std": 0.0, "grad_norm": 0.23186084628105164, "kl": 3.494140625, "learning_rate": 5e-05, "loss": 0.0273, "num_tokens": 27598595.0, "reward": 6.49737548828125, "reward_std": 0.32902854681015015, "rewards/helpfulness_reward/mean": 6.49737548828125, "rewards/helpfulness_reward/std": 1.611227035522461, "rewards/safety_reward/mean": 7.666099548339844, "rewards/safety_reward/std": 2.0048999786376953, "step": 1218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 124.5, "completions/mean_terminated_length": 124.5, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.4258143393590079, "frac_reward_zero_std": 0.0, "grad_norm": 0.23992443084716797, "kl": 3.767578125, "learning_rate": 5e-05, "loss": 0.0344, "num_tokens": 27619243.0, "reward": 7.061279296875, "reward_std": 0.2936737835407257, "rewards/helpfulness_reward/mean": 7.061279296875, "rewards/helpfulness_reward/std": 0.5678524374961853, "rewards/safety_reward/mean": 8.324462890625, "rewards/safety_reward/std": 0.5133269429206848, "step": 1219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 124.3203125, "completions/mean_terminated_length": 124.3203125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.4261636538293599, "frac_reward_zero_std": 0.0, "grad_norm": 0.24946323037147522, "kl": 3.82421875, "learning_rate": 5e-05, "loss": 0.0358, "num_tokens": 27639412.0, "reward": 7.065673828125, "reward_std": 0.2927505671977997, "rewards/helpfulness_reward/mean": 7.065673828125, "rewards/helpfulness_reward/std": 0.4765644669532776, "rewards/safety_reward/mean": 8.4990234375, "rewards/safety_reward/std": 0.4175892770290375, "step": 1220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 124.890625, "completions/mean_terminated_length": 124.890625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.4265129682997118, "frac_reward_zero_std": 0.0, "grad_norm": 0.2784913182258606, "kl": 3.974609375, "learning_rate": 5e-05, "loss": 0.041, "num_tokens": 27659094.0, "reward": 7.045654296875, "reward_std": 0.29609963297843933, "rewards/helpfulness_reward/mean": 7.045654296875, "rewards/helpfulness_reward/std": 0.5759665966033936, "rewards/safety_reward/mean": 8.281005859375, "rewards/safety_reward/std": 0.4891397953033447, "step": 1221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 124.734375, "completions/mean_terminated_length": 124.734375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.4268622827700638, "frac_reward_zero_std": 0.0, "grad_norm": 0.2706528306007385, "kl": 4.15234375, "learning_rate": 5e-05, "loss": 0.0436, "num_tokens": 27678932.0, "reward": 7.1630859375, "reward_std": 0.3285480737686157, "rewards/helpfulness_reward/mean": 7.1630859375, "rewards/helpfulness_reward/std": 0.49229636788368225, "rewards/safety_reward/mean": 8.413818359375, "rewards/safety_reward/std": 0.502267062664032, "step": 1222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 125.109375, "completions/mean_terminated_length": 125.109375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.4272115972404157, "frac_reward_zero_std": 0.0, "grad_norm": 0.28150370717048645, "kl": 3.98828125, "learning_rate": 5e-05, "loss": 0.0386, "num_tokens": 27699522.0, "reward": 6.946533203125, "reward_std": 0.2724371552467346, "rewards/helpfulness_reward/mean": 6.946533203125, "rewards/helpfulness_reward/std": 0.5220906138420105, "rewards/safety_reward/mean": 8.434326171875, "rewards/safety_reward/std": 0.4316581189632416, "step": 1223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 124.7109375, "completions/mean_terminated_length": 124.7109375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.4275609117107676, "frac_reward_zero_std": 0.0, "grad_norm": 0.38927823305130005, "kl": 3.80078125, "learning_rate": 5e-05, "loss": 0.0416, "num_tokens": 27719381.0, "reward": 7.0601806640625, "reward_std": 0.29397106170654297, "rewards/helpfulness_reward/mean": 7.0601806640625, "rewards/helpfulness_reward/std": 0.6725316047668457, "rewards/safety_reward/mean": 8.374267578125, "rewards/safety_reward/std": 0.551671028137207, "step": 1224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 124.484375, "completions/mean_terminated_length": 124.484375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.42791022618111957, "frac_reward_zero_std": 0.0, "grad_norm": 0.23565657436847687, "kl": 3.67578125, "learning_rate": 5e-05, "loss": 0.0361, "num_tokens": 27739291.0, "reward": 7.143310546875, "reward_std": 0.31246864795684814, "rewards/helpfulness_reward/mean": 7.143310546875, "rewards/helpfulness_reward/std": 0.49437856674194336, "rewards/safety_reward/mean": 8.314697265625, "rewards/safety_reward/std": 0.5741250514984131, "step": 1225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 124.7421875, "completions/mean_terminated_length": 124.7421875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.4282595406514715, "frac_reward_zero_std": 0.0, "grad_norm": 9.805963516235352, "kl": 6.3046875, "learning_rate": 5e-05, "loss": 0.0611, "num_tokens": 27761706.0, "reward": 6.7454833984375, "reward_std": 0.3212214708328247, "rewards/helpfulness_reward/mean": 6.7454833984375, "rewards/helpfulness_reward/std": 1.070542812347412, "rewards/safety_reward/mean": 8.071044921875, "rewards/safety_reward/std": 1.1912301778793335, "step": 1226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 124.5234375, "completions/mean_terminated_length": 124.5234375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.4286088551218234, "frac_reward_zero_std": 0.0, "grad_norm": 0.23948875069618225, "kl": 3.79296875, "learning_rate": 5e-05, "loss": 0.0382, "num_tokens": 27781501.0, "reward": 7.04638671875, "reward_std": 0.21891365945339203, "rewards/helpfulness_reward/mean": 7.04638671875, "rewards/helpfulness_reward/std": 0.43734151124954224, "rewards/safety_reward/mean": 8.246826171875, "rewards/safety_reward/std": 0.4293719232082367, "step": 1227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1327.0, "completions/max_terminated_length": 1327.0, "completions/mean_length": 152.625, "completions/mean_terminated_length": 152.625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.42895816959217536, "frac_reward_zero_std": 0.0, "grad_norm": 0.31468304991722107, "kl": 3.32421875, "learning_rate": 5e-05, "loss": 0.1393, "num_tokens": 27806525.0, "reward": 6.883209228515625, "reward_std": 0.3906583786010742, "rewards/helpfulness_reward/mean": 6.883209228515625, "rewards/helpfulness_reward/std": 1.2063593864440918, "rewards/safety_reward/mean": 8.227561950683594, "rewards/safety_reward/std": 1.3840092420578003, "step": 1228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 123.9609375, "completions/mean_terminated_length": 123.9609375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.4293074840625273, "frac_reward_zero_std": 0.0, "grad_norm": 0.260051429271698, "kl": 3.779296875, "learning_rate": 5e-05, "loss": 0.0372, "num_tokens": 27827272.0, "reward": 7.093994140625, "reward_std": 0.2354201376438141, "rewards/helpfulness_reward/mean": 7.093994140625, "rewards/helpfulness_reward/std": 0.6791105270385742, "rewards/safety_reward/mean": 8.300048828125, "rewards/safety_reward/std": 0.4168225824832916, "step": 1229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1792.0, "completions/max_terminated_length": 1045.0, "completions/mean_length": 159.7109375, "completions/mean_terminated_length": 133.8015899658203, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.42965679853287925, "frac_reward_zero_std": 0.0, "grad_norm": 0.23570868372917175, "kl": 3.37890625, "learning_rate": 5e-05, "loss": 0.088, "num_tokens": 27853987.0, "reward": 6.741397857666016, "reward_std": 0.4228941798210144, "rewards/helpfulness_reward/mean": 6.741397857666016, "rewards/helpfulness_reward/std": 1.695508599281311, "rewards/safety_reward/mean": 7.968910217285156, "rewards/safety_reward/std": 1.8681515455245972, "step": 1230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 124.3671875, "completions/mean_terminated_length": 124.3671875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.43000611300323116, "frac_reward_zero_std": 0.0, "grad_norm": 0.36140716075897217, "kl": 3.849609375, "learning_rate": 5e-05, "loss": 0.0504, "num_tokens": 27876346.0, "reward": 6.33172607421875, "reward_std": 0.4622044563293457, "rewards/helpfulness_reward/mean": 6.33172607421875, "rewards/helpfulness_reward/std": 1.6969788074493408, "rewards/safety_reward/mean": 7.539854049682617, "rewards/safety_reward/std": 2.226928949356079, "step": 1231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 130.03125, "completions/mean_terminated_length": 130.03125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.4303554274735831, "frac_reward_zero_std": 0.0, "grad_norm": 0.28504443168640137, "kl": 3.533203125, "learning_rate": 5e-05, "loss": 0.0252, "num_tokens": 27898710.0, "reward": 6.555183410644531, "reward_std": 0.444622665643692, "rewards/helpfulness_reward/mean": 6.555183410644531, "rewards/helpfulness_reward/std": 1.6416149139404297, "rewards/safety_reward/mean": 7.91436767578125, "rewards/safety_reward/std": 1.930720567703247, "step": 1232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 123.9453125, "completions/mean_terminated_length": 123.9453125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.43070474194393504, "frac_reward_zero_std": 0.0, "grad_norm": 0.26223400235176086, "kl": 3.7421875, "learning_rate": 5e-05, "loss": 0.0351, "num_tokens": 27918975.0, "reward": 7.01025390625, "reward_std": 0.2414456009864807, "rewards/helpfulness_reward/mean": 7.01025390625, "rewards/helpfulness_reward/std": 0.6261690258979797, "rewards/safety_reward/mean": 8.48095703125, "rewards/safety_reward/std": 0.45387715101242065, "step": 1233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 123.84375, "completions/mean_terminated_length": 123.84375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.43105405641428696, "frac_reward_zero_std": 0.0, "grad_norm": 0.2535575330257416, "kl": 3.884765625, "learning_rate": 5e-05, "loss": 0.0404, "num_tokens": 27938419.0, "reward": 7.151123046875, "reward_std": 0.27428144216537476, "rewards/helpfulness_reward/mean": 7.151123046875, "rewards/helpfulness_reward/std": 0.5320928692817688, "rewards/safety_reward/mean": 8.242919921875, "rewards/safety_reward/std": 0.5676602721214294, "step": 1234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 123.9921875, "completions/mean_terminated_length": 123.9921875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.43140337088463887, "frac_reward_zero_std": 0.0, "grad_norm": 0.27188488841056824, "kl": 3.865234375, "learning_rate": 5e-05, "loss": 0.039, "num_tokens": 27958226.0, "reward": 7.03515625, "reward_std": 0.28213727474212646, "rewards/helpfulness_reward/mean": 7.03515625, "rewards/helpfulness_reward/std": 0.5504505634307861, "rewards/safety_reward/mean": 8.406982421875, "rewards/safety_reward/std": 0.4433569014072418, "step": 1235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 124.125, "completions/mean_terminated_length": 124.125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.43175268535499084, "frac_reward_zero_std": 0.0, "grad_norm": 0.3480391800403595, "kl": 3.892578125, "learning_rate": 5e-05, "loss": 0.0417, "num_tokens": 27978210.0, "reward": 7.086181640625, "reward_std": 0.27287420630455017, "rewards/helpfulness_reward/mean": 7.086181640625, "rewards/helpfulness_reward/std": 0.4706367254257202, "rewards/safety_reward/mean": 8.455322265625, "rewards/safety_reward/std": 0.5759903788566589, "step": 1236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 123.359375, "completions/mean_terminated_length": 123.359375, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.43210199982534275, "frac_reward_zero_std": 0.0, "grad_norm": 0.336334228515625, "kl": 3.955078125, "learning_rate": 5e-05, "loss": 0.0488, "num_tokens": 28000256.0, "reward": 6.66302490234375, "reward_std": 0.34988272190093994, "rewards/helpfulness_reward/mean": 6.66302490234375, "rewards/helpfulness_reward/std": 1.1967087984085083, "rewards/safety_reward/mean": 8.170166015625, "rewards/safety_reward/std": 1.1113845109939575, "step": 1237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 123.40625, "completions/mean_terminated_length": 123.40625, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.4324513142956947, "frac_reward_zero_std": 0.0, "grad_norm": 0.2954374849796295, "kl": 3.892578125, "learning_rate": 5e-05, "loss": 0.031, "num_tokens": 28019876.0, "reward": 6.95123291015625, "reward_std": 0.4833470284938812, "rewards/helpfulness_reward/mean": 6.95123291015625, "rewards/helpfulness_reward/std": 0.9509021639823914, "rewards/safety_reward/mean": 8.121826171875, "rewards/safety_reward/std": 0.44509437680244446, "step": 1238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 123.75, "completions/mean_terminated_length": 123.75, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.43280062876604664, "frac_reward_zero_std": 0.0, "grad_norm": 0.21463800966739655, "kl": 3.81640625, "learning_rate": 5e-05, "loss": 0.0371, "num_tokens": 28040132.0, "reward": 6.99462890625, "reward_std": 0.2904343605041504, "rewards/helpfulness_reward/mean": 6.99462890625, "rewards/helpfulness_reward/std": 0.5883327126502991, "rewards/safety_reward/mean": 8.334228515625, "rewards/safety_reward/std": 0.4247031509876251, "step": 1239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 123.5, "completions/mean_terminated_length": 123.5, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.43314994323639855, "frac_reward_zero_std": 0.0, "grad_norm": 0.27230942249298096, "kl": 3.80859375, "learning_rate": 5e-05, "loss": 0.0339, "num_tokens": 28060564.0, "reward": 6.820556640625, "reward_std": 0.29046061635017395, "rewards/helpfulness_reward/mean": 6.820556640625, "rewards/helpfulness_reward/std": 0.46933600306510925, "rewards/safety_reward/mean": 8.345458984375, "rewards/safety_reward/std": 0.5343789458274841, "step": 1240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 123.5859375, "completions/mean_terminated_length": 123.5859375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.4334992577067505, "frac_reward_zero_std": 0.0, "grad_norm": 0.6781226396560669, "kl": 3.9609375, "learning_rate": 5e-05, "loss": 0.0381, "num_tokens": 28080271.0, "reward": 6.7939453125, "reward_std": 0.2644663453102112, "rewards/helpfulness_reward/mean": 6.7939453125, "rewards/helpfulness_reward/std": 0.5242700576782227, "rewards/safety_reward/mean": 8.254638671875, "rewards/safety_reward/std": 0.48911771178245544, "step": 1241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 125.9140625, "completions/mean_terminated_length": 125.9140625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.43384857217710243, "frac_reward_zero_std": 0.0, "grad_norm": 0.27460741996765137, "kl": 3.708984375, "learning_rate": 5e-05, "loss": 0.0397, "num_tokens": 28102364.0, "reward": 6.63568115234375, "reward_std": 0.37008529901504517, "rewards/helpfulness_reward/mean": 6.63568115234375, "rewards/helpfulness_reward/std": 1.4687873125076294, "rewards/safety_reward/mean": 7.85498046875, "rewards/safety_reward/std": 1.634228229522705, "step": 1242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 123.71875, "completions/mean_terminated_length": 123.71875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.43419788664745435, "frac_reward_zero_std": 0.0, "grad_norm": 0.5221396088600159, "kl": 4.271484375, "learning_rate": 5e-05, "loss": 0.0394, "num_tokens": 28122608.0, "reward": 6.88671875, "reward_std": 0.23218221962451935, "rewards/helpfulness_reward/mean": 6.88671875, "rewards/helpfulness_reward/std": 0.5034942030906677, "rewards/safety_reward/mean": 8.337890625, "rewards/safety_reward/std": 0.4051744043827057, "step": 1243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 123.0703125, "completions/mean_terminated_length": 123.0703125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.4345472011178063, "frac_reward_zero_std": 0.0, "grad_norm": 0.3092725872993469, "kl": 3.82421875, "learning_rate": 5e-05, "loss": 0.0286, "num_tokens": 28142257.0, "reward": 6.93701171875, "reward_std": 0.35776039958000183, "rewards/helpfulness_reward/mean": 6.93701171875, "rewards/helpfulness_reward/std": 0.6173994541168213, "rewards/safety_reward/mean": 8.135009765625, "rewards/safety_reward/std": 0.5629636645317078, "step": 1244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 122.890625, "completions/mean_terminated_length": 122.890625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.43489651558815823, "frac_reward_zero_std": 0.0, "grad_norm": 0.22102098166942596, "kl": 3.72265625, "learning_rate": 5e-05, "loss": 0.0263, "num_tokens": 28161883.0, "reward": 6.87060546875, "reward_std": 0.24634215235710144, "rewards/helpfulness_reward/mean": 6.87060546875, "rewards/helpfulness_reward/std": 0.49743643403053284, "rewards/safety_reward/mean": 8.17138671875, "rewards/safety_reward/std": 0.5537928342819214, "step": 1245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 123.0546875, "completions/mean_terminated_length": 123.0546875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.4352458300585102, "frac_reward_zero_std": 0.0, "grad_norm": 0.2067001312971115, "kl": 3.720703125, "learning_rate": 5e-05, "loss": 0.0339, "num_tokens": 28181826.0, "reward": 6.96337890625, "reward_std": 0.2618047595024109, "rewards/helpfulness_reward/mean": 6.96337890625, "rewards/helpfulness_reward/std": 0.5398164391517639, "rewards/safety_reward/mean": 8.40869140625, "rewards/safety_reward/std": 0.5878162384033203, "step": 1246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 123.1953125, "completions/mean_terminated_length": 123.1953125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.4355951445288621, "frac_reward_zero_std": 0.0, "grad_norm": 0.29313844442367554, "kl": 3.63671875, "learning_rate": 5e-05, "loss": 0.0267, "num_tokens": 28201947.0, "reward": 7.1259765625, "reward_std": 0.30047494173049927, "rewards/helpfulness_reward/mean": 7.1259765625, "rewards/helpfulness_reward/std": 0.6210744976997375, "rewards/safety_reward/mean": 8.5419921875, "rewards/safety_reward/std": 0.34859102964401245, "step": 1247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 122.984375, "completions/mean_terminated_length": 122.984375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.435944458999214, "frac_reward_zero_std": 0.0, "grad_norm": 1.0333755016326904, "kl": 4.34765625, "learning_rate": 5e-05, "loss": 0.0384, "num_tokens": 28221481.0, "reward": 7.1796875, "reward_std": 0.2893142104148865, "rewards/helpfulness_reward/mean": 7.1796875, "rewards/helpfulness_reward/std": 0.44907015562057495, "rewards/safety_reward/mean": 8.329345703125, "rewards/safety_reward/std": 0.4218025803565979, "step": 1248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 123.0546875, "completions/mean_terminated_length": 123.0546875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.436293773469566, "frac_reward_zero_std": 0.0, "grad_norm": 0.23286086320877075, "kl": 3.66796875, "learning_rate": 5e-05, "loss": 0.0237, "num_tokens": 28240896.0, "reward": 7.075927734375, "reward_std": 0.38336461782455444, "rewards/helpfulness_reward/mean": 7.075927734375, "rewards/helpfulness_reward/std": 0.5766708254814148, "rewards/safety_reward/mean": 8.162109375, "rewards/safety_reward/std": 0.5676946043968201, "step": 1249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 123.59375, "completions/mean_terminated_length": 123.59375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.4366430879399179, "frac_reward_zero_std": 0.0, "grad_norm": 0.23615297675132751, "kl": 3.654296875, "learning_rate": 5e-05, "loss": 0.0324, "num_tokens": 28260396.0, "reward": 7.10546875, "reward_std": 0.3128359019756317, "rewards/helpfulness_reward/mean": 7.10546875, "rewards/helpfulness_reward/std": 0.5091135501861572, "rewards/safety_reward/mean": 8.366943359375, "rewards/safety_reward/std": 0.5239297747612, "step": 1250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 123.484375, "completions/mean_terminated_length": 123.484375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.4369924024102698, "frac_reward_zero_std": 0.0, "grad_norm": 0.2661806046962738, "kl": 3.8515625, "learning_rate": 5e-05, "loss": 0.0343, "num_tokens": 28280402.0, "reward": 6.997802734375, "reward_std": 0.3335323929786682, "rewards/helpfulness_reward/mean": 6.997802734375, "rewards/helpfulness_reward/std": 0.6317935585975647, "rewards/safety_reward/mean": 8.277587890625, "rewards/safety_reward/std": 0.5602072477340698, "step": 1251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 123.5390625, "completions/mean_terminated_length": 123.5390625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.4373417168806218, "frac_reward_zero_std": 0.0, "grad_norm": 0.2743144929409027, "kl": 3.705078125, "learning_rate": 5e-05, "loss": 0.0376, "num_tokens": 28300639.0, "reward": 6.94482421875, "reward_std": 0.3043210208415985, "rewards/helpfulness_reward/mean": 6.94482421875, "rewards/helpfulness_reward/std": 0.6031296849250793, "rewards/safety_reward/mean": 8.32763671875, "rewards/safety_reward/std": 0.44761204719543457, "step": 1252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 123.5625, "completions/mean_terminated_length": 123.5625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.4376910313509737, "frac_reward_zero_std": 0.0, "grad_norm": 0.27161070704460144, "kl": 3.634765625, "learning_rate": 5e-05, "loss": 0.033, "num_tokens": 28322175.0, "reward": 6.900146484375, "reward_std": 0.2653469145298004, "rewards/helpfulness_reward/mean": 6.900146484375, "rewards/helpfulness_reward/std": 0.7768632173538208, "rewards/safety_reward/mean": 8.1943359375, "rewards/safety_reward/std": 0.8712212443351746, "step": 1253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 123.84375, "completions/mean_terminated_length": 123.84375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.43804034582132567, "frac_reward_zero_std": 0.0, "grad_norm": 0.5051295161247253, "kl": 3.87890625, "learning_rate": 5e-05, "loss": 0.0354, "num_tokens": 28343563.0, "reward": 6.889404296875, "reward_std": 0.307118684053421, "rewards/helpfulness_reward/mean": 6.889404296875, "rewards/helpfulness_reward/std": 0.7379891872406006, "rewards/safety_reward/mean": 8.0908203125, "rewards/safety_reward/std": 0.6716639399528503, "step": 1254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 123.40625, "completions/mean_terminated_length": 123.40625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.4383896602916776, "frac_reward_zero_std": 0.0, "grad_norm": 1.1891018152236938, "kl": 4.52734375, "learning_rate": 5e-05, "loss": 0.0439, "num_tokens": 28363263.0, "reward": 6.9423828125, "reward_std": 0.2978785037994385, "rewards/helpfulness_reward/mean": 6.9423828125, "rewards/helpfulness_reward/std": 0.6133440136909485, "rewards/safety_reward/mean": 8.418701171875, "rewards/safety_reward/std": 0.5730278491973877, "step": 1255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 123.625, "completions/mean_terminated_length": 123.625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.4387389747620295, "frac_reward_zero_std": 0.0, "grad_norm": 0.307432621717453, "kl": 3.6171875, "learning_rate": 5e-05, "loss": 0.0345, "num_tokens": 28382687.0, "reward": 7.058837890625, "reward_std": 0.31995952129364014, "rewards/helpfulness_reward/mean": 7.058837890625, "rewards/helpfulness_reward/std": 0.43728238344192505, "rewards/safety_reward/mean": 8.107421875, "rewards/safety_reward/std": 0.4584540128707886, "step": 1256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 123.9609375, "completions/mean_terminated_length": 123.9609375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.43908828923238147, "frac_reward_zero_std": 0.0, "grad_norm": 0.2777639925479889, "kl": 3.7265625, "learning_rate": 5e-05, "loss": 0.035, "num_tokens": 28402562.0, "reward": 6.964111328125, "reward_std": 0.3677389621734619, "rewards/helpfulness_reward/mean": 6.964111328125, "rewards/helpfulness_reward/std": 0.6300252079963684, "rewards/safety_reward/mean": 8.12451171875, "rewards/safety_reward/std": 0.5376737117767334, "step": 1257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 123.8828125, "completions/mean_terminated_length": 123.8828125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.4394376037027334, "frac_reward_zero_std": 0.0, "grad_norm": 0.7128207087516785, "kl": 4.017578125, "learning_rate": 5e-05, "loss": 0.0388, "num_tokens": 28422379.0, "reward": 6.84326171875, "reward_std": 0.3762146234512329, "rewards/helpfulness_reward/mean": 6.84326171875, "rewards/helpfulness_reward/std": 0.7244774103164673, "rewards/safety_reward/mean": 8.15966796875, "rewards/safety_reward/std": 0.7497460246086121, "step": 1258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 124.0703125, "completions/mean_terminated_length": 124.0703125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.4397869181730853, "frac_reward_zero_std": 0.0, "grad_norm": 0.2730371654033661, "kl": 3.7421875, "learning_rate": 5e-05, "loss": 0.033, "num_tokens": 28442156.0, "reward": 7.031005859375, "reward_std": 0.23767898976802826, "rewards/helpfulness_reward/mean": 7.031005859375, "rewards/helpfulness_reward/std": 0.617816686630249, "rewards/safety_reward/mean": 8.421630859375, "rewards/safety_reward/std": 0.6012080311775208, "step": 1259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 124.1484375, "completions/mean_terminated_length": 124.1484375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.44013623264343726, "frac_reward_zero_std": 0.0, "grad_norm": 0.24429196119308472, "kl": 3.779296875, "learning_rate": 5e-05, "loss": 0.0344, "num_tokens": 28461879.0, "reward": 7.140869140625, "reward_std": 0.302362859249115, "rewards/helpfulness_reward/mean": 7.140869140625, "rewards/helpfulness_reward/std": 0.6704901456832886, "rewards/safety_reward/mean": 8.451416015625, "rewards/safety_reward/std": 0.5695885419845581, "step": 1260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1792.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 143.359375, "completions/mean_terminated_length": 130.37794494628906, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.4404855471137892, "frac_reward_zero_std": 0.0, "grad_norm": 0.2885024845600128, "kl": 3.3125, "learning_rate": 5e-05, "loss": 0.0616, "num_tokens": 28487437.0, "reward": 6.600147247314453, "reward_std": 0.32152941823005676, "rewards/helpfulness_reward/mean": 6.600147247314453, "rewards/helpfulness_reward/std": 1.601521372795105, "rewards/safety_reward/mean": 7.935520172119141, "rewards/safety_reward/std": 1.855072021484375, "step": 1261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 124.0234375, "completions/mean_terminated_length": 124.0234375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.44083486158414115, "frac_reward_zero_std": 0.0, "grad_norm": 0.26696136593818665, "kl": 3.73046875, "learning_rate": 5e-05, "loss": 0.0374, "num_tokens": 28507616.0, "reward": 6.82763671875, "reward_std": 0.25519585609436035, "rewards/helpfulness_reward/mean": 6.82763671875, "rewards/helpfulness_reward/std": 0.7345227003097534, "rewards/safety_reward/mean": 8.1923828125, "rewards/safety_reward/std": 0.7232499718666077, "step": 1262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 124.9296875, "completions/mean_terminated_length": 124.9296875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.44118417605449306, "frac_reward_zero_std": 0.0, "grad_norm": 0.8608049154281616, "kl": 4.189453125, "learning_rate": 5e-05, "loss": 0.0501, "num_tokens": 28527295.0, "reward": 7.101318359375, "reward_std": 0.24194423854351044, "rewards/helpfulness_reward/mean": 7.101318359375, "rewards/helpfulness_reward/std": 0.6197831630706787, "rewards/safety_reward/mean": 8.56884765625, "rewards/safety_reward/std": 0.3628985583782196, "step": 1263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1701.0, "completions/max_terminated_length": 1701.0, "completions/mean_length": 136.453125, "completions/mean_terminated_length": 136.453125, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.441533490524845, "frac_reward_zero_std": 0.0, "grad_norm": 0.2935852110385895, "kl": 3.443359375, "learning_rate": 5e-05, "loss": 0.1214, "num_tokens": 28553121.0, "reward": 6.239013671875, "reward_std": 0.41862547397613525, "rewards/helpfulness_reward/mean": 6.239013671875, "rewards/helpfulness_reward/std": 1.6928812265396118, "rewards/safety_reward/mean": 7.5719757080078125, "rewards/safety_reward/std": 2.2915167808532715, "step": 1264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 124.7890625, "completions/mean_terminated_length": 124.7890625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.44188280499519694, "frac_reward_zero_std": 0.0, "grad_norm": 0.2939378619194031, "kl": 3.748046875, "learning_rate": 5e-05, "loss": 0.0432, "num_tokens": 28573278.0, "reward": 6.9169921875, "reward_std": 0.3603147268295288, "rewards/helpfulness_reward/mean": 6.9169921875, "rewards/helpfulness_reward/std": 0.6472353339195251, "rewards/safety_reward/mean": 8.214111328125, "rewards/safety_reward/std": 0.5926688313484192, "step": 1265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 123.7890625, "completions/mean_terminated_length": 123.7890625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.44223211946554886, "frac_reward_zero_std": 0.0, "grad_norm": 0.23341059684753418, "kl": 3.734375, "learning_rate": 5e-05, "loss": 0.0353, "num_tokens": 28594475.0, "reward": 7.151611328125, "reward_std": 0.29463112354278564, "rewards/helpfulness_reward/mean": 7.151611328125, "rewards/helpfulness_reward/std": 0.4833613634109497, "rewards/safety_reward/mean": 8.271240234375, "rewards/safety_reward/std": 0.4856284260749817, "step": 1266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 123.6328125, "completions/mean_terminated_length": 123.6328125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.44258143393590077, "frac_reward_zero_std": 0.0, "grad_norm": 0.3413330018520355, "kl": 3.98046875, "learning_rate": 5e-05, "loss": 0.0374, "num_tokens": 28615140.0, "reward": 6.7236328125, "reward_std": 0.265849232673645, "rewards/helpfulness_reward/mean": 6.7236328125, "rewards/helpfulness_reward/std": 0.5510011911392212, "rewards/safety_reward/mean": 8.272705078125, "rewards/safety_reward/std": 0.4198623299598694, "step": 1267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 124.7109375, "completions/mean_terminated_length": 124.7109375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.44293074840625274, "frac_reward_zero_std": 0.0, "grad_norm": 0.9119359850883484, "kl": 4.2578125, "learning_rate": 5e-05, "loss": 0.0455, "num_tokens": 28635055.0, "reward": 7.061279296875, "reward_std": 0.29080337285995483, "rewards/helpfulness_reward/mean": 7.061279296875, "rewards/helpfulness_reward/std": 0.5156328678131104, "rewards/safety_reward/mean": 8.5107421875, "rewards/safety_reward/std": 0.4833509624004364, "step": 1268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 123.8359375, "completions/mean_terminated_length": 123.8359375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.44328006287660465, "frac_reward_zero_std": 0.0, "grad_norm": 0.27192768454551697, "kl": 3.8125, "learning_rate": 5e-05, "loss": 0.0411, "num_tokens": 28655202.0, "reward": 6.96728515625, "reward_std": 0.3928609788417816, "rewards/helpfulness_reward/mean": 6.96728515625, "rewards/helpfulness_reward/std": 0.5965969562530518, "rewards/safety_reward/mean": 8.36865234375, "rewards/safety_reward/std": 0.6523937582969666, "step": 1269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 123.90625, "completions/mean_terminated_length": 123.90625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.4436293773469566, "frac_reward_zero_std": 0.0, "grad_norm": 0.26485946774482727, "kl": 3.6328125, "learning_rate": 5e-05, "loss": 0.0354, "num_tokens": 28675262.0, "reward": 6.975341796875, "reward_std": 0.3845977783203125, "rewards/helpfulness_reward/mean": 6.975341796875, "rewards/helpfulness_reward/std": 0.5778160095214844, "rewards/safety_reward/mean": 8.15625, "rewards/safety_reward/std": 0.5703331232070923, "step": 1270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 124.515625, "completions/mean_terminated_length": 124.515625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.44397869181730854, "frac_reward_zero_std": 0.0, "grad_norm": 0.3552599847316742, "kl": 4.017578125, "learning_rate": 5e-05, "loss": 0.0464, "num_tokens": 28696048.0, "reward": 6.799560546875, "reward_std": 0.4327350854873657, "rewards/helpfulness_reward/mean": 6.799560546875, "rewards/helpfulness_reward/std": 0.6444026231765747, "rewards/safety_reward/mean": 8.2490234375, "rewards/safety_reward/std": 0.5915109515190125, "step": 1271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 124.03125, "completions/mean_terminated_length": 124.03125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.44432800628766045, "frac_reward_zero_std": 0.0, "grad_norm": 0.2831800878047943, "kl": 3.74609375, "learning_rate": 5e-05, "loss": 0.0409, "num_tokens": 28716348.0, "reward": 6.86865234375, "reward_std": 0.3996872901916504, "rewards/helpfulness_reward/mean": 6.86865234375, "rewards/helpfulness_reward/std": 0.5661023259162903, "rewards/safety_reward/mean": 8.385498046875, "rewards/safety_reward/std": 0.44910913705825806, "step": 1272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 123.96875, "completions/mean_terminated_length": 123.96875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.4446773207580124, "frac_reward_zero_std": 0.0, "grad_norm": 0.28072813153266907, "kl": 3.759765625, "learning_rate": 5e-05, "loss": 0.0382, "num_tokens": 28737464.0, "reward": 6.786865234375, "reward_std": 0.41499245166778564, "rewards/helpfulness_reward/mean": 6.786865234375, "rewards/helpfulness_reward/std": 0.6995238661766052, "rewards/safety_reward/mean": 8.19189453125, "rewards/safety_reward/std": 0.5502931475639343, "step": 1273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1792.0, "completions/max_terminated_length": 754.0, "completions/mean_length": 143.25, "completions/mean_terminated_length": 130.26771545410156, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.44502663522836433, "frac_reward_zero_std": 0.0, "grad_norm": 0.2788839042186737, "kl": 3.478515625, "learning_rate": 5e-05, "loss": 0.0975, "num_tokens": 28761728.0, "reward": 6.505344390869141, "reward_std": 0.3900550603866577, "rewards/helpfulness_reward/mean": 6.505344390869141, "rewards/helpfulness_reward/std": 1.5725860595703125, "rewards/safety_reward/mean": 7.891956329345703, "rewards/safety_reward/std": 1.8979958295822144, "step": 1274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1068.0, "completions/max_terminated_length": 1068.0, "completions/mean_length": 142.0078125, "completions/mean_terminated_length": 142.0078125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.44537594969871624, "frac_reward_zero_std": 0.0, "grad_norm": 7.051403522491455, "kl": 5.421875, "learning_rate": 5e-05, "loss": 0.1006, "num_tokens": 28785537.0, "reward": 6.6006927490234375, "reward_std": 0.42113929986953735, "rewards/helpfulness_reward/mean": 6.6006927490234375, "rewards/helpfulness_reward/std": 1.6063830852508545, "rewards/safety_reward/mean": 7.802520751953125, "rewards/safety_reward/std": 1.752656102180481, "step": 1275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 123.21875, "completions/mean_terminated_length": 123.21875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.4457252641690682, "frac_reward_zero_std": 0.0, "grad_norm": 0.2724766731262207, "kl": 3.818359375, "learning_rate": 5e-05, "loss": 0.0297, "num_tokens": 28805501.0, "reward": 6.682861328125, "reward_std": 0.35290369391441345, "rewards/helpfulness_reward/mean": 6.682861328125, "rewards/helpfulness_reward/std": 0.5964323282241821, "rewards/safety_reward/mean": 8.156982421875, "rewards/safety_reward/std": 0.39361727237701416, "step": 1276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 123.671875, "completions/mean_terminated_length": 123.671875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.4460745786394201, "frac_reward_zero_std": 0.0, "grad_norm": 0.30706948041915894, "kl": 3.734375, "learning_rate": 5e-05, "loss": 0.036, "num_tokens": 28824843.0, "reward": 6.7783203125, "reward_std": 0.39354801177978516, "rewards/helpfulness_reward/mean": 6.7783203125, "rewards/helpfulness_reward/std": 0.5635712146759033, "rewards/safety_reward/mean": 8.03857421875, "rewards/safety_reward/std": 0.4696674644947052, "step": 1277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 123.921875, "completions/mean_terminated_length": 123.921875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.4464238931097721, "frac_reward_zero_std": 0.0, "grad_norm": 0.3492777645587921, "kl": 3.728515625, "learning_rate": 5e-05, "loss": 0.0341, "num_tokens": 28845201.0, "reward": 6.897705078125, "reward_std": 0.42988258600234985, "rewards/helpfulness_reward/mean": 6.897705078125, "rewards/helpfulness_reward/std": 0.6882234215736389, "rewards/safety_reward/mean": 8.275390625, "rewards/safety_reward/std": 0.5538035035133362, "step": 1278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 125.3984375, "completions/mean_terminated_length": 125.3984375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.446773207580124, "frac_reward_zero_std": 0.0, "grad_norm": 0.3549439609050751, "kl": 3.7109375, "learning_rate": 5e-05, "loss": 0.0486, "num_tokens": 28866100.0, "reward": 6.6981201171875, "reward_std": 0.47602736949920654, "rewards/helpfulness_reward/mean": 6.6981201171875, "rewards/helpfulness_reward/std": 0.752653181552887, "rewards/safety_reward/mean": 8.022705078125, "rewards/safety_reward/std": 0.5874643325805664, "step": 1279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 120.3203125, "completions/mean_terminated_length": 120.3203125, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.4471225220504759, "frac_reward_zero_std": 0.0, "grad_norm": 0.48609957098960876, "kl": 3.73828125, "learning_rate": 5e-05, "loss": 0.0351, "num_tokens": 28888581.0, "reward": 6.2291107177734375, "reward_std": 0.43556809425354004, "rewards/helpfulness_reward/mean": 6.2291107177734375, "rewards/helpfulness_reward/std": 1.6170977354049683, "rewards/safety_reward/mean": 7.729338645935059, "rewards/safety_reward/std": 2.0104849338531494, "step": 1280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 125.4453125, "completions/mean_terminated_length": 125.4453125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.4474718365208279, "frac_reward_zero_std": 0.0, "grad_norm": 0.3778114318847656, "kl": 3.8046875, "learning_rate": 5e-05, "loss": 0.0416, "num_tokens": 28909758.0, "reward": 6.8502197265625, "reward_std": 0.3895696997642517, "rewards/helpfulness_reward/mean": 6.8502197265625, "rewards/helpfulness_reward/std": 0.7674858570098877, "rewards/safety_reward/mean": 8.278564453125, "rewards/safety_reward/std": 0.6093264222145081, "step": 1281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 125.0078125, "completions/mean_terminated_length": 125.0078125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.4478211509911798, "frac_reward_zero_std": 0.0, "grad_norm": 0.5733938813209534, "kl": 3.962890625, "learning_rate": 5e-05, "loss": 0.0509, "num_tokens": 28929823.0, "reward": 6.81591796875, "reward_std": 0.4298514127731323, "rewards/helpfulness_reward/mean": 6.81591796875, "rewards/helpfulness_reward/std": 0.6378867626190186, "rewards/safety_reward/mean": 8.24267578125, "rewards/safety_reward/std": 0.5695571303367615, "step": 1282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 125.625, "completions/mean_terminated_length": 125.625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.4481704654615317, "frac_reward_zero_std": 0.0, "grad_norm": 0.3345149755477905, "kl": 3.732421875, "learning_rate": 5e-05, "loss": 0.0485, "num_tokens": 28949983.0, "reward": 6.739501953125, "reward_std": 0.41098618507385254, "rewards/helpfulness_reward/mean": 6.739501953125, "rewards/helpfulness_reward/std": 0.7069792747497559, "rewards/safety_reward/mean": 8.131103515625, "rewards/safety_reward/std": 0.6423652768135071, "step": 1283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 125.6796875, "completions/mean_terminated_length": 125.6796875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.4485197799318837, "frac_reward_zero_std": 0.0, "grad_norm": 0.27223363518714905, "kl": 3.669921875, "learning_rate": 5e-05, "loss": 0.0473, "num_tokens": 28970646.0, "reward": 6.6754150390625, "reward_std": 0.4315507113933563, "rewards/helpfulness_reward/mean": 6.6754150390625, "rewards/helpfulness_reward/std": 0.6814340949058533, "rewards/safety_reward/mean": 8.316162109375, "rewards/safety_reward/std": 0.5247272253036499, "step": 1284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 125.3515625, "completions/mean_terminated_length": 125.3515625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.4488690944022356, "frac_reward_zero_std": 0.0, "grad_norm": 0.3242233395576477, "kl": 3.576171875, "learning_rate": 5e-05, "loss": 0.0486, "num_tokens": 28990795.0, "reward": 6.90673828125, "reward_std": 0.40644437074661255, "rewards/helpfulness_reward/mean": 6.90673828125, "rewards/helpfulness_reward/std": 0.6696637868881226, "rewards/safety_reward/mean": 8.314208984375, "rewards/safety_reward/std": 0.6194053888320923, "step": 1285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 124.9609375, "completions/mean_terminated_length": 124.9609375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.44921840887258757, "frac_reward_zero_std": 0.0, "grad_norm": 0.3341382145881653, "kl": 3.728515625, "learning_rate": 5e-05, "loss": 0.0436, "num_tokens": 29010662.0, "reward": 6.892333984375, "reward_std": 0.3188062310218811, "rewards/helpfulness_reward/mean": 6.892333984375, "rewards/helpfulness_reward/std": 0.6276372075080872, "rewards/safety_reward/mean": 8.23876953125, "rewards/safety_reward/std": 0.518819272518158, "step": 1286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 124.8125, "completions/mean_terminated_length": 124.8125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.4495677233429395, "frac_reward_zero_std": 0.0, "grad_norm": 0.2593269944190979, "kl": 3.587890625, "learning_rate": 5e-05, "loss": 0.0427, "num_tokens": 29030590.0, "reward": 7.053955078125, "reward_std": 0.28361862897872925, "rewards/helpfulness_reward/mean": 7.053955078125, "rewards/helpfulness_reward/std": 0.5329871773719788, "rewards/safety_reward/mean": 8.20068359375, "rewards/safety_reward/std": 0.6968606114387512, "step": 1287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 124.6796875, "completions/mean_terminated_length": 124.6796875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.4499170378132914, "frac_reward_zero_std": 0.0, "grad_norm": 0.2889939546585083, "kl": 3.89453125, "learning_rate": 5e-05, "loss": 0.0466, "num_tokens": 29051653.0, "reward": 6.58740234375, "reward_std": 0.3374907970428467, "rewards/helpfulness_reward/mean": 6.58740234375, "rewards/helpfulness_reward/std": 0.6694958209991455, "rewards/safety_reward/mean": 8.2470703125, "rewards/safety_reward/std": 0.4980034828186035, "step": 1288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 124.046875, "completions/mean_terminated_length": 124.046875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.45026635228364337, "frac_reward_zero_std": 0.0, "grad_norm": 0.2795979976654053, "kl": 3.7421875, "learning_rate": 5e-05, "loss": 0.0391, "num_tokens": 29072115.0, "reward": 6.99609375, "reward_std": 0.2919301688671112, "rewards/helpfulness_reward/mean": 6.99609375, "rewards/helpfulness_reward/std": 0.5164613127708435, "rewards/safety_reward/mean": 8.2783203125, "rewards/safety_reward/std": 0.5211366415023804, "step": 1289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 123.84375, "completions/mean_terminated_length": 123.84375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.4506156667539953, "frac_reward_zero_std": 0.0, "grad_norm": 0.2683001756668091, "kl": 3.68359375, "learning_rate": 5e-05, "loss": 0.0389, "num_tokens": 29093231.0, "reward": 6.73291015625, "reward_std": 0.24833935499191284, "rewards/helpfulness_reward/mean": 6.73291015625, "rewards/helpfulness_reward/std": 0.8172357678413391, "rewards/safety_reward/mean": 8.333740234375, "rewards/safety_reward/std": 0.7857491374015808, "step": 1290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 123.8046875, "completions/mean_terminated_length": 123.8046875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.4509649812243472, "frac_reward_zero_std": 0.0, "grad_norm": 0.2641695439815521, "kl": 3.630859375, "learning_rate": 5e-05, "loss": 0.0341, "num_tokens": 29113094.0, "reward": 6.876220703125, "reward_std": 0.279558926820755, "rewards/helpfulness_reward/mean": 6.876220703125, "rewards/helpfulness_reward/std": 0.6207211017608643, "rewards/safety_reward/mean": 8.216552734375, "rewards/safety_reward/std": 0.482423335313797, "step": 1291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 123.1640625, "completions/mean_terminated_length": 123.1640625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.45131429569469916, "frac_reward_zero_std": 0.0, "grad_norm": 0.19164139032363892, "kl": 3.7109375, "learning_rate": 5e-05, "loss": 0.034, "num_tokens": 29134315.0, "reward": 6.875732421875, "reward_std": 0.3047022223472595, "rewards/helpfulness_reward/mean": 6.875732421875, "rewards/helpfulness_reward/std": 0.8342362642288208, "rewards/safety_reward/mean": 8.177978515625, "rewards/safety_reward/std": 0.8881495594978333, "step": 1292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 123.4375, "completions/mean_terminated_length": 123.4375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.4516636101650511, "frac_reward_zero_std": 0.0, "grad_norm": 0.6331080198287964, "kl": 4.052734375, "learning_rate": 5e-05, "loss": 0.0376, "num_tokens": 29156107.0, "reward": 6.88037109375, "reward_std": 0.2561916708946228, "rewards/helpfulness_reward/mean": 6.88037109375, "rewards/helpfulness_reward/std": 0.6554950475692749, "rewards/safety_reward/mean": 8.370849609375, "rewards/safety_reward/std": 0.5842441320419312, "step": 1293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 124.0078125, "completions/mean_terminated_length": 124.0078125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.45201292463540305, "frac_reward_zero_std": 0.0, "grad_norm": 0.22328490018844604, "kl": 3.62890625, "learning_rate": 5e-05, "loss": 0.0377, "num_tokens": 29176556.0, "reward": 6.872314453125, "reward_std": 0.2541502118110657, "rewards/helpfulness_reward/mean": 6.872314453125, "rewards/helpfulness_reward/std": 0.485503226518631, "rewards/safety_reward/mean": 8.192138671875, "rewards/safety_reward/std": 0.4774872958660126, "step": 1294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 121.8203125, "completions/mean_terminated_length": 121.8203125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.45236223910575496, "frac_reward_zero_std": 0.0, "grad_norm": 0.27134838700294495, "kl": 3.708984375, "learning_rate": 5e-05, "loss": 0.0232, "num_tokens": 29199205.0, "reward": 6.82177734375, "reward_std": 0.30656832456588745, "rewards/helpfulness_reward/mean": 6.82177734375, "rewards/helpfulness_reward/std": 0.9299594759941101, "rewards/safety_reward/mean": 8.1407470703125, "rewards/safety_reward/std": 0.8683209419250488, "step": 1295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 123.4609375, "completions/mean_terminated_length": 123.4609375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.4527115535761069, "frac_reward_zero_std": 0.0, "grad_norm": 0.3334880471229553, "kl": 3.90625, "learning_rate": 5e-05, "loss": 0.0312, "num_tokens": 29219752.0, "reward": 7.148193359375, "reward_std": 0.22396422922611237, "rewards/helpfulness_reward/mean": 7.148193359375, "rewards/helpfulness_reward/std": 0.5004956722259521, "rewards/safety_reward/mean": 8.47900390625, "rewards/safety_reward/std": 0.4236072599887848, "step": 1296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 123.3125, "completions/mean_terminated_length": 123.3125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.45306086804645884, "frac_reward_zero_std": 0.0, "grad_norm": 0.3811635673046112, "kl": 4.2109375, "learning_rate": 5e-05, "loss": 0.0361, "num_tokens": 29240472.0, "reward": 6.86474609375, "reward_std": 0.28661349415779114, "rewards/helpfulness_reward/mean": 6.86474609375, "rewards/helpfulness_reward/std": 0.7825246453285217, "rewards/safety_reward/mean": 8.384765625, "rewards/safety_reward/std": 0.46282756328582764, "step": 1297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 123.9140625, "completions/mean_terminated_length": 123.9140625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.45341018251681076, "frac_reward_zero_std": 0.0, "grad_norm": 0.224607452750206, "kl": 3.826171875, "learning_rate": 5e-05, "loss": 0.038, "num_tokens": 29261541.0, "reward": 7.072021484375, "reward_std": 0.2663927674293518, "rewards/helpfulness_reward/mean": 7.072021484375, "rewards/helpfulness_reward/std": 0.6157603859901428, "rewards/safety_reward/mean": 8.38916015625, "rewards/safety_reward/std": 0.5960940718650818, "step": 1298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 123.390625, "completions/mean_terminated_length": 123.390625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.45375949698716267, "frac_reward_zero_std": 0.0, "grad_norm": 0.25725042819976807, "kl": 3.822265625, "learning_rate": 5e-05, "loss": 0.0352, "num_tokens": 29280863.0, "reward": 7.25830078125, "reward_std": 0.1831497848033905, "rewards/helpfulness_reward/mean": 7.25830078125, "rewards/helpfulness_reward/std": 0.42029741406440735, "rewards/safety_reward/mean": 8.478515625, "rewards/safety_reward/std": 0.4245436191558838, "step": 1299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 123.203125, "completions/mean_terminated_length": 123.203125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.45410881145751464, "frac_reward_zero_std": 0.0, "grad_norm": 0.22569634020328522, "kl": 3.84375, "learning_rate": 5e-05, "loss": 0.0309, "num_tokens": 29301009.0, "reward": 6.91552734375, "reward_std": 0.2528674602508545, "rewards/helpfulness_reward/mean": 6.91552734375, "rewards/helpfulness_reward/std": 0.5726522207260132, "rewards/safety_reward/mean": 8.21435546875, "rewards/safety_reward/std": 0.6325787305831909, "step": 1300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 124.1484375, "completions/mean_terminated_length": 124.1484375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.45445812592786655, "frac_reward_zero_std": 0.0, "grad_norm": 0.20358583331108093, "kl": 3.73046875, "learning_rate": 5e-05, "loss": 0.0394, "num_tokens": 29320820.0, "reward": 7.093505859375, "reward_std": 0.272752970457077, "rewards/helpfulness_reward/mean": 7.093505859375, "rewards/helpfulness_reward/std": 0.5998955965042114, "rewards/safety_reward/mean": 8.29931640625, "rewards/safety_reward/std": 0.5858901143074036, "step": 1301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 123.1484375, "completions/mean_terminated_length": 123.1484375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.4548074403982185, "frac_reward_zero_std": 0.0, "grad_norm": 0.23025605082511902, "kl": 3.72265625, "learning_rate": 5e-05, "loss": 0.0303, "num_tokens": 29341391.0, "reward": 6.887939453125, "reward_std": 0.19018948078155518, "rewards/helpfulness_reward/mean": 6.887939453125, "rewards/helpfulness_reward/std": 0.8444858193397522, "rewards/safety_reward/mean": 8.25439453125, "rewards/safety_reward/std": 0.8673449754714966, "step": 1302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 124.2734375, "completions/mean_terminated_length": 124.2734375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.45515675486857043, "frac_reward_zero_std": 0.0, "grad_norm": 0.20652063190937042, "kl": 3.669921875, "learning_rate": 5e-05, "loss": 0.0352, "num_tokens": 29362138.0, "reward": 7.036865234375, "reward_std": 0.237380713224411, "rewards/helpfulness_reward/mean": 7.036865234375, "rewards/helpfulness_reward/std": 0.5428891181945801, "rewards/safety_reward/mean": 8.4072265625, "rewards/safety_reward/std": 0.33027541637420654, "step": 1303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 123.53125, "completions/mean_terminated_length": 123.53125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.45550606933892235, "frac_reward_zero_std": 0.0, "grad_norm": 0.5052892565727234, "kl": 3.81640625, "learning_rate": 5e-05, "loss": 0.0343, "num_tokens": 29382142.0, "reward": 6.912841796875, "reward_std": 0.23987412452697754, "rewards/helpfulness_reward/mean": 6.912841796875, "rewards/helpfulness_reward/std": 0.469735711812973, "rewards/safety_reward/mean": 8.2724609375, "rewards/safety_reward/std": 0.4180677533149719, "step": 1304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 123.6015625, "completions/mean_terminated_length": 123.6015625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.4558553838092743, "frac_reward_zero_std": 0.0, "grad_norm": 0.2415364384651184, "kl": 3.861328125, "learning_rate": 5e-05, "loss": 0.0331, "num_tokens": 29402539.0, "reward": 6.998291015625, "reward_std": 0.2510127127170563, "rewards/helpfulness_reward/mean": 6.998291015625, "rewards/helpfulness_reward/std": 0.5408154726028442, "rewards/safety_reward/mean": 8.469970703125, "rewards/safety_reward/std": 0.545967698097229, "step": 1305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 123.3359375, "completions/mean_terminated_length": 123.3359375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.45620469827962623, "frac_reward_zero_std": 0.0, "grad_norm": 0.5230164527893066, "kl": 3.962890625, "learning_rate": 5e-05, "loss": 0.032, "num_tokens": 29422374.0, "reward": 7.05908203125, "reward_std": 0.26491808891296387, "rewards/helpfulness_reward/mean": 7.05908203125, "rewards/helpfulness_reward/std": 0.5506772994995117, "rewards/safety_reward/mean": 8.412841796875, "rewards/safety_reward/std": 0.4227994978427887, "step": 1306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 123.7421875, "completions/mean_terminated_length": 123.7421875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.45655401274997814, "frac_reward_zero_std": 0.0, "grad_norm": 0.2368285208940506, "kl": 3.666015625, "learning_rate": 5e-05, "loss": 0.0385, "num_tokens": 29441845.0, "reward": 7.089111328125, "reward_std": 0.21009258925914764, "rewards/helpfulness_reward/mean": 7.089111328125, "rewards/helpfulness_reward/std": 0.4833931624889374, "rewards/safety_reward/mean": 8.353515625, "rewards/safety_reward/std": 0.49934980273246765, "step": 1307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 123.7265625, "completions/mean_terminated_length": 123.7265625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.4569033272203301, "frac_reward_zero_std": 0.0, "grad_norm": 0.23646190762519836, "kl": 3.810546875, "learning_rate": 5e-05, "loss": 0.0326, "num_tokens": 29462274.0, "reward": 6.815185546875, "reward_std": 0.2479824423789978, "rewards/helpfulness_reward/mean": 6.815185546875, "rewards/helpfulness_reward/std": 0.8217248916625977, "rewards/safety_reward/mean": 7.802978515625, "rewards/safety_reward/std": 1.1275094747543335, "step": 1308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 124.1640625, "completions/mean_terminated_length": 124.1640625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.457252641690682, "frac_reward_zero_std": 0.0, "grad_norm": 0.3456934690475464, "kl": 3.80859375, "learning_rate": 5e-05, "loss": 0.0404, "num_tokens": 29484119.0, "reward": 6.9599609375, "reward_std": 0.24747171998023987, "rewards/helpfulness_reward/mean": 6.9599609375, "rewards/helpfulness_reward/std": 0.5032335519790649, "rewards/safety_reward/mean": 8.46630859375, "rewards/safety_reward/std": 0.4860348105430603, "step": 1309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 123.328125, "completions/mean_terminated_length": 123.328125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.457601956161034, "frac_reward_zero_std": 0.0, "grad_norm": 0.3016493022441864, "kl": 3.712890625, "learning_rate": 5e-05, "loss": 0.0314, "num_tokens": 29504033.0, "reward": 7.084228515625, "reward_std": 0.2916874885559082, "rewards/helpfulness_reward/mean": 7.084228515625, "rewards/helpfulness_reward/std": 0.5241398215293884, "rewards/safety_reward/mean": 8.470458984375, "rewards/safety_reward/std": 0.3756004869937897, "step": 1310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 124.7734375, "completions/mean_terminated_length": 124.7734375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.4579512706313859, "frac_reward_zero_std": 0.0, "grad_norm": 0.37830302119255066, "kl": 3.85546875, "learning_rate": 5e-05, "loss": 0.0442, "num_tokens": 29524556.0, "reward": 7.10498046875, "reward_std": 0.23955415189266205, "rewards/helpfulness_reward/mean": 7.10498046875, "rewards/helpfulness_reward/std": 0.3874002695083618, "rewards/safety_reward/mean": 8.3115234375, "rewards/safety_reward/std": 0.5100632309913635, "step": 1311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 124.171875, "completions/mean_terminated_length": 124.171875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.4583005851017378, "frac_reward_zero_std": 0.0, "grad_norm": 0.24303163588047028, "kl": 3.748046875, "learning_rate": 5e-05, "loss": 0.0396, "num_tokens": 29544570.0, "reward": 6.985107421875, "reward_std": 0.32479164004325867, "rewards/helpfulness_reward/mean": 6.985107421875, "rewards/helpfulness_reward/std": 0.7362060546875, "rewards/safety_reward/mean": 8.197265625, "rewards/safety_reward/std": 0.8298120498657227, "step": 1312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 123.859375, "completions/mean_terminated_length": 123.859375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.4586498995720898, "frac_reward_zero_std": 0.0, "grad_norm": 0.27992475032806396, "kl": 3.994140625, "learning_rate": 5e-05, "loss": 0.0403, "num_tokens": 29564248.0, "reward": 7.065185546875, "reward_std": 0.2595895528793335, "rewards/helpfulness_reward/mean": 7.065185546875, "rewards/helpfulness_reward/std": 0.5863679051399231, "rewards/safety_reward/mean": 8.553466796875, "rewards/safety_reward/std": 0.4534859359264374, "step": 1313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 124.0625, "completions/mean_terminated_length": 124.0625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.4589992140424417, "frac_reward_zero_std": 0.0, "grad_norm": 2.507127046585083, "kl": 4.78125, "learning_rate": 5e-05, "loss": 0.0496, "num_tokens": 29584336.0, "reward": 6.963191986083984, "reward_std": 0.4519602358341217, "rewards/helpfulness_reward/mean": 6.963191986083984, "rewards/helpfulness_reward/std": 0.9058918356895447, "rewards/safety_reward/mean": 8.290283203125, "rewards/safety_reward/std": 0.9016961455345154, "step": 1314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 124.4453125, "completions/mean_terminated_length": 124.4453125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.4593485285127936, "frac_reward_zero_std": 0.0, "grad_norm": 0.2447735071182251, "kl": 3.904296875, "learning_rate": 5e-05, "loss": 0.0407, "num_tokens": 29605273.0, "reward": 7.0208740234375, "reward_std": 0.23792234063148499, "rewards/helpfulness_reward/mean": 7.0208740234375, "rewards/helpfulness_reward/std": 0.6999704241752625, "rewards/safety_reward/mean": 8.30224609375, "rewards/safety_reward/std": 0.5788592100143433, "step": 1315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 124.0859375, "completions/mean_terminated_length": 124.0859375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.4596978429831456, "frac_reward_zero_std": 0.0, "grad_norm": 0.2547249495983124, "kl": 3.919921875, "learning_rate": 5e-05, "loss": 0.0384, "num_tokens": 29625892.0, "reward": 6.95947265625, "reward_std": 0.301763117313385, "rewards/helpfulness_reward/mean": 6.95947265625, "rewards/helpfulness_reward/std": 0.7732350826263428, "rewards/safety_reward/mean": 8.431396484375, "rewards/safety_reward/std": 0.6591442823410034, "step": 1316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 123.9140625, "completions/mean_terminated_length": 123.9140625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.4600471574534975, "frac_reward_zero_std": 0.0, "grad_norm": 4.1546807289123535, "kl": 5.501953125, "learning_rate": 5e-05, "loss": 0.0544, "num_tokens": 29646289.0, "reward": 6.950439453125, "reward_std": 0.325276255607605, "rewards/helpfulness_reward/mean": 6.950439453125, "rewards/helpfulness_reward/std": 0.6647141575813293, "rewards/safety_reward/mean": 8.239990234375, "rewards/safety_reward/std": 0.6654357314109802, "step": 1317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 123.3046875, "completions/mean_terminated_length": 123.3046875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.46039647192384947, "frac_reward_zero_std": 0.0, "grad_norm": 0.27327704429626465, "kl": 3.87109375, "learning_rate": 5e-05, "loss": 0.0337, "num_tokens": 29666104.0, "reward": 6.9976806640625, "reward_std": 0.413138210773468, "rewards/helpfulness_reward/mean": 6.9976806640625, "rewards/helpfulness_reward/std": 0.7867646813392639, "rewards/safety_reward/mean": 8.2421875, "rewards/safety_reward/std": 0.7193007469177246, "step": 1318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 123.5703125, "completions/mean_terminated_length": 123.5703125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.4607457863942014, "frac_reward_zero_std": 0.0, "grad_norm": 0.27711984515190125, "kl": 3.73828125, "learning_rate": 5e-05, "loss": 0.0343, "num_tokens": 29685657.0, "reward": 7.0093994140625, "reward_std": 0.5483342409133911, "rewards/helpfulness_reward/mean": 7.0093994140625, "rewards/helpfulness_reward/std": 0.7348142266273499, "rewards/safety_reward/mean": 8.322998046875, "rewards/safety_reward/std": 0.7719702124595642, "step": 1319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 124.0703125, "completions/mean_terminated_length": 124.0703125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.4610951008645533, "frac_reward_zero_std": 0.0, "grad_norm": 0.24247752130031586, "kl": 3.93359375, "learning_rate": 5e-05, "loss": 0.0361, "num_tokens": 29705586.0, "reward": 6.9808349609375, "reward_std": 0.3983568549156189, "rewards/helpfulness_reward/mean": 6.9808349609375, "rewards/helpfulness_reward/std": 0.6234732270240784, "rewards/safety_reward/mean": 8.380615234375, "rewards/safety_reward/std": 0.6264185309410095, "step": 1320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 124.1796875, "completions/mean_terminated_length": 124.1796875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.46144441533490527, "frac_reward_zero_std": 0.0, "grad_norm": 0.27234792709350586, "kl": 3.900390625, "learning_rate": 5e-05, "loss": 0.039, "num_tokens": 29725265.0, "reward": 7.158447265625, "reward_std": 0.4896416962146759, "rewards/helpfulness_reward/mean": 7.158447265625, "rewards/helpfulness_reward/std": 0.6809953451156616, "rewards/safety_reward/mean": 8.39013671875, "rewards/safety_reward/std": 0.757832944393158, "step": 1321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 123.609375, "completions/mean_terminated_length": 123.609375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.4617937298052572, "frac_reward_zero_std": 0.0, "grad_norm": 0.23707666993141174, "kl": 3.6796875, "learning_rate": 5e-05, "loss": 0.0281, "num_tokens": 29745567.0, "reward": 6.74029541015625, "reward_std": 0.6531228423118591, "rewards/helpfulness_reward/mean": 6.74029541015625, "rewards/helpfulness_reward/std": 0.9259260296821594, "rewards/safety_reward/mean": 8.150146484375, "rewards/safety_reward/std": 0.7984961867332458, "step": 1322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 122.9296875, "completions/mean_terminated_length": 122.9296875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.4621430442756091, "frac_reward_zero_std": 0.0, "grad_norm": 0.2646704912185669, "kl": 3.76953125, "learning_rate": 5e-05, "loss": 0.0288, "num_tokens": 29765694.0, "reward": 6.76922607421875, "reward_std": 0.7635955810546875, "rewards/helpfulness_reward/mean": 6.76922607421875, "rewards/helpfulness_reward/std": 1.1029706001281738, "rewards/safety_reward/mean": 8.220947265625, "rewards/safety_reward/std": 0.859028697013855, "step": 1323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 123.4921875, "completions/mean_terminated_length": 123.4921875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.46249235874596106, "frac_reward_zero_std": 0.0, "grad_norm": 0.22288133203983307, "kl": 3.724609375, "learning_rate": 5e-05, "loss": 0.0328, "num_tokens": 29785709.0, "reward": 6.97210693359375, "reward_std": 0.39847251772880554, "rewards/helpfulness_reward/mean": 6.97210693359375, "rewards/helpfulness_reward/std": 0.8682792782783508, "rewards/safety_reward/mean": 8.311279296875, "rewards/safety_reward/std": 0.8052122592926025, "step": 1324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 123.2890625, "completions/mean_terminated_length": 123.2890625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.462841673216313, "frac_reward_zero_std": 0.0, "grad_norm": 0.2553658187389374, "kl": 3.833984375, "learning_rate": 5e-05, "loss": 0.0278, "num_tokens": 29805426.0, "reward": 6.73968505859375, "reward_std": 0.8732641339302063, "rewards/helpfulness_reward/mean": 6.73968505859375, "rewards/helpfulness_reward/std": 1.167894721031189, "rewards/safety_reward/mean": 8.24560546875, "rewards/safety_reward/std": 0.9709321856498718, "step": 1325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 123.8984375, "completions/mean_terminated_length": 123.8984375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.46319098768666495, "frac_reward_zero_std": 0.0, "grad_norm": 0.2565155029296875, "kl": 3.689453125, "learning_rate": 5e-05, "loss": 0.0358, "num_tokens": 29825925.0, "reward": 6.772705078125, "reward_std": 0.39558783173561096, "rewards/helpfulness_reward/mean": 6.772705078125, "rewards/helpfulness_reward/std": 0.7336819171905518, "rewards/safety_reward/mean": 8.268310546875, "rewards/safety_reward/std": 0.5402068495750427, "step": 1326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 123.2578125, "completions/mean_terminated_length": 123.2578125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.46354030215701686, "frac_reward_zero_std": 0.0, "grad_norm": 0.2708018124103546, "kl": 3.6640625, "learning_rate": 5e-05, "loss": 0.027, "num_tokens": 29847102.0, "reward": 6.57037353515625, "reward_std": 0.5649454593658447, "rewards/helpfulness_reward/mean": 6.57037353515625, "rewards/helpfulness_reward/std": 0.8719714283943176, "rewards/safety_reward/mean": 8.059814453125, "rewards/safety_reward/std": 0.7201985120773315, "step": 1327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 123.2578125, "completions/mean_terminated_length": 123.2578125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.4638896166273688, "frac_reward_zero_std": 0.0, "grad_norm": 0.30228909850120544, "kl": 3.875, "learning_rate": 5e-05, "loss": 0.0318, "num_tokens": 29868191.0, "reward": 6.8016357421875, "reward_std": 0.4058375954627991, "rewards/helpfulness_reward/mean": 6.8016357421875, "rewards/helpfulness_reward/std": 0.9191351532936096, "rewards/safety_reward/mean": 8.24462890625, "rewards/safety_reward/std": 0.7337070107460022, "step": 1328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 123.1015625, "completions/mean_terminated_length": 123.1015625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.46423893109772074, "frac_reward_zero_std": 0.0, "grad_norm": 0.2433619499206543, "kl": 3.775390625, "learning_rate": 5e-05, "loss": 0.0303, "num_tokens": 29888988.0, "reward": 7.0009765625, "reward_std": 0.26905378699302673, "rewards/helpfulness_reward/mean": 7.0009765625, "rewards/helpfulness_reward/std": 0.5172340273857117, "rewards/safety_reward/mean": 8.32421875, "rewards/safety_reward/std": 0.539073646068573, "step": 1329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 124.0078125, "completions/mean_terminated_length": 124.0078125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.46458824556807266, "frac_reward_zero_std": 0.0, "grad_norm": 0.2670438587665558, "kl": 3.779296875, "learning_rate": 5e-05, "loss": 0.0407, "num_tokens": 29908397.0, "reward": 6.962890625, "reward_std": 0.3393658399581909, "rewards/helpfulness_reward/mean": 6.962890625, "rewards/helpfulness_reward/std": 0.6868705749511719, "rewards/safety_reward/mean": 8.260498046875, "rewards/safety_reward/std": 0.6344666481018066, "step": 1330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 122.9765625, "completions/mean_terminated_length": 122.9765625, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.46493756003842457, "frac_reward_zero_std": 0.0, "grad_norm": 0.23702548444271088, "kl": 3.83984375, "learning_rate": 5e-05, "loss": 0.0176, "num_tokens": 29928042.0, "reward": 6.81512451171875, "reward_std": 0.7752572894096375, "rewards/helpfulness_reward/mean": 6.81512451171875, "rewards/helpfulness_reward/std": 1.2117260694503784, "rewards/safety_reward/mean": 8.203857421875, "rewards/safety_reward/std": 0.9681740999221802, "step": 1331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 124.0078125, "completions/mean_terminated_length": 124.0078125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.46528687450877654, "frac_reward_zero_std": 0.0, "grad_norm": 0.22457267343997955, "kl": 3.69921875, "learning_rate": 5e-05, "loss": 0.0305, "num_tokens": 29948899.0, "reward": 6.94024658203125, "reward_std": 0.45633482933044434, "rewards/helpfulness_reward/mean": 6.94024658203125, "rewards/helpfulness_reward/std": 0.8762572407722473, "rewards/safety_reward/mean": 8.291259765625, "rewards/safety_reward/std": 0.5559401512145996, "step": 1332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 123.7109375, "completions/mean_terminated_length": 123.7109375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.46563618897912845, "frac_reward_zero_std": 0.0, "grad_norm": 0.2711069881916046, "kl": 3.859375, "learning_rate": 5e-05, "loss": 0.037, "num_tokens": 29969062.0, "reward": 6.918212890625, "reward_std": 0.21860015392303467, "rewards/helpfulness_reward/mean": 6.918212890625, "rewards/helpfulness_reward/std": 0.5990442037582397, "rewards/safety_reward/mean": 8.324462890625, "rewards/safety_reward/std": 0.571439266204834, "step": 1333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 123.5703125, "completions/mean_terminated_length": 123.5703125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.4659855034494804, "frac_reward_zero_std": 0.0, "grad_norm": 0.2503884434700012, "kl": 3.828125, "learning_rate": 5e-05, "loss": 0.0254, "num_tokens": 29990007.0, "reward": 6.885009765625, "reward_std": 0.5862501263618469, "rewards/helpfulness_reward/mean": 6.885009765625, "rewards/helpfulness_reward/std": 0.9616531133651733, "rewards/safety_reward/mean": 8.147705078125, "rewards/safety_reward/std": 0.8179962038993835, "step": 1334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 123.7890625, "completions/mean_terminated_length": 123.7890625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.46633481791983233, "frac_reward_zero_std": 0.0, "grad_norm": 0.27244916558265686, "kl": 3.607421875, "learning_rate": 5e-05, "loss": 0.0382, "num_tokens": 30009852.0, "reward": 7.1097412109375, "reward_std": 0.3140352964401245, "rewards/helpfulness_reward/mean": 7.1097412109375, "rewards/helpfulness_reward/std": 0.6635834574699402, "rewards/safety_reward/mean": 8.447509765625, "rewards/safety_reward/std": 0.6524240374565125, "step": 1335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 123.5078125, "completions/mean_terminated_length": 123.5078125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.46668413239018425, "frac_reward_zero_std": 0.0, "grad_norm": 0.2015158236026764, "kl": 3.76953125, "learning_rate": 5e-05, "loss": 0.0319, "num_tokens": 30029509.0, "reward": 7.219482421875, "reward_std": 0.4198615550994873, "rewards/helpfulness_reward/mean": 7.219482421875, "rewards/helpfulness_reward/std": 0.6999418139457703, "rewards/safety_reward/mean": 8.45361328125, "rewards/safety_reward/std": 0.5916285514831543, "step": 1336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 123.8828125, "completions/mean_terminated_length": 123.8828125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.4670334468605362, "frac_reward_zero_std": 0.0, "grad_norm": 0.33083248138427734, "kl": 3.908203125, "learning_rate": 5e-05, "loss": 0.0377, "num_tokens": 30049726.0, "reward": 6.966552734375, "reward_std": 0.2595941722393036, "rewards/helpfulness_reward/mean": 6.966552734375, "rewards/helpfulness_reward/std": 0.4769570827484131, "rewards/safety_reward/mean": 8.3017578125, "rewards/safety_reward/std": 0.6186368465423584, "step": 1337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 123.0, "completions/mean_terminated_length": 123.0, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.46738276133088813, "frac_reward_zero_std": 0.0, "grad_norm": 0.2956068813800812, "kl": 3.61328125, "learning_rate": 5e-05, "loss": 0.0253, "num_tokens": 30070886.0, "reward": 6.838134765625, "reward_std": 0.3084554672241211, "rewards/helpfulness_reward/mean": 6.838134765625, "rewards/helpfulness_reward/std": 0.6320764422416687, "rewards/safety_reward/mean": 8.294677734375, "rewards/safety_reward/std": 0.41914689540863037, "step": 1338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 123.765625, "completions/mean_terminated_length": 123.765625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.46773207580124004, "frac_reward_zero_std": 0.0, "grad_norm": 0.23682019114494324, "kl": 3.865234375, "learning_rate": 5e-05, "loss": 0.0343, "num_tokens": 30092256.0, "reward": 6.91656494140625, "reward_std": 0.29888930916786194, "rewards/helpfulness_reward/mean": 6.91656494140625, "rewards/helpfulness_reward/std": 0.7837686538696289, "rewards/safety_reward/mean": 8.13671875, "rewards/safety_reward/std": 0.673465371131897, "step": 1339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 124.1640625, "completions/mean_terminated_length": 124.1640625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.468081390271592, "frac_reward_zero_std": 0.0, "grad_norm": 0.2590554356575012, "kl": 3.736328125, "learning_rate": 5e-05, "loss": 0.0371, "num_tokens": 30113309.0, "reward": 6.8572998046875, "reward_std": 0.28747403621673584, "rewards/helpfulness_reward/mean": 6.8572998046875, "rewards/helpfulness_reward/std": 0.6949654221534729, "rewards/safety_reward/mean": 8.28857421875, "rewards/safety_reward/std": 0.5678941607475281, "step": 1340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 123.6484375, "completions/mean_terminated_length": 123.6484375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.4684307047419439, "frac_reward_zero_std": 0.0, "grad_norm": 0.25384822487831116, "kl": 4.025390625, "learning_rate": 5e-05, "loss": 0.0385, "num_tokens": 30133472.0, "reward": 7.105224609375, "reward_std": 0.23879697918891907, "rewards/helpfulness_reward/mean": 7.105224609375, "rewards/helpfulness_reward/std": 0.5677639842033386, "rewards/safety_reward/mean": 8.4755859375, "rewards/safety_reward/std": 0.42806434631347656, "step": 1341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 124.1171875, "completions/mean_terminated_length": 124.1171875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.4687800192122959, "frac_reward_zero_std": 0.0, "grad_norm": 0.21427670121192932, "kl": 3.859375, "learning_rate": 5e-05, "loss": 0.0402, "num_tokens": 30153303.0, "reward": 7.1904296875, "reward_std": 0.21718528866767883, "rewards/helpfulness_reward/mean": 7.1904296875, "rewards/helpfulness_reward/std": 0.5765475034713745, "rewards/safety_reward/mean": 8.55517578125, "rewards/safety_reward/std": 0.39783424139022827, "step": 1342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 123.6171875, "completions/mean_terminated_length": 123.6171875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.4691293336826478, "frac_reward_zero_std": 0.0, "grad_norm": 0.2642286419868469, "kl": 3.91015625, "learning_rate": 5e-05, "loss": 0.0348, "num_tokens": 30174230.0, "reward": 6.876220703125, "reward_std": 0.21390590071678162, "rewards/helpfulness_reward/mean": 6.876220703125, "rewards/helpfulness_reward/std": 0.4793888032436371, "rewards/safety_reward/mean": 8.383056640625, "rewards/safety_reward/std": 0.45951777696609497, "step": 1343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1006.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 142.9453125, "completions/mean_terminated_length": 142.9453125, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.4694786481529997, "frac_reward_zero_std": 0.0, "grad_norm": 0.2748671770095825, "kl": 3.478515625, "learning_rate": 5e-05, "loss": 0.045, "num_tokens": 30198887.0, "reward": 6.581718444824219, "reward_std": 0.2671365439891815, "rewards/helpfulness_reward/mean": 6.581718444824219, "rewards/helpfulness_reward/std": 1.6358816623687744, "rewards/safety_reward/mean": 7.8230743408203125, "rewards/safety_reward/std": 2.104809045791626, "step": 1344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 123.703125, "completions/mean_terminated_length": 123.703125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.4698279626233517, "frac_reward_zero_std": 0.0, "grad_norm": 0.24597276747226715, "kl": 3.908203125, "learning_rate": 5e-05, "loss": 0.0386, "num_tokens": 30219249.0, "reward": 7.09130859375, "reward_std": 0.33163902163505554, "rewards/helpfulness_reward/mean": 7.09130859375, "rewards/helpfulness_reward/std": 0.5984212160110474, "rewards/safety_reward/mean": 8.26708984375, "rewards/safety_reward/std": 0.5879502892494202, "step": 1345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 124.0859375, "completions/mean_terminated_length": 124.0859375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.4701772770937036, "frac_reward_zero_std": 0.0, "grad_norm": 0.22964921593666077, "kl": 3.880859375, "learning_rate": 5e-05, "loss": 0.0367, "num_tokens": 30239116.0, "reward": 7.132568359375, "reward_std": 0.20353403687477112, "rewards/helpfulness_reward/mean": 7.132568359375, "rewards/helpfulness_reward/std": 0.4855937957763672, "rewards/safety_reward/mean": 8.396728515625, "rewards/safety_reward/std": 0.4352013170719147, "step": 1346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 124.2265625, "completions/mean_terminated_length": 124.2265625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.4705265915640555, "frac_reward_zero_std": 0.0, "grad_norm": 0.2725902199745178, "kl": 3.826171875, "learning_rate": 5e-05, "loss": 0.0419, "num_tokens": 30258953.0, "reward": 7.214599609375, "reward_std": 0.21588560938835144, "rewards/helpfulness_reward/mean": 7.214599609375, "rewards/helpfulness_reward/std": 0.5287090539932251, "rewards/safety_reward/mean": 8.41015625, "rewards/safety_reward/std": 0.4443363547325134, "step": 1347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 123.7109375, "completions/mean_terminated_length": 123.7109375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.4708759060344075, "frac_reward_zero_std": 0.0, "grad_norm": 0.4511606693267822, "kl": 4.15625, "learning_rate": 5e-05, "loss": 0.042, "num_tokens": 30278356.0, "reward": 6.911865234375, "reward_std": 0.2267380952835083, "rewards/helpfulness_reward/mean": 6.911865234375, "rewards/helpfulness_reward/std": 0.407194048166275, "rewards/safety_reward/mean": 8.265869140625, "rewards/safety_reward/std": 0.4647549092769623, "step": 1348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 124.25, "completions/mean_terminated_length": 124.25, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.4712252205047594, "frac_reward_zero_std": 0.0, "grad_norm": 0.20626144111156464, "kl": 3.77734375, "learning_rate": 5e-05, "loss": 0.038, "num_tokens": 30298036.0, "reward": 7.28955078125, "reward_std": 0.23209819197654724, "rewards/helpfulness_reward/mean": 7.28955078125, "rewards/helpfulness_reward/std": 0.44083523750305176, "rewards/safety_reward/mean": 8.470703125, "rewards/safety_reward/std": 0.5393267869949341, "step": 1349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 124.03125, "completions/mean_terminated_length": 124.03125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.47157453497511137, "frac_reward_zero_std": 0.0, "grad_norm": 0.2315271496772766, "kl": 3.8203125, "learning_rate": 5e-05, "loss": 0.0321, "num_tokens": 30319216.0, "reward": 7.075927734375, "reward_std": 0.25590988993644714, "rewards/helpfulness_reward/mean": 7.075927734375, "rewards/helpfulness_reward/std": 0.8712807893753052, "rewards/safety_reward/mean": 8.46142578125, "rewards/safety_reward/std": 0.8214554190635681, "step": 1350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 124.09375, "completions/mean_terminated_length": 124.09375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.4719238494454633, "frac_reward_zero_std": 0.0, "grad_norm": 0.24083106219768524, "kl": 3.892578125, "learning_rate": 5e-05, "loss": 0.0375, "num_tokens": 30339076.0, "reward": 7.232666015625, "reward_std": 0.24015170335769653, "rewards/helpfulness_reward/mean": 7.232666015625, "rewards/helpfulness_reward/std": 0.713222324848175, "rewards/safety_reward/mean": 8.490966796875, "rewards/safety_reward/std": 0.6233880519866943, "step": 1351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 124.7265625, "completions/mean_terminated_length": 124.7265625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.4722731639158152, "frac_reward_zero_std": 0.0, "grad_norm": 0.2042216807603836, "kl": 3.751953125, "learning_rate": 5e-05, "loss": 0.0398, "num_tokens": 30359713.0, "reward": 7.132568359375, "reward_std": 0.29549115896224976, "rewards/helpfulness_reward/mean": 7.132568359375, "rewards/helpfulness_reward/std": 0.6090073585510254, "rewards/safety_reward/mean": 8.103759765625, "rewards/safety_reward/std": 0.6721159219741821, "step": 1352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 123.6875, "completions/mean_terminated_length": 123.6875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.47262247838616717, "frac_reward_zero_std": 0.0, "grad_norm": 0.26178687810897827, "kl": 3.900390625, "learning_rate": 5e-05, "loss": 0.0319, "num_tokens": 30379977.0, "reward": 6.821533203125, "reward_std": 0.21879532933235168, "rewards/helpfulness_reward/mean": 6.821533203125, "rewards/helpfulness_reward/std": 0.872871994972229, "rewards/safety_reward/mean": 8.0535888671875, "rewards/safety_reward/std": 1.1425871849060059, "step": 1353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 124.546875, "completions/mean_terminated_length": 124.546875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.4729717928565191, "frac_reward_zero_std": 0.0, "grad_norm": 0.2810583710670471, "kl": 3.822265625, "learning_rate": 5e-05, "loss": 0.0395, "num_tokens": 30399855.0, "reward": 7.0185546875, "reward_std": 0.2027127742767334, "rewards/helpfulness_reward/mean": 7.0185546875, "rewards/helpfulness_reward/std": 0.3656443953514099, "rewards/safety_reward/mean": 8.114501953125, "rewards/safety_reward/std": 0.43897873163223267, "step": 1354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 124.4375, "completions/mean_terminated_length": 124.4375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.473321107326871, "frac_reward_zero_std": 0.0, "grad_norm": 0.3105400502681732, "kl": 3.775390625, "learning_rate": 5e-05, "loss": 0.0363, "num_tokens": 30419951.0, "reward": 6.9669189453125, "reward_std": 0.27578502893447876, "rewards/helpfulness_reward/mean": 6.9669189453125, "rewards/helpfulness_reward/std": 0.5702442526817322, "rewards/safety_reward/mean": 8.332275390625, "rewards/safety_reward/std": 0.5945830345153809, "step": 1355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 124.6796875, "completions/mean_terminated_length": 124.6796875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.47367042179722296, "frac_reward_zero_std": 0.0, "grad_norm": 0.2647165060043335, "kl": 4.068359375, "learning_rate": 5e-05, "loss": 0.0409, "num_tokens": 30439702.0, "reward": 7.1610107421875, "reward_std": 0.27842777967453003, "rewards/helpfulness_reward/mean": 7.1610107421875, "rewards/helpfulness_reward/std": 0.6203213334083557, "rewards/safety_reward/mean": 8.312255859375, "rewards/safety_reward/std": 0.5700701475143433, "step": 1356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 124.640625, "completions/mean_terminated_length": 124.640625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.4740197362675749, "frac_reward_zero_std": 0.0, "grad_norm": 0.2564045488834381, "kl": 4.005859375, "learning_rate": 5e-05, "loss": 0.0379, "num_tokens": 30459392.0, "reward": 7.119384765625, "reward_std": 0.30803972482681274, "rewards/helpfulness_reward/mean": 7.119384765625, "rewards/helpfulness_reward/std": 0.49931415915489197, "rewards/safety_reward/mean": 8.489990234375, "rewards/safety_reward/std": 0.47925296425819397, "step": 1357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 125.171875, "completions/mean_terminated_length": 125.171875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.47436905073792685, "frac_reward_zero_std": 0.0, "grad_norm": 0.25903138518333435, "kl": 3.876953125, "learning_rate": 5e-05, "loss": 0.0371, "num_tokens": 30481766.0, "reward": 6.8116455078125, "reward_std": 0.28884631395339966, "rewards/helpfulness_reward/mean": 6.8116455078125, "rewards/helpfulness_reward/std": 0.6465951204299927, "rewards/safety_reward/mean": 8.275390625, "rewards/safety_reward/std": 0.6648092865943909, "step": 1358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 127.6640625, "completions/mean_terminated_length": 127.6640625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.47471836520827876, "frac_reward_zero_std": 0.0, "grad_norm": 0.2635326087474823, "kl": 3.634765625, "learning_rate": 5e-05, "loss": 0.0645, "num_tokens": 30503787.0, "reward": 6.909210205078125, "reward_std": 0.2681763470172882, "rewards/helpfulness_reward/mean": 6.909210205078125, "rewards/helpfulness_reward/std": 1.2609349489212036, "rewards/safety_reward/mean": 8.11407470703125, "rewards/safety_reward/std": 1.4183546304702759, "step": 1359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 125.09375, "completions/mean_terminated_length": 125.09375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.47506767967863067, "frac_reward_zero_std": 0.0, "grad_norm": 0.23807580769062042, "kl": 3.748046875, "learning_rate": 5e-05, "loss": 0.0428, "num_tokens": 30525007.0, "reward": 6.916015625, "reward_std": 0.25919753313064575, "rewards/helpfulness_reward/mean": 6.916015625, "rewards/helpfulness_reward/std": 0.8115791082382202, "rewards/safety_reward/mean": 8.315185546875, "rewards/safety_reward/std": 0.5819053649902344, "step": 1360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 125.265625, "completions/mean_terminated_length": 125.265625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.47541699414898264, "frac_reward_zero_std": 0.0, "grad_norm": 0.2511480152606964, "kl": 3.857421875, "learning_rate": 5e-05, "loss": 0.0419, "num_tokens": 30545625.0, "reward": 7.093505859375, "reward_std": 0.26621443033218384, "rewards/helpfulness_reward/mean": 7.093505859375, "rewards/helpfulness_reward/std": 0.6315295696258545, "rewards/safety_reward/mean": 8.374267578125, "rewards/safety_reward/std": 0.5616170763969421, "step": 1361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 125.1328125, "completions/mean_terminated_length": 125.1328125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.47576630861933455, "frac_reward_zero_std": 0.0, "grad_norm": 0.21001090109348297, "kl": 3.779296875, "learning_rate": 5e-05, "loss": 0.0406, "num_tokens": 30565554.0, "reward": 7.10009765625, "reward_std": 0.2946227490901947, "rewards/helpfulness_reward/mean": 7.10009765625, "rewards/helpfulness_reward/std": 0.5417395830154419, "rewards/safety_reward/mean": 8.376220703125, "rewards/safety_reward/std": 0.4298652410507202, "step": 1362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 125.359375, "completions/mean_terminated_length": 125.359375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.47611562308968647, "frac_reward_zero_std": 0.0, "grad_norm": 0.25922608375549316, "kl": 4.01171875, "learning_rate": 5e-05, "loss": 0.0458, "num_tokens": 30585176.0, "reward": 7.021240234375, "reward_std": 0.25043731927871704, "rewards/helpfulness_reward/mean": 7.021240234375, "rewards/helpfulness_reward/std": 0.6831344962120056, "rewards/safety_reward/mean": 8.130615234375, "rewards/safety_reward/std": 0.6683931350708008, "step": 1363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 125.1953125, "completions/mean_terminated_length": 125.1953125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.47646493756003844, "frac_reward_zero_std": 0.0, "grad_norm": 0.2565097510814667, "kl": 3.947265625, "learning_rate": 5e-05, "loss": 0.0435, "num_tokens": 30605593.0, "reward": 6.8311767578125, "reward_std": 0.33252689242362976, "rewards/helpfulness_reward/mean": 6.8311767578125, "rewards/helpfulness_reward/std": 0.7717459797859192, "rewards/safety_reward/mean": 8.337646484375, "rewards/safety_reward/std": 0.6335543394088745, "step": 1364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 125.03125, "completions/mean_terminated_length": 125.03125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.47681425203039035, "frac_reward_zero_std": 0.0, "grad_norm": 2.577789306640625, "kl": 4.4375, "learning_rate": 5e-05, "loss": 0.0421, "num_tokens": 30625885.0, "reward": 6.9593505859375, "reward_std": 0.3305606245994568, "rewards/helpfulness_reward/mean": 6.9593505859375, "rewards/helpfulness_reward/std": 0.6272270083427429, "rewards/safety_reward/mean": 8.259521484375, "rewards/safety_reward/std": 0.5034263730049133, "step": 1365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 124.5859375, "completions/mean_terminated_length": 124.5859375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.4771635665007423, "frac_reward_zero_std": 0.0, "grad_norm": 0.22217296063899994, "kl": 3.875, "learning_rate": 5e-05, "loss": 0.0407, "num_tokens": 30649208.0, "reward": 7.0059814453125, "reward_std": 0.3178804814815521, "rewards/helpfulness_reward/mean": 7.0059814453125, "rewards/helpfulness_reward/std": 0.8656593561172485, "rewards/safety_reward/mean": 8.26953125, "rewards/safety_reward/std": 0.6612169146537781, "step": 1366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 124.9296875, "completions/mean_terminated_length": 124.9296875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.47751288097109423, "frac_reward_zero_std": 0.0, "grad_norm": 0.2932836711406708, "kl": 4.037109375, "learning_rate": 5e-05, "loss": 0.0413, "num_tokens": 30669847.0, "reward": 7.0062255859375, "reward_std": 0.406585693359375, "rewards/helpfulness_reward/mean": 7.0062255859375, "rewards/helpfulness_reward/std": 0.8228639960289001, "rewards/safety_reward/mean": 8.33935546875, "rewards/safety_reward/std": 0.5567528605461121, "step": 1367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 124.203125, "completions/mean_terminated_length": 124.203125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.47786219544144615, "frac_reward_zero_std": 0.0, "grad_norm": 0.29642170667648315, "kl": 3.83203125, "learning_rate": 5e-05, "loss": 0.0303, "num_tokens": 30690209.0, "reward": 7.179931640625, "reward_std": 0.20451262593269348, "rewards/helpfulness_reward/mean": 7.179931640625, "rewards/helpfulness_reward/std": 0.44052571058273315, "rewards/safety_reward/mean": 8.566162109375, "rewards/safety_reward/std": 0.3828055262565613, "step": 1368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 125.1015625, "completions/mean_terminated_length": 125.1015625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.4782115099117981, "frac_reward_zero_std": 0.0, "grad_norm": 0.2773381769657135, "kl": 3.873046875, "learning_rate": 5e-05, "loss": 0.0446, "num_tokens": 30710454.0, "reward": 7.00439453125, "reward_std": 0.3335106670856476, "rewards/helpfulness_reward/mean": 7.00439453125, "rewards/helpfulness_reward/std": 0.718249499797821, "rewards/safety_reward/mean": 8.28173828125, "rewards/safety_reward/std": 0.7006941437721252, "step": 1369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 124.34375, "completions/mean_terminated_length": 124.34375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.47856082438215003, "frac_reward_zero_std": 0.0, "grad_norm": 0.2522553503513336, "kl": 3.90234375, "learning_rate": 5e-05, "loss": 0.0374, "num_tokens": 30730866.0, "reward": 6.97607421875, "reward_std": 0.3170294761657715, "rewards/helpfulness_reward/mean": 6.97607421875, "rewards/helpfulness_reward/std": 0.5065922737121582, "rewards/safety_reward/mean": 8.403076171875, "rewards/safety_reward/std": 0.45012781023979187, "step": 1370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1792.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 152.484375, "completions/mean_terminated_length": 126.4603271484375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.47891013885250194, "frac_reward_zero_std": 0.0, "grad_norm": 0.38144800066947937, "kl": 3.701171875, "learning_rate": 5e-05, "loss": 0.151, "num_tokens": 30757872.0, "reward": 6.7149658203125, "reward_std": 0.428478479385376, "rewards/helpfulness_reward/mean": 6.7149658203125, "rewards/helpfulness_reward/std": 1.6896330118179321, "rewards/safety_reward/mean": 7.9609375, "rewards/safety_reward/std": 1.8357805013656616, "step": 1371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 124.609375, "completions/mean_terminated_length": 124.609375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.4792594533228539, "frac_reward_zero_std": 0.0, "grad_norm": 0.2680182456970215, "kl": 3.826171875, "learning_rate": 5e-05, "loss": 0.0405, "num_tokens": 30778494.0, "reward": 6.9356689453125, "reward_std": 0.4035825729370117, "rewards/helpfulness_reward/mean": 6.9356689453125, "rewards/helpfulness_reward/std": 0.6579048037528992, "rewards/safety_reward/mean": 8.375244140625, "rewards/safety_reward/std": 0.5810651779174805, "step": 1372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 124.5703125, "completions/mean_terminated_length": 124.5703125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.4796087677932058, "frac_reward_zero_std": 0.0, "grad_norm": 0.2790914475917816, "kl": 3.88671875, "learning_rate": 5e-05, "loss": 0.0327, "num_tokens": 30798879.0, "reward": 7.076904296875, "reward_std": 0.29448896646499634, "rewards/helpfulness_reward/mean": 7.076904296875, "rewards/helpfulness_reward/std": 0.5099270343780518, "rewards/safety_reward/mean": 8.27734375, "rewards/safety_reward/std": 0.4888804256916046, "step": 1373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 124.78125, "completions/mean_terminated_length": 124.78125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.4799580822635578, "frac_reward_zero_std": 0.0, "grad_norm": 0.3137427270412445, "kl": 4.126953125, "learning_rate": 5e-05, "loss": 0.0395, "num_tokens": 30818603.0, "reward": 7.110595703125, "reward_std": 0.3136868476867676, "rewards/helpfulness_reward/mean": 7.110595703125, "rewards/helpfulness_reward/std": 0.6151775121688843, "rewards/safety_reward/mean": 8.302978515625, "rewards/safety_reward/std": 0.7158024907112122, "step": 1374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 124.0390625, "completions/mean_terminated_length": 124.0390625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.4803073967339097, "frac_reward_zero_std": 0.0, "grad_norm": 0.2843286693096161, "kl": 4.01953125, "learning_rate": 5e-05, "loss": 0.0379, "num_tokens": 30838144.0, "reward": 7.13916015625, "reward_std": 0.33175885677337646, "rewards/helpfulness_reward/mean": 7.13916015625, "rewards/helpfulness_reward/std": 0.5059601068496704, "rewards/safety_reward/mean": 8.2685546875, "rewards/safety_reward/std": 0.5249149799346924, "step": 1375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 124.7890625, "completions/mean_terminated_length": 124.7890625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.4806567112042616, "frac_reward_zero_std": 0.0, "grad_norm": 0.2734219431877136, "kl": 3.966796875, "learning_rate": 5e-05, "loss": 0.0412, "num_tokens": 30858325.0, "reward": 7.0751953125, "reward_std": 0.35772112011909485, "rewards/helpfulness_reward/mean": 7.0751953125, "rewards/helpfulness_reward/std": 0.6761709451675415, "rewards/safety_reward/mean": 8.33984375, "rewards/safety_reward/std": 0.6580811738967896, "step": 1376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 124.234375, "completions/mean_terminated_length": 124.234375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.4810060256746136, "frac_reward_zero_std": 0.0, "grad_norm": 0.2719162404537201, "kl": 3.84765625, "learning_rate": 5e-05, "loss": 0.0365, "num_tokens": 30878251.0, "reward": 7.00244140625, "reward_std": 0.1888502985239029, "rewards/helpfulness_reward/mean": 7.00244140625, "rewards/helpfulness_reward/std": 0.5028769373893738, "rewards/safety_reward/mean": 8.369873046875, "rewards/safety_reward/std": 0.42696428298950195, "step": 1377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 123.953125, "completions/mean_terminated_length": 123.953125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.4813553401449655, "frac_reward_zero_std": 0.0, "grad_norm": 0.7182046175003052, "kl": 4.20703125, "learning_rate": 5e-05, "loss": 0.0283, "num_tokens": 30898277.0, "reward": 7.06884765625, "reward_std": 0.3390803337097168, "rewards/helpfulness_reward/mean": 7.06884765625, "rewards/helpfulness_reward/std": 0.6395856738090515, "rewards/safety_reward/mean": 8.392333984375, "rewards/safety_reward/std": 0.45091238617897034, "step": 1378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 124.6171875, "completions/mean_terminated_length": 124.6171875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.4817046546153174, "frac_reward_zero_std": 0.0, "grad_norm": 0.2856948673725128, "kl": 3.91015625, "learning_rate": 5e-05, "loss": 0.0345, "num_tokens": 30918252.0, "reward": 6.8675537109375, "reward_std": 0.40537160634994507, "rewards/helpfulness_reward/mean": 6.8675537109375, "rewards/helpfulness_reward/std": 0.7672615051269531, "rewards/safety_reward/mean": 8.24169921875, "rewards/safety_reward/std": 0.6286126375198364, "step": 1379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 124.015625, "completions/mean_terminated_length": 124.015625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.4820539690856694, "frac_reward_zero_std": 0.0, "grad_norm": 0.2584080994129181, "kl": 3.998046875, "learning_rate": 5e-05, "loss": 0.033, "num_tokens": 30937758.0, "reward": 7.077392578125, "reward_std": 0.29476428031921387, "rewards/helpfulness_reward/mean": 7.077392578125, "rewards/helpfulness_reward/std": 0.4954884648323059, "rewards/safety_reward/mean": 8.3115234375, "rewards/safety_reward/std": 0.504896342754364, "step": 1380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 124.5234375, "completions/mean_terminated_length": 124.5234375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.4824032835560213, "frac_reward_zero_std": 0.0, "grad_norm": 0.2839701771736145, "kl": 3.982421875, "learning_rate": 5e-05, "loss": 0.0426, "num_tokens": 30957985.0, "reward": 7.045166015625, "reward_std": 0.28882670402526855, "rewards/helpfulness_reward/mean": 7.045166015625, "rewards/helpfulness_reward/std": 0.5690077543258667, "rewards/safety_reward/mean": 8.31494140625, "rewards/safety_reward/std": 0.39932215213775635, "step": 1381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 124.328125, "completions/mean_terminated_length": 124.328125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.48275259802637327, "frac_reward_zero_std": 0.0, "grad_norm": 0.26278313994407654, "kl": 3.76171875, "learning_rate": 5e-05, "loss": 0.034, "num_tokens": 30978651.0, "reward": 7.024169921875, "reward_std": 0.28463777899742126, "rewards/helpfulness_reward/mean": 7.024169921875, "rewards/helpfulness_reward/std": 0.5232978463172913, "rewards/safety_reward/mean": 8.21337890625, "rewards/safety_reward/std": 0.5398734211921692, "step": 1382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 124.8671875, "completions/mean_terminated_length": 124.8671875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.4831019124967252, "frac_reward_zero_std": 0.0, "grad_norm": 0.4365592300891876, "kl": 4.1015625, "learning_rate": 5e-05, "loss": 0.0449, "num_tokens": 30998618.0, "reward": 6.782470703125, "reward_std": 0.30661165714263916, "rewards/helpfulness_reward/mean": 6.782470703125, "rewards/helpfulness_reward/std": 0.6802973747253418, "rewards/safety_reward/mean": 8.000244140625, "rewards/safety_reward/std": 0.7528600692749023, "step": 1383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 886.0, "completions/max_terminated_length": 886.0, "completions/mean_length": 135.4921875, "completions/mean_terminated_length": 135.4921875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.4834512269670771, "frac_reward_zero_std": 0.0, "grad_norm": 0.23025964200496674, "kl": 3.447265625, "learning_rate": 5e-05, "loss": 0.0722, "num_tokens": 31022209.0, "reward": 6.651185989379883, "reward_std": 0.3912205696105957, "rewards/helpfulness_reward/mean": 6.651185989379883, "rewards/helpfulness_reward/std": 1.3204084634780884, "rewards/safety_reward/mean": 7.955352783203125, "rewards/safety_reward/std": 1.4717488288879395, "step": 1384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 140.6171875, "completions/mean_terminated_length": 140.6171875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.48380054143742907, "frac_reward_zero_std": 0.0, "grad_norm": 0.31727418303489685, "kl": 3.609375, "learning_rate": 5e-05, "loss": 0.0515, "num_tokens": 31045784.0, "reward": 6.574563980102539, "reward_std": 0.4329647719860077, "rewards/helpfulness_reward/mean": 6.574563980102539, "rewards/helpfulness_reward/std": 1.9245601892471313, "rewards/safety_reward/mean": 7.75469970703125, "rewards/safety_reward/std": 2.394477128982544, "step": 1385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 124.171875, "completions/mean_terminated_length": 124.171875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.484149855907781, "frac_reward_zero_std": 0.0, "grad_norm": 0.24799808859825134, "kl": 3.845703125, "learning_rate": 5e-05, "loss": 0.0331, "num_tokens": 31067286.0, "reward": 6.981689453125, "reward_std": 0.34260523319244385, "rewards/helpfulness_reward/mean": 6.981689453125, "rewards/helpfulness_reward/std": 0.6084613800048828, "rewards/safety_reward/mean": 8.26171875, "rewards/safety_reward/std": 0.6260940432548523, "step": 1386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 123.546875, "completions/mean_terminated_length": 123.546875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.4844991703781329, "frac_reward_zero_std": 0.0, "grad_norm": 0.2662702202796936, "kl": 4.013671875, "learning_rate": 5e-05, "loss": 0.0337, "num_tokens": 31088644.0, "reward": 6.7828369140625, "reward_std": 0.4129350185394287, "rewards/helpfulness_reward/mean": 6.7828369140625, "rewards/helpfulness_reward/std": 0.6139144897460938, "rewards/safety_reward/mean": 8.071044921875, "rewards/safety_reward/std": 0.6444764733314514, "step": 1387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 124.1015625, "completions/mean_terminated_length": 124.1015625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.48484848484848486, "frac_reward_zero_std": 0.0, "grad_norm": 0.25688785314559937, "kl": 3.91796875, "learning_rate": 5e-05, "loss": 0.0372, "num_tokens": 31111017.0, "reward": 6.7960205078125, "reward_std": 0.4479483962059021, "rewards/helpfulness_reward/mean": 6.7960205078125, "rewards/helpfulness_reward/std": 0.6836790442466736, "rewards/safety_reward/mean": 8.161865234375, "rewards/safety_reward/std": 0.7287469506263733, "step": 1388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 124.6015625, "completions/mean_terminated_length": 124.6015625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.4851977993188368, "frac_reward_zero_std": 0.0, "grad_norm": 0.2965444326400757, "kl": 3.93359375, "learning_rate": 5e-05, "loss": 0.0359, "num_tokens": 31131206.0, "reward": 6.948486328125, "reward_std": 0.28695860505104065, "rewards/helpfulness_reward/mean": 6.948486328125, "rewards/helpfulness_reward/std": 0.5946958065032959, "rewards/safety_reward/mean": 8.232421875, "rewards/safety_reward/std": 0.4202473759651184, "step": 1389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 124.1875, "completions/mean_terminated_length": 124.1875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.48554711378918874, "frac_reward_zero_std": 0.0, "grad_norm": 0.235796257853508, "kl": 3.966796875, "learning_rate": 5e-05, "loss": 0.0401, "num_tokens": 31151350.0, "reward": 6.9185791015625, "reward_std": 0.3129566013813019, "rewards/helpfulness_reward/mean": 6.9185791015625, "rewards/helpfulness_reward/std": 0.649520993232727, "rewards/safety_reward/mean": 8.284912109375, "rewards/safety_reward/std": 0.600947380065918, "step": 1390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 124.0703125, "completions/mean_terminated_length": 124.0703125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.48589642825954066, "frac_reward_zero_std": 0.0, "grad_norm": 0.23828056454658508, "kl": 3.8203125, "learning_rate": 5e-05, "loss": 0.0334, "num_tokens": 31171527.0, "reward": 6.994140625, "reward_std": 0.32799768447875977, "rewards/helpfulness_reward/mean": 6.994140625, "rewards/helpfulness_reward/std": 0.5730736255645752, "rewards/safety_reward/mean": 8.322021484375, "rewards/safety_reward/std": 0.45153865218162537, "step": 1391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 124.3828125, "completions/mean_terminated_length": 124.3828125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.48624574272989257, "frac_reward_zero_std": 0.0, "grad_norm": 0.2580499053001404, "kl": 3.822265625, "learning_rate": 5e-05, "loss": 0.0358, "num_tokens": 31193056.0, "reward": 6.916748046875, "reward_std": 0.4362908899784088, "rewards/helpfulness_reward/mean": 6.916748046875, "rewards/helpfulness_reward/std": 0.7840871214866638, "rewards/safety_reward/mean": 8.220703125, "rewards/safety_reward/std": 0.6571545004844666, "step": 1392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 124.1015625, "completions/mean_terminated_length": 124.1015625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.48659505720024454, "frac_reward_zero_std": 0.0, "grad_norm": 0.22159141302108765, "kl": 3.8828125, "learning_rate": 5e-05, "loss": 0.034, "num_tokens": 31212933.0, "reward": 6.7435302734375, "reward_std": 0.4160325527191162, "rewards/helpfulness_reward/mean": 6.7435302734375, "rewards/helpfulness_reward/std": 0.7253196239471436, "rewards/safety_reward/mean": 8.29345703125, "rewards/safety_reward/std": 0.6010704040527344, "step": 1393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 124.1328125, "completions/mean_terminated_length": 124.1328125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.48694437167059645, "frac_reward_zero_std": 0.0, "grad_norm": 0.24752074480056763, "kl": 3.71875, "learning_rate": 5e-05, "loss": 0.0304, "num_tokens": 31234374.0, "reward": 7.0294189453125, "reward_std": 0.42559242248535156, "rewards/helpfulness_reward/mean": 7.0294189453125, "rewards/helpfulness_reward/std": 0.7462835311889648, "rewards/safety_reward/mean": 8.333984375, "rewards/safety_reward/std": 0.5961055755615234, "step": 1394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 124.453125, "completions/mean_terminated_length": 124.453125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.48729368614094837, "frac_reward_zero_std": 0.0, "grad_norm": 0.3138134181499481, "kl": 4.017578125, "learning_rate": 5e-05, "loss": 0.0324, "num_tokens": 31255888.0, "reward": 6.941162109375, "reward_std": 0.5071414709091187, "rewards/helpfulness_reward/mean": 6.941162109375, "rewards/helpfulness_reward/std": 0.9242636561393738, "rewards/safety_reward/mean": 8.224853515625, "rewards/safety_reward/std": 0.8205241560935974, "step": 1395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 123.7734375, "completions/mean_terminated_length": 123.7734375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.48764300061130034, "frac_reward_zero_std": 0.0, "grad_norm": 0.278585821390152, "kl": 3.92578125, "learning_rate": 5e-05, "loss": 0.022, "num_tokens": 31275299.0, "reward": 6.98291015625, "reward_std": 0.6091130375862122, "rewards/helpfulness_reward/mean": 6.98291015625, "rewards/helpfulness_reward/std": 0.8891667127609253, "rewards/safety_reward/mean": 8.296875, "rewards/safety_reward/std": 0.7081934213638306, "step": 1396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 124.59375, "completions/mean_terminated_length": 124.59375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.48799231508165225, "frac_reward_zero_std": 0.0, "grad_norm": 0.2527458369731903, "kl": 3.9375, "learning_rate": 5e-05, "loss": 0.0339, "num_tokens": 31295135.0, "reward": 7.11376953125, "reward_std": 0.3856995701789856, "rewards/helpfulness_reward/mean": 7.11376953125, "rewards/helpfulness_reward/std": 0.786849856376648, "rewards/safety_reward/mean": 8.458740234375, "rewards/safety_reward/std": 0.5512592792510986, "step": 1397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 124.640625, "completions/mean_terminated_length": 124.640625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.4883416295520042, "frac_reward_zero_std": 0.0, "grad_norm": 0.28243154287338257, "kl": 3.91796875, "learning_rate": 5e-05, "loss": 0.0396, "num_tokens": 31315577.0, "reward": 7.098876953125, "reward_std": 0.2089935541152954, "rewards/helpfulness_reward/mean": 7.098876953125, "rewards/helpfulness_reward/std": 0.5384567379951477, "rewards/safety_reward/mean": 8.440673828125, "rewards/safety_reward/std": 0.3925878703594208, "step": 1398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 123.3359375, "completions/mean_terminated_length": 123.3359375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.48869094402235613, "frac_reward_zero_std": 0.0, "grad_norm": 0.29432904720306396, "kl": 4.080078125, "learning_rate": 5e-05, "loss": 0.0339, "num_tokens": 31337724.0, "reward": 6.6307373046875, "reward_std": 0.3020572066307068, "rewards/helpfulness_reward/mean": 6.6307373046875, "rewards/helpfulness_reward/std": 0.8763256669044495, "rewards/safety_reward/mean": 7.977294921875, "rewards/safety_reward/std": 0.9470130205154419, "step": 1399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 124.7734375, "completions/mean_terminated_length": 124.7734375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.48904025849270805, "frac_reward_zero_std": 0.0, "grad_norm": 0.23934026062488556, "kl": 3.962890625, "learning_rate": 5e-05, "loss": 0.0382, "num_tokens": 31357415.0, "reward": 7.037841796875, "reward_std": 0.38781288266181946, "rewards/helpfulness_reward/mean": 7.037841796875, "rewards/helpfulness_reward/std": 0.7443077564239502, "rewards/safety_reward/mean": 8.44287109375, "rewards/safety_reward/std": 0.6349855661392212, "step": 1400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 124.5703125, "completions/mean_terminated_length": 124.5703125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.48938957296306, "frac_reward_zero_std": 0.0, "grad_norm": 0.21812111139297485, "kl": 3.908203125, "learning_rate": 5e-05, "loss": 0.0402, "num_tokens": 31377928.0, "reward": 7.005615234375, "reward_std": 0.26168379187583923, "rewards/helpfulness_reward/mean": 7.005615234375, "rewards/helpfulness_reward/std": 0.6042499542236328, "rewards/safety_reward/mean": 8.456298828125, "rewards/safety_reward/std": 0.4505631923675537, "step": 1401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 123.921875, "completions/mean_terminated_length": 123.921875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.48973888743341193, "frac_reward_zero_std": 0.0, "grad_norm": 0.2743241786956787, "kl": 3.97265625, "learning_rate": 5e-05, "loss": 0.0353, "num_tokens": 31398366.0, "reward": 6.9150390625, "reward_std": 0.3783436119556427, "rewards/helpfulness_reward/mean": 6.9150390625, "rewards/helpfulness_reward/std": 0.8120071291923523, "rewards/safety_reward/mean": 8.339111328125, "rewards/safety_reward/std": 0.6943897604942322, "step": 1402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 124.5625, "completions/mean_terminated_length": 124.5625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.49008820190376384, "frac_reward_zero_std": 0.0, "grad_norm": 0.3133423924446106, "kl": 3.826171875, "learning_rate": 5e-05, "loss": 0.0397, "num_tokens": 31418438.0, "reward": 7.175048828125, "reward_std": 0.2560729384422302, "rewards/helpfulness_reward/mean": 7.175048828125, "rewards/helpfulness_reward/std": 0.5682982802391052, "rewards/safety_reward/mean": 8.499267578125, "rewards/safety_reward/std": 0.4061168134212494, "step": 1403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 124.5859375, "completions/mean_terminated_length": 124.5859375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.4904375163741158, "frac_reward_zero_std": 0.0, "grad_norm": 0.21173793077468872, "kl": 3.8671875, "learning_rate": 5e-05, "loss": 0.0389, "num_tokens": 31439593.0, "reward": 6.988037109375, "reward_std": 0.30153602361679077, "rewards/helpfulness_reward/mean": 6.988037109375, "rewards/helpfulness_reward/std": 0.8159198760986328, "rewards/safety_reward/mean": 8.24072265625, "rewards/safety_reward/std": 0.6781685948371887, "step": 1404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 124.1796875, "completions/mean_terminated_length": 124.1796875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.4907868308444677, "frac_reward_zero_std": 0.0, "grad_norm": 0.5873265862464905, "kl": 4.109375, "learning_rate": 5e-05, "loss": 0.0379, "num_tokens": 31459616.0, "reward": 7.042724609375, "reward_std": 0.2432328760623932, "rewards/helpfulness_reward/mean": 7.042724609375, "rewards/helpfulness_reward/std": 0.5244523882865906, "rewards/safety_reward/mean": 8.4453125, "rewards/safety_reward/std": 0.4386409521102905, "step": 1405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 124.7890625, "completions/mean_terminated_length": 124.7890625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.49113614531481964, "frac_reward_zero_std": 0.0, "grad_norm": 0.22270216047763824, "kl": 3.953125, "learning_rate": 5e-05, "loss": 0.043, "num_tokens": 31480773.0, "reward": 6.92041015625, "reward_std": 0.3027998208999634, "rewards/helpfulness_reward/mean": 6.92041015625, "rewards/helpfulness_reward/std": 0.6619591116905212, "rewards/safety_reward/mean": 8.341552734375, "rewards/safety_reward/std": 0.7359053492546082, "step": 1406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 124.484375, "completions/mean_terminated_length": 124.484375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.4914854597851716, "frac_reward_zero_std": 0.0, "grad_norm": 0.27114441990852356, "kl": 3.861328125, "learning_rate": 5e-05, "loss": 0.0418, "num_tokens": 31500483.0, "reward": 7.05078125, "reward_std": 0.3236759901046753, "rewards/helpfulness_reward/mean": 7.05078125, "rewards/helpfulness_reward/std": 0.5682801008224487, "rewards/safety_reward/mean": 8.212890625, "rewards/safety_reward/std": 0.587621808052063, "step": 1407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 124.5625, "completions/mean_terminated_length": 124.5625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.4918347742555235, "frac_reward_zero_std": 0.0, "grad_norm": 0.2608587145805359, "kl": 3.904296875, "learning_rate": 5e-05, "loss": 0.0409, "num_tokens": 31520803.0, "reward": 7.23876953125, "reward_std": 0.1956290751695633, "rewards/helpfulness_reward/mean": 7.23876953125, "rewards/helpfulness_reward/std": 0.5300942063331604, "rewards/safety_reward/mean": 8.4169921875, "rewards/safety_reward/std": 0.5035619735717773, "step": 1408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 124.1484375, "completions/mean_terminated_length": 124.1484375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.4921840887258755, "frac_reward_zero_std": 0.0, "grad_norm": 0.3000038266181946, "kl": 4.046875, "learning_rate": 5e-05, "loss": 0.0326, "num_tokens": 31540190.0, "reward": 6.979248046875, "reward_std": 0.3784061074256897, "rewards/helpfulness_reward/mean": 6.979248046875, "rewards/helpfulness_reward/std": 0.6568095684051514, "rewards/safety_reward/mean": 8.242431640625, "rewards/safety_reward/std": 0.5599343776702881, "step": 1409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 123.578125, "completions/mean_terminated_length": 123.578125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.4925334031962274, "frac_reward_zero_std": 0.0, "grad_norm": 0.23874060809612274, "kl": 3.921875, "learning_rate": 5e-05, "loss": 0.0362, "num_tokens": 31561504.0, "reward": 6.994140625, "reward_std": 0.22904309630393982, "rewards/helpfulness_reward/mean": 6.994140625, "rewards/helpfulness_reward/std": 1.1483389139175415, "rewards/safety_reward/mean": 8.278564453125, "rewards/safety_reward/std": 1.095601201057434, "step": 1410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 123.75, "completions/mean_terminated_length": 123.75, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.4928827176665793, "frac_reward_zero_std": 0.0, "grad_norm": 0.23207499086856842, "kl": 4.052734375, "learning_rate": 5e-05, "loss": 0.0359, "num_tokens": 31582296.0, "reward": 6.9251708984375, "reward_std": 0.2634502649307251, "rewards/helpfulness_reward/mean": 6.9251708984375, "rewards/helpfulness_reward/std": 0.6945677399635315, "rewards/safety_reward/mean": 8.43798828125, "rewards/safety_reward/std": 0.5304893255233765, "step": 1411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 124.2265625, "completions/mean_terminated_length": 124.2265625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.4932320321369313, "frac_reward_zero_std": 0.0, "grad_norm": 0.23193040490150452, "kl": 4.0546875, "learning_rate": 5e-05, "loss": 0.039, "num_tokens": 31601941.0, "reward": 6.95068359375, "reward_std": 0.20122143626213074, "rewards/helpfulness_reward/mean": 6.95068359375, "rewards/helpfulness_reward/std": 0.538953959941864, "rewards/safety_reward/mean": 8.163330078125, "rewards/safety_reward/std": 0.5156598687171936, "step": 1412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 123.875, "completions/mean_terminated_length": 123.875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.4935813466072832, "frac_reward_zero_std": 0.0, "grad_norm": 0.27773281931877136, "kl": 4.0546875, "learning_rate": 5e-05, "loss": 0.033, "num_tokens": 31621805.0, "reward": 7.27734375, "reward_std": 0.3670138418674469, "rewards/helpfulness_reward/mean": 7.27734375, "rewards/helpfulness_reward/std": 0.6847767233848572, "rewards/safety_reward/mean": 8.479248046875, "rewards/safety_reward/std": 0.5074232816696167, "step": 1413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 124.1875, "completions/mean_terminated_length": 124.1875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.4939306610776351, "frac_reward_zero_std": 0.0, "grad_norm": 0.2043972909450531, "kl": 3.71484375, "learning_rate": 5e-05, "loss": 0.0333, "num_tokens": 31642229.0, "reward": 7.125732421875, "reward_std": 0.1959557831287384, "rewards/helpfulness_reward/mean": 7.125732421875, "rewards/helpfulness_reward/std": 0.4833514094352722, "rewards/safety_reward/mean": 8.264892578125, "rewards/safety_reward/std": 0.5567560791969299, "step": 1414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 124.03125, "completions/mean_terminated_length": 124.03125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.4942799755479871, "frac_reward_zero_std": 0.0, "grad_norm": 0.23976895213127136, "kl": 3.82421875, "learning_rate": 5e-05, "loss": 0.0316, "num_tokens": 31662305.0, "reward": 7.15087890625, "reward_std": 0.20642493665218353, "rewards/helpfulness_reward/mean": 7.15087890625, "rewards/helpfulness_reward/std": 0.5584955215454102, "rewards/safety_reward/mean": 8.37548828125, "rewards/safety_reward/std": 0.5541537404060364, "step": 1415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 124.4140625, "completions/mean_terminated_length": 124.4140625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.494629290018339, "frac_reward_zero_std": 0.0, "grad_norm": 1.0375018119812012, "kl": 4.564453125, "learning_rate": 5e-05, "loss": 0.0449, "num_tokens": 31682870.0, "reward": 7.255859375, "reward_std": 0.2547474503517151, "rewards/helpfulness_reward/mean": 7.255859375, "rewards/helpfulness_reward/std": 0.5410776138305664, "rewards/safety_reward/mean": 8.7060546875, "rewards/safety_reward/std": 0.3863011300563812, "step": 1416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 123.9765625, "completions/mean_terminated_length": 123.9765625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.49497860448869097, "frac_reward_zero_std": 0.0, "grad_norm": 0.24341721832752228, "kl": 3.765625, "learning_rate": 5e-05, "loss": 0.036, "num_tokens": 31702739.0, "reward": 7.06298828125, "reward_std": 0.20267005264759064, "rewards/helpfulness_reward/mean": 7.06298828125, "rewards/helpfulness_reward/std": 0.6036203503608704, "rewards/safety_reward/mean": 8.25341796875, "rewards/safety_reward/std": 0.4780207574367523, "step": 1417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 124.2109375, "completions/mean_terminated_length": 124.2109375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.4953279189590429, "frac_reward_zero_std": 0.0, "grad_norm": 0.23024742305278778, "kl": 3.65234375, "learning_rate": 5e-05, "loss": 0.0362, "num_tokens": 31723766.0, "reward": 7.146240234375, "reward_std": 0.2520865797996521, "rewards/helpfulness_reward/mean": 7.146240234375, "rewards/helpfulness_reward/std": 0.6404216289520264, "rewards/safety_reward/mean": 8.385986328125, "rewards/safety_reward/std": 0.5315406918525696, "step": 1418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 124.1640625, "completions/mean_terminated_length": 124.1640625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.4956772334293948, "frac_reward_zero_std": 0.0, "grad_norm": 0.1848979890346527, "kl": 3.83984375, "learning_rate": 5e-05, "loss": 0.0382, "num_tokens": 31743651.0, "reward": 7.19189453125, "reward_std": 0.24222469329833984, "rewards/helpfulness_reward/mean": 7.19189453125, "rewards/helpfulness_reward/std": 0.4203934669494629, "rewards/safety_reward/mean": 8.26708984375, "rewards/safety_reward/std": 0.30484873056411743, "step": 1419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 124.3046875, "completions/mean_terminated_length": 124.3046875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.49602654789974676, "frac_reward_zero_std": 0.0, "grad_norm": 0.37058016657829285, "kl": 4.109375, "learning_rate": 5e-05, "loss": 0.0412, "num_tokens": 31764586.0, "reward": 7.18798828125, "reward_std": 0.23633429408073425, "rewards/helpfulness_reward/mean": 7.18798828125, "rewards/helpfulness_reward/std": 0.5057035684585571, "rewards/safety_reward/mean": 8.412841796875, "rewards/safety_reward/std": 0.5455344319343567, "step": 1420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 124.015625, "completions/mean_terminated_length": 124.015625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.4963758623700987, "frac_reward_zero_std": 0.0, "grad_norm": 0.24180279672145844, "kl": 3.86328125, "learning_rate": 5e-05, "loss": 0.0404, "num_tokens": 31787548.0, "reward": 6.875732421875, "reward_std": 0.3108929991722107, "rewards/helpfulness_reward/mean": 6.875732421875, "rewards/helpfulness_reward/std": 0.7405745983123779, "rewards/safety_reward/mean": 8.123291015625, "rewards/safety_reward/std": 0.8444678783416748, "step": 1421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 124.1171875, "completions/mean_terminated_length": 124.1171875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.4967251768404506, "frac_reward_zero_std": 0.0, "grad_norm": 0.25468364357948303, "kl": 3.8984375, "learning_rate": 5e-05, "loss": 0.0404, "num_tokens": 31807419.0, "reward": 7.09423828125, "reward_std": 0.30613356828689575, "rewards/helpfulness_reward/mean": 7.09423828125, "rewards/helpfulness_reward/std": 0.6843159794807434, "rewards/safety_reward/mean": 8.39697265625, "rewards/safety_reward/std": 0.6687143445014954, "step": 1422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 127.6640625, "completions/mean_terminated_length": 127.6640625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.49707449131080256, "frac_reward_zero_std": 0.0, "grad_norm": 0.25659674406051636, "kl": 3.791015625, "learning_rate": 5e-05, "loss": 0.0421, "num_tokens": 31832648.0, "reward": 6.664741516113281, "reward_std": 0.44986480474472046, "rewards/helpfulness_reward/mean": 6.664741516113281, "rewards/helpfulness_reward/std": 1.2902467250823975, "rewards/safety_reward/mean": 8.041473388671875, "rewards/safety_reward/std": 1.4182440042495728, "step": 1423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 124.4921875, "completions/mean_terminated_length": 124.4921875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.49742380578115447, "frac_reward_zero_std": 0.0, "grad_norm": 0.24020670354366302, "kl": 3.671875, "learning_rate": 5e-05, "loss": 0.0423, "num_tokens": 31853407.0, "reward": 6.830810546875, "reward_std": 0.2529035806655884, "rewards/helpfulness_reward/mean": 6.830810546875, "rewards/helpfulness_reward/std": 0.49431636929512024, "rewards/safety_reward/mean": 8.185791015625, "rewards/safety_reward/std": 0.38546159863471985, "step": 1424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 124.515625, "completions/mean_terminated_length": 124.515625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.49777312025150644, "frac_reward_zero_std": 0.0, "grad_norm": 0.3743833303451538, "kl": 3.849609375, "learning_rate": 5e-05, "loss": 0.0449, "num_tokens": 31873761.0, "reward": 7.06298828125, "reward_std": 0.2960488200187683, "rewards/helpfulness_reward/mean": 7.06298828125, "rewards/helpfulness_reward/std": 0.6231392621994019, "rewards/safety_reward/mean": 8.295166015625, "rewards/safety_reward/std": 0.47820690274238586, "step": 1425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 124.84375, "completions/mean_terminated_length": 124.84375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.49812243472185835, "frac_reward_zero_std": 0.0, "grad_norm": 0.2615688145160675, "kl": 3.57421875, "learning_rate": 5e-05, "loss": 0.0397, "num_tokens": 31895405.0, "reward": 6.797607421875, "reward_std": 0.40819257497787476, "rewards/helpfulness_reward/mean": 6.797607421875, "rewards/helpfulness_reward/std": 1.0791432857513428, "rewards/safety_reward/mean": 8.04754638671875, "rewards/safety_reward/std": 1.3334531784057617, "step": 1426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 124.15625, "completions/mean_terminated_length": 124.15625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.49847174919221027, "frac_reward_zero_std": 0.0, "grad_norm": 0.2619244158267975, "kl": 3.8046875, "learning_rate": 5e-05, "loss": 0.042, "num_tokens": 31916201.0, "reward": 6.970947265625, "reward_std": 0.3614727258682251, "rewards/helpfulness_reward/mean": 6.970947265625, "rewards/helpfulness_reward/std": 0.6838236451148987, "rewards/safety_reward/mean": 8.26220703125, "rewards/safety_reward/std": 0.6927727460861206, "step": 1427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 124.0703125, "completions/mean_terminated_length": 124.0703125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.49882106366256224, "frac_reward_zero_std": 0.0, "grad_norm": 0.24783998727798462, "kl": 4.15234375, "learning_rate": 5e-05, "loss": 0.0431, "num_tokens": 31935970.0, "reward": 7.046875, "reward_std": 0.28787505626678467, "rewards/helpfulness_reward/mean": 7.046875, "rewards/helpfulness_reward/std": 0.5648327469825745, "rewards/safety_reward/mean": 8.27978515625, "rewards/safety_reward/std": 0.41253018379211426, "step": 1428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 124.03125, "completions/mean_terminated_length": 124.03125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.49917037813291415, "frac_reward_zero_std": 0.0, "grad_norm": 0.2518630623817444, "kl": 3.80078125, "learning_rate": 5e-05, "loss": 0.0377, "num_tokens": 31956726.0, "reward": 6.798828125, "reward_std": 0.33716708421707153, "rewards/helpfulness_reward/mean": 6.798828125, "rewards/helpfulness_reward/std": 0.6294715404510498, "rewards/safety_reward/mean": 8.219482421875, "rewards/safety_reward/std": 0.6309566497802734, "step": 1429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 123.4921875, "completions/mean_terminated_length": 123.4921875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.49951969260326606, "frac_reward_zero_std": 0.0, "grad_norm": 0.23696109652519226, "kl": 3.890625, "learning_rate": 5e-05, "loss": 0.0334, "num_tokens": 31977197.0, "reward": 6.83154296875, "reward_std": 0.36525973677635193, "rewards/helpfulness_reward/mean": 6.83154296875, "rewards/helpfulness_reward/std": 0.5921254754066467, "rewards/safety_reward/mean": 8.21044921875, "rewards/safety_reward/std": 0.5379810929298401, "step": 1430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 123.5859375, "completions/mean_terminated_length": 123.5859375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.49986900707361803, "frac_reward_zero_std": 0.0, "grad_norm": 0.23072990775108337, "kl": 4.03125, "learning_rate": 5e-05, "loss": 0.0363, "num_tokens": 31997640.0, "reward": 7.1162109375, "reward_std": 0.31566867232322693, "rewards/helpfulness_reward/mean": 7.1162109375, "rewards/helpfulness_reward/std": 0.6557042002677917, "rewards/safety_reward/mean": 8.374267578125, "rewards/safety_reward/std": 0.5423085689544678, "step": 1431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 123.546875, "completions/mean_terminated_length": 123.546875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.50021832154397, "frac_reward_zero_std": 0.0, "grad_norm": 0.20670320093631744, "kl": 3.759765625, "learning_rate": 5e-05, "loss": 0.0336, "num_tokens": 32017486.0, "reward": 7.072021484375, "reward_std": 0.3239514231681824, "rewards/helpfulness_reward/mean": 7.072021484375, "rewards/helpfulness_reward/std": 0.533849835395813, "rewards/safety_reward/mean": 8.287841796875, "rewards/safety_reward/std": 0.5290039777755737, "step": 1432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 123.90625, "completions/mean_terminated_length": 123.90625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.5005676360143219, "frac_reward_zero_std": 0.0, "grad_norm": 0.26787298917770386, "kl": 3.85546875, "learning_rate": 5e-05, "loss": 0.0371, "num_tokens": 32037298.0, "reward": 7.021484375, "reward_std": 0.328640341758728, "rewards/helpfulness_reward/mean": 7.021484375, "rewards/helpfulness_reward/std": 0.6378929615020752, "rewards/safety_reward/mean": 8.14208984375, "rewards/safety_reward/std": 0.6555625200271606, "step": 1433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 123.3828125, "completions/mean_terminated_length": 123.3828125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.5009169504846738, "frac_reward_zero_std": 0.0, "grad_norm": 0.23157401382923126, "kl": 3.873046875, "learning_rate": 5e-05, "loss": 0.0334, "num_tokens": 32057235.0, "reward": 7.024169921875, "reward_std": 0.2708158493041992, "rewards/helpfulness_reward/mean": 7.024169921875, "rewards/helpfulness_reward/std": 0.5958019495010376, "rewards/safety_reward/mean": 8.490966796875, "rewards/safety_reward/std": 0.4189169406890869, "step": 1434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 124.0390625, "completions/mean_terminated_length": 124.0390625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.5012662649550258, "frac_reward_zero_std": 0.0, "grad_norm": 0.310306578874588, "kl": 3.845703125, "learning_rate": 5e-05, "loss": 0.0391, "num_tokens": 32078528.0, "reward": 6.91650390625, "reward_std": 0.23438437283039093, "rewards/helpfulness_reward/mean": 6.91650390625, "rewards/helpfulness_reward/std": 0.5843591690063477, "rewards/safety_reward/mean": 8.288818359375, "rewards/safety_reward/std": 0.5893596410751343, "step": 1435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 123.578125, "completions/mean_terminated_length": 123.578125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.5016155794253777, "frac_reward_zero_std": 0.0, "grad_norm": 0.2536904513835907, "kl": 3.78515625, "learning_rate": 5e-05, "loss": 0.0401, "num_tokens": 32099258.0, "reward": 6.913818359375, "reward_std": 0.3488733768463135, "rewards/helpfulness_reward/mean": 6.913818359375, "rewards/helpfulness_reward/std": 0.9258939027786255, "rewards/safety_reward/mean": 8.0927734375, "rewards/safety_reward/std": 0.9432234764099121, "step": 1436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 123.9921875, "completions/mean_terminated_length": 123.9921875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.5019648938957296, "frac_reward_zero_std": 0.0, "grad_norm": 0.22190611064434052, "kl": 4.10546875, "learning_rate": 5e-05, "loss": 0.0423, "num_tokens": 32119561.0, "reward": 6.9814453125, "reward_std": 0.23060473799705505, "rewards/helpfulness_reward/mean": 6.9814453125, "rewards/helpfulness_reward/std": 0.5607535243034363, "rewards/safety_reward/mean": 8.48681640625, "rewards/safety_reward/std": 0.5782145857810974, "step": 1437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 124.2109375, "completions/mean_terminated_length": 124.2109375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.5023142083660815, "frac_reward_zero_std": 0.0, "grad_norm": 0.263271301984787, "kl": 4.087890625, "learning_rate": 5e-05, "loss": 0.0464, "num_tokens": 32139452.0, "reward": 7.16064453125, "reward_std": 0.24794432520866394, "rewards/helpfulness_reward/mean": 7.16064453125, "rewards/helpfulness_reward/std": 0.6193361878395081, "rewards/safety_reward/mean": 8.458251953125, "rewards/safety_reward/std": 0.4490235149860382, "step": 1438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 123.6953125, "completions/mean_terminated_length": 123.6953125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.5026635228364335, "frac_reward_zero_std": 0.0, "grad_norm": 0.25036266446113586, "kl": 3.86328125, "learning_rate": 5e-05, "loss": 0.0325, "num_tokens": 32159749.0, "reward": 7.0665283203125, "reward_std": 0.3238564133644104, "rewards/helpfulness_reward/mean": 7.0665283203125, "rewards/helpfulness_reward/std": 0.5920712947845459, "rewards/safety_reward/mean": 8.425537109375, "rewards/safety_reward/std": 0.4783244729042053, "step": 1439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 124.375, "completions/mean_terminated_length": 124.375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.5030128373067855, "frac_reward_zero_std": 0.0, "grad_norm": 0.260174036026001, "kl": 3.826171875, "learning_rate": 5e-05, "loss": 0.0413, "num_tokens": 32179029.0, "reward": 7.24462890625, "reward_std": 0.31714847683906555, "rewards/helpfulness_reward/mean": 7.24462890625, "rewards/helpfulness_reward/std": 0.6866058707237244, "rewards/safety_reward/mean": 8.400390625, "rewards/safety_reward/std": 0.6200065016746521, "step": 1440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 123.71875, "completions/mean_terminated_length": 123.71875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.5033621517771374, "frac_reward_zero_std": 0.0, "grad_norm": 0.1893099844455719, "kl": 3.759765625, "learning_rate": 5e-05, "loss": 0.0325, "num_tokens": 32201249.0, "reward": 7.1396484375, "reward_std": 0.19261738657951355, "rewards/helpfulness_reward/mean": 7.1396484375, "rewards/helpfulness_reward/std": 0.6917030811309814, "rewards/safety_reward/mean": 8.37890625, "rewards/safety_reward/std": 0.6936686038970947, "step": 1441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 124.03125, "completions/mean_terminated_length": 124.03125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.5037114662474893, "frac_reward_zero_std": 0.0, "grad_norm": 0.2559945285320282, "kl": 3.681640625, "learning_rate": 5e-05, "loss": 0.0345, "num_tokens": 32221765.0, "reward": 7.11083984375, "reward_std": 0.417977511882782, "rewards/helpfulness_reward/mean": 7.11083984375, "rewards/helpfulness_reward/std": 0.7560768127441406, "rewards/safety_reward/mean": 8.32666015625, "rewards/safety_reward/std": 0.6710846424102783, "step": 1442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 123.921875, "completions/mean_terminated_length": 123.921875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.5040607807178412, "frac_reward_zero_std": 0.0, "grad_norm": 0.23919501900672913, "kl": 3.888671875, "learning_rate": 5e-05, "loss": 0.0363, "num_tokens": 32244219.0, "reward": 6.7857666015625, "reward_std": 0.3201100826263428, "rewards/helpfulness_reward/mean": 6.7857666015625, "rewards/helpfulness_reward/std": 0.9133351445198059, "rewards/safety_reward/mean": 8.1552734375, "rewards/safety_reward/std": 0.7991400361061096, "step": 1443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 124.09375, "completions/mean_terminated_length": 124.09375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.5044100951881931, "frac_reward_zero_std": 0.0, "grad_norm": 0.2395879477262497, "kl": 3.990234375, "learning_rate": 5e-05, "loss": 0.0382, "num_tokens": 32264943.0, "reward": 7.0223388671875, "reward_std": 0.29740917682647705, "rewards/helpfulness_reward/mean": 7.0223388671875, "rewards/helpfulness_reward/std": 0.7548076510429382, "rewards/safety_reward/mean": 8.458251953125, "rewards/safety_reward/std": 0.5977355241775513, "step": 1444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 124.1328125, "completions/mean_terminated_length": 124.1328125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.5047594096585452, "frac_reward_zero_std": 0.0, "grad_norm": 0.2198948860168457, "kl": 3.87109375, "learning_rate": 5e-05, "loss": 0.0388, "num_tokens": 32284728.0, "reward": 7.1949462890625, "reward_std": 0.246735617518425, "rewards/helpfulness_reward/mean": 7.1949462890625, "rewards/helpfulness_reward/std": 0.5391162633895874, "rewards/safety_reward/mean": 8.523681640625, "rewards/safety_reward/std": 0.45330944657325745, "step": 1445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 124.0234375, "completions/mean_terminated_length": 124.0234375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.5051087241288971, "frac_reward_zero_std": 0.0, "grad_norm": 0.2347557097673416, "kl": 3.80859375, "learning_rate": 5e-05, "loss": 0.0355, "num_tokens": 32305507.0, "reward": 6.8857421875, "reward_std": 0.25049543380737305, "rewards/helpfulness_reward/mean": 6.8857421875, "rewards/helpfulness_reward/std": 0.7163810729980469, "rewards/safety_reward/mean": 8.4404296875, "rewards/safety_reward/std": 0.44276633858680725, "step": 1446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 123.640625, "completions/mean_terminated_length": 123.640625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.505458038599249, "frac_reward_zero_std": 0.0, "grad_norm": 0.3098159730434418, "kl": 4.025390625, "learning_rate": 5e-05, "loss": 0.0323, "num_tokens": 32326085.0, "reward": 7.2154541015625, "reward_std": 0.3317933976650238, "rewards/helpfulness_reward/mean": 7.2154541015625, "rewards/helpfulness_reward/std": 0.7258233428001404, "rewards/safety_reward/mean": 8.3212890625, "rewards/safety_reward/std": 0.6909689903259277, "step": 1447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 123.890625, "completions/mean_terminated_length": 123.890625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.5058073530696009, "frac_reward_zero_std": 0.0, "grad_norm": 0.4535006880760193, "kl": 3.95703125, "learning_rate": 5e-05, "loss": 0.034, "num_tokens": 32347471.0, "reward": 7.1953125, "reward_std": 0.2666522264480591, "rewards/helpfulness_reward/mean": 7.1953125, "rewards/helpfulness_reward/std": 0.5890105366706848, "rewards/safety_reward/mean": 8.28515625, "rewards/safety_reward/std": 0.6162278056144714, "step": 1448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 124.3046875, "completions/mean_terminated_length": 124.3046875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.5061566675399528, "frac_reward_zero_std": 0.0, "grad_norm": 0.22673220932483673, "kl": 3.595703125, "learning_rate": 5e-05, "loss": 0.0371, "num_tokens": 32367694.0, "reward": 6.857666015625, "reward_std": 0.21600370109081268, "rewards/helpfulness_reward/mean": 6.857666015625, "rewards/helpfulness_reward/std": 0.5932746529579163, "rewards/safety_reward/mean": 8.214599609375, "rewards/safety_reward/std": 0.6016475558280945, "step": 1449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 124.328125, "completions/mean_terminated_length": 124.328125, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.5065059820103047, "frac_reward_zero_std": 0.0, "grad_norm": 0.34256136417388916, "kl": 3.923828125, "learning_rate": 5e-05, "loss": 0.0401, "num_tokens": 32387560.0, "reward": 7.2001953125, "reward_std": 0.16775977611541748, "rewards/helpfulness_reward/mean": 7.2001953125, "rewards/helpfulness_reward/std": 0.35083380341529846, "rewards/safety_reward/mean": 8.124267578125, "rewards/safety_reward/std": 0.4418976306915283, "step": 1450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 124.25, "completions/mean_terminated_length": 124.25, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.5068552964806567, "frac_reward_zero_std": 0.0, "grad_norm": 0.24472324550151825, "kl": 3.830078125, "learning_rate": 5e-05, "loss": 0.0373, "num_tokens": 32407720.0, "reward": 7.030517578125, "reward_std": 0.15417438745498657, "rewards/helpfulness_reward/mean": 7.030517578125, "rewards/helpfulness_reward/std": 0.6989303827285767, "rewards/safety_reward/mean": 8.29296875, "rewards/safety_reward/std": 0.6805527806282043, "step": 1451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 123.578125, "completions/mean_terminated_length": 123.578125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.5072046109510087, "frac_reward_zero_std": 0.0, "grad_norm": 0.20808105170726776, "kl": 3.8046875, "learning_rate": 5e-05, "loss": 0.0299, "num_tokens": 32428346.0, "reward": 7.10546875, "reward_std": 0.20340022444725037, "rewards/helpfulness_reward/mean": 7.10546875, "rewards/helpfulness_reward/std": 0.5635243058204651, "rewards/safety_reward/mean": 8.3359375, "rewards/safety_reward/std": 0.4180137515068054, "step": 1452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 123.8828125, "completions/mean_terminated_length": 123.8828125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.5075539254213606, "frac_reward_zero_std": 0.0, "grad_norm": 0.17975351214408875, "kl": 3.83203125, "learning_rate": 5e-05, "loss": 0.035, "num_tokens": 32448075.0, "reward": 7.202880859375, "reward_std": 0.15995490550994873, "rewards/helpfulness_reward/mean": 7.202880859375, "rewards/helpfulness_reward/std": 0.48356661200523376, "rewards/safety_reward/mean": 8.543212890625, "rewards/safety_reward/std": 0.47566458582878113, "step": 1453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 123.7109375, "completions/mean_terminated_length": 123.7109375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.5079032398917125, "frac_reward_zero_std": 0.0, "grad_norm": 0.22118978202342987, "kl": 4.146484375, "learning_rate": 5e-05, "loss": 0.0397, "num_tokens": 32468062.0, "reward": 7.2857666015625, "reward_std": 0.32068151235580444, "rewards/helpfulness_reward/mean": 7.2857666015625, "rewards/helpfulness_reward/std": 0.6790459752082825, "rewards/safety_reward/mean": 8.4951171875, "rewards/safety_reward/std": 0.5657228827476501, "step": 1454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 123.796875, "completions/mean_terminated_length": 123.796875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.5082525543620644, "frac_reward_zero_std": 0.0, "grad_norm": 0.1856580227613449, "kl": 3.822265625, "learning_rate": 5e-05, "loss": 0.0358, "num_tokens": 32489372.0, "reward": 7.180419921875, "reward_std": 0.17519229650497437, "rewards/helpfulness_reward/mean": 7.180419921875, "rewards/helpfulness_reward/std": 0.6874911785125732, "rewards/safety_reward/mean": 8.3701171875, "rewards/safety_reward/std": 0.5648522973060608, "step": 1455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 123.859375, "completions/mean_terminated_length": 123.859375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.5086018688324164, "frac_reward_zero_std": 0.0, "grad_norm": 0.1752031445503235, "kl": 3.68359375, "learning_rate": 5e-05, "loss": 0.0359, "num_tokens": 32509586.0, "reward": 7.211181640625, "reward_std": 0.18980103731155396, "rewards/helpfulness_reward/mean": 7.211181640625, "rewards/helpfulness_reward/std": 0.595911979675293, "rewards/safety_reward/mean": 8.429443359375, "rewards/safety_reward/std": 0.5047639608383179, "step": 1456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 123.5703125, "completions/mean_terminated_length": 123.5703125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.5089511833027683, "frac_reward_zero_std": 0.0, "grad_norm": 0.22745561599731445, "kl": 3.865234375, "learning_rate": 5e-05, "loss": 0.0354, "num_tokens": 32529699.0, "reward": 7.064453125, "reward_std": 0.3326929211616516, "rewards/helpfulness_reward/mean": 7.064453125, "rewards/helpfulness_reward/std": 0.7366319298744202, "rewards/safety_reward/mean": 8.2841796875, "rewards/safety_reward/std": 0.7251399755477905, "step": 1457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 123.6796875, "completions/mean_terminated_length": 123.6796875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.5093004977731203, "frac_reward_zero_std": 0.0, "grad_norm": 0.3886975944042206, "kl": 4.08984375, "learning_rate": 5e-05, "loss": 0.0368, "num_tokens": 32549226.0, "reward": 7.179931640625, "reward_std": 0.24194703996181488, "rewards/helpfulness_reward/mean": 7.179931640625, "rewards/helpfulness_reward/std": 0.6622713208198547, "rewards/safety_reward/mean": 8.44775390625, "rewards/safety_reward/std": 0.5932914614677429, "step": 1458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 123.3203125, "completions/mean_terminated_length": 123.3203125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.5096498122434722, "frac_reward_zero_std": 0.0, "grad_norm": 0.20392578840255737, "kl": 3.796875, "learning_rate": 5e-05, "loss": 0.0265, "num_tokens": 32569779.0, "reward": 7.3736572265625, "reward_std": 0.2561473250389099, "rewards/helpfulness_reward/mean": 7.3736572265625, "rewards/helpfulness_reward/std": 0.6628199219703674, "rewards/safety_reward/mean": 8.501953125, "rewards/safety_reward/std": 0.5138155221939087, "step": 1459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 126.8671875, "completions/mean_terminated_length": 126.8671875, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.5099991267138241, "frac_reward_zero_std": 0.0, "grad_norm": 1.1025716066360474, "kl": 3.828125, "learning_rate": 5e-05, "loss": 0.044, "num_tokens": 32592986.0, "reward": 6.8470306396484375, "reward_std": 0.2277841567993164, "rewards/helpfulness_reward/mean": 6.8470306396484375, "rewards/helpfulness_reward/std": 1.461776852607727, "rewards/safety_reward/mean": 8.18133544921875, "rewards/safety_reward/std": 1.5650986433029175, "step": 1460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 124.0390625, "completions/mean_terminated_length": 124.0390625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.5103484411841761, "frac_reward_zero_std": 0.0, "grad_norm": 0.2210046350955963, "kl": 3.791015625, "learning_rate": 5e-05, "loss": 0.0364, "num_tokens": 32613295.0, "reward": 7.2392578125, "reward_std": 0.23836834728717804, "rewards/helpfulness_reward/mean": 7.2392578125, "rewards/helpfulness_reward/std": 0.45700404047966003, "rewards/safety_reward/mean": 8.40478515625, "rewards/safety_reward/std": 0.612420916557312, "step": 1461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 124.1640625, "completions/mean_terminated_length": 124.1640625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.510697755654528, "frac_reward_zero_std": 0.0, "grad_norm": 0.2369532436132431, "kl": 3.91796875, "learning_rate": 5e-05, "loss": 0.0397, "num_tokens": 32634316.0, "reward": 7.205810546875, "reward_std": 0.2505693733692169, "rewards/helpfulness_reward/mean": 7.205810546875, "rewards/helpfulness_reward/std": 0.5503740906715393, "rewards/safety_reward/mean": 8.447265625, "rewards/safety_reward/std": 0.5497341752052307, "step": 1462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 123.734375, "completions/mean_terminated_length": 123.734375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.5110470701248799, "frac_reward_zero_std": 0.0, "grad_norm": 0.22885461151599884, "kl": 3.96875, "learning_rate": 5e-05, "loss": 0.0386, "num_tokens": 32655802.0, "reward": 6.927490234375, "reward_std": 0.3217088580131531, "rewards/helpfulness_reward/mean": 6.927490234375, "rewards/helpfulness_reward/std": 0.8475910425186157, "rewards/safety_reward/mean": 8.412841796875, "rewards/safety_reward/std": 0.4860038757324219, "step": 1463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 124.125, "completions/mean_terminated_length": 124.125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.5113963845952318, "frac_reward_zero_std": 0.0, "grad_norm": 0.20001383125782013, "kl": 3.9140625, "learning_rate": 5e-05, "loss": 0.0397, "num_tokens": 32675162.0, "reward": 7.26806640625, "reward_std": 0.2419528067111969, "rewards/helpfulness_reward/mean": 7.26806640625, "rewards/helpfulness_reward/std": 0.44820648431777954, "rewards/safety_reward/mean": 8.455078125, "rewards/safety_reward/std": 0.587700366973877, "step": 1464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 124.5, "completions/mean_terminated_length": 124.5, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.5117456990655838, "frac_reward_zero_std": 0.0, "grad_norm": 0.2728939652442932, "kl": 3.900390625, "learning_rate": 5e-05, "loss": 0.0431, "num_tokens": 32695018.0, "reward": 7.0146484375, "reward_std": 0.3608046770095825, "rewards/helpfulness_reward/mean": 7.0146484375, "rewards/helpfulness_reward/std": 0.6769835352897644, "rewards/safety_reward/mean": 8.19873046875, "rewards/safety_reward/std": 0.6499893069267273, "step": 1465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 124.484375, "completions/mean_terminated_length": 124.484375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.5120950135359357, "frac_reward_zero_std": 0.0, "grad_norm": 0.2544849216938019, "kl": 3.953125, "learning_rate": 5e-05, "loss": 0.0431, "num_tokens": 32715232.0, "reward": 6.91748046875, "reward_std": 0.1961509883403778, "rewards/helpfulness_reward/mean": 6.91748046875, "rewards/helpfulness_reward/std": 0.6370846033096313, "rewards/safety_reward/mean": 8.36669921875, "rewards/safety_reward/std": 0.5436541438102722, "step": 1466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 124.8125, "completions/mean_terminated_length": 124.8125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.5124443280062877, "frac_reward_zero_std": 0.0, "grad_norm": 0.2388661503791809, "kl": 3.7421875, "learning_rate": 5e-05, "loss": 0.0454, "num_tokens": 32736656.0, "reward": 6.834716796875, "reward_std": 0.4027039706707001, "rewards/helpfulness_reward/mean": 6.834716796875, "rewards/helpfulness_reward/std": 0.8233503699302673, "rewards/safety_reward/mean": 8.324462890625, "rewards/safety_reward/std": 0.6050598621368408, "step": 1467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 124.234375, "completions/mean_terminated_length": 124.234375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.5127936424766396, "frac_reward_zero_std": 0.0, "grad_norm": 0.2810717821121216, "kl": 3.849609375, "learning_rate": 5e-05, "loss": 0.0445, "num_tokens": 32757758.0, "reward": 7.00390625, "reward_std": 0.2993372082710266, "rewards/helpfulness_reward/mean": 7.00390625, "rewards/helpfulness_reward/std": 0.6646849513053894, "rewards/safety_reward/mean": 8.37939453125, "rewards/safety_reward/std": 0.49346327781677246, "step": 1468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 124.34375, "completions/mean_terminated_length": 124.34375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.5131429569469915, "frac_reward_zero_std": 0.0, "grad_norm": 0.2963526248931885, "kl": 3.87890625, "learning_rate": 5e-05, "loss": 0.0406, "num_tokens": 32777170.0, "reward": 7.288330078125, "reward_std": 0.2596050500869751, "rewards/helpfulness_reward/mean": 7.288330078125, "rewards/helpfulness_reward/std": 0.4785510003566742, "rewards/safety_reward/mean": 8.359619140625, "rewards/safety_reward/std": 0.5564249157905579, "step": 1469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 123.8515625, "completions/mean_terminated_length": 123.8515625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.5134922714173434, "frac_reward_zero_std": 0.0, "grad_norm": 0.2512401044368744, "kl": 4.076171875, "learning_rate": 5e-05, "loss": 0.0383, "num_tokens": 32796759.0, "reward": 6.78564453125, "reward_std": 0.3547089695930481, "rewards/helpfulness_reward/mean": 6.78564453125, "rewards/helpfulness_reward/std": 0.7305101156234741, "rewards/safety_reward/mean": 8.067626953125, "rewards/safety_reward/std": 0.5505173206329346, "step": 1470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 123.6015625, "completions/mean_terminated_length": 123.6015625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.5138415858876954, "frac_reward_zero_std": 0.0, "grad_norm": 0.21040232479572296, "kl": 3.767578125, "learning_rate": 5e-05, "loss": 0.035, "num_tokens": 32816956.0, "reward": 6.869873046875, "reward_std": 0.2869035005569458, "rewards/helpfulness_reward/mean": 6.869873046875, "rewards/helpfulness_reward/std": 0.7011635899543762, "rewards/safety_reward/mean": 8.344482421875, "rewards/safety_reward/std": 0.8363719582557678, "step": 1471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 124.0625, "completions/mean_terminated_length": 124.0625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.5141909003580474, "frac_reward_zero_std": 0.0, "grad_norm": 0.19866612553596497, "kl": 3.833984375, "learning_rate": 5e-05, "loss": 0.0402, "num_tokens": 32837196.0, "reward": 7.2529296875, "reward_std": 0.22343558073043823, "rewards/helpfulness_reward/mean": 7.2529296875, "rewards/helpfulness_reward/std": 0.563612163066864, "rewards/safety_reward/mean": 8.556884765625, "rewards/safety_reward/std": 0.5056799650192261, "step": 1472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 123.4921875, "completions/mean_terminated_length": 123.4921875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.5145402148283993, "frac_reward_zero_std": 0.0, "grad_norm": 0.2387045919895172, "kl": 3.927734375, "learning_rate": 5e-05, "loss": 0.0409, "num_tokens": 32857715.0, "reward": 7.096923828125, "reward_std": 0.27869725227355957, "rewards/helpfulness_reward/mean": 7.096923828125, "rewards/helpfulness_reward/std": 0.818720817565918, "rewards/safety_reward/mean": 8.2535400390625, "rewards/safety_reward/std": 0.8370275497436523, "step": 1473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 124.2734375, "completions/mean_terminated_length": 124.2734375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.5148895292987512, "frac_reward_zero_std": 0.0, "grad_norm": 0.35248804092407227, "kl": 3.912109375, "learning_rate": 5e-05, "loss": 0.0414, "num_tokens": 32877726.0, "reward": 7.28857421875, "reward_std": 0.1935107409954071, "rewards/helpfulness_reward/mean": 7.28857421875, "rewards/helpfulness_reward/std": 0.5900081992149353, "rewards/safety_reward/mean": 8.489990234375, "rewards/safety_reward/std": 0.5276836156845093, "step": 1474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 123.953125, "completions/mean_terminated_length": 123.953125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.5152388437691031, "frac_reward_zero_std": 0.0, "grad_norm": 0.5258616805076599, "kl": 4.029296875, "learning_rate": 5e-05, "loss": 0.0394, "num_tokens": 32897192.0, "reward": 7.138671875, "reward_std": 0.3065173029899597, "rewards/helpfulness_reward/mean": 7.138671875, "rewards/helpfulness_reward/std": 0.6281631588935852, "rewards/safety_reward/mean": 8.369384765625, "rewards/safety_reward/std": 0.5581144094467163, "step": 1475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 123.328125, "completions/mean_terminated_length": 123.328125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.515588158239455, "frac_reward_zero_std": 0.0, "grad_norm": 0.2307075560092926, "kl": 3.994140625, "learning_rate": 5e-05, "loss": 0.0291, "num_tokens": 32917242.0, "reward": 7.03125, "reward_std": 0.31102874875068665, "rewards/helpfulness_reward/mean": 7.03125, "rewards/helpfulness_reward/std": 0.6403864026069641, "rewards/safety_reward/mean": 8.410400390625, "rewards/safety_reward/std": 0.5878916382789612, "step": 1476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 123.1875, "completions/mean_terminated_length": 123.1875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.515937472709807, "frac_reward_zero_std": 0.0, "grad_norm": 0.19039411842823029, "kl": 3.935546875, "learning_rate": 5e-05, "loss": 0.0307, "num_tokens": 32937266.0, "reward": 7.0216064453125, "reward_std": 0.29521995782852173, "rewards/helpfulness_reward/mean": 7.0216064453125, "rewards/helpfulness_reward/std": 0.7636857032775879, "rewards/safety_reward/mean": 8.3525390625, "rewards/safety_reward/std": 0.5535005331039429, "step": 1477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 123.5, "completions/mean_terminated_length": 123.5, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.516286787180159, "frac_reward_zero_std": 0.0, "grad_norm": 0.21857130527496338, "kl": 3.7265625, "learning_rate": 5e-05, "loss": 0.0308, "num_tokens": 32959218.0, "reward": 6.78192138671875, "reward_std": 0.38451358675956726, "rewards/helpfulness_reward/mean": 6.78192138671875, "rewards/helpfulness_reward/std": 1.2641021013259888, "rewards/safety_reward/mean": 7.988525390625, "rewards/safety_reward/std": 1.0908029079437256, "step": 1478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 124.34375, "completions/mean_terminated_length": 124.34375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.5166361016505109, "frac_reward_zero_std": 0.0, "grad_norm": 0.2948480248451233, "kl": 3.994140625, "learning_rate": 5e-05, "loss": 0.0373, "num_tokens": 32979030.0, "reward": 7.052001953125, "reward_std": 0.2601251006126404, "rewards/helpfulness_reward/mean": 7.052001953125, "rewards/helpfulness_reward/std": 0.6323664784431458, "rewards/safety_reward/mean": 8.39599609375, "rewards/safety_reward/std": 0.49128273129463196, "step": 1479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 124.4296875, "completions/mean_terminated_length": 124.4296875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.5169854161208628, "frac_reward_zero_std": 0.0, "grad_norm": 0.272401362657547, "kl": 3.998046875, "learning_rate": 5e-05, "loss": 0.0384, "num_tokens": 32999837.0, "reward": 6.983154296875, "reward_std": 0.37069255113601685, "rewards/helpfulness_reward/mean": 6.983154296875, "rewards/helpfulness_reward/std": 0.6399407386779785, "rewards/safety_reward/mean": 8.3388671875, "rewards/safety_reward/std": 0.5414567589759827, "step": 1480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 124.2265625, "completions/mean_terminated_length": 124.2265625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.5173347305912147, "frac_reward_zero_std": 0.0, "grad_norm": 4.276041030883789, "kl": 5.806640625, "learning_rate": 5e-05, "loss": 0.0535, "num_tokens": 33020178.0, "reward": 6.950927734375, "reward_std": 0.37233850359916687, "rewards/helpfulness_reward/mean": 6.950927734375, "rewards/helpfulness_reward/std": 0.6421452760696411, "rewards/safety_reward/mean": 8.36865234375, "rewards/safety_reward/std": 0.49460920691490173, "step": 1481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 123.8515625, "completions/mean_terminated_length": 123.8515625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.5176840450615666, "frac_reward_zero_std": 0.0, "grad_norm": 0.286878377199173, "kl": 3.802734375, "learning_rate": 5e-05, "loss": 0.0367, "num_tokens": 33040559.0, "reward": 7.0478515625, "reward_std": 0.24276158213615417, "rewards/helpfulness_reward/mean": 7.0478515625, "rewards/helpfulness_reward/std": 0.5080693960189819, "rewards/safety_reward/mean": 8.333984375, "rewards/safety_reward/std": 0.4330970346927643, "step": 1482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 124.015625, "completions/mean_terminated_length": 124.015625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.5180333595319186, "frac_reward_zero_std": 0.0, "grad_norm": 0.2184865027666092, "kl": 3.953125, "learning_rate": 5e-05, "loss": 0.039, "num_tokens": 33061729.0, "reward": 6.87786865234375, "reward_std": 0.4576362371444702, "rewards/helpfulness_reward/mean": 6.87786865234375, "rewards/helpfulness_reward/std": 0.8152870535850525, "rewards/safety_reward/mean": 8.2392578125, "rewards/safety_reward/std": 0.6232435703277588, "step": 1483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 124.796875, "completions/mean_terminated_length": 124.796875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.5183826740022706, "frac_reward_zero_std": 0.0, "grad_norm": 0.23729655146598816, "kl": 3.865234375, "learning_rate": 5e-05, "loss": 0.042, "num_tokens": 33082551.0, "reward": 7.23388671875, "reward_std": 0.26521745324134827, "rewards/helpfulness_reward/mean": 7.23388671875, "rewards/helpfulness_reward/std": 0.4396737515926361, "rewards/safety_reward/mean": 8.599365234375, "rewards/safety_reward/std": 0.34493204951286316, "step": 1484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 123.9453125, "completions/mean_terminated_length": 123.9453125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.5187319884726225, "frac_reward_zero_std": 0.0, "grad_norm": 0.2557224929332733, "kl": 3.9375, "learning_rate": 5e-05, "loss": 0.0333, "num_tokens": 33103168.0, "reward": 6.820556640625, "reward_std": 0.337446391582489, "rewards/helpfulness_reward/mean": 6.820556640625, "rewards/helpfulness_reward/std": 0.8566157221794128, "rewards/safety_reward/mean": 8.12841796875, "rewards/safety_reward/std": 0.7836452722549438, "step": 1485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 124.4453125, "completions/mean_terminated_length": 124.4453125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.5190813029429744, "frac_reward_zero_std": 0.0, "grad_norm": 0.31180620193481445, "kl": 3.869140625, "learning_rate": 5e-05, "loss": 0.0381, "num_tokens": 33123529.0, "reward": 6.86083984375, "reward_std": 0.3064210116863251, "rewards/helpfulness_reward/mean": 6.86083984375, "rewards/helpfulness_reward/std": 0.6201814413070679, "rewards/safety_reward/mean": 8.4052734375, "rewards/safety_reward/std": 0.4443352520465851, "step": 1486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 124.1953125, "completions/mean_terminated_length": 124.1953125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.5194306174133263, "frac_reward_zero_std": 0.0, "grad_norm": 0.24026650190353394, "kl": 3.99609375, "learning_rate": 5e-05, "loss": 0.0373, "num_tokens": 33142866.0, "reward": 7.17626953125, "reward_std": 0.2559938132762909, "rewards/helpfulness_reward/mean": 7.17626953125, "rewards/helpfulness_reward/std": 0.4786396622657776, "rewards/safety_reward/mean": 8.275146484375, "rewards/safety_reward/std": 0.4080735743045807, "step": 1487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 124.3828125, "completions/mean_terminated_length": 124.3828125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.5197799318836783, "frac_reward_zero_std": 0.0, "grad_norm": 0.24569770693778992, "kl": 3.923828125, "learning_rate": 5e-05, "loss": 0.0382, "num_tokens": 33162563.0, "reward": 6.89208984375, "reward_std": 0.32526978850364685, "rewards/helpfulness_reward/mean": 6.89208984375, "rewards/helpfulness_reward/std": 0.6488896608352661, "rewards/safety_reward/mean": 8.325439453125, "rewards/safety_reward/std": 0.643889307975769, "step": 1488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 123.953125, "completions/mean_terminated_length": 123.953125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.5201292463540302, "frac_reward_zero_std": 0.0, "grad_norm": 0.25304853916168213, "kl": 3.900390625, "learning_rate": 5e-05, "loss": 0.0359, "num_tokens": 33182045.0, "reward": 7.0772705078125, "reward_std": 0.3969600796699524, "rewards/helpfulness_reward/mean": 7.0772705078125, "rewards/helpfulness_reward/std": 0.7470911145210266, "rewards/safety_reward/mean": 8.235107421875, "rewards/safety_reward/std": 0.6446584463119507, "step": 1489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 123.6953125, "completions/mean_terminated_length": 123.6953125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.5204785608243822, "frac_reward_zero_std": 0.0, "grad_norm": 0.2651577293872833, "kl": 3.8671875, "learning_rate": 5e-05, "loss": 0.0365, "num_tokens": 33202286.0, "reward": 6.899658203125, "reward_std": 0.24701546132564545, "rewards/helpfulness_reward/mean": 6.899658203125, "rewards/helpfulness_reward/std": 0.5757095813751221, "rewards/safety_reward/mean": 8.26025390625, "rewards/safety_reward/std": 0.5680379867553711, "step": 1490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 124.375, "completions/mean_terminated_length": 124.375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.5208278752947341, "frac_reward_zero_std": 0.0, "grad_norm": 0.35001638531684875, "kl": 4.0, "learning_rate": 5e-05, "loss": 0.0382, "num_tokens": 33221902.0, "reward": 7.005615234375, "reward_std": 0.3434785008430481, "rewards/helpfulness_reward/mean": 7.005615234375, "rewards/helpfulness_reward/std": 0.603409469127655, "rewards/safety_reward/mean": 8.373046875, "rewards/safety_reward/std": 0.5304279327392578, "step": 1491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 124.515625, "completions/mean_terminated_length": 124.515625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.521177189765086, "frac_reward_zero_std": 0.0, "grad_norm": 0.3699396252632141, "kl": 3.935546875, "learning_rate": 5e-05, "loss": 0.036, "num_tokens": 33242264.0, "reward": 7.1016845703125, "reward_std": 0.4336588680744171, "rewards/helpfulness_reward/mean": 7.1016845703125, "rewards/helpfulness_reward/std": 0.6896552443504333, "rewards/safety_reward/mean": 8.194580078125, "rewards/safety_reward/std": 0.5223565697669983, "step": 1492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 123.3046875, "completions/mean_terminated_length": 123.3046875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.521526504235438, "frac_reward_zero_std": 0.0, "grad_norm": 0.3187488913536072, "kl": 4.083984375, "learning_rate": 5e-05, "loss": 0.0321, "num_tokens": 33262495.0, "reward": 6.89013671875, "reward_std": 0.4245293438434601, "rewards/helpfulness_reward/mean": 6.89013671875, "rewards/helpfulness_reward/std": 0.8323285579681396, "rewards/safety_reward/mean": 8.30078125, "rewards/safety_reward/std": 0.5941060185432434, "step": 1493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 123.734375, "completions/mean_terminated_length": 123.734375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.5218758187057899, "frac_reward_zero_std": 0.0, "grad_norm": 0.23357897996902466, "kl": 3.927734375, "learning_rate": 5e-05, "loss": 0.0301, "num_tokens": 33282405.0, "reward": 7.062744140625, "reward_std": 0.34646034240722656, "rewards/helpfulness_reward/mean": 7.062744140625, "rewards/helpfulness_reward/std": 0.6379742622375488, "rewards/safety_reward/mean": 8.341552734375, "rewards/safety_reward/std": 0.5852147936820984, "step": 1494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 123.890625, "completions/mean_terminated_length": 123.890625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.5222251331761418, "frac_reward_zero_std": 0.0, "grad_norm": 0.3170640468597412, "kl": 4.046875, "learning_rate": 5e-05, "loss": 0.0377, "num_tokens": 33302367.0, "reward": 6.995849609375, "reward_std": 0.27511489391326904, "rewards/helpfulness_reward/mean": 6.995849609375, "rewards/helpfulness_reward/std": 0.7152631878852844, "rewards/safety_reward/mean": 8.363037109375, "rewards/safety_reward/std": 0.478292316198349, "step": 1495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 124.15625, "completions/mean_terminated_length": 124.15625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.5225744476464937, "frac_reward_zero_std": 0.0, "grad_norm": 0.25582584738731384, "kl": 4.09765625, "learning_rate": 5e-05, "loss": 0.0406, "num_tokens": 33321907.0, "reward": 7.1484375, "reward_std": 0.3192208409309387, "rewards/helpfulness_reward/mean": 7.1484375, "rewards/helpfulness_reward/std": 0.5978219509124756, "rewards/safety_reward/mean": 8.286865234375, "rewards/safety_reward/std": 0.6130430102348328, "step": 1496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 123.8515625, "completions/mean_terminated_length": 123.8515625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.5229237621168457, "frac_reward_zero_std": 0.0, "grad_norm": 0.21196913719177246, "kl": 3.955078125, "learning_rate": 5e-05, "loss": 0.0317, "num_tokens": 33341960.0, "reward": 7.0302734375, "reward_std": 0.2201385498046875, "rewards/helpfulness_reward/mean": 7.0302734375, "rewards/helpfulness_reward/std": 0.4473362863063812, "rewards/safety_reward/mean": 8.40771484375, "rewards/safety_reward/std": 0.3667242228984833, "step": 1497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 124.1484375, "completions/mean_terminated_length": 124.1484375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.5232730765871976, "frac_reward_zero_std": 0.0, "grad_norm": 0.2500542998313904, "kl": 3.873046875, "learning_rate": 5e-05, "loss": 0.037, "num_tokens": 33364227.0, "reward": 6.94189453125, "reward_std": 0.2820018529891968, "rewards/helpfulness_reward/mean": 6.94189453125, "rewards/helpfulness_reward/std": 0.6598984003067017, "rewards/safety_reward/mean": 8.361328125, "rewards/safety_reward/std": 0.5065052509307861, "step": 1498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 124.2421875, "completions/mean_terminated_length": 124.2421875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.5236223910575496, "frac_reward_zero_std": 0.0, "grad_norm": 0.21835115551948547, "kl": 3.890625, "learning_rate": 5e-05, "loss": 0.0363, "num_tokens": 33384538.0, "reward": 7.0858154296875, "reward_std": 0.37502336502075195, "rewards/helpfulness_reward/mean": 7.0858154296875, "rewards/helpfulness_reward/std": 0.7525537014007568, "rewards/safety_reward/mean": 8.32421875, "rewards/safety_reward/std": 0.5874026417732239, "step": 1499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 124.1796875, "completions/mean_terminated_length": 124.1796875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.5239717055279015, "frac_reward_zero_std": 0.0, "grad_norm": 0.2529344856739044, "kl": 3.9296875, "learning_rate": 5e-05, "loss": 0.0343, "num_tokens": 33404185.0, "reward": 7.07958984375, "reward_std": 0.3075680732727051, "rewards/helpfulness_reward/mean": 7.07958984375, "rewards/helpfulness_reward/std": 0.4846445620059967, "rewards/safety_reward/mean": 8.265625, "rewards/safety_reward/std": 0.5003997087478638, "step": 1500 } ], "logging_steps": 1, "max_steps": 2863, "num_input_tokens_seen": 33404185, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }