{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.4369747899159664, "eval_steps": 500, "global_step": 260, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 2947.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3156.0, "completions/max_terminated_length": 3156.0, "completions/mean_length": 2947.5, "completions/mean_terminated_length": 2947.5, "completions/min_length": 2764.0, "completions/min_terminated_length": 2764.0, "epoch": 0.0016806722689075631, "frac_reward_zero_std": 0.0, "grad_norm": 0.22864551842212677, "kl": 0.41864094138145447, "learning_rate": 0.0, "loss": 0.0004, "num_tokens": 22462.0, "reward": 8.499990463256836, "reward_std": 0.3535599112510681, "rewards/reward_model/mean": 8.499990463256836, "rewards/reward_model/std": 0.3535599112510681, "step": 1 }, { "completion_length": 3534.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4533.0, "completions/max_terminated_length": 4533.0, "completions/mean_length": 3534.0, "completions/mean_terminated_length": 3534.0, "completions/min_length": 2558.0, "completions/min_terminated_length": 2558.0, "epoch": 0.0033613445378151263, "frac_reward_zero_std": 0.0, "grad_norm": 0.24098172783851624, "kl": 0.3827270567417145, "learning_rate": 5.0000000000000004e-08, "loss": 0.0004, "num_tokens": 47146.0, "reward": 7.874920845031738, "reward_std": 0.5204211473464966, "rewards/reward_model/mean": 7.874920845031738, "rewards/reward_model/std": 0.5204212665557861, "step": 2 }, { "completion_length": 2755.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3132.0, "completions/max_terminated_length": 3132.0, "completions/mean_length": 2755.75, "completions/mean_terminated_length": 2755.75, "completions/min_length": 2379.0, "completions/min_terminated_length": 2379.0, "epoch": 0.005042016806722689, "frac_reward_zero_std": 0.0, "grad_norm": 0.2558765113353729, "kl": 0.8935418128967285, "learning_rate": 1.0000000000000001e-07, "loss": 0.0009, "num_tokens": 66937.0, "reward": 9.562366485595703, "reward_std": 0.5153685212135315, "rewards/reward_model/mean": 9.562366485595703, "rewards/reward_model/std": 0.5153685212135315, "step": 3 }, { "completion_length": 3009.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3186.0, "completions/max_terminated_length": 3186.0, "completions/mean_length": 3009.0, "completions/mean_terminated_length": 3009.0, "completions/min_length": 2896.0, "completions/min_terminated_length": 2896.0, "epoch": 0.0067226890756302525, "frac_reward_zero_std": 0.0, "grad_norm": 0.23344005644321442, "kl": 0.4426734447479248, "learning_rate": 1.5000000000000002e-07, "loss": 0.0004, "num_tokens": 88921.0, "reward": 9.624762535095215, "reward_std": 0.43301600217819214, "rewards/reward_model/mean": 9.624762535095215, "rewards/reward_model/std": 0.43301627039909363, "step": 4 }, { "completion_length": 883.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1788.0, "completions/max_terminated_length": 1788.0, "completions/mean_length": 883.0, "completions/mean_terminated_length": 883.0, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.008403361344537815, "frac_reward_zero_std": 0.0, "grad_norm": 0.6023623943328857, "kl": 0.8841147422790527, "learning_rate": 2.0000000000000002e-07, "loss": 0.0009, "num_tokens": 102045.0, "reward": 6.124550819396973, "reward_std": 1.6142183542251587, "rewards/reward_model/mean": 6.124550819396973, "rewards/reward_model/std": 1.6142184734344482, "step": 5 }, { "completion_length": 3231.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4057.0, "completions/max_terminated_length": 4057.0, "completions/mean_length": 3231.0, "completions/mean_terminated_length": 3231.0, "completions/min_length": 2769.0, "completions/min_terminated_length": 2769.0, "epoch": 0.010084033613445379, "frac_reward_zero_std": 0.0, "grad_norm": 0.25101199746131897, "kl": 0.42379599809646606, "learning_rate": 2.5000000000000004e-07, "loss": 0.0004, "num_tokens": 124149.0, "reward": 8.374448776245117, "reward_std": 0.6291409134864807, "rewards/reward_model/mean": 8.374448776245117, "rewards/reward_model/std": 0.6291410326957703, "step": 6 }, { "completion_length": 2049.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2638.0, "completions/max_terminated_length": 2638.0, "completions/mean_length": 2049.25, "completions/mean_terminated_length": 2049.25, "completions/min_length": 1558.0, "completions/min_terminated_length": 1558.0, "epoch": 0.011764705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 0.34592828154563904, "kl": 0.3436116874217987, "learning_rate": 3.0000000000000004e-07, "loss": 0.0003, "num_tokens": 143242.0, "reward": 7.25, "reward_std": 3.4034295082092285, "rewards/reward_model/mean": 7.25, "rewards/reward_model/std": 3.4034297466278076, "step": 7 }, { "completion_length": 3440.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3849.0, "completions/max_terminated_length": 3849.0, "completions/mean_length": 3440.5, "completions/mean_terminated_length": 3440.5, "completions/min_length": 2937.0, "completions/min_terminated_length": 2937.0, "epoch": 0.013445378151260505, "frac_reward_zero_std": 0.0, "grad_norm": 0.231065571308136, "kl": 0.4249846041202545, "learning_rate": 3.5000000000000004e-07, "loss": 0.0004, "num_tokens": 167764.0, "reward": 6.87348747253418, "reward_std": 3.0664312839508057, "rewards/reward_model/mean": 6.87348747253418, "rewards/reward_model/std": 3.0664312839508057, "step": 8 }, { "completion_length": 1771.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2382.0, "completions/max_terminated_length": 2382.0, "completions/mean_length": 1771.0, "completions/mean_terminated_length": 1771.0, "completions/min_length": 1367.0, "completions/min_terminated_length": 1367.0, "epoch": 0.015126050420168067, "frac_reward_zero_std": 0.0, "grad_norm": 0.41485831141471863, "kl": 0.40487462282180786, "learning_rate": 4.0000000000000003e-07, "loss": 0.0004, "num_tokens": 183464.0, "reward": 6.125, "reward_std": 1.973786473274231, "rewards/reward_model/mean": 6.125, "rewards/reward_model/std": 1.9737865924835205, "step": 9 }, { "completion_length": 3359.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3600.0, "completions/max_terminated_length": 3600.0, "completions/mean_length": 3359.5, "completions/mean_terminated_length": 3359.5, "completions/min_length": 3058.0, "completions/min_terminated_length": 3058.0, "epoch": 0.01680672268907563, "frac_reward_zero_std": 0.0, "grad_norm": 0.2300678789615631, "kl": 0.40493836998939514, "learning_rate": 4.5000000000000003e-07, "loss": 0.0004, "num_tokens": 206786.0, "reward": 8.87358283996582, "reward_std": 0.14423945546150208, "rewards/reward_model/mean": 8.87358283996582, "rewards/reward_model/std": 0.14423947036266327, "step": 10 }, { "completion_length": 1176.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1360.0, "completions/max_terminated_length": 1360.0, "completions/mean_length": 1176.0, "completions/mean_terminated_length": 1176.0, "completions/min_length": 1029.0, "completions/min_terminated_length": 1029.0, "epoch": 0.018487394957983194, "frac_reward_zero_std": 0.0, "grad_norm": 0.5758852958679199, "kl": 0.7856903076171875, "learning_rate": 5.000000000000001e-07, "loss": 0.0008, "num_tokens": 220790.0, "reward": 7.059416770935059, "reward_std": 1.1248167753219604, "rewards/reward_model/mean": 7.059416770935059, "rewards/reward_model/std": 1.12481689453125, "step": 11 }, { "completion_length": 197.25, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 197.25, "completions/mean_terminated_length": 197.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.020168067226890758, "frac_reward_zero_std": 0.0, "grad_norm": 10.455142974853516, "kl": 1.7632454633712769, "learning_rate": 5.5e-07, "loss": 0.0018, "num_tokens": 229683.0, "reward": 6.749574661254883, "reward_std": 3.7753303050994873, "rewards/reward_model/mean": 6.749574661254883, "rewards/reward_model/std": 3.7753303050994873, "step": 12 }, { "completion_length": 3293.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3623.0, "completions/max_terminated_length": 3623.0, "completions/mean_length": 3293.5, "completions/mean_terminated_length": 3293.5, "completions/min_length": 2867.0, "completions/min_terminated_length": 2867.0, "epoch": 0.021848739495798318, "frac_reward_zero_std": 0.0, "grad_norm": 0.19915665686130524, "kl": 0.462253212928772, "learning_rate": 6.000000000000001e-07, "loss": 0.0005, "num_tokens": 251365.0, "reward": 7.060825347900391, "reward_std": 3.0564966201782227, "rewards/reward_model/mean": 7.060825347900391, "rewards/reward_model/std": 3.0564968585968018, "step": 13 }, { "completion_length": 3526.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3798.0, "completions/max_terminated_length": 3798.0, "completions/mean_length": 3526.5, "completions/mean_terminated_length": 3526.5, "completions/min_length": 3032.0, "completions/min_terminated_length": 3032.0, "epoch": 0.023529411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 0.23769311606884003, "kl": 0.39918115735054016, "learning_rate": 6.5e-07, "loss": 0.0004, "num_tokens": 275555.0, "reward": 7.184149742126465, "reward_std": 1.26445472240448, "rewards/reward_model/mean": 7.184149742126465, "rewards/reward_model/std": 1.26445472240448, "step": 14 }, { "completion_length": 3020.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3207.0, "completions/max_terminated_length": 3207.0, "completions/mean_length": 3020.25, "completions/mean_terminated_length": 3020.25, "completions/min_length": 2788.0, "completions/min_terminated_length": 2788.0, "epoch": 0.025210084033613446, "frac_reward_zero_std": 0.0, "grad_norm": 0.21623943746089935, "kl": 0.46406587958335876, "learning_rate": 7.000000000000001e-07, "loss": 0.0005, "num_tokens": 297480.0, "reward": 8.49774169921875, "reward_std": 1.0792028903961182, "rewards/reward_model/mean": 8.49774169921875, "rewards/reward_model/std": 1.0792030096054077, "step": 15 }, { "completion_length": 3226.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3620.0, "completions/max_terminated_length": 3620.0, "completions/mean_length": 3226.75, "completions/mean_terminated_length": 3226.75, "completions/min_length": 2546.0, "completions/min_terminated_length": 2546.0, "epoch": 0.02689075630252101, "frac_reward_zero_std": 0.0, "grad_norm": 0.24463801085948944, "kl": 0.49360811710357666, "learning_rate": 7.5e-07, "loss": 0.0005, "num_tokens": 321115.0, "reward": 6.808682441711426, "reward_std": 1.6747063398361206, "rewards/reward_model/mean": 6.808682441711426, "rewards/reward_model/std": 1.6747063398361206, "step": 16 }, { "completion_length": 3048.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3467.0, "completions/max_terminated_length": 3467.0, "completions/mean_length": 3048.75, "completions/mean_terminated_length": 3048.75, "completions/min_length": 2805.0, "completions/min_terminated_length": 2805.0, "epoch": 0.02857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.2717757225036621, "kl": 0.4575347304344177, "learning_rate": 8.000000000000001e-07, "loss": 0.0005, "num_tokens": 344562.0, "reward": 8.246984481811523, "reward_std": 0.8661016821861267, "rewards/reward_model/mean": 8.246984481811523, "rewards/reward_model/std": 0.8661016821861267, "step": 17 }, { "completion_length": 2945.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3917.0, "completions/max_terminated_length": 3917.0, "completions/mean_length": 2945.0, "completions/mean_terminated_length": 2945.0, "completions/min_length": 2311.0, "completions/min_terminated_length": 2311.0, "epoch": 0.030252100840336135, "frac_reward_zero_std": 0.0, "grad_norm": 0.30351686477661133, "kl": 0.33760935068130493, "learning_rate": 8.500000000000001e-07, "loss": 0.0003, "num_tokens": 365458.0, "reward": 3.12009334564209, "reward_std": 3.325429677963257, "rewards/reward_model/mean": 3.12009334564209, "rewards/reward_model/std": 3.325429677963257, "step": 18 }, { "completion_length": 3071.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3501.0, "completions/max_terminated_length": 3501.0, "completions/mean_length": 3071.25, "completions/mean_terminated_length": 3071.25, "completions/min_length": 2716.0, "completions/min_terminated_length": 2716.0, "epoch": 0.031932773109243695, "frac_reward_zero_std": 0.0, "grad_norm": 0.21365933120250702, "kl": 0.44693294167518616, "learning_rate": 9.000000000000001e-07, "loss": 0.0004, "num_tokens": 388303.0, "reward": 7.433669567108154, "reward_std": 2.9736855030059814, "rewards/reward_model/mean": 7.433669567108154, "rewards/reward_model/std": 2.9736855030059814, "step": 19 }, { "completion_length": 3026.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3183.0, "completions/max_terminated_length": 3183.0, "completions/mean_length": 3026.0, "completions/mean_terminated_length": 3026.0, "completions/min_length": 2747.0, "completions/min_terminated_length": 2747.0, "epoch": 0.03361344537815126, "frac_reward_zero_std": 0.0, "grad_norm": 0.2290022075176239, "kl": 0.42688068747520447, "learning_rate": 9.500000000000001e-07, "loss": 0.0004, "num_tokens": 408863.0, "reward": 7.873417377471924, "reward_std": 1.7840297222137451, "rewards/reward_model/mean": 7.873417377471924, "rewards/reward_model/std": 1.7840297222137451, "step": 20 }, { "completion_length": 1102.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1387.0, "completions/max_terminated_length": 1387.0, "completions/mean_length": 1102.25, "completions/mean_terminated_length": 1102.25, "completions/min_length": 749.0, "completions/min_terminated_length": 749.0, "epoch": 0.03529411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 0.7857298851013184, "kl": 0.8577286005020142, "learning_rate": 1.0000000000000002e-06, "loss": 0.0009, "num_tokens": 423604.0, "reward": 6.423410415649414, "reward_std": 4.296779632568359, "rewards/reward_model/mean": 6.423410415649414, "rewards/reward_model/std": 4.296780109405518, "step": 21 }, { "completion_length": 3086.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3459.0, "completions/max_terminated_length": 3459.0, "completions/mean_length": 3086.75, "completions/mean_terminated_length": 3086.75, "completions/min_length": 2556.0, "completions/min_terminated_length": 2556.0, "epoch": 0.03697478991596639, "frac_reward_zero_std": 0.0, "grad_norm": 0.2390928417444229, "kl": 0.5277106761932373, "learning_rate": 1.0500000000000001e-06, "loss": 0.0005, "num_tokens": 446255.0, "reward": 7.993368148803711, "reward_std": 0.8412674069404602, "rewards/reward_model/mean": 7.993368148803711, "rewards/reward_model/std": 0.8412673473358154, "step": 22 }, { "completion_length": 1777.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2093.0, "completions/max_terminated_length": 2093.0, "completions/mean_length": 1777.75, "completions/mean_terminated_length": 1777.75, "completions/min_length": 1416.0, "completions/min_terminated_length": 1416.0, "epoch": 0.03865546218487395, "frac_reward_zero_std": 0.0, "grad_norm": 0.3648889660835266, "kl": 0.41789889335632324, "learning_rate": 1.1e-06, "loss": 0.0004, "num_tokens": 462262.0, "reward": 4.125, "reward_std": 1.4930394887924194, "rewards/reward_model/mean": 4.125, "rewards/reward_model/std": 1.4930394887924194, "step": 23 }, { "completion_length": 3006.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3322.0, "completions/max_terminated_length": 3322.0, "completions/mean_length": 3006.75, "completions/mean_terminated_length": 3006.75, "completions/min_length": 2838.0, "completions/min_terminated_length": 2838.0, "epoch": 0.040336134453781515, "frac_reward_zero_std": 0.0, "grad_norm": 0.24086372554302216, "kl": 0.45793622732162476, "learning_rate": 1.1500000000000002e-06, "loss": 0.0005, "num_tokens": 483125.0, "reward": 9.804398536682129, "reward_std": 0.12509065866470337, "rewards/reward_model/mean": 9.804398536682129, "rewards/reward_model/std": 0.12509065866470337, "step": 24 }, { "completion_length": 1440.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2084.0, "completions/max_terminated_length": 2084.0, "completions/mean_length": 1440.25, "completions/mean_terminated_length": 1440.25, "completions/min_length": 1091.0, "completions/min_terminated_length": 1091.0, "epoch": 0.04201680672268908, "frac_reward_zero_std": 0.0, "grad_norm": 0.49449586868286133, "kl": 0.7333781123161316, "learning_rate": 1.2000000000000002e-06, "loss": 0.0007, "num_tokens": 499202.0, "reward": 6.603575229644775, "reward_std": 2.1134190559387207, "rewards/reward_model/mean": 6.603575229644775, "rewards/reward_model/std": 2.1134190559387207, "step": 25 }, { "completion_length": 2997.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3468.0, "completions/max_terminated_length": 3468.0, "completions/mean_length": 2997.5, "completions/mean_terminated_length": 2997.5, "completions/min_length": 2723.0, "completions/min_terminated_length": 2723.0, "epoch": 0.043697478991596636, "frac_reward_zero_std": 0.0, "grad_norm": 0.22256870567798615, "kl": 0.4614196717739105, "learning_rate": 1.25e-06, "loss": 0.0005, "num_tokens": 520276.0, "reward": 7.620273590087891, "reward_std": 2.245389461517334, "rewards/reward_model/mean": 7.620273590087891, "rewards/reward_model/std": 2.245389461517334, "step": 26 }, { "completion_length": 1711.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2113.0, "completions/max_terminated_length": 2113.0, "completions/mean_length": 1711.5, "completions/mean_terminated_length": 1711.5, "completions/min_length": 1381.0, "completions/min_terminated_length": 1381.0, "epoch": 0.0453781512605042, "frac_reward_zero_std": 0.0, "grad_norm": 0.5169827938079834, "kl": 0.6935045123100281, "learning_rate": 1.3e-06, "loss": 0.0007, "num_tokens": 536426.0, "reward": 8.411685943603516, "reward_std": 0.920927107334137, "rewards/reward_model/mean": 8.411685943603516, "rewards/reward_model/std": 0.920927107334137, "step": 27 }, { "completion_length": 2390.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3224.0, "completions/max_terminated_length": 3224.0, "completions/mean_length": 2390.75, "completions/mean_terminated_length": 2390.75, "completions/min_length": 1817.0, "completions/min_terminated_length": 1817.0, "epoch": 0.047058823529411764, "frac_reward_zero_std": 0.0, "grad_norm": 0.4216252863407135, "kl": 0.3447871506214142, "learning_rate": 1.3500000000000002e-06, "loss": 0.0003, "num_tokens": 557113.0, "reward": 6.5, "reward_std": 2.798809289932251, "rewards/reward_model/mean": 6.5, "rewards/reward_model/std": 2.798809289932251, "step": 28 }, { "completion_length": 3297.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3448.0, "completions/max_terminated_length": 3448.0, "completions/mean_length": 3297.0, "completions/mean_terminated_length": 3297.0, "completions/min_length": 3118.0, "completions/min_terminated_length": 3118.0, "epoch": 0.04873949579831933, "frac_reward_zero_std": 0.0, "grad_norm": 0.20905952155590057, "kl": 0.44418153166770935, "learning_rate": 1.4000000000000001e-06, "loss": 0.0004, "num_tokens": 581145.0, "reward": 7.86611270904541, "reward_std": 0.774669885635376, "rewards/reward_model/mean": 7.86611270904541, "rewards/reward_model/std": 0.7746700644493103, "step": 29 }, { "completion_length": 2944.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3491.0, "completions/max_terminated_length": 3491.0, "completions/mean_length": 2944.5, "completions/mean_terminated_length": 2944.5, "completions/min_length": 2070.0, "completions/min_terminated_length": 2070.0, "epoch": 0.05042016806722689, "frac_reward_zero_std": 0.0, "grad_norm": 0.21313747763633728, "kl": 0.4106956720352173, "learning_rate": 1.45e-06, "loss": 0.0004, "num_tokens": 603227.0, "reward": 8.306060791015625, "reward_std": 0.893286943435669, "rewards/reward_model/mean": 8.306060791015625, "rewards/reward_model/std": 0.893286943435669, "step": 30 }, { "completion_length": 2035.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2720.0, "completions/max_terminated_length": 2720.0, "completions/mean_length": 2035.25, "completions/mean_terminated_length": 2035.25, "completions/min_length": 1470.0, "completions/min_terminated_length": 1470.0, "epoch": 0.052100840336134456, "frac_reward_zero_std": 0.0, "grad_norm": 0.327402800321579, "kl": 0.3633970618247986, "learning_rate": 1.5e-06, "loss": 0.0004, "num_tokens": 620532.0, "reward": 6.75, "reward_std": 2.3273732662200928, "rewards/reward_model/mean": 6.75, "rewards/reward_model/std": 2.327373504638672, "step": 31 }, { "completion_length": 2783.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3118.0, "completions/max_terminated_length": 3118.0, "completions/mean_length": 2783.25, "completions/mean_terminated_length": 2783.25, "completions/min_length": 2585.0, "completions/min_terminated_length": 2585.0, "epoch": 0.05378151260504202, "frac_reward_zero_std": 0.0, "grad_norm": 0.2371380627155304, "kl": 0.5408622622489929, "learning_rate": 1.5500000000000002e-06, "loss": 0.0005, "num_tokens": 640929.0, "reward": 8.676490783691406, "reward_std": 0.23656955361366272, "rewards/reward_model/mean": 8.676490783691406, "rewards/reward_model/std": 0.23656955361366272, "step": 32 }, { "completion_length": 3145.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3334.0, "completions/max_terminated_length": 3334.0, "completions/mean_length": 3145.5, "completions/mean_terminated_length": 3145.5, "completions/min_length": 2748.0, "completions/min_terminated_length": 2748.0, "epoch": 0.05546218487394958, "frac_reward_zero_std": 0.0, "grad_norm": 0.23712949454784393, "kl": 0.4347061812877655, "learning_rate": 1.6000000000000001e-06, "loss": 0.0004, "num_tokens": 664079.0, "reward": -0.014399351552128792, "reward_std": 0.01094669010490179, "rewards/reward_model/mean": -0.014399351552128792, "rewards/reward_model/std": 0.01094669010490179, "step": 33 }, { "completion_length": 1896.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2315.0, "completions/max_terminated_length": 2315.0, "completions/mean_length": 1896.25, "completions/mean_terminated_length": 1896.25, "completions/min_length": 1386.0, "completions/min_terminated_length": 1386.0, "epoch": 0.05714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.48681309819221497, "kl": 0.40498724579811096, "learning_rate": 1.6500000000000003e-06, "loss": 0.0004, "num_tokens": 681572.0, "reward": 5.930726051330566, "reward_std": 4.415777206420898, "rewards/reward_model/mean": 5.930726051330566, "rewards/reward_model/std": 4.415777206420898, "step": 34 }, { "completion_length": 2924.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3041.0, "completions/max_terminated_length": 3041.0, "completions/mean_length": 2924.5, "completions/mean_terminated_length": 2924.5, "completions/min_length": 2808.0, "completions/min_terminated_length": 2808.0, "epoch": 0.058823529411764705, "frac_reward_zero_std": 0.0, "grad_norm": 0.22173462808132172, "kl": 0.3978763818740845, "learning_rate": 1.7000000000000002e-06, "loss": 0.0004, "num_tokens": 702998.0, "reward": 8.046117782592773, "reward_std": 1.2211781740188599, "rewards/reward_model/mean": 8.046117782592773, "rewards/reward_model/std": 1.2211780548095703, "step": 35 }, { "completion_length": 3348.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3742.0, "completions/max_terminated_length": 3742.0, "completions/mean_length": 3348.5, "completions/mean_terminated_length": 3348.5, "completions/min_length": 3066.0, "completions/min_terminated_length": 3066.0, "epoch": 0.06050420168067227, "frac_reward_zero_std": 0.0, "grad_norm": 0.2196238934993744, "kl": 0.41847947239875793, "learning_rate": 1.75e-06, "loss": 0.0004, "num_tokens": 727604.0, "reward": 8.480682373046875, "reward_std": 0.6139712929725647, "rewards/reward_model/mean": 8.480682373046875, "rewards/reward_model/std": 0.6139712929725647, "step": 36 }, { "completion_length": 7.25, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.25, "completions/mean_terminated_length": 7.25, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.06218487394957983, "frac_reward_zero_std": 1.0, "grad_norm": 0.05685713142156601, "kl": 4.287390232086182, "learning_rate": 1.8000000000000001e-06, "loss": 0.0043, "num_tokens": 736097.0, "reward": 10.0, "reward_std": 0.0, "rewards/reward_model/mean": 10.0, "rewards/reward_model/std": 0.0, "step": 37 }, { "completion_length": 3168.25, "completions/clipped_ratio": 0.0, "completions/max_length": 4236.0, "completions/max_terminated_length": 4236.0, "completions/mean_length": 3168.25, "completions/mean_terminated_length": 3168.25, "completions/min_length": 2421.0, "completions/min_terminated_length": 2421.0, "epoch": 0.06386554621848739, "frac_reward_zero_std": 0.0, "grad_norm": 0.30552753806114197, "kl": 0.274600625038147, "learning_rate": 1.85e-06, "loss": 0.0003, "num_tokens": 758170.0, "reward": 7.125, "reward_std": 1.5478479862213135, "rewards/reward_model/mean": 7.125, "rewards/reward_model/std": 1.5478479862213135, "step": 38 }, { "completion_length": 3031.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3205.0, "completions/max_terminated_length": 3205.0, "completions/mean_length": 3031.0, "completions/mean_terminated_length": 3031.0, "completions/min_length": 2839.0, "completions/min_terminated_length": 2839.0, "epoch": 0.06554621848739496, "frac_reward_zero_std": 0.0, "grad_norm": 0.2312554270029068, "kl": 0.4626840054988861, "learning_rate": 1.9000000000000002e-06, "loss": 0.0005, "num_tokens": 780238.0, "reward": 8.927217483520508, "reward_std": 1.1756690740585327, "rewards/reward_model/mean": 8.927217483520508, "rewards/reward_model/std": 1.1756691932678223, "step": 39 }, { "completion_length": 3206.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3607.0, "completions/max_terminated_length": 3607.0, "completions/mean_length": 3206.0, "completions/mean_terminated_length": 3206.0, "completions/min_length": 2840.0, "completions/min_terminated_length": 2840.0, "epoch": 0.06722689075630252, "frac_reward_zero_std": 0.0, "grad_norm": 0.27287694811820984, "kl": 0.42922842502593994, "learning_rate": 1.9500000000000004e-06, "loss": 0.0004, "num_tokens": 803578.0, "reward": 8.357658386230469, "reward_std": 1.2403758764266968, "rewards/reward_model/mean": 8.357658386230469, "rewards/reward_model/std": 1.2403758764266968, "step": 40 }, { "completion_length": 3249.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3387.0, "completions/max_terminated_length": 3387.0, "completions/mean_length": 3249.0, "completions/mean_terminated_length": 3249.0, "completions/min_length": 3084.0, "completions/min_terminated_length": 3084.0, "epoch": 0.06890756302521009, "frac_reward_zero_std": 0.0, "grad_norm": 0.21696928143501282, "kl": 0.43103882670402527, "learning_rate": 2.0000000000000003e-06, "loss": 0.0004, "num_tokens": 826502.0, "reward": 7.731931209564209, "reward_std": 0.6359418034553528, "rewards/reward_model/mean": 7.731931209564209, "rewards/reward_model/std": 0.635941743850708, "step": 41 }, { "completion_length": 3166.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3637.0, "completions/max_terminated_length": 3637.0, "completions/mean_length": 3166.0, "completions/mean_terminated_length": 3166.0, "completions/min_length": 2995.0, "completions/min_terminated_length": 2995.0, "epoch": 0.07058823529411765, "frac_reward_zero_std": 0.0, "grad_norm": 0.2448631376028061, "kl": 0.4528976082801819, "learning_rate": 2.05e-06, "loss": 0.0005, "num_tokens": 849846.0, "reward": 8.172100067138672, "reward_std": 1.5137546062469482, "rewards/reward_model/mean": 8.172100067138672, "rewards/reward_model/std": 1.5137547254562378, "step": 42 }, { "completion_length": 1082.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1601.0, "completions/max_terminated_length": 1601.0, "completions/mean_length": 1082.5, "completions/mean_terminated_length": 1082.5, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "epoch": 0.07226890756302522, "frac_reward_zero_std": 0.0, "grad_norm": 0.614413321018219, "kl": 0.9997281432151794, "learning_rate": 2.1000000000000002e-06, "loss": 0.001, "num_tokens": 865128.0, "reward": 5.758825302124023, "reward_std": 2.153346061706543, "rewards/reward_model/mean": 5.758825302124023, "rewards/reward_model/std": 2.153346061706543, "step": 43 }, { "completion_length": 3139.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3744.0, "completions/max_terminated_length": 3744.0, "completions/mean_length": 3139.25, "completions/mean_terminated_length": 3139.25, "completions/min_length": 2570.0, "completions/min_terminated_length": 2570.0, "epoch": 0.07394957983193277, "frac_reward_zero_std": 0.0, "grad_norm": 0.23637787997722626, "kl": 0.4812513291835785, "learning_rate": 2.15e-06, "loss": 0.0005, "num_tokens": 888913.0, "reward": 7.538500785827637, "reward_std": 1.2191288471221924, "rewards/reward_model/mean": 7.538500785827637, "rewards/reward_model/std": 1.2191288471221924, "step": 44 }, { "completion_length": 3443.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3698.0, "completions/max_terminated_length": 3698.0, "completions/mean_length": 3443.75, "completions/mean_terminated_length": 3443.75, "completions/min_length": 3105.0, "completions/min_terminated_length": 3105.0, "epoch": 0.07563025210084033, "frac_reward_zero_std": 0.0, "grad_norm": 0.19623233377933502, "kl": 0.3900257647037506, "learning_rate": 2.2e-06, "loss": 0.0004, "num_tokens": 912152.0, "reward": 8.34599781036377, "reward_std": 1.126673698425293, "rewards/reward_model/mean": 8.34599781036377, "rewards/reward_model/std": 1.1266734600067139, "step": 45 }, { "completion_length": 57.5, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 57.5, "completions/mean_terminated_length": 57.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.0773109243697479, "frac_reward_zero_std": 0.0, "grad_norm": 5.90267276763916, "kl": 3.3936893939971924, "learning_rate": 2.25e-06, "loss": 0.0034, "num_tokens": 921438.0, "reward": 8.0, "reward_std": 4.0, "rewards/reward_model/mean": 8.0, "rewards/reward_model/std": 4.0, "step": 46 }, { "completion_length": 1558.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1778.0, "completions/max_terminated_length": 1778.0, "completions/mean_length": 1558.75, "completions/mean_terminated_length": 1558.75, "completions/min_length": 1377.0, "completions/min_terminated_length": 1377.0, "epoch": 0.07899159663865546, "frac_reward_zero_std": 0.0, "grad_norm": 0.5147756338119507, "kl": 0.6282870173454285, "learning_rate": 2.3000000000000004e-06, "loss": 0.0006, "num_tokens": 937877.0, "reward": 7.551456451416016, "reward_std": 1.0817488431930542, "rewards/reward_model/mean": 7.551456451416016, "rewards/reward_model/std": 1.0817489624023438, "step": 47 }, { "completion_length": 3149.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3280.0, "completions/max_terminated_length": 3280.0, "completions/mean_length": 3149.0, "completions/mean_terminated_length": 3149.0, "completions/min_length": 3075.0, "completions/min_terminated_length": 3075.0, "epoch": 0.08067226890756303, "frac_reward_zero_std": 0.0, "grad_norm": 0.21285971999168396, "kl": 0.4472464621067047, "learning_rate": 2.35e-06, "loss": 0.0004, "num_tokens": 960593.0, "reward": 8.15469741821289, "reward_std": 0.9880263209342957, "rewards/reward_model/mean": 8.15469741821289, "rewards/reward_model/std": 0.9880266189575195, "step": 48 }, { "completion_length": 2261.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3393.0, "completions/max_terminated_length": 3393.0, "completions/mean_length": 2261.5, "completions/mean_terminated_length": 2261.5, "completions/min_length": 1702.0, "completions/min_terminated_length": 1702.0, "epoch": 0.08235294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 0.4225335419178009, "kl": 0.3637184798717499, "learning_rate": 2.4000000000000003e-06, "loss": 0.0004, "num_tokens": 979171.0, "reward": 7.375, "reward_std": 2.780137777328491, "rewards/reward_model/mean": 7.375, "rewards/reward_model/std": 2.7801380157470703, "step": 49 }, { "completion_length": 3035.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3395.0, "completions/max_terminated_length": 3395.0, "completions/mean_length": 3035.0, "completions/mean_terminated_length": 3035.0, "completions/min_length": 2758.0, "completions/min_terminated_length": 2758.0, "epoch": 0.08403361344537816, "frac_reward_zero_std": 0.0, "grad_norm": 0.2415667027235031, "kl": 0.4912819564342499, "learning_rate": 2.4500000000000003e-06, "loss": 0.0005, "num_tokens": 999851.0, "reward": 8.715576171875, "reward_std": 0.45521172881126404, "rewards/reward_model/mean": 8.715576171875, "rewards/reward_model/std": 0.45521196722984314, "step": 50 }, { "completion_length": 4185.75, "completions/clipped_ratio": 0.25, "completions/max_length": 7232.0, "completions/max_terminated_length": 3633.0, "completions/mean_length": 4185.75, "completions/mean_terminated_length": 3170.33349609375, "completions/min_length": 2558.0, "completions/min_terminated_length": 2558.0, "epoch": 0.08571428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.16737323999404907, "kl": 0.3826863169670105, "learning_rate": 2.5e-06, "loss": 0.0004, "num_tokens": 1026650.0, "reward": 7.64158821105957, "reward_std": 1.6326850652694702, "rewards/reward_model/mean": 7.64158821105957, "rewards/reward_model/std": 1.6326849460601807, "step": 51 }, { "completion_length": 1089.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1463.0, "completions/max_terminated_length": 1463.0, "completions/mean_length": 1089.25, "completions/mean_terminated_length": 1089.25, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.08739495798319327, "frac_reward_zero_std": 0.0, "grad_norm": 0.49092772603034973, "kl": 0.8803520798683167, "learning_rate": 2.55e-06, "loss": 0.0009, "num_tokens": 1041415.0, "reward": 7.310797691345215, "reward_std": 1.0026116371154785, "rewards/reward_model/mean": 7.310797691345215, "rewards/reward_model/std": 1.0026116371154785, "step": 52 }, { "completion_length": 3123.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3517.0, "completions/max_terminated_length": 3517.0, "completions/mean_length": 3123.25, "completions/mean_terminated_length": 3123.25, "completions/min_length": 2651.0, "completions/min_terminated_length": 2651.0, "epoch": 0.08907563025210084, "frac_reward_zero_std": 0.0, "grad_norm": 0.21318843960762024, "kl": 0.42238688468933105, "learning_rate": 2.6e-06, "loss": 0.0004, "num_tokens": 1063012.0, "reward": 9.053631782531738, "reward_std": 0.7274901270866394, "rewards/reward_model/mean": 9.053631782531738, "rewards/reward_model/std": 0.7274901270866394, "step": 53 }, { "completion_length": 2926.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3444.0, "completions/max_terminated_length": 3444.0, "completions/mean_length": 2926.0, "completions/mean_terminated_length": 2926.0, "completions/min_length": 2523.0, "completions/min_terminated_length": 2523.0, "epoch": 0.0907563025210084, "frac_reward_zero_std": 0.0, "grad_norm": 0.2534274160861969, "kl": 0.5139771103858948, "learning_rate": 2.6500000000000005e-06, "loss": 0.0005, "num_tokens": 1086172.0, "reward": 8.154108047485352, "reward_std": 0.9382317066192627, "rewards/reward_model/mean": 8.154108047485352, "rewards/reward_model/std": 0.9382315874099731, "step": 54 }, { "completion_length": 3260.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3541.0, "completions/max_terminated_length": 3541.0, "completions/mean_length": 3260.0, "completions/mean_terminated_length": 3260.0, "completions/min_length": 3041.0, "completions/min_terminated_length": 3041.0, "epoch": 0.09243697478991597, "frac_reward_zero_std": 0.0, "grad_norm": 0.219038188457489, "kl": 0.7034112215042114, "learning_rate": 2.7000000000000004e-06, "loss": 0.0007, "num_tokens": 1109548.0, "reward": 8.521720886230469, "reward_std": 1.0076866149902344, "rewards/reward_model/mean": 8.521720886230469, "rewards/reward_model/std": 1.0076866149902344, "step": 55 }, { "completion_length": 3062.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3473.0, "completions/max_terminated_length": 3473.0, "completions/mean_length": 3062.25, "completions/mean_terminated_length": 3062.25, "completions/min_length": 2484.0, "completions/min_terminated_length": 2484.0, "epoch": 0.09411764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 0.23915456235408783, "kl": 0.4395737946033478, "learning_rate": 2.7500000000000004e-06, "loss": 0.0004, "num_tokens": 1130981.0, "reward": 8.267683029174805, "reward_std": 0.7767911553382874, "rewards/reward_model/mean": 8.267683029174805, "rewards/reward_model/std": 0.7767910957336426, "step": 56 }, { "completion_length": 3322.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3483.0, "completions/max_terminated_length": 3483.0, "completions/mean_length": 3322.0, "completions/mean_terminated_length": 3322.0, "completions/min_length": 2998.0, "completions/min_terminated_length": 2998.0, "epoch": 0.0957983193277311, "frac_reward_zero_std": 0.0, "grad_norm": 0.20130489766597748, "kl": 0.4055849015712738, "learning_rate": 2.8000000000000003e-06, "loss": 0.0004, "num_tokens": 1154809.0, "reward": 8.213647842407227, "reward_std": 0.8999168276786804, "rewards/reward_model/mean": 8.213647842407227, "rewards/reward_model/std": 0.8999167680740356, "step": 57 }, { "completion_length": 1292.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1526.0, "completions/max_terminated_length": 1526.0, "completions/mean_length": 1292.25, "completions/mean_terminated_length": 1292.25, "completions/min_length": 1090.0, "completions/min_terminated_length": 1090.0, "epoch": 0.09747899159663866, "frac_reward_zero_std": 0.0, "grad_norm": 0.5870078802108765, "kl": 0.7187660932540894, "learning_rate": 2.85e-06, "loss": 0.0007, "num_tokens": 1169350.0, "reward": 7.023124694824219, "reward_std": 1.6435866355895996, "rewards/reward_model/mean": 7.023124694824219, "rewards/reward_model/std": 1.6435868740081787, "step": 58 }, { "completion_length": 2549.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3263.0, "completions/max_terminated_length": 3263.0, "completions/mean_length": 2549.25, "completions/mean_terminated_length": 2549.25, "completions/min_length": 2124.0, "completions/min_terminated_length": 2124.0, "epoch": 0.09915966386554621, "frac_reward_zero_std": 0.0, "grad_norm": 0.2910903990268707, "kl": 0.29700058698654175, "learning_rate": 2.9e-06, "loss": 0.0003, "num_tokens": 1189963.0, "reward": 5.5, "reward_std": 1.5811388492584229, "rewards/reward_model/mean": 5.5, "rewards/reward_model/std": 1.5811388492584229, "step": 59 }, { "completion_length": 2904.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3014.0, "completions/max_terminated_length": 3014.0, "completions/mean_length": 2904.0, "completions/mean_terminated_length": 2904.0, "completions/min_length": 2822.0, "completions/min_terminated_length": 2822.0, "epoch": 0.10084033613445378, "frac_reward_zero_std": 0.0, "grad_norm": 0.2662099599838257, "kl": 0.45383381843566895, "learning_rate": 2.95e-06, "loss": 0.0005, "num_tokens": 1212511.0, "reward": 8.575969696044922, "reward_std": 1.0328577756881714, "rewards/reward_model/mean": 8.575969696044922, "rewards/reward_model/std": 1.0328580141067505, "step": 60 }, { "completion_length": 2814.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3206.0, "completions/max_terminated_length": 3206.0, "completions/mean_length": 2814.75, "completions/mean_terminated_length": 2814.75, "completions/min_length": 2580.0, "completions/min_terminated_length": 2580.0, "epoch": 0.10252100840336134, "frac_reward_zero_std": 0.0, "grad_norm": 0.24718832969665527, "kl": 0.4411505460739136, "learning_rate": 3e-06, "loss": 0.0004, "num_tokens": 1234434.0, "reward": 8.411513328552246, "reward_std": 0.8315414786338806, "rewards/reward_model/mean": 8.411513328552246, "rewards/reward_model/std": 0.8315416574478149, "step": 61 }, { "completion_length": 3244.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3895.0, "completions/max_terminated_length": 3895.0, "completions/mean_length": 3244.5, "completions/mean_terminated_length": 3244.5, "completions/min_length": 2591.0, "completions/min_terminated_length": 2591.0, "epoch": 0.10420168067226891, "frac_reward_zero_std": 0.0, "grad_norm": 0.2538319230079651, "kl": 0.27741584181785583, "learning_rate": 3.05e-06, "loss": 0.0003, "num_tokens": 1256916.0, "reward": 6.0, "reward_std": 3.9370038509368896, "rewards/reward_model/mean": 6.0, "rewards/reward_model/std": 3.9370038509368896, "step": 62 }, { "completion_length": 2728.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2981.0, "completions/max_terminated_length": 2981.0, "completions/mean_length": 2728.75, "completions/mean_terminated_length": 2728.75, "completions/min_length": 2376.0, "completions/min_terminated_length": 2376.0, "epoch": 0.10588235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 0.25504007935523987, "kl": 0.46872344613075256, "learning_rate": 3.1000000000000004e-06, "loss": 0.0005, "num_tokens": 1276531.0, "reward": 8.205828666687012, "reward_std": 0.682498037815094, "rewards/reward_model/mean": 8.205828666687012, "rewards/reward_model/std": 0.6824979782104492, "step": 63 }, { "completion_length": 2017.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2290.0, "completions/max_terminated_length": 2290.0, "completions/mean_length": 2017.0, "completions/mean_terminated_length": 2017.0, "completions/min_length": 1666.0, "completions/min_terminated_length": 1666.0, "epoch": 0.10756302521008404, "frac_reward_zero_std": 0.0, "grad_norm": 0.3398452401161194, "kl": 0.3937748372554779, "learning_rate": 3.1500000000000003e-06, "loss": 0.0004, "num_tokens": 1294551.0, "reward": 5.75, "reward_std": 1.7078251838684082, "rewards/reward_model/mean": 5.75, "rewards/reward_model/std": 1.7078251838684082, "step": 64 }, { "completion_length": 3087.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3985.0, "completions/max_terminated_length": 3985.0, "completions/mean_length": 3087.0, "completions/mean_terminated_length": 3087.0, "completions/min_length": 2602.0, "completions/min_terminated_length": 2602.0, "epoch": 0.1092436974789916, "frac_reward_zero_std": 0.0, "grad_norm": 0.22506201267242432, "kl": 0.4327456057071686, "learning_rate": 3.2000000000000003e-06, "loss": 0.0004, "num_tokens": 1317007.0, "reward": 8.593668937683105, "reward_std": 1.6437653303146362, "rewards/reward_model/mean": 8.593668937683105, "rewards/reward_model/std": 1.6437653303146362, "step": 65 }, { "completion_length": 828.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1600.0, "completions/max_terminated_length": 1600.0, "completions/mean_length": 828.5, "completions/mean_terminated_length": 828.5, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.11092436974789915, "frac_reward_zero_std": 0.0, "grad_norm": 0.5670727491378784, "kl": 0.9830642342567444, "learning_rate": 3.2500000000000002e-06, "loss": 0.001, "num_tokens": 1330901.0, "reward": 6.8028154373168945, "reward_std": 1.10745370388031, "rewards/reward_model/mean": 6.8028154373168945, "rewards/reward_model/std": 1.10745370388031, "step": 66 }, { "completion_length": 2075.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2810.0, "completions/max_terminated_length": 2810.0, "completions/mean_length": 2075.25, "completions/mean_terminated_length": 2075.25, "completions/min_length": 1383.0, "completions/min_terminated_length": 1383.0, "epoch": 0.11260504201680673, "frac_reward_zero_std": 0.0, "grad_norm": 0.29757460951805115, "kl": 0.35559019446372986, "learning_rate": 3.3000000000000006e-06, "loss": 0.0004, "num_tokens": 1348938.0, "reward": 6.351768493652344, "reward_std": 4.791616439819336, "rewards/reward_model/mean": 6.351768493652344, "rewards/reward_model/std": 4.791616916656494, "step": 67 }, { "completion_length": 1801.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2303.0, "completions/max_terminated_length": 2303.0, "completions/mean_length": 1801.25, "completions/mean_terminated_length": 1801.25, "completions/min_length": 1360.0, "completions/min_terminated_length": 1360.0, "epoch": 0.11428571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.4349398910999298, "kl": 0.4391127824783325, "learning_rate": 3.3500000000000005e-06, "loss": 0.0004, "num_tokens": 1366391.0, "reward": 6.5406036376953125, "reward_std": 4.524318695068359, "rewards/reward_model/mean": 6.5406036376953125, "rewards/reward_model/std": 4.524318695068359, "step": 68 }, { "completion_length": 3342.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3956.0, "completions/max_terminated_length": 3956.0, "completions/mean_length": 3342.0, "completions/mean_terminated_length": 3342.0, "completions/min_length": 2952.0, "completions/min_terminated_length": 2952.0, "epoch": 0.11596638655462185, "frac_reward_zero_std": 0.0, "grad_norm": 0.20729507505893707, "kl": 0.4406834542751312, "learning_rate": 3.4000000000000005e-06, "loss": 0.0004, "num_tokens": 1388627.0, "reward": 9.244726181030273, "reward_std": 0.9476140141487122, "rewards/reward_model/mean": 9.244726181030273, "rewards/reward_model/std": 0.9476140141487122, "step": 69 }, { "completion_length": 2394.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2967.0, "completions/max_terminated_length": 2967.0, "completions/mean_length": 2394.0, "completions/mean_terminated_length": 2394.0, "completions/min_length": 1955.0, "completions/min_terminated_length": 1955.0, "epoch": 0.11764705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 0.3415325880050659, "kl": 0.7586065530776978, "learning_rate": 3.45e-06, "loss": 0.0008, "num_tokens": 1407775.0, "reward": 7.75, "reward_std": 2.629955530166626, "rewards/reward_model/mean": 7.75, "rewards/reward_model/std": 2.629955768585205, "step": 70 }, { "completion_length": 3043.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3499.0, "completions/max_terminated_length": 3499.0, "completions/mean_length": 3043.75, "completions/mean_terminated_length": 3043.75, "completions/min_length": 2632.0, "completions/min_terminated_length": 2632.0, "epoch": 0.11932773109243698, "frac_reward_zero_std": 0.0, "grad_norm": 0.2264113426208496, "kl": 0.43621689081192017, "learning_rate": 3.5e-06, "loss": 0.0004, "num_tokens": 1429114.0, "reward": 7.379914283752441, "reward_std": 1.2877236604690552, "rewards/reward_model/mean": 7.379914283752441, "rewards/reward_model/std": 1.2877237796783447, "step": 71 }, { "completion_length": 2935.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3168.0, "completions/max_terminated_length": 3168.0, "completions/mean_length": 2935.5, "completions/mean_terminated_length": 2935.5, "completions/min_length": 2622.0, "completions/min_terminated_length": 2622.0, "epoch": 0.12100840336134454, "frac_reward_zero_std": 0.0, "grad_norm": 0.21893373131752014, "kl": 0.5037636756896973, "learning_rate": 3.5500000000000003e-06, "loss": 0.0006, "num_tokens": 1450920.0, "reward": 8.926493644714355, "reward_std": 0.007464637514203787, "rewards/reward_model/mean": 8.926493644714355, "rewards/reward_model/std": 0.007464637979865074, "step": 72 }, { "completion_length": 909.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1270.0, "completions/max_terminated_length": 1270.0, "completions/mean_length": 909.25, "completions/mean_terminated_length": 909.25, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.1226890756302521, "frac_reward_zero_std": 0.0, "grad_norm": 0.6668774485588074, "kl": 1.02503502368927, "learning_rate": 3.6000000000000003e-06, "loss": 0.001, "num_tokens": 1464057.0, "reward": 6.830574989318848, "reward_std": 1.5993455648422241, "rewards/reward_model/mean": 6.830574989318848, "rewards/reward_model/std": 1.5993454456329346, "step": 73 }, { "completion_length": 1187.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1714.0, "completions/max_terminated_length": 1714.0, "completions/mean_length": 1187.25, "completions/mean_terminated_length": 1187.25, "completions/min_length": 864.0, "completions/min_terminated_length": 864.0, "epoch": 0.12436974789915967, "frac_reward_zero_std": 0.0, "grad_norm": 2.6230356693267822, "kl": 7.192581653594971, "learning_rate": 3.65e-06, "loss": 0.0072, "num_tokens": 1478994.0, "reward": 7.468416213989258, "reward_std": 0.5226073265075684, "rewards/reward_model/mean": 7.468416213989258, "rewards/reward_model/std": 0.5226073861122131, "step": 74 }, { "completion_length": 2722.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3341.0, "completions/max_terminated_length": 3341.0, "completions/mean_length": 2722.0, "completions/mean_terminated_length": 2722.0, "completions/min_length": 2369.0, "completions/min_terminated_length": 2369.0, "epoch": 0.12605042016806722, "frac_reward_zero_std": 0.0, "grad_norm": 0.2930119037628174, "kl": 0.525174081325531, "learning_rate": 3.7e-06, "loss": 0.0005, "num_tokens": 1498790.0, "reward": 9.866546630859375, "reward_std": 0.13535748422145844, "rewards/reward_model/mean": 9.866546630859375, "rewards/reward_model/std": 0.13535748422145844, "step": 75 }, { "completion_length": 2678.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2907.0, "completions/max_terminated_length": 2907.0, "completions/mean_length": 2678.25, "completions/mean_terminated_length": 2678.25, "completions/min_length": 2529.0, "completions/min_terminated_length": 2529.0, "epoch": 0.12773109243697478, "frac_reward_zero_std": 0.0, "grad_norm": 0.24343159794807434, "kl": 0.47763705253601074, "learning_rate": 3.7500000000000005e-06, "loss": 0.0005, "num_tokens": 1520331.0, "reward": 9.23454761505127, "reward_std": 0.8037624359130859, "rewards/reward_model/mean": 9.23454761505127, "rewards/reward_model/std": 0.8037623763084412, "step": 76 }, { "completion_length": 2961.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3192.0, "completions/max_terminated_length": 3192.0, "completions/mean_length": 2961.0, "completions/mean_terminated_length": 2961.0, "completions/min_length": 2866.0, "completions/min_terminated_length": 2866.0, "epoch": 0.12941176470588237, "frac_reward_zero_std": 0.0, "grad_norm": 0.27070072293281555, "kl": 0.4094032049179077, "learning_rate": 3.8000000000000005e-06, "loss": 0.0004, "num_tokens": 1541335.0, "reward": 8.187192916870117, "reward_std": 1.4074355363845825, "rewards/reward_model/mean": 8.187192916870117, "rewards/reward_model/std": 1.407435655593872, "step": 77 }, { "completion_length": 2904.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3201.0, "completions/max_terminated_length": 3201.0, "completions/mean_length": 2904.5, "completions/mean_terminated_length": 2904.5, "completions/min_length": 2665.0, "completions/min_terminated_length": 2665.0, "epoch": 0.13109243697478992, "frac_reward_zero_std": 0.0, "grad_norm": 0.22602877020835876, "kl": 0.4318145513534546, "learning_rate": 3.85e-06, "loss": 0.0004, "num_tokens": 1562753.0, "reward": 7.521336555480957, "reward_std": 2.3426802158355713, "rewards/reward_model/mean": 7.521336555480957, "rewards/reward_model/std": 2.3426802158355713, "step": 78 }, { "completion_length": 3219.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3297.0, "completions/max_terminated_length": 3297.0, "completions/mean_length": 3219.5, "completions/mean_terminated_length": 3219.5, "completions/min_length": 3121.0, "completions/min_terminated_length": 3121.0, "epoch": 0.13277310924369748, "frac_reward_zero_std": 0.0, "grad_norm": 0.20466728508472443, "kl": 0.40577566623687744, "learning_rate": 3.900000000000001e-06, "loss": 0.0004, "num_tokens": 1586335.0, "reward": 8.53989315032959, "reward_std": 1.2545560598373413, "rewards/reward_model/mean": 8.53989315032959, "rewards/reward_model/std": 1.2545561790466309, "step": 79 }, { "completion_length": 806.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1127.0, "completions/max_terminated_length": 1127.0, "completions/mean_length": 806.75, "completions/mean_terminated_length": 806.75, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.13445378151260504, "frac_reward_zero_std": 0.0, "grad_norm": 0.6143341660499573, "kl": 1.0517674684524536, "learning_rate": 3.95e-06, "loss": 0.0011, "num_tokens": 1600666.0, "reward": 7.633878707885742, "reward_std": 0.37605300545692444, "rewards/reward_model/mean": 7.633878707885742, "rewards/reward_model/std": 0.3760528862476349, "step": 80 }, { "completion_length": 3203.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4340.0, "completions/max_terminated_length": 4340.0, "completions/mean_length": 3203.5, "completions/mean_terminated_length": 3203.5, "completions/min_length": 2409.0, "completions/min_terminated_length": 2409.0, "epoch": 0.1361344537815126, "frac_reward_zero_std": 0.0, "grad_norm": 0.23666132986545563, "kl": 0.42324742674827576, "learning_rate": 4.000000000000001e-06, "loss": 0.0004, "num_tokens": 1622448.0, "reward": 7.318637847900391, "reward_std": 2.778326988220215, "rewards/reward_model/mean": 7.318637847900391, "rewards/reward_model/std": 2.778327226638794, "step": 81 }, { "completion_length": 3323.75, "completions/clipped_ratio": 0.0, "completions/max_length": 4202.0, "completions/max_terminated_length": 4202.0, "completions/mean_length": 3323.75, "completions/mean_terminated_length": 3323.75, "completions/min_length": 2512.0, "completions/min_terminated_length": 2512.0, "epoch": 0.13781512605042018, "frac_reward_zero_std": 0.0, "grad_norm": 0.24035634100437164, "kl": 0.4106868803501129, "learning_rate": 4.05e-06, "loss": 0.0004, "num_tokens": 1646043.0, "reward": 6.966048240661621, "reward_std": 1.1899865865707397, "rewards/reward_model/mean": 6.966048240661621, "rewards/reward_model/std": 1.1899867057800293, "step": 82 }, { "completion_length": 3115.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3221.0, "completions/max_terminated_length": 3221.0, "completions/mean_length": 3115.75, "completions/mean_terminated_length": 3115.75, "completions/min_length": 3007.0, "completions/min_terminated_length": 3007.0, "epoch": 0.13949579831932774, "frac_reward_zero_std": 0.0, "grad_norm": 0.21992234885692596, "kl": 0.44762954115867615, "learning_rate": 4.1e-06, "loss": 0.0004, "num_tokens": 1667814.0, "reward": 9.537565231323242, "reward_std": 0.4324139952659607, "rewards/reward_model/mean": 9.537565231323242, "rewards/reward_model/std": 0.4324139654636383, "step": 83 }, { "completion_length": 2971.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3360.0, "completions/max_terminated_length": 3360.0, "completions/mean_length": 2971.0, "completions/mean_terminated_length": 2971.0, "completions/min_length": 2768.0, "completions/min_terminated_length": 2768.0, "epoch": 0.1411764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 0.26085975766181946, "kl": 0.43445736169815063, "learning_rate": 4.15e-06, "loss": 0.0004, "num_tokens": 1690270.0, "reward": 8.511381149291992, "reward_std": 0.6435398459434509, "rewards/reward_model/mean": 8.511381149291992, "rewards/reward_model/std": 0.6435400247573853, "step": 84 }, { "completion_length": 2111.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2822.0, "completions/max_terminated_length": 2822.0, "completions/mean_length": 2111.0, "completions/mean_terminated_length": 2111.0, "completions/min_length": 575.0, "completions/min_terminated_length": 575.0, "epoch": 0.14285714285714285, "frac_reward_zero_std": 0.0, "grad_norm": 0.735354483127594, "kl": 1.131089687347412, "learning_rate": 4.2000000000000004e-06, "loss": 0.0011, "num_tokens": 1708290.0, "reward": 6.888094902038574, "reward_std": 4.362208366394043, "rewards/reward_model/mean": 6.888094902038574, "rewards/reward_model/std": 4.362208366394043, "step": 85 }, { "completion_length": 402.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1080.0, "completions/max_terminated_length": 1080.0, "completions/mean_length": 402.75, "completions/mean_terminated_length": 402.75, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.14453781512605043, "frac_reward_zero_std": 0.0, "grad_norm": 0.6006290316581726, "kl": 1.258028268814087, "learning_rate": 4.25e-06, "loss": 0.0013, "num_tokens": 1721241.0, "reward": 6.638233184814453, "reward_std": 2.5938193798065186, "rewards/reward_model/mean": 6.638233184814453, "rewards/reward_model/std": 2.5938196182250977, "step": 86 }, { "completion_length": 2114.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2257.0, "completions/max_terminated_length": 2257.0, "completions/mean_length": 2114.0, "completions/mean_terminated_length": 2114.0, "completions/min_length": 1862.0, "completions/min_terminated_length": 1862.0, "epoch": 0.146218487394958, "frac_reward_zero_std": 0.0, "grad_norm": 0.34364503622055054, "kl": 1.398358941078186, "learning_rate": 4.3e-06, "loss": 0.0014, "num_tokens": 1740013.0, "reward": 8.375, "reward_std": 1.3768926858901978, "rewards/reward_model/mean": 8.375, "rewards/reward_model/std": 1.3768926858901978, "step": 87 }, { "completion_length": 2752.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2924.0, "completions/max_terminated_length": 2924.0, "completions/mean_length": 2752.25, "completions/mean_terminated_length": 2752.25, "completions/min_length": 2575.0, "completions/min_terminated_length": 2575.0, "epoch": 0.14789915966386555, "frac_reward_zero_std": 0.0, "grad_norm": 0.27950361371040344, "kl": 0.49790701270103455, "learning_rate": 4.350000000000001e-06, "loss": 0.0005, "num_tokens": 1761918.0, "reward": 7.676429748535156, "reward_std": 1.4302712678909302, "rewards/reward_model/mean": 7.676429748535156, "rewards/reward_model/std": 1.4302712678909302, "step": 88 }, { "completion_length": 2853.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3089.0, "completions/max_terminated_length": 3089.0, "completions/mean_length": 2853.0, "completions/mean_terminated_length": 2853.0, "completions/min_length": 2605.0, "completions/min_terminated_length": 2605.0, "epoch": 0.1495798319327731, "frac_reward_zero_std": 0.0, "grad_norm": 0.2263125628232956, "kl": 0.4167507588863373, "learning_rate": 4.4e-06, "loss": 0.0004, "num_tokens": 1784366.0, "reward": 7.4489288330078125, "reward_std": 2.177258014678955, "rewards/reward_model/mean": 7.4489288330078125, "rewards/reward_model/std": 2.177258253097534, "step": 89 }, { "completion_length": 2805.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3072.0, "completions/max_terminated_length": 3072.0, "completions/mean_length": 2805.25, "completions/mean_terminated_length": 2805.25, "completions/min_length": 2340.0, "completions/min_terminated_length": 2340.0, "epoch": 0.15126050420168066, "frac_reward_zero_std": 0.0, "grad_norm": 0.25741341710090637, "kl": 0.43744996190071106, "learning_rate": 4.450000000000001e-06, "loss": 0.0004, "num_tokens": 1806855.0, "reward": 7.539546966552734, "reward_std": 3.0199856758117676, "rewards/reward_model/mean": 7.539546966552734, "rewards/reward_model/std": 3.0199856758117676, "step": 90 }, { "completion_length": 2687.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2977.0, "completions/max_terminated_length": 2977.0, "completions/mean_length": 2687.5, "completions/mean_terminated_length": 2687.5, "completions/min_length": 2398.0, "completions/min_terminated_length": 2398.0, "epoch": 0.15294117647058825, "frac_reward_zero_std": 0.0, "grad_norm": 0.24347051978111267, "kl": 0.4858957827091217, "learning_rate": 4.5e-06, "loss": 0.0005, "num_tokens": 1827277.0, "reward": 8.389686584472656, "reward_std": 0.6794203519821167, "rewards/reward_model/mean": 8.389686584472656, "rewards/reward_model/std": 0.679420530796051, "step": 91 }, { "completion_length": 2798.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3086.0, "completions/max_terminated_length": 3086.0, "completions/mean_length": 2798.25, "completions/mean_terminated_length": 2798.25, "completions/min_length": 2591.0, "completions/min_terminated_length": 2591.0, "epoch": 0.1546218487394958, "frac_reward_zero_std": 0.0, "grad_norm": 0.24916556477546692, "kl": 0.48482340574264526, "learning_rate": 4.5500000000000005e-06, "loss": 0.0005, "num_tokens": 1847102.0, "reward": 7.97796630859375, "reward_std": 0.8365663290023804, "rewards/reward_model/mean": 7.97796630859375, "rewards/reward_model/std": 0.8365663886070251, "step": 92 }, { "completion_length": 3056.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3159.0, "completions/max_terminated_length": 3159.0, "completions/mean_length": 3056.25, "completions/mean_terminated_length": 3056.25, "completions/min_length": 2902.0, "completions/min_terminated_length": 2902.0, "epoch": 0.15630252100840336, "frac_reward_zero_std": 0.0, "grad_norm": 0.22196516394615173, "kl": 0.4709712564945221, "learning_rate": 4.600000000000001e-06, "loss": 0.0005, "num_tokens": 1869175.0, "reward": 9.879180908203125, "reward_std": 0.16843675076961517, "rewards/reward_model/mean": 9.879180908203125, "rewards/reward_model/std": 0.16843685507774353, "step": 93 }, { "completion_length": 2674.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2908.0, "completions/max_terminated_length": 2908.0, "completions/mean_length": 2674.25, "completions/mean_terminated_length": 2674.25, "completions/min_length": 2532.0, "completions/min_terminated_length": 2532.0, "epoch": 0.15798319327731092, "frac_reward_zero_std": 0.0, "grad_norm": 0.3054737448692322, "kl": 0.7604443430900574, "learning_rate": 4.65e-06, "loss": 0.0008, "num_tokens": 1889516.0, "reward": 7.914120674133301, "reward_std": 1.0475715398788452, "rewards/reward_model/mean": 7.914120674133301, "rewards/reward_model/std": 1.0475715398788452, "step": 94 }, { "completion_length": 2791.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3596.0, "completions/max_terminated_length": 3596.0, "completions/mean_length": 2791.0, "completions/mean_terminated_length": 2791.0, "completions/min_length": 2346.0, "completions/min_terminated_length": 2346.0, "epoch": 0.15966386554621848, "frac_reward_zero_std": 0.0, "grad_norm": 0.240616574883461, "kl": 0.4514558017253876, "learning_rate": 4.7e-06, "loss": 0.0005, "num_tokens": 1911380.0, "reward": 8.27052116394043, "reward_std": 0.7082969546318054, "rewards/reward_model/mean": 8.27052116394043, "rewards/reward_model/std": 0.7082969546318054, "step": 95 }, { "completion_length": 3014.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3382.0, "completions/max_terminated_length": 3382.0, "completions/mean_length": 3014.25, "completions/mean_terminated_length": 3014.25, "completions/min_length": 2798.0, "completions/min_terminated_length": 2798.0, "epoch": 0.16134453781512606, "frac_reward_zero_std": 0.0, "grad_norm": 0.22918841242790222, "kl": 0.4786817729473114, "learning_rate": 4.75e-06, "loss": 0.0005, "num_tokens": 1933981.0, "reward": 7.595390319824219, "reward_std": 1.2992151975631714, "rewards/reward_model/mean": 7.595390319824219, "rewards/reward_model/std": 1.2992151975631714, "step": 96 }, { "completion_length": 3266.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3377.0, "completions/max_terminated_length": 3377.0, "completions/mean_length": 3266.5, "completions/mean_terminated_length": 3266.5, "completions/min_length": 3109.0, "completions/min_terminated_length": 3109.0, "epoch": 0.16302521008403362, "frac_reward_zero_std": 0.0, "grad_norm": 0.2017071694135666, "kl": 0.42354616522789, "learning_rate": 4.800000000000001e-06, "loss": 0.0004, "num_tokens": 1957023.0, "reward": 9.349321365356445, "reward_std": 0.6180247664451599, "rewards/reward_model/mean": 9.349321365356445, "rewards/reward_model/std": 0.6180248260498047, "step": 97 }, { "completion_length": 199.75, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 199.75, "completions/mean_terminated_length": 199.75, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.16470588235294117, "frac_reward_zero_std": 0.0, "grad_norm": 2.11334228515625, "kl": 1.4292675256729126, "learning_rate": 4.85e-06, "loss": 0.0014, "num_tokens": 1965326.0, "reward": 8.5, "reward_std": 1.0, "rewards/reward_model/mean": 8.5, "rewards/reward_model/std": 1.0, "step": 98 }, { "completion_length": 2075.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2459.0, "completions/max_terminated_length": 2459.0, "completions/mean_length": 2075.0, "completions/mean_terminated_length": 2075.0, "completions/min_length": 1511.0, "completions/min_terminated_length": 1511.0, "epoch": 0.16638655462184873, "frac_reward_zero_std": 0.0, "grad_norm": 0.44360974431037903, "kl": 0.38844701647758484, "learning_rate": 4.9000000000000005e-06, "loss": 0.0004, "num_tokens": 1984098.0, "reward": 6.0625, "reward_std": 3.6307425498962402, "rewards/reward_model/mean": 6.0625, "rewards/reward_model/std": 3.6307425498962402, "step": 99 }, { "completion_length": 3006.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3300.0, "completions/max_terminated_length": 3300.0, "completions/mean_length": 3006.5, "completions/mean_terminated_length": 3006.5, "completions/min_length": 2651.0, "completions/min_terminated_length": 2651.0, "epoch": 0.16806722689075632, "frac_reward_zero_std": 0.0, "grad_norm": 0.21265709400177002, "kl": 0.4319321811199188, "learning_rate": 4.95e-06, "loss": 0.0004, "num_tokens": 2006040.0, "reward": 9.181880950927734, "reward_std": 0.6058098673820496, "rewards/reward_model/mean": 9.181880950927734, "rewards/reward_model/std": 0.6058098673820496, "step": 100 }, { "completion_length": 2721.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2922.0, "completions/max_terminated_length": 2922.0, "completions/mean_length": 2721.5, "completions/mean_terminated_length": 2721.5, "completions/min_length": 2631.0, "completions/min_terminated_length": 2631.0, "epoch": 0.16974789915966387, "frac_reward_zero_std": 0.0, "grad_norm": 0.2661150097846985, "kl": 0.49838021397590637, "learning_rate": 5e-06, "loss": 0.0005, "num_tokens": 2026218.0, "reward": 7.413733959197998, "reward_std": 1.5620050430297852, "rewards/reward_model/mean": 7.413733959197998, "rewards/reward_model/std": 1.5620051622390747, "step": 101 }, { "completion_length": 926.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1246.0, "completions/max_terminated_length": 1246.0, "completions/mean_length": 926.5, "completions/mean_terminated_length": 926.5, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.17142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.6446244120597839, "kl": 0.8280232548713684, "learning_rate": 4.994444444444445e-06, "loss": 0.0008, "num_tokens": 2039944.0, "reward": 6.553905487060547, "reward_std": 1.4130481481552124, "rewards/reward_model/mean": 6.553905487060547, "rewards/reward_model/std": 1.413048267364502, "step": 102 }, { "completion_length": 2172.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2470.0, "completions/max_terminated_length": 2470.0, "completions/mean_length": 2172.0, "completions/mean_terminated_length": 2172.0, "completions/min_length": 1860.0, "completions/min_terminated_length": 1860.0, "epoch": 0.173109243697479, "frac_reward_zero_std": 0.0, "grad_norm": 0.27262404561042786, "kl": 0.3252059817314148, "learning_rate": 4.988888888888889e-06, "loss": 0.0003, "num_tokens": 2058512.0, "reward": 8.375, "reward_std": 1.7969882488250732, "rewards/reward_model/mean": 8.375, "rewards/reward_model/std": 1.7969882488250732, "step": 103 }, { "completion_length": 2038.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2534.0, "completions/max_terminated_length": 2534.0, "completions/mean_length": 2038.25, "completions/mean_terminated_length": 2038.25, "completions/min_length": 1499.0, "completions/min_terminated_length": 1499.0, "epoch": 0.17478991596638654, "frac_reward_zero_std": 0.0, "grad_norm": 0.37813693284988403, "kl": 0.3322860896587372, "learning_rate": 4.983333333333334e-06, "loss": 0.0003, "num_tokens": 2077113.0, "reward": 9.625, "reward_std": 0.75, "rewards/reward_model/mean": 9.625, "rewards/reward_model/std": 0.75, "step": 104 }, { "completion_length": 1085.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1528.0, "completions/max_terminated_length": 1528.0, "completions/mean_length": 1085.0, "completions/mean_terminated_length": 1085.0, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.17647058823529413, "frac_reward_zero_std": 0.0, "grad_norm": 0.5497007966041565, "kl": 0.861971914768219, "learning_rate": 4.977777777777778e-06, "loss": 0.0009, "num_tokens": 2090677.0, "reward": 7.113149642944336, "reward_std": 1.5167663097381592, "rewards/reward_model/mean": 7.113149642944336, "rewards/reward_model/std": 1.5167663097381592, "step": 105 }, { "completion_length": 141.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 141.0, "completions/mean_terminated_length": 141.0, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.1781512605042017, "frac_reward_zero_std": 0.0, "grad_norm": 2.3327019214630127, "kl": 1.9802045822143555, "learning_rate": 4.9722222222222224e-06, "loss": 0.002, "num_tokens": 2098745.0, "reward": 9.25, "reward_std": 1.5, "rewards/reward_model/mean": 9.25, "rewards/reward_model/std": 1.5, "step": 106 }, { "completion_length": 2973.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3368.0, "completions/max_terminated_length": 3368.0, "completions/mean_length": 2973.5, "completions/mean_terminated_length": 2973.5, "completions/min_length": 2667.0, "completions/min_terminated_length": 2667.0, "epoch": 0.17983193277310924, "frac_reward_zero_std": 0.0, "grad_norm": 0.2415865957736969, "kl": 0.5100703835487366, "learning_rate": 4.966666666666667e-06, "loss": 0.0005, "num_tokens": 2120403.0, "reward": 7.290048599243164, "reward_std": 1.7958564758300781, "rewards/reward_model/mean": 7.290048599243164, "rewards/reward_model/std": 1.7958564758300781, "step": 107 }, { "completion_length": 3186.25, "completions/clipped_ratio": 0.0, "completions/max_length": 4318.0, "completions/max_terminated_length": 4318.0, "completions/mean_length": 3186.25, "completions/mean_terminated_length": 3186.25, "completions/min_length": 2722.0, "completions/min_terminated_length": 2722.0, "epoch": 0.1815126050420168, "frac_reward_zero_std": 0.0, "grad_norm": 0.2218141257762909, "kl": 0.4428662061691284, "learning_rate": 4.961111111111111e-06, "loss": 0.0004, "num_tokens": 2143188.0, "reward": 7.9481964111328125, "reward_std": 2.0744946002960205, "rewards/reward_model/mean": 7.9481964111328125, "rewards/reward_model/std": 2.0744946002960205, "step": 108 }, { "completion_length": 2968.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3096.0, "completions/max_terminated_length": 3096.0, "completions/mean_length": 2968.0, "completions/mean_terminated_length": 2968.0, "completions/min_length": 2650.0, "completions/min_terminated_length": 2650.0, "epoch": 0.18319327731092436, "frac_reward_zero_std": 0.0, "grad_norm": 0.23803484439849854, "kl": 0.48616155982017517, "learning_rate": 4.9555555555555565e-06, "loss": 0.0005, "num_tokens": 2164028.0, "reward": 8.134922981262207, "reward_std": 0.7836208343505859, "rewards/reward_model/mean": 8.134922981262207, "rewards/reward_model/std": 0.7836208939552307, "step": 109 }, { "completion_length": 2895.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3122.0, "completions/max_terminated_length": 3122.0, "completions/mean_length": 2895.0, "completions/mean_terminated_length": 2895.0, "completions/min_length": 2736.0, "completions/min_terminated_length": 2736.0, "epoch": 0.18487394957983194, "frac_reward_zero_std": 0.0, "grad_norm": 0.23772816359996796, "kl": 0.7086710929870605, "learning_rate": 4.95e-06, "loss": 0.0007, "num_tokens": 2184752.0, "reward": 9.1290922164917, "reward_std": 1.3770978450775146, "rewards/reward_model/mean": 9.1290922164917, "rewards/reward_model/std": 1.3770978450775146, "step": 110 }, { "completion_length": 3044.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3504.0, "completions/max_terminated_length": 3504.0, "completions/mean_length": 3044.0, "completions/mean_terminated_length": 3044.0, "completions/min_length": 2646.0, "completions/min_terminated_length": 2646.0, "epoch": 0.1865546218487395, "frac_reward_zero_std": 0.0, "grad_norm": 0.23825815320014954, "kl": 0.4379551112651825, "learning_rate": 4.944444444444445e-06, "loss": 0.0004, "num_tokens": 2208396.0, "reward": 8.048235893249512, "reward_std": 1.601423740386963, "rewards/reward_model/mean": 8.048235893249512, "rewards/reward_model/std": 1.601423740386963, "step": 111 }, { "completion_length": 2520.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2852.0, "completions/max_terminated_length": 2852.0, "completions/mean_length": 2520.0, "completions/mean_terminated_length": 2520.0, "completions/min_length": 2203.0, "completions/min_terminated_length": 2203.0, "epoch": 0.18823529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 0.2840467691421509, "kl": 0.4571273624897003, "learning_rate": 4.938888888888889e-06, "loss": 0.0005, "num_tokens": 2227528.0, "reward": 7.234651565551758, "reward_std": 4.369300842285156, "rewards/reward_model/mean": 7.234651565551758, "rewards/reward_model/std": 4.3693013191223145, "step": 112 }, { "completion_length": 2831.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3357.0, "completions/max_terminated_length": 3357.0, "completions/mean_length": 2831.75, "completions/mean_terminated_length": 2831.75, "completions/min_length": 2390.0, "completions/min_terminated_length": 2390.0, "epoch": 0.1899159663865546, "frac_reward_zero_std": 0.0, "grad_norm": 0.25559303164482117, "kl": 0.5501706004142761, "learning_rate": 4.933333333333334e-06, "loss": 0.0006, "num_tokens": 2249951.0, "reward": 8.183422088623047, "reward_std": 0.45960310101509094, "rewards/reward_model/mean": 8.183422088623047, "rewards/reward_model/std": 0.4596029818058014, "step": 113 }, { "completion_length": 2031.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2488.0, "completions/max_terminated_length": 2488.0, "completions/mean_length": 2031.0, "completions/mean_terminated_length": 2031.0, "completions/min_length": 1647.0, "completions/min_terminated_length": 1647.0, "epoch": 0.1915966386554622, "frac_reward_zero_std": 0.0, "grad_norm": 0.33318275213241577, "kl": 0.3922136127948761, "learning_rate": 4.927777777777778e-06, "loss": 0.0004, "num_tokens": 2267395.0, "reward": 6.75, "reward_std": 2.7537853717803955, "rewards/reward_model/mean": 6.75, "rewards/reward_model/std": 2.7537853717803955, "step": 114 }, { "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.19327731092436976, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008913876954466105, "kl": 3.7003390789031982, "learning_rate": 4.922222222222223e-06, "loss": 0.0037, "num_tokens": 2275843.0, "reward": 10.0, "reward_std": 0.0, "rewards/reward_model/mean": 10.0, "rewards/reward_model/std": 0.0, "step": 115 }, { "completion_length": 2963.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3328.0, "completions/max_terminated_length": 3328.0, "completions/mean_length": 2963.75, "completions/mean_terminated_length": 2963.75, "completions/min_length": 2585.0, "completions/min_terminated_length": 2585.0, "epoch": 0.1949579831932773, "frac_reward_zero_std": 0.0, "grad_norm": 0.28050631284713745, "kl": 0.5140309929847717, "learning_rate": 4.9166666666666665e-06, "loss": 0.0005, "num_tokens": 2298438.0, "reward": 8.040239334106445, "reward_std": 1.173266887664795, "rewards/reward_model/mean": 8.040239334106445, "rewards/reward_model/std": 1.1732672452926636, "step": 116 }, { "completion_length": 2095.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2667.0, "completions/max_terminated_length": 2667.0, "completions/mean_length": 2095.75, "completions/mean_terminated_length": 2095.75, "completions/min_length": 1826.0, "completions/min_terminated_length": 1826.0, "epoch": 0.19663865546218487, "frac_reward_zero_std": 0.0, "grad_norm": 0.352792888879776, "kl": 0.41261500120162964, "learning_rate": 4.911111111111112e-06, "loss": 0.0004, "num_tokens": 2316933.0, "reward": 8.625, "reward_std": 1.8874585628509521, "rewards/reward_model/mean": 8.625, "rewards/reward_model/std": 1.8874585628509521, "step": 117 }, { "completion_length": 1953.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2144.0, "completions/max_terminated_length": 2144.0, "completions/mean_length": 1953.25, "completions/mean_terminated_length": 1953.25, "completions/min_length": 1792.0, "completions/min_terminated_length": 1792.0, "epoch": 0.19831932773109243, "frac_reward_zero_std": 0.0, "grad_norm": 0.3218609094619751, "kl": 0.3694440424442291, "learning_rate": 4.905555555555556e-06, "loss": 0.0004, "num_tokens": 2334030.0, "reward": 9.0, "reward_std": 2.0, "rewards/reward_model/mean": 9.0, "rewards/reward_model/std": 2.0, "step": 118 }, { "completion_length": 1927.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2460.0, "completions/max_terminated_length": 2460.0, "completions/mean_length": 1927.0, "completions/mean_terminated_length": 1927.0, "completions/min_length": 1555.0, "completions/min_terminated_length": 1555.0, "epoch": 0.2, "frac_reward_zero_std": 0.0, "grad_norm": 0.36837753653526306, "kl": 0.3546859323978424, "learning_rate": 4.9000000000000005e-06, "loss": 0.0004, "num_tokens": 2351874.0, "reward": 7.625, "reward_std": 1.7017147541046143, "rewards/reward_model/mean": 7.625, "rewards/reward_model/std": 1.7017148733139038, "step": 119 }, { "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.20168067226890757, "frac_reward_zero_std": 1.0, "grad_norm": 0.005828255787491798, "kl": 3.5667221546173096, "learning_rate": 4.894444444444445e-06, "loss": 0.0036, "num_tokens": 2359778.0, "reward": 10.0, "reward_std": 0.0, "rewards/reward_model/mean": 10.0, "rewards/reward_model/std": 0.0, "step": 120 }, { "completion_length": 1054.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1255.0, "completions/max_terminated_length": 1255.0, "completions/mean_length": 1054.25, "completions/mean_terminated_length": 1054.25, "completions/min_length": 853.0, "completions/min_terminated_length": 853.0, "epoch": 0.20336134453781513, "frac_reward_zero_std": 0.0, "grad_norm": 0.6091054677963257, "kl": 0.7709091305732727, "learning_rate": 4.888888888888889e-06, "loss": 0.0008, "num_tokens": 2375299.0, "reward": 7.903931617736816, "reward_std": 0.7135262489318848, "rewards/reward_model/mean": 7.903931617736816, "rewards/reward_model/std": 0.7135262489318848, "step": 121 }, { "completion_length": 2617.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2759.0, "completions/max_terminated_length": 2759.0, "completions/mean_length": 2617.5, "completions/mean_terminated_length": 2617.5, "completions/min_length": 2503.0, "completions/min_terminated_length": 2503.0, "epoch": 0.20504201680672268, "frac_reward_zero_std": 0.0, "grad_norm": 0.2652413547039032, "kl": 0.5015753507614136, "learning_rate": 4.883333333333334e-06, "loss": 0.0005, "num_tokens": 2395317.0, "reward": 8.609277725219727, "reward_std": 1.2459663152694702, "rewards/reward_model/mean": 8.609277725219727, "rewards/reward_model/std": 1.2459663152694702, "step": 122 }, { "completion_length": 1957.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2337.0, "completions/max_terminated_length": 2337.0, "completions/mean_length": 1957.25, "completions/mean_terminated_length": 1957.25, "completions/min_length": 1258.0, "completions/min_terminated_length": 1258.0, "epoch": 0.20672268907563024, "frac_reward_zero_std": 0.0, "grad_norm": 0.4272514283657074, "kl": 0.40108439326286316, "learning_rate": 4.877777777777778e-06, "loss": 0.0004, "num_tokens": 2414138.0, "reward": 6.875, "reward_std": 2.3228933811187744, "rewards/reward_model/mean": 6.875, "rewards/reward_model/std": 2.3228933811187744, "step": 123 }, { "completion_length": 1083.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1481.0, "completions/max_terminated_length": 1481.0, "completions/mean_length": 1083.0, "completions/mean_terminated_length": 1083.0, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.20840336134453782, "frac_reward_zero_std": 0.0, "grad_norm": 0.726239800453186, "kl": 1.0261199474334717, "learning_rate": 4.8722222222222225e-06, "loss": 0.001, "num_tokens": 2428382.0, "reward": 6.302136421203613, "reward_std": 2.2844040393829346, "rewards/reward_model/mean": 6.302136421203613, "rewards/reward_model/std": 2.2844040393829346, "step": 124 }, { "completion_length": 2840.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2960.0, "completions/max_terminated_length": 2960.0, "completions/mean_length": 2840.25, "completions/mean_terminated_length": 2840.25, "completions/min_length": 2633.0, "completions/min_terminated_length": 2633.0, "epoch": 0.21008403361344538, "frac_reward_zero_std": 0.0, "grad_norm": 0.25508490204811096, "kl": 0.4664878845214844, "learning_rate": 4.866666666666667e-06, "loss": 0.0005, "num_tokens": 2449003.0, "reward": 8.282108306884766, "reward_std": 0.6589659452438354, "rewards/reward_model/mean": 8.282108306884766, "rewards/reward_model/std": 0.6589656472206116, "step": 125 }, { "completion_length": 1021.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1632.0, "completions/max_terminated_length": 1632.0, "completions/mean_length": 1021.75, "completions/mean_terminated_length": 1021.75, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.21176470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 0.5321233868598938, "kl": 0.9480860829353333, "learning_rate": 4.861111111111111e-06, "loss": 0.0009, "num_tokens": 2462086.0, "reward": 7.718278884887695, "reward_std": 0.9998491406440735, "rewards/reward_model/mean": 7.718278884887695, "rewards/reward_model/std": 0.999849259853363, "step": 126 }, { "completion_length": 1045.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1353.0, "completions/max_terminated_length": 1353.0, "completions/mean_length": 1045.25, "completions/mean_terminated_length": 1045.25, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.2134453781512605, "frac_reward_zero_std": 0.0, "grad_norm": 0.5783733129501343, "kl": 0.9520083069801331, "learning_rate": 4.855555555555556e-06, "loss": 0.001, "num_tokens": 2475995.0, "reward": 7.649256706237793, "reward_std": 0.715004563331604, "rewards/reward_model/mean": 7.649256706237793, "rewards/reward_model/std": 0.7150046229362488, "step": 127 }, { "completion_length": 1338.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1888.0, "completions/max_terminated_length": 1888.0, "completions/mean_length": 1338.75, "completions/mean_terminated_length": 1338.75, "completions/min_length": 1130.0, "completions/min_terminated_length": 1130.0, "epoch": 0.21512605042016808, "frac_reward_zero_std": 0.0, "grad_norm": 0.6013649702072144, "kl": 0.7449133992195129, "learning_rate": 4.85e-06, "loss": 0.0007, "num_tokens": 2492062.0, "reward": 7.158843994140625, "reward_std": 0.819972574710846, "rewards/reward_model/mean": 7.158843994140625, "rewards/reward_model/std": 0.819972574710846, "step": 128 }, { "completion_length": 910.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1386.0, "completions/max_terminated_length": 1386.0, "completions/mean_length": 910.25, "completions/mean_terminated_length": 910.25, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.21680672268907564, "frac_reward_zero_std": 0.0, "grad_norm": 0.5794121026992798, "kl": 0.9318625926971436, "learning_rate": 4.8444444444444446e-06, "loss": 0.0009, "num_tokens": 2506207.0, "reward": 6.760138988494873, "reward_std": 0.8559876680374146, "rewards/reward_model/mean": 6.760138988494873, "rewards/reward_model/std": 0.8559876084327698, "step": 129 }, { "completion_length": 2844.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3094.0, "completions/max_terminated_length": 3094.0, "completions/mean_length": 2844.5, "completions/mean_terminated_length": 2844.5, "completions/min_length": 2625.0, "completions/min_terminated_length": 2625.0, "epoch": 0.2184873949579832, "frac_reward_zero_std": 0.0, "grad_norm": 0.23493915796279907, "kl": 0.46635761857032776, "learning_rate": 4.838888888888889e-06, "loss": 0.0005, "num_tokens": 2526885.0, "reward": 8.46530532836914, "reward_std": 1.7754273414611816, "rewards/reward_model/mean": 8.46530532836914, "rewards/reward_model/std": 1.7754271030426025, "step": 130 }, { "completion_length": 2762.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2971.0, "completions/max_terminated_length": 2971.0, "completions/mean_length": 2762.25, "completions/mean_terminated_length": 2762.25, "completions/min_length": 2651.0, "completions/min_terminated_length": 2651.0, "epoch": 0.22016806722689075, "frac_reward_zero_std": 0.0, "grad_norm": 0.24335390329360962, "kl": 0.43510404229164124, "learning_rate": 4.833333333333333e-06, "loss": 0.0004, "num_tokens": 2547110.0, "reward": 7.015050888061523, "reward_std": 0.8529961705207825, "rewards/reward_model/mean": 7.015050888061523, "rewards/reward_model/std": 0.8529962301254272, "step": 131 }, { "completion_length": 2942.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3428.0, "completions/max_terminated_length": 3428.0, "completions/mean_length": 2942.5, "completions/mean_terminated_length": 2942.5, "completions/min_length": 2555.0, "completions/min_terminated_length": 2555.0, "epoch": 0.2218487394957983, "frac_reward_zero_std": 0.0, "grad_norm": 0.22997471690177917, "kl": 0.461277037858963, "learning_rate": 4.827777777777778e-06, "loss": 0.0005, "num_tokens": 2568532.0, "reward": 7.938364028930664, "reward_std": 1.288348913192749, "rewards/reward_model/mean": 7.938364028930664, "rewards/reward_model/std": 1.2883487939834595, "step": 132 }, { "completion_length": 1835.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2406.0, "completions/max_terminated_length": 2406.0, "completions/mean_length": 1835.5, "completions/mean_terminated_length": 1835.5, "completions/min_length": 1467.0, "completions/min_terminated_length": 1467.0, "epoch": 0.2235294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 0.3291085660457611, "kl": 0.31088417768478394, "learning_rate": 4.822222222222222e-06, "loss": 0.0003, "num_tokens": 2584746.0, "reward": 9.25, "reward_std": 1.5, "rewards/reward_model/mean": 9.25, "rewards/reward_model/std": 1.5, "step": 133 }, { "completion_length": 2998.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3211.0, "completions/max_terminated_length": 3211.0, "completions/mean_length": 2998.5, "completions/mean_terminated_length": 2998.5, "completions/min_length": 2824.0, "completions/min_terminated_length": 2824.0, "epoch": 0.22521008403361345, "frac_reward_zero_std": 0.0, "grad_norm": 0.23152707517147064, "kl": 0.46097418665885925, "learning_rate": 4.816666666666667e-06, "loss": 0.0005, "num_tokens": 2606156.0, "reward": 8.750370025634766, "reward_std": 0.4992596507072449, "rewards/reward_model/mean": 8.750370025634766, "rewards/reward_model/std": 0.49925950169563293, "step": 134 }, { "completion_length": 1982.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2127.0, "completions/max_terminated_length": 2127.0, "completions/mean_length": 1982.75, "completions/mean_terminated_length": 1982.75, "completions/min_length": 1816.0, "completions/min_terminated_length": 1816.0, "epoch": 0.226890756302521, "frac_reward_zero_std": 0.0, "grad_norm": 0.3510342240333557, "kl": 0.34647998213768005, "learning_rate": 4.811111111111111e-06, "loss": 0.0003, "num_tokens": 2624427.0, "reward": 8.875, "reward_std": 1.9311050176620483, "rewards/reward_model/mean": 8.875, "rewards/reward_model/std": 1.9311050176620483, "step": 135 }, { "completion_length": 1114.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2058.0, "completions/max_terminated_length": 2058.0, "completions/mean_length": 1114.25, "completions/mean_terminated_length": 1114.25, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.22857142857142856, "frac_reward_zero_std": 0.0, "grad_norm": 0.5375041961669922, "kl": 0.9781880974769592, "learning_rate": 4.805555555555556e-06, "loss": 0.001, "num_tokens": 2638572.0, "reward": 6.138072490692139, "reward_std": 1.0945528745651245, "rewards/reward_model/mean": 6.138072490692139, "rewards/reward_model/std": 1.094552993774414, "step": 136 }, { "completion_length": 2201.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2541.0, "completions/max_terminated_length": 2541.0, "completions/mean_length": 2201.5, "completions/mean_terminated_length": 2201.5, "completions/min_length": 1675.0, "completions/min_terminated_length": 1675.0, "epoch": 0.23025210084033612, "frac_reward_zero_std": 0.0, "grad_norm": 0.3466634750366211, "kl": 0.33992475271224976, "learning_rate": 4.800000000000001e-06, "loss": 0.0003, "num_tokens": 2656674.0, "reward": 5.375, "reward_std": 2.2867372035980225, "rewards/reward_model/mean": 5.375, "rewards/reward_model/std": 2.2867372035980225, "step": 137 }, { "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.2319327731092437, "frac_reward_zero_std": 1.0, "grad_norm": 0.02911699190735817, "kl": 1.452804446220398, "learning_rate": 4.794444444444445e-06, "loss": 0.0015, "num_tokens": 2664726.0, "reward": 10.0, "reward_std": 0.0, "rewards/reward_model/mean": 10.0, "rewards/reward_model/std": 0.0, "step": 138 }, { "completion_length": 769.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1314.0, "completions/max_terminated_length": 1314.0, "completions/mean_length": 769.75, "completions/mean_terminated_length": 769.75, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.23361344537815126, "frac_reward_zero_std": 0.0, "grad_norm": 0.9029837846755981, "kl": 1.0116665363311768, "learning_rate": 4.7888888888888894e-06, "loss": 0.001, "num_tokens": 2677333.0, "reward": 7.218898773193359, "reward_std": 1.1574432849884033, "rewards/reward_model/mean": 7.218898773193359, "rewards/reward_model/std": 1.1574432849884033, "step": 139 }, { "completion_length": 3052.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4165.0, "completions/max_terminated_length": 4165.0, "completions/mean_length": 3052.5, "completions/mean_terminated_length": 3052.5, "completions/min_length": 2442.0, "completions/min_terminated_length": 2442.0, "epoch": 0.23529411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 0.25393086671829224, "kl": 0.4295288622379303, "learning_rate": 4.783333333333334e-06, "loss": 0.0004, "num_tokens": 2700003.0, "reward": 8.008160591125488, "reward_std": 0.7072951197624207, "rewards/reward_model/mean": 8.008160591125488, "rewards/reward_model/std": 0.7072951793670654, "step": 140 }, { "completion_length": 2434.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3146.0, "completions/max_terminated_length": 3146.0, "completions/mean_length": 2434.5, "completions/mean_terminated_length": 2434.5, "completions/min_length": 1822.0, "completions/min_terminated_length": 1822.0, "epoch": 0.23697478991596638, "frac_reward_zero_std": 0.0, "grad_norm": 0.3451744318008423, "kl": 0.36107468605041504, "learning_rate": 4.777777777777778e-06, "loss": 0.0004, "num_tokens": 2719029.0, "reward": 6.0, "reward_std": 1.7320507764816284, "rewards/reward_model/mean": 6.0, "rewards/reward_model/std": 1.7320507764816284, "step": 141 }, { "completion_length": 2533.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3518.0, "completions/max_terminated_length": 3518.0, "completions/mean_length": 2533.5, "completions/mean_terminated_length": 2533.5, "completions/min_length": 1936.0, "completions/min_terminated_length": 1936.0, "epoch": 0.23865546218487396, "frac_reward_zero_std": 0.0, "grad_norm": 0.22997406125068665, "kl": 0.27732178568840027, "learning_rate": 4.772222222222223e-06, "loss": 0.0003, "num_tokens": 2738123.0, "reward": 6.75, "reward_std": 2.362907886505127, "rewards/reward_model/mean": 6.75, "rewards/reward_model/std": 2.362907886505127, "step": 142 }, { "completion_length": 486.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1192.0, "completions/max_terminated_length": 1192.0, "completions/mean_length": 486.0, "completions/mean_terminated_length": 486.0, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.24033613445378152, "frac_reward_zero_std": 0.0, "grad_norm": 0.7206143736839294, "kl": 1.2204090356826782, "learning_rate": 4.766666666666667e-06, "loss": 0.0012, "num_tokens": 2749363.0, "reward": 7.29646110534668, "reward_std": 0.6988224983215332, "rewards/reward_model/mean": 7.29646110534668, "rewards/reward_model/std": 0.698822557926178, "step": 143 }, { "completion_length": 3001.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3150.0, "completions/max_terminated_length": 3150.0, "completions/mean_length": 3001.5, "completions/mean_terminated_length": 3001.5, "completions/min_length": 2846.0, "completions/min_terminated_length": 2846.0, "epoch": 0.24201680672268908, "frac_reward_zero_std": 0.0, "grad_norm": 0.23584291338920593, "kl": 0.4656028151512146, "learning_rate": 4.7611111111111115e-06, "loss": 0.0005, "num_tokens": 2771829.0, "reward": 7.873873710632324, "reward_std": 1.058119297027588, "rewards/reward_model/mean": 7.873873710632324, "rewards/reward_model/std": 1.0581194162368774, "step": 144 }, { "completion_length": 2899.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3358.0, "completions/max_terminated_length": 3358.0, "completions/mean_length": 2899.5, "completions/mean_terminated_length": 2899.5, "completions/min_length": 2521.0, "completions/min_terminated_length": 2521.0, "epoch": 0.24369747899159663, "frac_reward_zero_std": 0.0, "grad_norm": 0.222762793302536, "kl": 0.45958980917930603, "learning_rate": 4.755555555555556e-06, "loss": 0.0005, "num_tokens": 2792747.0, "reward": 8.856507301330566, "reward_std": 0.8436880707740784, "rewards/reward_model/mean": 8.856507301330566, "rewards/reward_model/std": 0.8436882495880127, "step": 145 }, { "completion_length": 2922.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3402.0, "completions/max_terminated_length": 3402.0, "completions/mean_length": 2922.5, "completions/mean_terminated_length": 2922.5, "completions/min_length": 2679.0, "completions/min_terminated_length": 2679.0, "epoch": 0.2453781512605042, "frac_reward_zero_std": 0.0, "grad_norm": 0.21581555902957916, "kl": 0.44578394293785095, "learning_rate": 4.75e-06, "loss": 0.0004, "num_tokens": 2813509.0, "reward": 8.549777030944824, "reward_std": 0.6061685681343079, "rewards/reward_model/mean": 8.549777030944824, "rewards/reward_model/std": 0.6061685085296631, "step": 146 }, { "completion_length": 1648.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1727.0, "completions/max_terminated_length": 1727.0, "completions/mean_length": 1648.25, "completions/mean_terminated_length": 1648.25, "completions/min_length": 1609.0, "completions/min_terminated_length": 1609.0, "epoch": 0.24705882352941178, "frac_reward_zero_std": 0.0, "grad_norm": 0.3748073875904083, "kl": 0.39842212200164795, "learning_rate": 4.744444444444445e-06, "loss": 0.0004, "num_tokens": 2829714.0, "reward": 8.25, "reward_std": 2.0615527629852295, "rewards/reward_model/mean": 8.25, "rewards/reward_model/std": 2.0615527629852295, "step": 147 }, { "completion_length": 2712.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2883.0, "completions/max_terminated_length": 2883.0, "completions/mean_length": 2712.25, "completions/mean_terminated_length": 2712.25, "completions/min_length": 2541.0, "completions/min_terminated_length": 2541.0, "epoch": 0.24873949579831933, "frac_reward_zero_std": 0.0, "grad_norm": 0.23967885971069336, "kl": 0.4646434783935547, "learning_rate": 4.73888888888889e-06, "loss": 0.0005, "num_tokens": 2849275.0, "reward": 8.54823112487793, "reward_std": 0.7377526760101318, "rewards/reward_model/mean": 8.54823112487793, "rewards/reward_model/std": 0.7377527952194214, "step": 148 }, { "completion_length": 2780.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3075.0, "completions/max_terminated_length": 3075.0, "completions/mean_length": 2780.0, "completions/mean_terminated_length": 2780.0, "completions/min_length": 2484.0, "completions/min_terminated_length": 2484.0, "epoch": 0.2504201680672269, "frac_reward_zero_std": 0.0, "grad_norm": 0.2209819257259369, "kl": 0.4562610387802124, "learning_rate": 4.7333333333333335e-06, "loss": 0.0005, "num_tokens": 2869639.0, "reward": 7.664841175079346, "reward_std": 1.110151767730713, "rewards/reward_model/mean": 7.664841175079346, "rewards/reward_model/std": 1.1101515293121338, "step": 149 }, { "completion_length": 1810.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2114.0, "completions/max_terminated_length": 2114.0, "completions/mean_length": 1810.0, "completions/mean_terminated_length": 1810.0, "completions/min_length": 1502.0, "completions/min_terminated_length": 1502.0, "epoch": 0.25210084033613445, "frac_reward_zero_std": 0.0, "grad_norm": 0.3472149968147278, "kl": 0.4088854193687439, "learning_rate": 4.727777777777779e-06, "loss": 0.0004, "num_tokens": 2886107.0, "reward": 6.125, "reward_std": 4.269562721252441, "rewards/reward_model/mean": 6.125, "rewards/reward_model/std": 4.2695631980896, "step": 150 }, { "completion_length": 2084.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2866.0, "completions/max_terminated_length": 2866.0, "completions/mean_length": 2084.75, "completions/mean_terminated_length": 2084.75, "completions/min_length": 1248.0, "completions/min_terminated_length": 1248.0, "epoch": 0.253781512605042, "frac_reward_zero_std": 0.0, "grad_norm": 0.462090402841568, "kl": 0.37683919072151184, "learning_rate": 4.722222222222222e-06, "loss": 0.0004, "num_tokens": 2904326.0, "reward": 5.75, "reward_std": 4.193248748779297, "rewards/reward_model/mean": 5.75, "rewards/reward_model/std": 4.193248748779297, "step": 151 }, { "completion_length": 3218.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3679.0, "completions/max_terminated_length": 3679.0, "completions/mean_length": 3218.25, "completions/mean_terminated_length": 3218.25, "completions/min_length": 2463.0, "completions/min_terminated_length": 2463.0, "epoch": 0.25546218487394956, "frac_reward_zero_std": 0.0, "grad_norm": 0.22532090544700623, "kl": 0.4902237057685852, "learning_rate": 4.7166666666666675e-06, "loss": 0.0005, "num_tokens": 2926371.0, "reward": 7.50351619720459, "reward_std": 1.5778533220291138, "rewards/reward_model/mean": 7.50351619720459, "rewards/reward_model/std": 1.5778533220291138, "step": 152 }, { "completion_length": 2763.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2868.0, "completions/max_terminated_length": 2868.0, "completions/mean_length": 2763.5, "completions/mean_terminated_length": 2763.5, "completions/min_length": 2580.0, "completions/min_terminated_length": 2580.0, "epoch": 0.2571428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 0.22987692058086395, "kl": 0.5656647682189941, "learning_rate": 4.711111111111111e-06, "loss": 0.0006, "num_tokens": 2946173.0, "reward": 8.71992301940918, "reward_std": 0.3809349834918976, "rewards/reward_model/mean": 8.71992301940918, "rewards/reward_model/std": 0.38093510270118713, "step": 153 }, { "completion_length": 527.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1341.0, "completions/max_terminated_length": 1341.0, "completions/mean_length": 527.25, "completions/mean_terminated_length": 527.25, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.25882352941176473, "frac_reward_zero_std": 0.0, "grad_norm": 0.5611518621444702, "kl": 1.0655821561813354, "learning_rate": 4.705555555555556e-06, "loss": 0.0011, "num_tokens": 2958790.0, "reward": 7.388204574584961, "reward_std": 0.7416150569915771, "rewards/reward_model/mean": 7.388204574584961, "rewards/reward_model/std": 0.7416151762008667, "step": 154 }, { "completion_length": 2629.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2822.0, "completions/max_terminated_length": 2822.0, "completions/mean_length": 2629.25, "completions/mean_terminated_length": 2629.25, "completions/min_length": 2321.0, "completions/min_terminated_length": 2321.0, "epoch": 0.2605042016806723, "frac_reward_zero_std": 0.0, "grad_norm": 0.21701310575008392, "kl": 0.4524945318698883, "learning_rate": 4.7e-06, "loss": 0.0005, "num_tokens": 2979667.0, "reward": 9.411394119262695, "reward_std": 0.45068293809890747, "rewards/reward_model/mean": 9.411394119262695, "rewards/reward_model/std": 0.450682669878006, "step": 155 }, { "completion_length": 806.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1536.0, "completions/max_terminated_length": 1536.0, "completions/mean_length": 806.5, "completions/mean_terminated_length": 806.5, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.26218487394957984, "frac_reward_zero_std": 0.0, "grad_norm": 1.1670176982879639, "kl": 1.115036129951477, "learning_rate": 4.694444444444445e-06, "loss": 0.0011, "num_tokens": 2992293.0, "reward": 7.309442043304443, "reward_std": 1.4336081743240356, "rewards/reward_model/mean": 7.309442043304443, "rewards/reward_model/std": 1.4336081743240356, "step": 156 }, { "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.2638655462184874, "frac_reward_zero_std": 1.0, "grad_norm": 3.568586544133723e-05, "kl": 3.766537666320801, "learning_rate": 4.6888888888888895e-06, "loss": 0.0038, "num_tokens": 3001013.0, "reward": 10.0, "reward_std": 0.0, "rewards/reward_model/mean": 10.0, "rewards/reward_model/std": 0.0, "step": 157 }, { "completion_length": 2871.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3163.0, "completions/max_terminated_length": 3163.0, "completions/mean_length": 2871.5, "completions/mean_terminated_length": 2871.5, "completions/min_length": 2385.0, "completions/min_terminated_length": 2385.0, "epoch": 0.26554621848739496, "frac_reward_zero_std": 0.0, "grad_norm": 0.20554177463054657, "kl": 0.474880188703537, "learning_rate": 4.683333333333334e-06, "loss": 0.0005, "num_tokens": 3022867.0, "reward": 7.569443702697754, "reward_std": 2.278564214706421, "rewards/reward_model/mean": 7.569443702697754, "rewards/reward_model/std": 2.278564214706421, "step": 158 }, { "completion_length": 2923.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3323.0, "completions/max_terminated_length": 3323.0, "completions/mean_length": 2923.0, "completions/mean_terminated_length": 2923.0, "completions/min_length": 2266.0, "completions/min_terminated_length": 2266.0, "epoch": 0.2672268907563025, "frac_reward_zero_std": 0.0, "grad_norm": 0.220077246427536, "kl": 0.43240219354629517, "learning_rate": 4.677777777777778e-06, "loss": 0.0004, "num_tokens": 3045187.0, "reward": 8.473838806152344, "reward_std": 1.3453700542449951, "rewards/reward_model/mean": 8.473838806152344, "rewards/reward_model/std": 1.3453701734542847, "step": 159 }, { "completion_length": 2675.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2963.0, "completions/max_terminated_length": 2963.0, "completions/mean_length": 2675.25, "completions/mean_terminated_length": 2675.25, "completions/min_length": 2503.0, "completions/min_terminated_length": 2503.0, "epoch": 0.2689075630252101, "frac_reward_zero_std": 0.0, "grad_norm": 0.24588997662067413, "kl": 0.5831613540649414, "learning_rate": 4.672222222222223e-06, "loss": 0.0006, "num_tokens": 3064848.0, "reward": 9.232027053833008, "reward_std": 0.19261541962623596, "rewards/reward_model/mean": 9.232027053833008, "rewards/reward_model/std": 0.1926155984401703, "step": 160 }, { "completion_length": 2873.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2931.0, "completions/max_terminated_length": 2931.0, "completions/mean_length": 2873.5, "completions/mean_terminated_length": 2873.5, "completions/min_length": 2782.0, "completions/min_terminated_length": 2782.0, "epoch": 0.27058823529411763, "frac_reward_zero_std": 0.0, "grad_norm": 0.22928255796432495, "kl": 0.7358444333076477, "learning_rate": 4.666666666666667e-06, "loss": 0.0007, "num_tokens": 3085830.0, "reward": 8.405526161193848, "reward_std": 1.475093960762024, "rewards/reward_model/mean": 8.405526161193848, "rewards/reward_model/std": 1.4750943183898926, "step": 161 }, { "completion_length": 3030.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3525.0, "completions/max_terminated_length": 3525.0, "completions/mean_length": 3030.0, "completions/mean_terminated_length": 3030.0, "completions/min_length": 2763.0, "completions/min_terminated_length": 2763.0, "epoch": 0.2722689075630252, "frac_reward_zero_std": 0.0, "grad_norm": 0.22029252350330353, "kl": 0.47707507014274597, "learning_rate": 4.6611111111111116e-06, "loss": 0.0005, "num_tokens": 3108074.0, "reward": 8.712586402893066, "reward_std": 0.9264523983001709, "rewards/reward_model/mean": 8.712586402893066, "rewards/reward_model/std": 0.9264521598815918, "step": 162 }, { "completion_length": 2786.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3248.0, "completions/max_terminated_length": 3248.0, "completions/mean_length": 2786.0, "completions/mean_terminated_length": 2786.0, "completions/min_length": 2233.0, "completions/min_terminated_length": 2233.0, "epoch": 0.2739495798319328, "frac_reward_zero_std": 0.0, "grad_norm": 0.29195111989974976, "kl": 0.4199967384338379, "learning_rate": 4.655555555555556e-06, "loss": 0.0004, "num_tokens": 3129658.0, "reward": 7.955666542053223, "reward_std": 2.5929222106933594, "rewards/reward_model/mean": 7.955666542053223, "rewards/reward_model/std": 2.5929222106933594, "step": 163 }, { "completion_length": 2653.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2879.0, "completions/max_terminated_length": 2879.0, "completions/mean_length": 2653.0, "completions/mean_terminated_length": 2653.0, "completions/min_length": 2464.0, "completions/min_terminated_length": 2464.0, "epoch": 0.27563025210084036, "frac_reward_zero_std": 0.0, "grad_norm": 0.2398880124092102, "kl": 0.6989083886146545, "learning_rate": 4.65e-06, "loss": 0.0007, "num_tokens": 3149762.0, "reward": 8.421257019042969, "reward_std": 2.498462677001953, "rewards/reward_model/mean": 8.421257019042969, "rewards/reward_model/std": 2.498462677001953, "step": 164 }, { "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.2773109243697479, "frac_reward_zero_std": 1.0, "grad_norm": 0.11003076285123825, "kl": 3.142202377319336, "learning_rate": 4.644444444444445e-06, "loss": 0.0031, "num_tokens": 3158262.0, "reward": 10.0, "reward_std": 0.0, "rewards/reward_model/mean": 10.0, "rewards/reward_model/std": 0.0, "step": 165 }, { "completion_length": 3022.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3399.0, "completions/max_terminated_length": 3399.0, "completions/mean_length": 3022.75, "completions/mean_terminated_length": 3022.75, "completions/min_length": 2646.0, "completions/min_terminated_length": 2646.0, "epoch": 0.27899159663865547, "frac_reward_zero_std": 0.0, "grad_norm": 0.22422586381435394, "kl": 0.44220367074012756, "learning_rate": 4.638888888888889e-06, "loss": 0.0004, "num_tokens": 3181801.0, "reward": 7.756802558898926, "reward_std": 1.6295182704925537, "rewards/reward_model/mean": 7.756802558898926, "rewards/reward_model/std": 1.6295182704925537, "step": 166 }, { "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.280672268907563, "frac_reward_zero_std": 1.0, "grad_norm": 0.094392791390419, "kl": 3.158621072769165, "learning_rate": 4.633333333333334e-06, "loss": 0.0032, "num_tokens": 3190213.0, "reward": 10.0, "reward_std": 0.0, "rewards/reward_model/mean": 10.0, "rewards/reward_model/std": 0.0, "step": 167 }, { "completion_length": 2663.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3069.0, "completions/max_terminated_length": 3069.0, "completions/mean_length": 2663.5, "completions/mean_terminated_length": 2663.5, "completions/min_length": 2385.0, "completions/min_terminated_length": 2385.0, "epoch": 0.2823529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 0.2424425184726715, "kl": 0.5139824151992798, "learning_rate": 4.627777777777778e-06, "loss": 0.0005, "num_tokens": 3210791.0, "reward": 8.023183822631836, "reward_std": 0.60601407289505, "rewards/reward_model/mean": 8.023183822631836, "rewards/reward_model/std": 0.6060142517089844, "step": 168 }, { "completion_length": 2033.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2274.0, "completions/max_terminated_length": 2274.0, "completions/mean_length": 2033.25, "completions/mean_terminated_length": 2033.25, "completions/min_length": 1597.0, "completions/min_terminated_length": 1597.0, "epoch": 0.28403361344537814, "frac_reward_zero_std": 0.0, "grad_norm": 0.33141204714775085, "kl": 0.3702349364757538, "learning_rate": 4.622222222222222e-06, "loss": 0.0004, "num_tokens": 3228656.0, "reward": 6.625, "reward_std": 2.462214469909668, "rewards/reward_model/mean": 6.625, "rewards/reward_model/std": 2.462214469909668, "step": 169 }, { "completion_length": 2960.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3061.0, "completions/max_terminated_length": 3061.0, "completions/mean_length": 2960.75, "completions/mean_terminated_length": 2960.75, "completions/min_length": 2787.0, "completions/min_terminated_length": 2787.0, "epoch": 0.2857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.24120405316352844, "kl": 0.4731678366661072, "learning_rate": 4.616666666666667e-06, "loss": 0.0005, "num_tokens": 3250495.0, "reward": 8.096492767333984, "reward_std": 2.1864511966705322, "rewards/reward_model/mean": 8.096492767333984, "rewards/reward_model/std": 2.186450958251953, "step": 170 }, { "completion_length": 1669.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1895.0, "completions/max_terminated_length": 1895.0, "completions/mean_length": 1669.5, "completions/mean_terminated_length": 1669.5, "completions/min_length": 1364.0, "completions/min_terminated_length": 1364.0, "epoch": 0.28739495798319326, "frac_reward_zero_std": 0.0, "grad_norm": 0.3854559063911438, "kl": 0.4447607100009918, "learning_rate": 4.611111111111112e-06, "loss": 0.0004, "num_tokens": 3267873.0, "reward": 9.125, "reward_std": 1.4361406564712524, "rewards/reward_model/mean": 9.125, "rewards/reward_model/std": 1.4361406564712524, "step": 171 }, { "completion_length": 1177.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1740.0, "completions/max_terminated_length": 1740.0, "completions/mean_length": 1177.0, "completions/mean_terminated_length": 1177.0, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.28907563025210087, "frac_reward_zero_std": 0.0, "grad_norm": 0.5531060695648193, "kl": 0.856512725353241, "learning_rate": 4.605555555555556e-06, "loss": 0.0009, "num_tokens": 3282981.0, "reward": 5.437447547912598, "reward_std": 2.498600721359253, "rewards/reward_model/mean": 5.437447547912598, "rewards/reward_model/std": 2.498600721359253, "step": 172 }, { "completion_length": 35.5, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 35.5, "completions/mean_terminated_length": 35.5, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.2907563025210084, "frac_reward_zero_std": 1.0, "grad_norm": 0.0045571294613182545, "kl": 2.7890782356262207, "learning_rate": 4.600000000000001e-06, "loss": 0.0028, "num_tokens": 3290631.0, "reward": 10.0, "reward_std": 0.0, "rewards/reward_model/mean": 10.0, "rewards/reward_model/std": 0.0, "step": 173 }, { "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.292436974789916, "frac_reward_zero_std": 1.0, "grad_norm": 0.07329612970352173, "kl": 0.8356922268867493, "learning_rate": 4.594444444444444e-06, "loss": 0.0008, "num_tokens": 3298979.0, "reward": 10.0, "reward_std": 0.0, "rewards/reward_model/mean": 10.0, "rewards/reward_model/std": 0.0, "step": 174 }, { "completion_length": 1129.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2464.0, "completions/max_terminated_length": 2464.0, "completions/mean_length": 1129.25, "completions/mean_terminated_length": 1129.25, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.29411764705882354, "frac_reward_zero_std": 0.0, "grad_norm": 0.6764624118804932, "kl": 0.7676644921302795, "learning_rate": 4.58888888888889e-06, "loss": 0.0008, "num_tokens": 3313872.0, "reward": 6.710842132568359, "reward_std": 1.0217556953430176, "rewards/reward_model/mean": 6.710842132568359, "rewards/reward_model/std": 1.0217556953430176, "step": 175 }, { "completion_length": 1741.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2025.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1741.0, "completions/mean_terminated_length": 1741.0, "completions/min_length": 1512.0, "completions/min_terminated_length": 1512.0, "epoch": 0.2957983193277311, "frac_reward_zero_std": 0.0, "grad_norm": 0.3833245038986206, "kl": 0.408497154712677, "learning_rate": 4.583333333333333e-06, "loss": 0.0004, "num_tokens": 3329552.0, "reward": 4.75, "reward_std": 2.901149272918701, "rewards/reward_model/mean": 4.75, "rewards/reward_model/std": 2.901149272918701, "step": 176 }, { "completion_length": 822.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1241.0, "completions/max_terminated_length": 1241.0, "completions/mean_length": 822.5, "completions/mean_terminated_length": 822.5, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.29747899159663865, "frac_reward_zero_std": 0.0, "grad_norm": 0.7895044088363647, "kl": 0.9263789057731628, "learning_rate": 4.5777777777777785e-06, "loss": 0.0009, "num_tokens": 3342470.0, "reward": 6.625405311584473, "reward_std": 0.6329342126846313, "rewards/reward_model/mean": 6.625405311584473, "rewards/reward_model/std": 0.6329342126846313, "step": 177 }, { "completion_length": 2721.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2925.0, "completions/max_terminated_length": 2925.0, "completions/mean_length": 2721.25, "completions/mean_terminated_length": 2721.25, "completions/min_length": 2488.0, "completions/min_terminated_length": 2488.0, "epoch": 0.2991596638655462, "frac_reward_zero_std": 0.0, "grad_norm": 0.23519279062747955, "kl": 0.47075846791267395, "learning_rate": 4.572222222222222e-06, "loss": 0.0005, "num_tokens": 3362955.0, "reward": 8.75, "reward_std": 1.3994046449661255, "rewards/reward_model/mean": 8.75, "rewards/reward_model/std": 1.3994046449661255, "step": 178 }, { "completion_length": 2845.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3041.0, "completions/max_terminated_length": 3041.0, "completions/mean_length": 2845.5, "completions/mean_terminated_length": 2845.5, "completions/min_length": 2671.0, "completions/min_terminated_length": 2671.0, "epoch": 0.30084033613445377, "frac_reward_zero_std": 0.0, "grad_norm": 0.22595342993736267, "kl": 0.4554749131202698, "learning_rate": 4.566666666666667e-06, "loss": 0.0005, "num_tokens": 3383937.0, "reward": 8.375, "reward_std": 1.5612494945526123, "rewards/reward_model/mean": 8.375, "rewards/reward_model/std": 1.5612494945526123, "step": 179 }, { "completion_length": 2462.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2915.0, "completions/max_terminated_length": 2915.0, "completions/mean_length": 2462.25, "completions/mean_terminated_length": 2462.25, "completions/min_length": 2137.0, "completions/min_terminated_length": 2137.0, "epoch": 0.3025210084033613, "frac_reward_zero_std": 0.0, "grad_norm": 0.3311174511909485, "kl": 0.3128574788570404, "learning_rate": 4.561111111111112e-06, "loss": 0.0003, "num_tokens": 3403302.0, "reward": 8.5, "reward_std": 2.3804759979248047, "rewards/reward_model/mean": 8.5, "rewards/reward_model/std": 2.380476236343384, "step": 180 }, { "completion_length": 3018.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3157.0, "completions/max_terminated_length": 3157.0, "completions/mean_length": 3018.25, "completions/mean_terminated_length": 3018.25, "completions/min_length": 2856.0, "completions/min_terminated_length": 2856.0, "epoch": 0.3042016806722689, "frac_reward_zero_std": 0.0, "grad_norm": 0.23110154271125793, "kl": 0.42669227719306946, "learning_rate": 4.555555555555556e-06, "loss": 0.0004, "num_tokens": 3424899.0, "reward": 7.908246040344238, "reward_std": 1.5839347839355469, "rewards/reward_model/mean": 7.908246040344238, "rewards/reward_model/std": 1.5839349031448364, "step": 181 }, { "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.3058823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 5.533947387448279e-06, "kl": 0.6830087304115295, "learning_rate": 4.5500000000000005e-06, "loss": 0.0007, "num_tokens": 3433059.0, "reward": 10.0, "reward_std": 0.0, "rewards/reward_model/mean": 10.0, "rewards/reward_model/std": 0.0, "step": 182 }, { "completion_length": 2384.75, "completions/clipped_ratio": 0.25, "completions/max_length": 7232.0, "completions/max_terminated_length": 1347.0, "completions/mean_length": 2384.75, "completions/mean_terminated_length": 769.0, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.30756302521008405, "frac_reward_zero_std": 0.0, "grad_norm": 0.3204618990421295, "kl": 0.7881150841712952, "learning_rate": 4.544444444444445e-06, "loss": 0.0008, "num_tokens": 3453478.0, "reward": 4.037667274475098, "reward_std": 3.387129306793213, "rewards/reward_model/mean": 4.037667274475098, "rewards/reward_model/std": 3.387129068374634, "step": 183 }, { "completion_length": 2932.75, "completions/clipped_ratio": 0.25, "completions/max_length": 7232.0, "completions/max_terminated_length": 1755.0, "completions/mean_length": 2932.75, "completions/mean_terminated_length": 1499.666748046875, "completions/min_length": 1372.0, "completions/min_terminated_length": 1372.0, "epoch": 0.3092436974789916, "frac_reward_zero_std": 0.0, "grad_norm": 0.22691690921783447, "kl": 0.32286691665649414, "learning_rate": 4.538888888888889e-06, "loss": 0.0003, "num_tokens": 3474185.0, "reward": 6.0, "reward_std": 3.535533905029297, "rewards/reward_model/mean": 6.0, "rewards/reward_model/std": 3.535533905029297, "step": 184 }, { "completion_length": 1219.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1704.0, "completions/max_terminated_length": 1704.0, "completions/mean_length": 1219.5, "completions/mean_terminated_length": 1219.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.31092436974789917, "frac_reward_zero_std": 0.0, "grad_norm": 12.985053062438965, "kl": 1.6388633251190186, "learning_rate": 4.533333333333334e-06, "loss": 0.0016, "num_tokens": 3489371.0, "reward": 5.125, "reward_std": 3.6142079830169678, "rewards/reward_model/mean": 5.125, "rewards/reward_model/std": 3.6142079830169678, "step": 185 }, { "completion_length": 2768.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2975.0, "completions/max_terminated_length": 2975.0, "completions/mean_length": 2768.75, "completions/mean_terminated_length": 2768.75, "completions/min_length": 2353.0, "completions/min_terminated_length": 2353.0, "epoch": 0.3126050420168067, "frac_reward_zero_std": 0.0, "grad_norm": 0.22867242991924286, "kl": 0.47679272294044495, "learning_rate": 4.527777777777778e-06, "loss": 0.0005, "num_tokens": 3509922.0, "reward": 8.5, "reward_std": 1.0801234245300293, "rewards/reward_model/mean": 8.5, "rewards/reward_model/std": 1.0801235437393188, "step": 186 }, { "completion_length": 515.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1392.0, "completions/max_terminated_length": 1392.0, "completions/mean_length": 515.5, "completions/mean_terminated_length": 515.5, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.3142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.772415816783905, "kl": 1.1174834966659546, "learning_rate": 4.5222222222222225e-06, "loss": 0.0011, "num_tokens": 3521276.0, "reward": 6.782945156097412, "reward_std": 1.331932783126831, "rewards/reward_model/mean": 6.782945156097412, "rewards/reward_model/std": 1.3319326639175415, "step": 187 }, { "completion_length": 3300.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4073.0, "completions/max_terminated_length": 4073.0, "completions/mean_length": 3300.5, "completions/mean_terminated_length": 3300.5, "completions/min_length": 2707.0, "completions/min_terminated_length": 2707.0, "epoch": 0.31596638655462184, "frac_reward_zero_std": 0.0, "grad_norm": 0.2294078916311264, "kl": 0.47780218720436096, "learning_rate": 4.516666666666667e-06, "loss": 0.0005, "num_tokens": 3544978.0, "reward": 8.62745475769043, "reward_std": 0.3291955590248108, "rewards/reward_model/mean": 8.62745475769043, "rewards/reward_model/std": 0.3291955590248108, "step": 188 }, { "completion_length": 1702.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1841.0, "completions/max_terminated_length": 1841.0, "completions/mean_length": 1702.5, "completions/mean_terminated_length": 1702.5, "completions/min_length": 1544.0, "completions/min_terminated_length": 1544.0, "epoch": 0.3176470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 0.3584645092487335, "kl": 0.38060808181762695, "learning_rate": 4.511111111111111e-06, "loss": 0.0004, "num_tokens": 3561500.0, "reward": 9.125, "reward_std": 1.75, "rewards/reward_model/mean": 9.125, "rewards/reward_model/std": 1.75, "step": 189 }, { "completion_length": 1480.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1595.0, "completions/max_terminated_length": 1595.0, "completions/mean_length": 1480.5, "completions/mean_terminated_length": 1480.5, "completions/min_length": 1308.0, "completions/min_terminated_length": 1308.0, "epoch": 0.31932773109243695, "frac_reward_zero_std": 0.0, "grad_norm": 0.4364623427391052, "kl": 0.4817902445793152, "learning_rate": 4.505555555555556e-06, "loss": 0.0005, "num_tokens": 3576722.0, "reward": 7.875, "reward_std": 2.3935678005218506, "rewards/reward_model/mean": 7.875, "rewards/reward_model/std": 2.3935678005218506, "step": 190 }, { "completion_length": 2473.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2725.0, "completions/max_terminated_length": 2725.0, "completions/mean_length": 2473.25, "completions/mean_terminated_length": 2473.25, "completions/min_length": 1974.0, "completions/min_terminated_length": 1974.0, "epoch": 0.32100840336134456, "frac_reward_zero_std": 0.0, "grad_norm": 0.25333359837532043, "kl": 0.5164632201194763, "learning_rate": 4.5e-06, "loss": 0.0005, "num_tokens": 3597439.0, "reward": 8.10063362121582, "reward_std": 0.573161780834198, "rewards/reward_model/mean": 8.10063362121582, "rewards/reward_model/std": 0.573161780834198, "step": 191 }, { "completion_length": 2716.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2976.0, "completions/max_terminated_length": 2976.0, "completions/mean_length": 2716.5, "completions/mean_terminated_length": 2716.5, "completions/min_length": 2586.0, "completions/min_terminated_length": 2586.0, "epoch": 0.3226890756302521, "frac_reward_zero_std": 0.0, "grad_norm": 0.22468847036361694, "kl": 0.7855070233345032, "learning_rate": 4.4944444444444445e-06, "loss": 0.0008, "num_tokens": 3618181.0, "reward": 8.293278694152832, "reward_std": 1.2478078603744507, "rewards/reward_model/mean": 8.293278694152832, "rewards/reward_model/std": 1.2478079795837402, "step": 192 }, { "completion_length": 1787.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1934.0, "completions/max_terminated_length": 1934.0, "completions/mean_length": 1787.25, "completions/mean_terminated_length": 1787.25, "completions/min_length": 1577.0, "completions/min_terminated_length": 1577.0, "epoch": 0.3243697478991597, "frac_reward_zero_std": 0.0, "grad_norm": 0.44078683853149414, "kl": 0.3985818028450012, "learning_rate": 4.488888888888889e-06, "loss": 0.0004, "num_tokens": 3635362.0, "reward": 8.875, "reward_std": 1.6520190238952637, "rewards/reward_model/mean": 8.875, "rewards/reward_model/std": 1.6520190238952637, "step": 193 }, { "completion_length": 2601.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2945.0, "completions/max_terminated_length": 2945.0, "completions/mean_length": 2601.0, "completions/mean_terminated_length": 2601.0, "completions/min_length": 2142.0, "completions/min_terminated_length": 2142.0, "epoch": 0.32605042016806723, "frac_reward_zero_std": 0.0, "grad_norm": 0.27554836869239807, "kl": 0.5283735394477844, "learning_rate": 4.483333333333333e-06, "loss": 0.0005, "num_tokens": 3655434.0, "reward": 8.392618179321289, "reward_std": 0.2633104622364044, "rewards/reward_model/mean": 8.392618179321289, "rewards/reward_model/std": 0.2633104622364044, "step": 194 }, { "completion_length": 1337.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1536.0, "completions/max_terminated_length": 1536.0, "completions/mean_length": 1337.25, "completions/mean_terminated_length": 1337.25, "completions/min_length": 1161.0, "completions/min_terminated_length": 1161.0, "epoch": 0.3277310924369748, "frac_reward_zero_std": 0.0, "grad_norm": 0.5729833841323853, "kl": 0.7496541142463684, "learning_rate": 4.477777777777778e-06, "loss": 0.0007, "num_tokens": 3670847.0, "reward": 5.385059833526611, "reward_std": 2.0218420028686523, "rewards/reward_model/mean": 5.385059833526611, "rewards/reward_model/std": 2.0218420028686523, "step": 195 }, { "completion_length": 1217.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1555.0, "completions/max_terminated_length": 1555.0, "completions/mean_length": 1217.5, "completions/mean_terminated_length": 1217.5, "completions/min_length": 922.0, "completions/min_terminated_length": 922.0, "epoch": 0.32941176470588235, "frac_reward_zero_std": 0.0, "grad_norm": 0.54954594373703, "kl": 0.9338546395301819, "learning_rate": 4.472222222222223e-06, "loss": 0.0009, "num_tokens": 3685053.0, "reward": 6.616114139556885, "reward_std": 0.3764911890029907, "rewards/reward_model/mean": 6.616114139556885, "rewards/reward_model/std": 0.37649112939834595, "step": 196 }, { "completion_length": 1339.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1524.0, "completions/max_terminated_length": 1524.0, "completions/mean_length": 1339.75, "completions/mean_terminated_length": 1339.75, "completions/min_length": 1274.0, "completions/min_terminated_length": 1274.0, "epoch": 0.3310924369747899, "frac_reward_zero_std": 0.0, "grad_norm": 0.49502500891685486, "kl": 0.6899818181991577, "learning_rate": 4.4666666666666665e-06, "loss": 0.0007, "num_tokens": 3700456.0, "reward": 6.938961029052734, "reward_std": 0.47081485390663147, "rewards/reward_model/mean": 6.938961029052734, "rewards/reward_model/std": 0.470814973115921, "step": 197 }, { "completion_length": 2196.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2598.0, "completions/max_terminated_length": 2598.0, "completions/mean_length": 2196.25, "completions/mean_terminated_length": 2196.25, "completions/min_length": 1713.0, "completions/min_terminated_length": 1713.0, "epoch": 0.33277310924369746, "frac_reward_zero_std": 0.0, "grad_norm": 0.33635690808296204, "kl": 0.3528163731098175, "learning_rate": 4.461111111111112e-06, "loss": 0.0004, "num_tokens": 3717853.0, "reward": 8.75, "reward_std": 1.5545631647109985, "rewards/reward_model/mean": 8.75, "rewards/reward_model/std": 1.5545631647109985, "step": 198 }, { "completion_length": 1193.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1452.0, "completions/max_terminated_length": 1452.0, "completions/mean_length": 1193.75, "completions/mean_terminated_length": 1193.75, "completions/min_length": 968.0, "completions/min_terminated_length": 968.0, "epoch": 0.334453781512605, "frac_reward_zero_std": 0.0, "grad_norm": 0.6599255800247192, "kl": 0.8738446235656738, "learning_rate": 4.455555555555555e-06, "loss": 0.0009, "num_tokens": 3731124.0, "reward": 6.023212909698486, "reward_std": 1.539479374885559, "rewards/reward_model/mean": 6.023212909698486, "rewards/reward_model/std": 1.539479374885559, "step": 199 }, { "completion_length": 1154.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1301.0, "completions/max_terminated_length": 1301.0, "completions/mean_length": 1154.5, "completions/mean_terminated_length": 1154.5, "completions/min_length": 1033.0, "completions/min_terminated_length": 1033.0, "epoch": 0.33613445378151263, "frac_reward_zero_std": 0.0, "grad_norm": 0.628311812877655, "kl": 0.8193744421005249, "learning_rate": 4.450000000000001e-06, "loss": 0.0008, "num_tokens": 3745958.0, "reward": 6.5886006355285645, "reward_std": 1.5735419988632202, "rewards/reward_model/mean": 6.5886006355285645, "rewards/reward_model/std": 1.5735421180725098, "step": 200 }, { "completion_length": 2125.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2382.0, "completions/max_terminated_length": 2382.0, "completions/mean_length": 2125.25, "completions/mean_terminated_length": 2125.25, "completions/min_length": 1687.0, "completions/min_terminated_length": 1687.0, "epoch": 0.3378151260504202, "frac_reward_zero_std": 0.0, "grad_norm": 0.3583378195762634, "kl": 0.3288140296936035, "learning_rate": 4.444444444444444e-06, "loss": 0.0003, "num_tokens": 3763167.0, "reward": 6.5, "reward_std": 0.9128709435462952, "rewards/reward_model/mean": 6.5, "rewards/reward_model/std": 0.9128709435462952, "step": 201 }, { "completion_length": 2559.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2932.0, "completions/max_terminated_length": 2932.0, "completions/mean_length": 2559.25, "completions/mean_terminated_length": 2559.25, "completions/min_length": 2192.0, "completions/min_terminated_length": 2192.0, "epoch": 0.33949579831932775, "frac_reward_zero_std": 0.0, "grad_norm": 0.23530146479606628, "kl": 0.4598729908466339, "learning_rate": 4.438888888888889e-06, "loss": 0.0005, "num_tokens": 3784792.0, "reward": 8.130855560302734, "reward_std": 0.7685312628746033, "rewards/reward_model/mean": 8.130855560302734, "rewards/reward_model/std": 0.7685312628746033, "step": 202 }, { "completion_length": 2495.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2574.0, "completions/max_terminated_length": 2574.0, "completions/mean_length": 2495.25, "completions/mean_terminated_length": 2495.25, "completions/min_length": 2392.0, "completions/min_terminated_length": 2392.0, "epoch": 0.3411764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 0.23678502440452576, "kl": 0.4983557164669037, "learning_rate": 4.433333333333334e-06, "loss": 0.0005, "num_tokens": 3803773.0, "reward": 7.749725341796875, "reward_std": 1.274611234664917, "rewards/reward_model/mean": 7.749725341796875, "rewards/reward_model/std": 1.274611234664917, "step": 203 }, { "completion_length": 2991.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3169.0, "completions/max_terminated_length": 3169.0, "completions/mean_length": 2991.5, "completions/mean_terminated_length": 2991.5, "completions/min_length": 2786.0, "completions/min_terminated_length": 2786.0, "epoch": 0.34285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.21996529400348663, "kl": 0.44032400846481323, "learning_rate": 4.427777777777778e-06, "loss": 0.0004, "num_tokens": 3826407.0, "reward": 9.06251049041748, "reward_std": 0.13361211121082306, "rewards/reward_model/mean": 9.06251049041748, "rewards/reward_model/std": 0.13361230492591858, "step": 204 }, { "completion_length": 3041.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3546.0, "completions/max_terminated_length": 3546.0, "completions/mean_length": 3041.25, "completions/mean_terminated_length": 3041.25, "completions/min_length": 2768.0, "completions/min_terminated_length": 2768.0, "epoch": 0.3445378151260504, "frac_reward_zero_std": 0.0, "grad_norm": 0.22607754170894623, "kl": 0.4181973338127136, "learning_rate": 4.422222222222223e-06, "loss": 0.0004, "num_tokens": 3848652.0, "reward": 8.249590873718262, "reward_std": 1.1901236772537231, "rewards/reward_model/mean": 8.249590873718262, "rewards/reward_model/std": 1.1901236772537231, "step": 205 }, { "completion_length": 1240.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1490.0, "completions/max_terminated_length": 1490.0, "completions/mean_length": 1240.0, "completions/mean_terminated_length": 1240.0, "completions/min_length": 810.0, "completions/min_terminated_length": 810.0, "epoch": 0.346218487394958, "frac_reward_zero_std": 0.0, "grad_norm": 0.6906178593635559, "kl": 0.7297955751419067, "learning_rate": 4.416666666666667e-06, "loss": 0.0007, "num_tokens": 3864228.0, "reward": 6.978236198425293, "reward_std": 0.2300529032945633, "rewards/reward_model/mean": 6.978236198425293, "rewards/reward_model/std": 0.2300529032945633, "step": 206 }, { "completion_length": 2016.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2403.0, "completions/max_terminated_length": 2403.0, "completions/mean_length": 2016.25, "completions/mean_terminated_length": 2016.25, "completions/min_length": 1466.0, "completions/min_terminated_length": 1466.0, "epoch": 0.34789915966386553, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021035869140177965, "kl": 0.3609980344772339, "learning_rate": 4.411111111111111e-06, "loss": 0.0004, "num_tokens": 3881761.0, "reward": 10.0, "reward_std": 0.0, "rewards/reward_model/mean": 10.0, "rewards/reward_model/std": 0.0, "step": 207 }, { "completion_length": 2906.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3030.0, "completions/max_terminated_length": 3030.0, "completions/mean_length": 2906.5, "completions/mean_terminated_length": 2906.5, "completions/min_length": 2797.0, "completions/min_terminated_length": 2797.0, "epoch": 0.3495798319327731, "frac_reward_zero_std": 0.0, "grad_norm": 0.2264556735754013, "kl": 0.431278795003891, "learning_rate": 4.405555555555556e-06, "loss": 0.0004, "num_tokens": 3903959.0, "reward": 9.686347007751465, "reward_std": 0.37577515840530396, "rewards/reward_model/mean": 9.686347007751465, "rewards/reward_model/std": 0.3757750988006592, "step": 208 }, { "completion_length": 2715.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2963.0, "completions/max_terminated_length": 2963.0, "completions/mean_length": 2715.5, "completions/mean_terminated_length": 2715.5, "completions/min_length": 2502.0, "completions/min_terminated_length": 2502.0, "epoch": 0.35126050420168065, "frac_reward_zero_std": 0.0, "grad_norm": 0.23651069402694702, "kl": 0.4715757966041565, "learning_rate": 4.4e-06, "loss": 0.0005, "num_tokens": 3924701.0, "reward": 9.56467056274414, "reward_std": 0.5904367566108704, "rewards/reward_model/mean": 9.56467056274414, "rewards/reward_model/std": 0.5904366970062256, "step": 209 }, { "completion_length": 1060.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1446.0, "completions/max_terminated_length": 1446.0, "completions/mean_length": 1060.5, "completions/mean_terminated_length": 1060.5, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 0.35294117647058826, "frac_reward_zero_std": 0.0, "grad_norm": 0.6156778931617737, "kl": 0.8022011518478394, "learning_rate": 4.3944444444444455e-06, "loss": 0.0008, "num_tokens": 3939579.0, "reward": 5.080005645751953, "reward_std": 3.199277877807617, "rewards/reward_model/mean": 5.080005645751953, "rewards/reward_model/std": 3.199277877807617, "step": 210 }, { "completion_length": 2800.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2967.0, "completions/max_terminated_length": 2967.0, "completions/mean_length": 2800.0, "completions/mean_terminated_length": 2800.0, "completions/min_length": 2555.0, "completions/min_terminated_length": 2555.0, "epoch": 0.3546218487394958, "frac_reward_zero_std": 0.0, "grad_norm": 0.2301899492740631, "kl": 0.4365086853504181, "learning_rate": 4.388888888888889e-06, "loss": 0.0004, "num_tokens": 3959559.0, "reward": 8.273713111877441, "reward_std": 1.4226092100143433, "rewards/reward_model/mean": 8.273713111877441, "rewards/reward_model/std": 1.4226092100143433, "step": 211 }, { "completion_length": 1130.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1334.0, "completions/max_terminated_length": 1334.0, "completions/mean_length": 1130.25, "completions/mean_terminated_length": 1130.25, "completions/min_length": 828.0, "completions/min_terminated_length": 828.0, "epoch": 0.3563025210084034, "frac_reward_zero_std": 0.0, "grad_norm": 0.6039474606513977, "kl": 1.3054817914962769, "learning_rate": 4.383333333333334e-06, "loss": 0.0013, "num_tokens": 3974288.0, "reward": 6.615111827850342, "reward_std": 1.1467223167419434, "rewards/reward_model/mean": 6.615111827850342, "rewards/reward_model/std": 1.1467223167419434, "step": 212 }, { "completion_length": 1182.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1593.0, "completions/max_terminated_length": 1593.0, "completions/mean_length": 1182.75, "completions/mean_terminated_length": 1182.75, "completions/min_length": 792.0, "completions/min_terminated_length": 792.0, "epoch": 0.35798319327731093, "frac_reward_zero_std": 0.0, "grad_norm": 0.563639223575592, "kl": 0.8061200976371765, "learning_rate": 4.377777777777778e-06, "loss": 0.0008, "num_tokens": 3989879.0, "reward": 7.473617076873779, "reward_std": 0.6156018376350403, "rewards/reward_model/mean": 7.473617076873779, "rewards/reward_model/std": 0.6156017184257507, "step": 213 }, { "completion_length": 2827.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3099.0, "completions/max_terminated_length": 3099.0, "completions/mean_length": 2827.25, "completions/mean_terminated_length": 2827.25, "completions/min_length": 2574.0, "completions/min_terminated_length": 2574.0, "epoch": 0.3596638655462185, "frac_reward_zero_std": 0.0, "grad_norm": 0.2129405289888382, "kl": 0.4732570946216583, "learning_rate": 4.372222222222223e-06, "loss": 0.0005, "num_tokens": 4010588.0, "reward": 8.651857376098633, "reward_std": 0.4093300700187683, "rewards/reward_model/mean": 8.651857376098633, "rewards/reward_model/std": 0.4093301296234131, "step": 214 }, { "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.36134453781512604, "frac_reward_zero_std": 1.0, "grad_norm": 3.541159685482853e-06, "kl": 0.7164430022239685, "learning_rate": 4.366666666666667e-06, "loss": 0.0007, "num_tokens": 4018988.0, "reward": 10.0, "reward_std": 0.0, "rewards/reward_model/mean": 10.0, "rewards/reward_model/std": 0.0, "step": 215 }, { "completion_length": 2901.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3227.0, "completions/max_terminated_length": 3227.0, "completions/mean_length": 2901.25, "completions/mean_terminated_length": 2901.25, "completions/min_length": 2564.0, "completions/min_terminated_length": 2564.0, "epoch": 0.3630252100840336, "frac_reward_zero_std": 0.0, "grad_norm": 0.22036711871623993, "kl": 0.44254857301712036, "learning_rate": 4.361111111111112e-06, "loss": 0.0004, "num_tokens": 4040389.0, "reward": 8.36063003540039, "reward_std": 0.6233159303665161, "rewards/reward_model/mean": 8.36063003540039, "rewards/reward_model/std": 0.6233160495758057, "step": 216 }, { "completion_length": 498.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1579.0, "completions/max_terminated_length": 1579.0, "completions/mean_length": 498.75, "completions/mean_terminated_length": 498.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.36470588235294116, "frac_reward_zero_std": 0.0, "grad_norm": 7.46598482131958, "kl": 1.1445415019989014, "learning_rate": 4.3555555555555555e-06, "loss": 0.0011, "num_tokens": 4050540.0, "reward": 2.5, "reward_std": 2.886751174926758, "rewards/reward_model/mean": 2.5, "rewards/reward_model/std": 2.886751413345337, "step": 217 }, { "completion_length": 2650.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2959.0, "completions/max_terminated_length": 2959.0, "completions/mean_length": 2650.75, "completions/mean_terminated_length": 2650.75, "completions/min_length": 2241.0, "completions/min_terminated_length": 2241.0, "epoch": 0.3663865546218487, "frac_reward_zero_std": 0.0, "grad_norm": 0.2419486939907074, "kl": 0.4856666922569275, "learning_rate": 4.350000000000001e-06, "loss": 0.0005, "num_tokens": 4070671.0, "reward": 6.481883525848389, "reward_std": 1.6037975549697876, "rewards/reward_model/mean": 6.481883525848389, "rewards/reward_model/std": 1.6037975549697876, "step": 218 }, { "completion_length": 1812.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2159.0, "completions/max_terminated_length": 2159.0, "completions/mean_length": 1812.0, "completions/mean_terminated_length": 1812.0, "completions/min_length": 1453.0, "completions/min_terminated_length": 1453.0, "epoch": 0.3680672268907563, "frac_reward_zero_std": 0.0, "grad_norm": 0.3586513102054596, "kl": 0.4191110134124756, "learning_rate": 4.344444444444445e-06, "loss": 0.0004, "num_tokens": 4087115.0, "reward": 9.125, "reward_std": 0.6291528940200806, "rewards/reward_model/mean": 9.125, "rewards/reward_model/std": 0.6291528940200806, "step": 219 }, { "completion_length": 1374.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1685.0, "completions/max_terminated_length": 1685.0, "completions/mean_length": 1374.5, "completions/mean_terminated_length": 1374.5, "completions/min_length": 997.0, "completions/min_terminated_length": 997.0, "epoch": 0.3697478991596639, "frac_reward_zero_std": 0.0, "grad_norm": 0.5102481842041016, "kl": 0.7383792400360107, "learning_rate": 4.3388888888888895e-06, "loss": 0.0007, "num_tokens": 4101217.0, "reward": 6.679230213165283, "reward_std": 0.7687322497367859, "rewards/reward_model/mean": 6.679230213165283, "rewards/reward_model/std": 0.768732488155365, "step": 220 }, { "completion_length": 1784.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2015.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 1784.5, "completions/mean_terminated_length": 1784.5, "completions/min_length": 1498.0, "completions/min_terminated_length": 1498.0, "epoch": 0.37142857142857144, "frac_reward_zero_std": 0.0, "grad_norm": 0.3449091911315918, "kl": 0.3592361807823181, "learning_rate": 4.333333333333334e-06, "loss": 0.0004, "num_tokens": 4118411.0, "reward": 6.0, "reward_std": 3.240370273590088, "rewards/reward_model/mean": 6.0, "rewards/reward_model/std": 3.240370273590088, "step": 221 }, { "completion_length": 1657.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1888.0, "completions/max_terminated_length": 1888.0, "completions/mean_length": 1657.0, "completions/mean_terminated_length": 1657.0, "completions/min_length": 1438.0, "completions/min_terminated_length": 1438.0, "epoch": 0.373109243697479, "frac_reward_zero_std": 0.0, "grad_norm": 0.3586861491203308, "kl": 0.44242116808891296, "learning_rate": 4.327777777777778e-06, "loss": 0.0004, "num_tokens": 4133891.0, "reward": 8.75, "reward_std": 1.8929693698883057, "rewards/reward_model/mean": 8.75, "rewards/reward_model/std": 1.8929694890975952, "step": 222 }, { "completion_length": 2416.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3223.0, "completions/max_terminated_length": 3223.0, "completions/mean_length": 2416.0, "completions/mean_terminated_length": 2416.0, "completions/min_length": 1638.0, "completions/min_terminated_length": 1638.0, "epoch": 0.37478991596638656, "frac_reward_zero_std": 0.0, "grad_norm": 0.31728434562683105, "kl": 0.3470199704170227, "learning_rate": 4.322222222222223e-06, "loss": 0.0003, "num_tokens": 4152903.0, "reward": 7.0, "reward_std": 3.488075017929077, "rewards/reward_model/mean": 7.0, "rewards/reward_model/std": 3.488075017929077, "step": 223 }, { "completion_length": 1478.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1824.0, "completions/max_terminated_length": 1824.0, "completions/mean_length": 1478.25, "completions/mean_terminated_length": 1478.25, "completions/min_length": 1267.0, "completions/min_terminated_length": 1267.0, "epoch": 0.3764705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 0.4029053747653961, "kl": 0.3796059489250183, "learning_rate": 4.316666666666667e-06, "loss": 0.0004, "num_tokens": 4168224.0, "reward": 8.375, "reward_std": 1.108677864074707, "rewards/reward_model/mean": 8.375, "rewards/reward_model/std": 1.1086779832839966, "step": 224 }, { "completion_length": 2548.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2685.0, "completions/max_terminated_length": 2685.0, "completions/mean_length": 2548.25, "completions/mean_terminated_length": 2548.25, "completions/min_length": 2442.0, "completions/min_terminated_length": 2442.0, "epoch": 0.37815126050420167, "frac_reward_zero_std": 0.0, "grad_norm": 0.23842692375183105, "kl": 0.5093043446540833, "learning_rate": 4.3111111111111115e-06, "loss": 0.0005, "num_tokens": 4187045.0, "reward": 8.74769115447998, "reward_std": 0.6377931833267212, "rewards/reward_model/mean": 8.74769115447998, "rewards/reward_model/std": 0.6377933025360107, "step": 225 }, { "completion_length": 789.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1215.0, "completions/max_terminated_length": 1215.0, "completions/mean_length": 789.75, "completions/mean_terminated_length": 789.75, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.3798319327731092, "frac_reward_zero_std": 0.0, "grad_norm": 0.6549173593521118, "kl": 1.0279666185379028, "learning_rate": 4.305555555555556e-06, "loss": 0.001, "num_tokens": 4199356.0, "reward": 7.8533034324646, "reward_std": 0.7746135592460632, "rewards/reward_model/mean": 7.8533034324646, "rewards/reward_model/std": 0.7746136784553528, "step": 226 }, { "completion_length": 1766.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2141.0, "completions/max_terminated_length": 2141.0, "completions/mean_length": 1766.5, "completions/mean_terminated_length": 1766.5, "completions/min_length": 1432.0, "completions/min_terminated_length": 1432.0, "epoch": 0.3815126050420168, "frac_reward_zero_std": 0.0, "grad_norm": 0.3704773783683777, "kl": 0.37874269485473633, "learning_rate": 4.3e-06, "loss": 0.0004, "num_tokens": 4215918.0, "reward": 5.75, "reward_std": 2.217355728149414, "rewards/reward_model/mean": 5.75, "rewards/reward_model/std": 2.217355966567993, "step": 227 }, { "completion_length": 3109.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3472.0, "completions/max_terminated_length": 3472.0, "completions/mean_length": 3109.5, "completions/mean_terminated_length": 3109.5, "completions/min_length": 2585.0, "completions/min_terminated_length": 2585.0, "epoch": 0.3831932773109244, "frac_reward_zero_std": 0.0, "grad_norm": 0.23839071393013, "kl": 0.47008052468299866, "learning_rate": 4.294444444444445e-06, "loss": 0.0005, "num_tokens": 4239080.0, "reward": 7.694772243499756, "reward_std": 1.1449562311172485, "rewards/reward_model/mean": 7.694772243499756, "rewards/reward_model/std": 1.1449562311172485, "step": 228 }, { "completion_length": 1801.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2275.0, "completions/max_terminated_length": 2275.0, "completions/mean_length": 1801.0, "completions/mean_terminated_length": 1801.0, "completions/min_length": 1559.0, "completions/min_terminated_length": 1559.0, "epoch": 0.38487394957983195, "frac_reward_zero_std": 0.0, "grad_norm": 0.3541783094406128, "kl": 0.4009455740451813, "learning_rate": 4.288888888888889e-06, "loss": 0.0004, "num_tokens": 4255632.0, "reward": 8.0, "reward_std": 0.8164966106414795, "rewards/reward_model/mean": 8.0, "rewards/reward_model/std": 0.8164966106414795, "step": 229 }, { "completion_length": 1486.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2014.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 1486.25, "completions/mean_terminated_length": 1486.25, "completions/min_length": 836.0, "completions/min_terminated_length": 836.0, "epoch": 0.3865546218487395, "frac_reward_zero_std": 0.0, "grad_norm": 0.6752094626426697, "kl": 0.42911961674690247, "learning_rate": 4.2833333333333335e-06, "loss": 0.0004, "num_tokens": 4270469.0, "reward": 6.8125, "reward_std": 4.160203456878662, "rewards/reward_model/mean": 6.8125, "rewards/reward_model/std": 4.160203456878662, "step": 230 }, { "completion_length": 2945.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3142.0, "completions/max_terminated_length": 3142.0, "completions/mean_length": 2945.5, "completions/mean_terminated_length": 2945.5, "completions/min_length": 2688.0, "completions/min_terminated_length": 2688.0, "epoch": 0.38823529411764707, "frac_reward_zero_std": 0.0, "grad_norm": 0.2245272994041443, "kl": 0.5455446839332581, "learning_rate": 4.277777777777778e-06, "loss": 0.0005, "num_tokens": 4292147.0, "reward": 7.7876482009887695, "reward_std": 1.4192252159118652, "rewards/reward_model/mean": 7.7876482009887695, "rewards/reward_model/std": 1.4192253351211548, "step": 231 }, { "completion_length": 1662.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1796.0, "completions/max_terminated_length": 1796.0, "completions/mean_length": 1662.0, "completions/mean_terminated_length": 1662.0, "completions/min_length": 1529.0, "completions/min_terminated_length": 1529.0, "epoch": 0.3899159663865546, "frac_reward_zero_std": 0.0, "grad_norm": 0.3757660388946533, "kl": 0.42322346568107605, "learning_rate": 4.272222222222222e-06, "loss": 0.0004, "num_tokens": 4307431.0, "reward": 7.25, "reward_std": 0.8660253882408142, "rewards/reward_model/mean": 7.25, "rewards/reward_model/std": 0.8660253882408142, "step": 232 }, { "completion_length": 2795.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3066.0, "completions/max_terminated_length": 3066.0, "completions/mean_length": 2795.75, "completions/mean_terminated_length": 2795.75, "completions/min_length": 2561.0, "completions/min_terminated_length": 2561.0, "epoch": 0.3915966386554622, "frac_reward_zero_std": 0.0, "grad_norm": 0.22974319756031036, "kl": 0.4618850648403168, "learning_rate": 4.266666666666668e-06, "loss": 0.0005, "num_tokens": 4328122.0, "reward": 7.258394241333008, "reward_std": 0.5371370911598206, "rewards/reward_model/mean": 7.258394241333008, "rewards/reward_model/std": 0.5371370911598206, "step": 233 }, { "completion_length": 1628.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2026.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1628.25, "completions/mean_terminated_length": 1628.25, "completions/min_length": 1252.0, "completions/min_terminated_length": 1252.0, "epoch": 0.39327731092436974, "frac_reward_zero_std": 0.0, "grad_norm": 0.34619399905204773, "kl": 0.46718090772628784, "learning_rate": 4.261111111111111e-06, "loss": 0.0005, "num_tokens": 4344839.0, "reward": 9.125, "reward_std": 1.1814539432525635, "rewards/reward_model/mean": 9.125, "rewards/reward_model/std": 1.1814539432525635, "step": 234 }, { "completion_length": 1105.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1232.0, "completions/max_terminated_length": 1232.0, "completions/mean_length": 1105.5, "completions/mean_terminated_length": 1105.5, "completions/min_length": 996.0, "completions/min_terminated_length": 996.0, "epoch": 0.3949579831932773, "frac_reward_zero_std": 0.0, "grad_norm": 0.6756569743156433, "kl": 0.8630207180976868, "learning_rate": 4.255555555555556e-06, "loss": 0.0009, "num_tokens": 4358041.0, "reward": 5.431639671325684, "reward_std": 1.4380779266357422, "rewards/reward_model/mean": 5.431639671325684, "rewards/reward_model/std": 1.4380780458450317, "step": 235 }, { "completion_length": 2920.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3273.0, "completions/max_terminated_length": 3273.0, "completions/mean_length": 2920.75, "completions/mean_terminated_length": 2920.75, "completions/min_length": 2707.0, "completions/min_terminated_length": 2707.0, "epoch": 0.39663865546218485, "frac_reward_zero_std": 0.0, "grad_norm": 0.21619361639022827, "kl": 0.4627906084060669, "learning_rate": 4.25e-06, "loss": 0.0005, "num_tokens": 4381184.0, "reward": 8.5, "reward_std": 0.6123724579811096, "rewards/reward_model/mean": 8.5, "rewards/reward_model/std": 0.6123724579811096, "step": 236 }, { "completion_length": 3077.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3252.0, "completions/max_terminated_length": 3252.0, "completions/mean_length": 3077.0, "completions/mean_terminated_length": 3077.0, "completions/min_length": 2915.0, "completions/min_terminated_length": 2915.0, "epoch": 0.3983193277310924, "frac_reward_zero_std": 0.0, "grad_norm": 0.2036733329296112, "kl": 0.4635200798511505, "learning_rate": 4.244444444444445e-06, "loss": 0.0005, "num_tokens": 4402932.0, "reward": 9.433502197265625, "reward_std": 0.46453145146369934, "rewards/reward_model/mean": 9.433502197265625, "rewards/reward_model/std": 0.46453163027763367, "step": 237 }, { "completion_length": 2770.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2872.0, "completions/max_terminated_length": 2872.0, "completions/mean_length": 2770.25, "completions/mean_terminated_length": 2770.25, "completions/min_length": 2621.0, "completions/min_terminated_length": 2621.0, "epoch": 0.4, "frac_reward_zero_std": 0.0, "grad_norm": 0.24756017327308655, "kl": 0.4903179109096527, "learning_rate": 4.238888888888889e-06, "loss": 0.0005, "num_tokens": 4424409.0, "reward": 8.543237686157227, "reward_std": 1.2686576843261719, "rewards/reward_model/mean": 8.543237686157227, "rewards/reward_model/std": 1.268657922744751, "step": 238 }, { "completion_length": 320.25, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 320.25, "completions/mean_terminated_length": 320.25, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.4016806722689076, "frac_reward_zero_std": 0.0, "grad_norm": 0.5235669016838074, "kl": 0.9405966997146606, "learning_rate": 4.233333333333334e-06, "loss": 0.0009, "num_tokens": 4436042.0, "reward": 8.375, "reward_std": 0.25, "rewards/reward_model/mean": 8.375, "rewards/reward_model/std": 0.25, "step": 239 }, { "completion_length": 3967.75, "completions/clipped_ratio": 0.25, "completions/max_length": 7232.0, "completions/max_terminated_length": 3252.0, "completions/mean_length": 3967.75, "completions/mean_terminated_length": 2879.666748046875, "completions/min_length": 2402.0, "completions/min_terminated_length": 2402.0, "epoch": 0.40336134453781514, "frac_reward_zero_std": 0.0, "grad_norm": 0.15598049759864807, "kl": 0.3935525715351105, "learning_rate": 4.227777777777778e-06, "loss": 0.0004, "num_tokens": 4462557.0, "reward": 7.573015213012695, "reward_std": 2.0017759799957275, "rewards/reward_model/mean": 7.573015213012695, "rewards/reward_model/std": 2.0017759799957275, "step": 240 }, { "completion_length": 796.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1453.0, "completions/max_terminated_length": 1453.0, "completions/mean_length": 796.75, "completions/mean_terminated_length": 796.75, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.4050420168067227, "frac_reward_zero_std": 0.0, "grad_norm": 0.5736768841743469, "kl": 0.8851484656333923, "learning_rate": 4.222222222222223e-06, "loss": 0.0009, "num_tokens": 4474704.0, "reward": 6.755120754241943, "reward_std": 2.419196128845215, "rewards/reward_model/mean": 6.755120754241943, "rewards/reward_model/std": 2.419196128845215, "step": 241 }, { "completion_length": 1553.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1958.0, "completions/max_terminated_length": 1958.0, "completions/mean_length": 1553.75, "completions/mean_terminated_length": 1553.75, "completions/min_length": 1263.0, "completions/min_terminated_length": 1263.0, "epoch": 0.40672268907563025, "frac_reward_zero_std": 0.0, "grad_norm": 0.40061724185943604, "kl": 0.36847835779190063, "learning_rate": 4.216666666666667e-06, "loss": 0.0004, "num_tokens": 4490207.0, "reward": 9.25, "reward_std": 0.9574271440505981, "rewards/reward_model/mean": 9.25, "rewards/reward_model/std": 0.9574271440505981, "step": 242 }, { "completion_length": 2808.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3275.0, "completions/max_terminated_length": 3275.0, "completions/mean_length": 2808.0, "completions/mean_terminated_length": 2808.0, "completions/min_length": 2385.0, "completions/min_terminated_length": 2385.0, "epoch": 0.4084033613445378, "frac_reward_zero_std": 0.0, "grad_norm": 0.2375236451625824, "kl": 0.5209259986877441, "learning_rate": 4.211111111111112e-06, "loss": 0.0005, "num_tokens": 4511323.0, "reward": 8.539529800415039, "reward_std": 1.2907954454421997, "rewards/reward_model/mean": 8.539529800415039, "rewards/reward_model/std": 1.2907954454421997, "step": 243 }, { "completion_length": 2845.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3201.0, "completions/max_terminated_length": 3201.0, "completions/mean_length": 2845.5, "completions/mean_terminated_length": 2845.5, "completions/min_length": 2438.0, "completions/min_terminated_length": 2438.0, "epoch": 0.41008403361344536, "frac_reward_zero_std": 0.0, "grad_norm": 0.26784712076187134, "kl": 0.5034636855125427, "learning_rate": 4.205555555555556e-06, "loss": 0.0005, "num_tokens": 4533309.0, "reward": 8.882152557373047, "reward_std": 0.8950045704841614, "rewards/reward_model/mean": 8.882152557373047, "rewards/reward_model/std": 0.8950047492980957, "step": 244 }, { "completion_length": 471.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/max_terminated_length": 652.0, "completions/mean_length": 471.0, "completions/mean_terminated_length": 471.0, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.4117647058823529, "frac_reward_zero_std": 0.0, "grad_norm": 1.0807291269302368, "kl": 0.9026118516921997, "learning_rate": 4.2000000000000004e-06, "loss": 0.0009, "num_tokens": 4545037.0, "reward": 5.626047134399414, "reward_std": 4.62348747253418, "rewards/reward_model/mean": 5.626047134399414, "rewards/reward_model/std": 4.62348747253418, "step": 245 }, { "completion_length": 3169.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3654.0, "completions/max_terminated_length": 3654.0, "completions/mean_length": 3169.5, "completions/mean_terminated_length": 3169.5, "completions/min_length": 2712.0, "completions/min_terminated_length": 2712.0, "epoch": 0.4134453781512605, "frac_reward_zero_std": 0.0, "grad_norm": 0.22497378289699554, "kl": 0.4505349397659302, "learning_rate": 4.194444444444445e-06, "loss": 0.0005, "num_tokens": 4568679.0, "reward": 7.199137210845947, "reward_std": 1.1360905170440674, "rewards/reward_model/mean": 7.199137210845947, "rewards/reward_model/std": 1.1360902786254883, "step": 246 }, { "completion_length": 2979.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3353.0, "completions/max_terminated_length": 3353.0, "completions/mean_length": 2979.0, "completions/mean_terminated_length": 2979.0, "completions/min_length": 2461.0, "completions/min_terminated_length": 2461.0, "epoch": 0.4151260504201681, "frac_reward_zero_std": 0.0, "grad_norm": 0.1978806108236313, "kl": 0.4465804100036621, "learning_rate": 4.188888888888889e-06, "loss": 0.0004, "num_tokens": 4591319.0, "reward": 9.045299530029297, "reward_std": 0.32406604290008545, "rewards/reward_model/mean": 9.045299530029297, "rewards/reward_model/std": 0.32406583428382874, "step": 247 }, { "completion_length": 2383.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2678.0, "completions/max_terminated_length": 2678.0, "completions/mean_length": 2383.25, "completions/mean_terminated_length": 2383.25, "completions/min_length": 1829.0, "completions/min_terminated_length": 1829.0, "epoch": 0.41680672268907565, "frac_reward_zero_std": 0.0, "grad_norm": 0.2815382778644562, "kl": 0.8800508379936218, "learning_rate": 4.183333333333334e-06, "loss": 0.0009, "num_tokens": 4609576.0, "reward": 7.960483074188232, "reward_std": 1.1536815166473389, "rewards/reward_model/mean": 7.960483074188232, "rewards/reward_model/std": 1.1536816358566284, "step": 248 }, { "completion_length": 1926.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2080.0, "completions/max_terminated_length": 2080.0, "completions/mean_length": 1926.0, "completions/mean_terminated_length": 1926.0, "completions/min_length": 1749.0, "completions/min_terminated_length": 1749.0, "epoch": 0.4184873949579832, "frac_reward_zero_std": 0.0, "grad_norm": 0.33348986506462097, "kl": 0.42607924342155457, "learning_rate": 4.177777777777778e-06, "loss": 0.0004, "num_tokens": 4627812.0, "reward": 6.625, "reward_std": 2.4958298206329346, "rewards/reward_model/mean": 6.625, "rewards/reward_model/std": 2.4958298206329346, "step": 249 }, { "completion_length": 635.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 635.0, "completions/mean_terminated_length": 635.0, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.42016806722689076, "frac_reward_zero_std": 0.0, "grad_norm": 0.5603328347206116, "kl": 1.105303406715393, "learning_rate": 4.1722222222222225e-06, "loss": 0.0011, "num_tokens": 4639708.0, "reward": 7.454676151275635, "reward_std": 1.3379886150360107, "rewards/reward_model/mean": 7.454676151275635, "rewards/reward_model/std": 1.3379884958267212, "step": 250 }, { "completion_length": 2519.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3053.0, "completions/max_terminated_length": 3053.0, "completions/mean_length": 2519.0, "completions/mean_terminated_length": 2519.0, "completions/min_length": 2101.0, "completions/min_terminated_length": 2101.0, "epoch": 0.4218487394957983, "frac_reward_zero_std": 0.0, "grad_norm": 0.23313941061496735, "kl": 0.5020186305046082, "learning_rate": 4.166666666666667e-06, "loss": 0.0005, "num_tokens": 4659764.0, "reward": 7.716615676879883, "reward_std": 2.483187198638916, "rewards/reward_model/mean": 7.716615676879883, "rewards/reward_model/std": 2.483187198638916, "step": 251 }, { "completion_length": 239.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 239.0, "completions/mean_terminated_length": 239.0, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.4235294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006444022292271256, "kl": 1.3417521715164185, "learning_rate": 4.161111111111111e-06, "loss": 0.0013, "num_tokens": 4670236.0, "reward": 9.25, "reward_std": 0.0, "rewards/reward_model/mean": 9.25, "rewards/reward_model/std": 0.0, "step": 252 }, { "completion_length": 2820.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3126.0, "completions/max_terminated_length": 3126.0, "completions/mean_length": 2820.75, "completions/mean_terminated_length": 2820.75, "completions/min_length": 2446.0, "completions/min_terminated_length": 2446.0, "epoch": 0.42521008403361343, "frac_reward_zero_std": 0.0, "grad_norm": 0.23993265628814697, "kl": 0.4522344172000885, "learning_rate": 4.155555555555556e-06, "loss": 0.0005, "num_tokens": 4691631.0, "reward": 8.778319358825684, "reward_std": 0.8046744465827942, "rewards/reward_model/mean": 8.778319358825684, "rewards/reward_model/std": 0.8046746253967285, "step": 253 }, { "completion_length": 1701.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2052.0, "completions/max_terminated_length": 2052.0, "completions/mean_length": 1701.0, "completions/mean_terminated_length": 1701.0, "completions/min_length": 1419.0, "completions/min_terminated_length": 1419.0, "epoch": 0.426890756302521, "frac_reward_zero_std": 0.0, "grad_norm": 0.3598572015762329, "kl": 0.4365740120410919, "learning_rate": 4.15e-06, "loss": 0.0004, "num_tokens": 4707115.0, "reward": 7.375, "reward_std": 1.6520190238952637, "rewards/reward_model/mean": 7.375, "rewards/reward_model/std": 1.6520190238952637, "step": 254 }, { "completion_length": 255.5, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 255.5, "completions/mean_terminated_length": 255.5, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.42857142857142855, "frac_reward_zero_std": 0.0, "grad_norm": 0.5156990885734558, "kl": 1.2908028364181519, "learning_rate": 4.1444444444444445e-06, "loss": 0.0013, "num_tokens": 4716985.0, "reward": 8.8125, "reward_std": 0.3145764470100403, "rewards/reward_model/mean": 8.8125, "rewards/reward_model/std": 0.3145764470100403, "step": 255 }, { "completion_length": 2111.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2612.0, "completions/max_terminated_length": 2612.0, "completions/mean_length": 2111.5, "completions/mean_terminated_length": 2111.5, "completions/min_length": 1738.0, "completions/min_terminated_length": 1738.0, "epoch": 0.43025210084033616, "frac_reward_zero_std": 0.0, "grad_norm": 0.3176546096801758, "kl": 0.31987547874450684, "learning_rate": 4.138888888888889e-06, "loss": 0.0003, "num_tokens": 4735275.0, "reward": 9.375, "reward_std": 0.9464846849441528, "rewards/reward_model/mean": 9.375, "rewards/reward_model/std": 0.9464847445487976, "step": 256 }, { "completion_length": 1851.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2513.0, "completions/max_terminated_length": 2513.0, "completions/mean_length": 1851.25, "completions/mean_terminated_length": 1851.25, "completions/min_length": 1591.0, "completions/min_terminated_length": 1591.0, "epoch": 0.4319327731092437, "frac_reward_zero_std": 0.0, "grad_norm": 0.349301278591156, "kl": 0.343892365694046, "learning_rate": 4.133333333333333e-06, "loss": 0.0003, "num_tokens": 4752660.0, "reward": 7.625, "reward_std": 2.3228933811187744, "rewards/reward_model/mean": 7.625, "rewards/reward_model/std": 2.3228933811187744, "step": 257 }, { "completion_length": 2430.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2653.0, "completions/max_terminated_length": 2653.0, "completions/mean_length": 2430.5, "completions/mean_terminated_length": 2430.5, "completions/min_length": 2252.0, "completions/min_terminated_length": 2252.0, "epoch": 0.4336134453781513, "frac_reward_zero_std": 0.0, "grad_norm": 0.26822173595428467, "kl": 0.5344579219818115, "learning_rate": 4.1277777777777785e-06, "loss": 0.0005, "num_tokens": 4771654.0, "reward": -0.3647910952568054, "reward_std": 0.4221853017807007, "rewards/reward_model/mean": -0.3647910952568054, "rewards/reward_model/std": 0.42218533158302307, "step": 258 }, { "completion_length": 1812.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2814.0, "completions/max_terminated_length": 2814.0, "completions/mean_length": 1812.5, "completions/mean_terminated_length": 1812.5, "completions/min_length": 1364.0, "completions/min_terminated_length": 1364.0, "epoch": 0.43529411764705883, "frac_reward_zero_std": 0.0, "grad_norm": 0.346161425113678, "kl": 0.40465983748435974, "learning_rate": 4.122222222222222e-06, "loss": 0.0004, "num_tokens": 4787404.0, "reward": 6.094305038452148, "reward_std": 5.060488700866699, "rewards/reward_model/mean": 6.094305038452148, "rewards/reward_model/std": 5.060488700866699, "step": 259 }, { "completion_length": 1989.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2346.0, "completions/max_terminated_length": 2346.0, "completions/mean_length": 1989.25, "completions/mean_terminated_length": 1989.25, "completions/min_length": 1700.0, "completions/min_terminated_length": 1700.0, "epoch": 0.4369747899159664, "frac_reward_zero_std": 0.0, "grad_norm": 0.3061082065105438, "kl": 0.36512595415115356, "learning_rate": 4.116666666666667e-06, "loss": 0.0004, "num_tokens": 4804709.0, "reward": 9.875, "reward_std": 0.25, "rewards/reward_model/mean": 9.875, "rewards/reward_model/std": 0.25, "step": 260 } ], "logging_steps": 1, "max_steps": 1000, "num_input_tokens_seen": 4804709, "num_train_epochs": 2, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }