{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.05, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1171875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1251.0, "completions/mean_length": 519.09765625, "completions/mean_terminated_length": 418.0929260253906, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.0005, "frac_reward_zero_std": 0.015625, "grad_norm": 0.338766485452652, "learning_rate": 0.0, "loss": 0.078, "num_tokens": 345050.0, "reward": 0.3798828125, "reward_std": 0.28941643238067627, "rewards/correctness_reward_func/mean": 0.185546875, "rewards/correctness_reward_func/std": 0.38912075757980347, "rewards/strict_format_reward_func/mean": 0.57421875, "rewards/strict_format_reward_func/std": 0.4949444830417633, "step": 1 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.08203125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1264.0, "completions/mean_length": 492.201171875, "completions/mean_terminated_length": 421.8021240234375, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.001, "frac_reward_zero_std": 0.0, "grad_norm": 0.35604149103164673, "learning_rate": 1.6666666666666667e-08, "loss": 0.0504, "num_tokens": 677193.0, "reward": 0.34765625, "reward_std": 0.26424503326416016, "rewards/correctness_reward_func/mean": 0.099609375, "rewards/correctness_reward_func/std": 0.29977133870124817, "rewards/strict_format_reward_func/mean": 0.595703125, "rewards/strict_format_reward_func/std": 0.4912354052066803, "step": 2 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.107421875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1280.0, "completions/mean_length": 517.6953125, "completions/mean_terminated_length": 425.9518737792969, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.0015, "frac_reward_zero_std": 0.015625, "grad_norm": 0.332509309053421, "learning_rate": 3.3333333333333334e-08, "loss": 0.0796, "num_tokens": 1019981.0, "reward": 0.345703125, "reward_std": 0.2651558816432953, "rewards/correctness_reward_func/mean": 0.125, "rewards/correctness_reward_func/std": 0.3310423493385315, "rewards/strict_format_reward_func/mean": 0.56640625, "rewards/strict_format_reward_func/std": 0.4960552453994751, "step": 3 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1273.0, "completions/mean_length": 495.77734375, "completions/mean_terminated_length": 422.0470275878906, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.002, "frac_reward_zero_std": 0.03125, "grad_norm": 0.32658830285072327, "learning_rate": 5e-08, "loss": 0.0471, "num_tokens": 1354419.0, "reward": 0.373046875, "reward_std": 0.26736605167388916, "rewards/correctness_reward_func/mean": 0.1484375, "rewards/correctness_reward_func/std": 0.35588082671165466, "rewards/strict_format_reward_func/mean": 0.59765625, "rewards/strict_format_reward_func/std": 0.4908501207828522, "step": 4 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.11328125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1262.0, "completions/mean_length": 530.775390625, "completions/mean_terminated_length": 435.0594482421875, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.0025, "frac_reward_zero_std": 0.03125, "grad_norm": 0.32245033979415894, "learning_rate": 6.666666666666667e-08, "loss": 0.0435, "num_tokens": 1705264.0, "reward": 0.318359375, "reward_std": 0.25092875957489014, "rewards/correctness_reward_func/mean": 0.103515625, "rewards/correctness_reward_func/std": 0.30492907762527466, "rewards/strict_format_reward_func/mean": 0.533203125, "rewards/strict_format_reward_func/std": 0.4993842542171478, "step": 5 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1015625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1279.0, "completions/mean_length": 504.80859375, "completions/mean_terminated_length": 417.1782531738281, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.003, "frac_reward_zero_std": 0.046875, "grad_norm": 0.34851861000061035, "learning_rate": 8.333333333333333e-08, "loss": -0.001, "num_tokens": 2046694.0, "reward": 0.33203125, "reward_std": 0.2571216821670532, "rewards/correctness_reward_func/mean": 0.126953125, "rewards/correctness_reward_func/std": 0.33324605226516724, "rewards/strict_format_reward_func/mean": 0.537109375, "rewards/strict_format_reward_func/std": 0.4991086423397064, "step": 6 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.111328125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1276.0, "completions/mean_length": 515.822265625, "completions/mean_terminated_length": 420.0901184082031, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.0035, "frac_reward_zero_std": 0.03125, "grad_norm": 0.3351835608482361, "learning_rate": 1e-07, "loss": 0.0657, "num_tokens": 2392451.0, "reward": 0.33984375, "reward_std": 0.258058100938797, "rewards/correctness_reward_func/mean": 0.12109375, "rewards/correctness_reward_func/std": 0.3265552520751953, "rewards/strict_format_reward_func/mean": 0.55859375, "rewards/strict_format_reward_func/std": 0.4970405399799347, "step": 7 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.099609375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1229.0, "completions/mean_length": 486.931640625, "completions/mean_terminated_length": 399.1952209472656, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.004, "frac_reward_zero_std": 0.03125, "grad_norm": 0.3637154996395111, "learning_rate": 1.1666666666666667e-07, "loss": 0.1004, "num_tokens": 2722008.0, "reward": 0.345703125, "reward_std": 0.2512314021587372, "rewards/correctness_reward_func/mean": 0.12109375, "rewards/correctness_reward_func/std": 0.3265552520751953, "rewards/strict_format_reward_func/mean": 0.5703125, "rewards/strict_format_reward_func/std": 0.4955156147480011, "step": 8 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.10546875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1268.0, "completions/mean_length": 499.8046875, "completions/mean_terminated_length": 407.81658935546875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.0045, "frac_reward_zero_std": 0.0625, "grad_norm": 0.31052860617637634, "learning_rate": 1.3333333333333334e-07, "loss": 0.0588, "num_tokens": 3059220.0, "reward": 0.337890625, "reward_std": 0.24443647265434265, "rewards/correctness_reward_func/mean": 0.123046875, "rewards/correctness_reward_func/std": 0.32881227135658264, "rewards/strict_format_reward_func/mean": 0.552734375, "rewards/strict_format_reward_func/std": 0.4976975917816162, "step": 9 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.083984375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1271.0, "completions/mean_length": 482.73046875, "completions/mean_terminated_length": 409.6332702636719, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.005, "frac_reward_zero_std": 0.078125, "grad_norm": 0.3554303050041199, "learning_rate": 1.5e-07, "loss": 0.044, "num_tokens": 3386450.0, "reward": 0.31640625, "reward_std": 0.25127753615379333, "rewards/correctness_reward_func/mean": 0.09765625, "rewards/correctness_reward_func/std": 0.29713961482048035, "rewards/strict_format_reward_func/mean": 0.53515625, "rewards/strict_format_reward_func/std": 0.49925029277801514, "step": 10 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.099609375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1278.0, "completions/mean_length": 470.484375, "completions/mean_terminated_length": 380.9284362792969, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.0055, "frac_reward_zero_std": 0.0625, "grad_norm": 0.36456865072250366, "learning_rate": 1.6666666666666665e-07, "loss": 0.066, "num_tokens": 3707954.0, "reward": 0.3544921875, "reward_std": 0.2562474310398102, "rewards/correctness_reward_func/mean": 0.1484375, "rewards/correctness_reward_func/std": 0.35588082671165466, "rewards/strict_format_reward_func/mean": 0.560546875, "rewards/strict_format_reward_func/std": 0.49680593609809875, "step": 11 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1278.0, "completions/mean_length": 480.06640625, "completions/mean_terminated_length": 412.2754211425781, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.006, "frac_reward_zero_std": 0.046875, "grad_norm": 0.36434271931648254, "learning_rate": 1.833333333333333e-07, "loss": 0.0494, "num_tokens": 4033044.0, "reward": 0.3251953125, "reward_std": 0.2645646929740906, "rewards/correctness_reward_func/mean": 0.11328125, "rewards/correctness_reward_func/std": 0.3172462284564972, "rewards/strict_format_reward_func/mean": 0.537109375, "rewards/strict_format_reward_func/std": 0.4991086423397064, "step": 12 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.08203125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1279.0, "completions/mean_length": 468.708984375, "completions/mean_terminated_length": 396.21063232421875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.0065, "frac_reward_zero_std": 0.015625, "grad_norm": 0.36773481965065, "learning_rate": 2e-07, "loss": 0.053, "num_tokens": 4354615.0, "reward": 0.3193359375, "reward_std": 0.2689654529094696, "rewards/correctness_reward_func/mean": 0.111328125, "rewards/correctness_reward_func/std": 0.31484565138816833, "rewards/strict_format_reward_func/mean": 0.52734375, "rewards/strict_format_reward_func/std": 0.49974003434181213, "step": 13 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.111328125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1260.0, "completions/mean_length": 485.958984375, "completions/mean_terminated_length": 386.4857177734375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.007, "frac_reward_zero_std": 0.046875, "grad_norm": 0.33340662717819214, "learning_rate": 2.1666666666666667e-07, "loss": 0.0758, "num_tokens": 4685202.0, "reward": 0.361328125, "reward_std": 0.2693173885345459, "rewards/correctness_reward_func/mean": 0.154296875, "rewards/correctness_reward_func/std": 0.36158639192581177, "rewards/strict_format_reward_func/mean": 0.568359375, "rewards/strict_format_reward_func/std": 0.4957893490791321, "step": 14 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.115234375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1278.0, "completions/mean_length": 496.302734375, "completions/mean_terminated_length": 394.2317810058594, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.0075, "frac_reward_zero_std": 0.046875, "grad_norm": 0.31166115403175354, "learning_rate": 2.3333333333333333e-07, "loss": 0.0588, "num_tokens": 5018365.0, "reward": 0.3857421875, "reward_std": 0.25771355628967285, "rewards/correctness_reward_func/mean": 0.15234375, "rewards/correctness_reward_func/std": 0.35970520973205566, "rewards/strict_format_reward_func/mean": 0.619140625, "rewards/strict_format_reward_func/std": 0.48607301712036133, "step": 15 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.095703125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1266.0, "completions/mean_length": 486.75390625, "completions/mean_terminated_length": 402.8034362792969, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.008, "frac_reward_zero_std": 0.046875, "grad_norm": 0.36958619952201843, "learning_rate": 2.5e-07, "loss": 0.1103, "num_tokens": 5350215.0, "reward": 0.3349609375, "reward_std": 0.24944864213466644, "rewards/correctness_reward_func/mean": 0.083984375, "rewards/correctness_reward_func/std": 0.2776356339454651, "rewards/strict_format_reward_func/mean": 0.5859375, "rewards/strict_format_reward_func/std": 0.49304109811782837, "step": 16 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.083984375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1276.0, "completions/mean_length": 487.0859375, "completions/mean_terminated_length": 414.3880615234375, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.0085, "frac_reward_zero_std": 0.015625, "grad_norm": 0.3691529631614685, "learning_rate": 2.6666666666666667e-07, "loss": 0.0438, "num_tokens": 5679539.0, "reward": 0.365234375, "reward_std": 0.2718730568885803, "rewards/correctness_reward_func/mean": 0.134765625, "rewards/correctness_reward_func/std": 0.3418070077896118, "rewards/strict_format_reward_func/mean": 0.595703125, "rewards/strict_format_reward_func/std": 0.4912354052066803, "step": 17 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.146484375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1249.0, "completions/mean_length": 539.380859375, "completions/mean_terminated_length": 412.27227783203125, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.009, "frac_reward_zero_std": 0.0625, "grad_norm": 0.2965885400772095, "learning_rate": 2.833333333333333e-07, "loss": 0.0626, "num_tokens": 6042406.0, "reward": 0.3232421875, "reward_std": 0.25863024592399597, "rewards/correctness_reward_func/mean": 0.13671875, "rewards/correctness_reward_func/std": 0.3438861668109894, "rewards/strict_format_reward_func/mean": 0.509765625, "rewards/strict_format_reward_func/std": 0.5003935098648071, "step": 18 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1275.0, "completions/mean_length": 485.21875, "completions/mean_terminated_length": 403.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.0095, "frac_reward_zero_std": 0.046875, "grad_norm": 0.38167980313301086, "learning_rate": 3e-07, "loss": 0.0318, "num_tokens": 6369486.0, "reward": 0.3876953125, "reward_std": 0.262542188167572, "rewards/correctness_reward_func/mean": 0.15625, "rewards/correctness_reward_func/std": 0.36344730854034424, "rewards/strict_format_reward_func/mean": 0.619140625, "rewards/strict_format_reward_func/std": 0.48607301712036133, "step": 19 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.107421875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1277.0, "completions/mean_length": 508.498046875, "completions/mean_terminated_length": 415.647705078125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.01, "frac_reward_zero_std": 0.0625, "grad_norm": 0.3474893867969513, "learning_rate": 3.166666666666666e-07, "loss": 0.0142, "num_tokens": 6712861.0, "reward": 0.3330078125, "reward_std": 0.24650248885154724, "rewards/correctness_reward_func/mean": 0.099609375, "rewards/correctness_reward_func/std": 0.29977133870124817, "rewards/strict_format_reward_func/mean": 0.56640625, "rewards/strict_format_reward_func/std": 0.4960552453994751, "step": 20 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1272.0, "completions/mean_length": 514.66015625, "completions/mean_terminated_length": 420.6710510253906, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.0105, "frac_reward_zero_std": 0.03125, "grad_norm": 0.3415084779262543, "learning_rate": 3.333333333333333e-07, "loss": 0.1265, "num_tokens": 7054527.0, "reward": 0.375, "reward_std": 0.26221993565559387, "rewards/correctness_reward_func/mean": 0.162109375, "rewards/correctness_reward_func/std": 0.3689115643501282, "rewards/strict_format_reward_func/mean": 0.587890625, "rewards/strict_format_reward_func/std": 0.49269601702690125, "step": 21 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.111328125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1266.0, "completions/mean_length": 525.611328125, "completions/mean_terminated_length": 431.1054992675781, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.011, "frac_reward_zero_std": 0.015625, "grad_norm": 0.31360530853271484, "learning_rate": 3.5e-07, "loss": 0.0387, "num_tokens": 7407728.0, "reward": 0.326171875, "reward_std": 0.24292460083961487, "rewards/correctness_reward_func/mean": 0.068359375, "rewards/correctness_reward_func/std": 0.25260838866233826, "rewards/strict_format_reward_func/mean": 0.583984375, "rewards/strict_format_reward_func/std": 0.493378221988678, "step": 22 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.11328125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1274.0, "completions/mean_length": 530.44140625, "completions/mean_terminated_length": 434.68280029296875, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.0115, "frac_reward_zero_std": 0.03125, "grad_norm": 0.3227068781852722, "learning_rate": 3.666666666666666e-07, "loss": 0.0178, "num_tokens": 7758946.0, "reward": 0.3701171875, "reward_std": 0.25999653339385986, "rewards/correctness_reward_func/mean": 0.11328125, "rewards/correctness_reward_func/std": 0.3172462284564972, "rewards/strict_format_reward_func/mean": 0.626953125, "rewards/strict_format_reward_func/std": 0.48408737778663635, "step": 23 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.119140625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1267.0, "completions/mean_length": 489.134765625, "completions/mean_terminated_length": 382.1662902832031, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.012, "frac_reward_zero_std": 0.046875, "grad_norm": 0.3339863121509552, "learning_rate": 3.8333333333333335e-07, "loss": 0.0657, "num_tokens": 8089551.0, "reward": 0.3837890625, "reward_std": 0.2371073067188263, "rewards/correctness_reward_func/mean": 0.150390625, "rewards/correctness_reward_func/std": 0.35780346393585205, "rewards/strict_format_reward_func/mean": 0.6171875, "rewards/strict_format_reward_func/std": 0.486548513174057, "step": 24 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09765625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1273.0, "completions/mean_length": 478.623046875, "completions/mean_terminated_length": 391.8939514160156, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.0125, "frac_reward_zero_std": 0.0, "grad_norm": 0.3568263351917267, "learning_rate": 4e-07, "loss": 0.0492, "num_tokens": 8414726.0, "reward": 0.373046875, "reward_std": 0.27422550320625305, "rewards/correctness_reward_func/mean": 0.125, "rewards/correctness_reward_func/std": 0.3310423493385315, "rewards/strict_format_reward_func/mean": 0.62109375, "rewards/strict_format_reward_func/std": 0.4855891764163971, "step": 25 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1278.0, "completions/mean_length": 480.068359375, "completions/mean_terminated_length": 381.8311462402344, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.013, "frac_reward_zero_std": 0.09375, "grad_norm": 0.31916946172714233, "learning_rate": 4.1666666666666667e-07, "loss": 0.0776, "num_tokens": 8744809.0, "reward": 0.3720703125, "reward_std": 0.26208120584487915, "rewards/correctness_reward_func/mean": 0.162109375, "rewards/correctness_reward_func/std": 0.3689115643501282, "rewards/strict_format_reward_func/mean": 0.58203125, "rewards/strict_format_reward_func/std": 0.4937073290348053, "step": 26 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09765625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1276.0, "completions/mean_length": 495.048828125, "completions/mean_terminated_length": 410.097412109375, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.0135, "frac_reward_zero_std": 0.03125, "grad_norm": 0.3349536955356598, "learning_rate": 4.3333333333333335e-07, "loss": 0.07, "num_tokens": 9081114.0, "reward": 0.3876953125, "reward_std": 0.2574464678764343, "rewards/correctness_reward_func/mean": 0.1328125, "rewards/correctness_reward_func/std": 0.33970388770103455, "rewards/strict_format_reward_func/mean": 0.642578125, "rewards/strict_format_reward_func/std": 0.4797092080116272, "step": 27 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1269.0, "completions/mean_length": 476.123046875, "completions/mean_terminated_length": 400.5448913574219, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.014, "frac_reward_zero_std": 0.046875, "grad_norm": 0.36770424246788025, "learning_rate": 4.5e-07, "loss": 0.0595, "num_tokens": 9407537.0, "reward": 0.3681640625, "reward_std": 0.2513667941093445, "rewards/correctness_reward_func/mean": 0.10546875, "rewards/correctness_reward_func/std": 0.3074568510055542, "rewards/strict_format_reward_func/mean": 0.630859375, "rewards/strict_format_reward_func/std": 0.4830440282821655, "step": 28 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1262.0, "completions/mean_length": 478.572265625, "completions/mean_terminated_length": 395.66595458984375, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.0145, "frac_reward_zero_std": 0.078125, "grad_norm": 0.34928959608078003, "learning_rate": 4.6666666666666666e-07, "loss": 0.0713, "num_tokens": 9730062.0, "reward": 0.4130859375, "reward_std": 0.245479553937912, "rewards/correctness_reward_func/mean": 0.12890625, "rewards/correctness_reward_func/std": 0.33542385697364807, "rewards/strict_format_reward_func/mean": 0.697265625, "rewards/strict_format_reward_func/std": 0.45989060401916504, "step": 29 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.08984375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1273.0, "completions/mean_length": 475.310546875, "completions/mean_terminated_length": 395.877685546875, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.015, "frac_reward_zero_std": 0.046875, "grad_norm": 0.3654455542564392, "learning_rate": 4.833333333333333e-07, "loss": 0.0367, "num_tokens": 10054341.0, "reward": 0.4169921875, "reward_std": 0.25523918867111206, "rewards/correctness_reward_func/mean": 0.14453125, "rewards/correctness_reward_func/std": 0.35197147727012634, "rewards/strict_format_reward_func/mean": 0.689453125, "rewards/strict_format_reward_func/std": 0.46317005157470703, "step": 30 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.095703125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1262.0, "completions/mean_length": 478.076171875, "completions/mean_terminated_length": 393.20733642578125, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.0155, "frac_reward_zero_std": 0.015625, "grad_norm": 0.3683274984359741, "learning_rate": 5e-07, "loss": 0.0631, "num_tokens": 10379580.0, "reward": 0.4091796875, "reward_std": 0.2527580261230469, "rewards/correctness_reward_func/mean": 0.115234375, "rewards/correctness_reward_func/std": 0.3196168541908264, "rewards/strict_format_reward_func/mean": 0.703125, "rewards/strict_format_reward_func/std": 0.45732781291007996, "step": 31 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.107421875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1233.0, "completions/mean_length": 510.697265625, "completions/mean_terminated_length": 418.1116027832031, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.016, "frac_reward_zero_std": 0.140625, "grad_norm": 0.309796541929245, "learning_rate": 5.166666666666667e-07, "loss": 0.0752, "num_tokens": 10719009.0, "reward": 0.4140625, "reward_std": 0.21483345329761505, "rewards/correctness_reward_func/mean": 0.1171875, "rewards/correctness_reward_func/std": 0.32195815443992615, "rewards/strict_format_reward_func/mean": 0.7109375, "rewards/strict_format_reward_func/std": 0.45377036929130554, "step": 32 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09765625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1253.0, "completions/mean_length": 463.578125, "completions/mean_terminated_length": 375.22076416015625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.0165, "frac_reward_zero_std": 0.078125, "grad_norm": 0.3456345796585083, "learning_rate": 5.333333333333333e-07, "loss": 0.0511, "num_tokens": 11033809.0, "reward": 0.4267578125, "reward_std": 0.24420523643493652, "rewards/correctness_reward_func/mean": 0.13671875, "rewards/correctness_reward_func/std": 0.3438861668109894, "rewards/strict_format_reward_func/mean": 0.716796875, "rewards/strict_format_reward_func/std": 0.4509948492050171, "step": 33 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.08984375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1268.0, "completions/mean_length": 433.3203125, "completions/mean_terminated_length": 349.74249267578125, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.017, "frac_reward_zero_std": 0.0625, "grad_norm": 0.39212310314178467, "learning_rate": 5.5e-07, "loss": 0.0611, "num_tokens": 11337629.0, "reward": 0.4404296875, "reward_std": 0.2442634403705597, "rewards/correctness_reward_func/mean": 0.125, "rewards/correctness_reward_func/std": 0.3310423493385315, "rewards/strict_format_reward_func/mean": 0.755859375, "rewards/strict_format_reward_func/std": 0.42999663949012756, "step": 34 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.123046875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1279.0, "completions/mean_length": 509.75, "completions/mean_terminated_length": 401.6748352050781, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.0175, "frac_reward_zero_std": 0.078125, "grad_norm": 0.3314379155635834, "learning_rate": 5.666666666666666e-07, "loss": 0.1008, "num_tokens": 11679389.0, "reward": 0.4677734375, "reward_std": 0.2392577975988388, "rewards/correctness_reward_func/mean": 0.138671875, "rewards/correctness_reward_func/std": 0.34594178199768066, "rewards/strict_format_reward_func/mean": 0.796875, "rewards/strict_format_reward_func/std": 0.4027182459831238, "step": 35 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.126953125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1277.0, "completions/mean_length": 522.14453125, "completions/mean_terminated_length": 411.94183349609375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.018, "frac_reward_zero_std": 0.125, "grad_norm": 0.3079322576522827, "learning_rate": 5.833333333333334e-07, "loss": 0.0629, "num_tokens": 12026983.0, "reward": 0.453125, "reward_std": 0.20784910023212433, "rewards/correctness_reward_func/mean": 0.095703125, "rewards/correctness_reward_func/std": 0.2944713830947876, "rewards/strict_format_reward_func/mean": 0.810546875, "rewards/strict_format_reward_func/std": 0.3922513723373413, "step": 36 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1254.0, "completions/mean_length": 434.671875, "completions/mean_terminated_length": 363.0339050292969, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.0185, "frac_reward_zero_std": 0.171875, "grad_norm": 0.34872007369995117, "learning_rate": 6e-07, "loss": 0.1292, "num_tokens": 12325079.0, "reward": 0.5009765625, "reward_std": 0.204152449965477, "rewards/correctness_reward_func/mean": 0.1484375, "rewards/correctness_reward_func/std": 0.35588082671165466, "rewards/strict_format_reward_func/mean": 0.853515625, "rewards/strict_format_reward_func/std": 0.35393697023391724, "step": 37 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1271.0, "completions/mean_length": 433.001953125, "completions/mean_terminated_length": 353.36968994140625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.019, "frac_reward_zero_std": 0.171875, "grad_norm": 0.31309986114501953, "learning_rate": 6.166666666666667e-07, "loss": 0.0508, "num_tokens": 12627696.0, "reward": 0.48828125, "reward_std": 0.1888417899608612, "rewards/correctness_reward_func/mean": 0.109375, "rewards/correctness_reward_func/std": 0.31241437792778015, "rewards/strict_format_reward_func/mean": 0.8671875, "rewards/strict_format_reward_func/std": 0.33970388770103455, "step": 38 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1270.0, "completions/mean_length": 424.388671875, "completions/mean_terminated_length": 359.6785888671875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.0195, "frac_reward_zero_std": 0.09375, "grad_norm": 0.3900044560432434, "learning_rate": 6.333333333333332e-07, "loss": 0.1074, "num_tokens": 12926895.0, "reward": 0.4833984375, "reward_std": 0.2132570892572403, "rewards/correctness_reward_func/mean": 0.134765625, "rewards/correctness_reward_func/std": 0.3418070077896118, "rewards/strict_format_reward_func/mean": 0.83203125, "rewards/strict_format_reward_func/std": 0.374204158782959, "step": 39 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.103515625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1254.0, "completions/mean_length": 464.607421875, "completions/mean_terminated_length": 370.455322265625, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.02, "frac_reward_zero_std": 0.140625, "grad_norm": 0.3177374601364136, "learning_rate": 6.5e-07, "loss": 0.0991, "num_tokens": 13245982.0, "reward": 0.501953125, "reward_std": 0.20007643103599548, "rewards/correctness_reward_func/mean": 0.126953125, "rewards/correctness_reward_func/std": 0.33324605226516724, "rewards/strict_format_reward_func/mean": 0.876953125, "rewards/strict_format_reward_func/std": 0.32881227135658264, "step": 40 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.095703125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1280.0, "completions/mean_length": 448.64453125, "completions/mean_terminated_length": 360.660888671875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.0205, "frac_reward_zero_std": 0.140625, "grad_norm": 0.31568291783332825, "learning_rate": 6.666666666666666e-07, "loss": 0.106, "num_tokens": 13554360.0, "reward": 0.4990234375, "reward_std": 0.19699305295944214, "rewards/correctness_reward_func/mean": 0.12890625, "rewards/correctness_reward_func/std": 0.33542385697364807, "rewards/strict_format_reward_func/mean": 0.869140625, "rewards/strict_format_reward_func/std": 0.33757632970809937, "step": 41 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1269.0, "completions/mean_length": 443.26171875, "completions/mean_terminated_length": 383.7447509765625, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.021, "frac_reward_zero_std": 0.234375, "grad_norm": 0.28603100776672363, "learning_rate": 6.833333333333333e-07, "loss": 0.0884, "num_tokens": 13860294.0, "reward": 0.50390625, "reward_std": 0.17007258534431458, "rewards/correctness_reward_func/mean": 0.107421875, "rewards/correctness_reward_func/std": 0.30995169281959534, "rewards/strict_format_reward_func/mean": 0.900390625, "rewards/strict_format_reward_func/std": 0.29977133870124817, "step": 42 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1261.0, "completions/mean_length": 410.166015625, "completions/mean_terminated_length": 348.29498291015625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.0215, "frac_reward_zero_std": 0.234375, "grad_norm": 0.3239445388317108, "learning_rate": 7e-07, "loss": 0.1053, "num_tokens": 14150835.0, "reward": 0.505859375, "reward_std": 0.17652790248394012, "rewards/correctness_reward_func/mean": 0.1484375, "rewards/correctness_reward_func/std": 0.35588082671165466, "rewards/strict_format_reward_func/mean": 0.86328125, "rewards/strict_format_reward_func/std": 0.3438861668109894, "step": 43 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.064453125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1249.0, "completions/mean_length": 396.798828125, "completions/mean_terminated_length": 335.9519958496094, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.022, "frac_reward_zero_std": 0.1875, "grad_norm": 0.3426041603088379, "learning_rate": 7.166666666666667e-07, "loss": 0.1241, "num_tokens": 14434852.0, "reward": 0.5283203125, "reward_std": 0.18634288012981415, "rewards/correctness_reward_func/mean": 0.1484375, "rewards/correctness_reward_func/std": 0.35588082671165466, "rewards/strict_format_reward_func/mean": 0.908203125, "rewards/strict_format_reward_func/std": 0.289021372795105, "step": 44 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1273.0, "completions/mean_length": 400.2890625, "completions/mean_terminated_length": 337.7154846191406, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.0225, "frac_reward_zero_std": 0.25, "grad_norm": 0.3275441527366638, "learning_rate": 7.333333333333332e-07, "loss": 0.0845, "num_tokens": 14721088.0, "reward": 0.5400390625, "reward_std": 0.16695258021354675, "rewards/correctness_reward_func/mean": 0.162109375, "rewards/correctness_reward_func/std": 0.3689115643501282, "rewards/strict_format_reward_func/mean": 0.91796875, "rewards/strict_format_reward_func/std": 0.2746807038784027, "step": 45 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.083984375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1259.0, "completions/mean_length": 395.701171875, "completions/mean_terminated_length": 314.624755859375, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.023, "frac_reward_zero_std": 0.328125, "grad_norm": 0.3019014894962311, "learning_rate": 7.5e-07, "loss": 0.072, "num_tokens": 15002687.0, "reward": 0.5380859375, "reward_std": 0.14848747849464417, "rewards/correctness_reward_func/mean": 0.1484375, "rewards/correctness_reward_func/std": 0.35588082671165466, "rewards/strict_format_reward_func/mean": 0.927734375, "rewards/strict_format_reward_func/std": 0.2591804563999176, "step": 46 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1275.0, "completions/mean_length": 409.96875, "completions/mean_terminated_length": 351.9666748046875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.0235, "frac_reward_zero_std": 0.25, "grad_norm": 0.3243018686771393, "learning_rate": 7.666666666666667e-07, "loss": 0.1358, "num_tokens": 15292767.0, "reward": 0.5087890625, "reward_std": 0.16159790754318237, "rewards/correctness_reward_func/mean": 0.103515625, "rewards/correctness_reward_func/std": 0.30492907762527466, "rewards/strict_format_reward_func/mean": 0.9140625, "rewards/strict_format_reward_func/std": 0.28054583072662354, "step": 47 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.068359375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1280.0, "completions/mean_length": 377.39453125, "completions/mean_terminated_length": 311.1656188964844, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.024, "frac_reward_zero_std": 0.296875, "grad_norm": 0.3503972291946411, "learning_rate": 7.833333333333333e-07, "loss": 0.0887, "num_tokens": 15570681.0, "reward": 0.51953125, "reward_std": 0.15567830204963684, "rewards/correctness_reward_func/mean": 0.12890625, "rewards/correctness_reward_func/std": 0.33542385697364807, "rewards/strict_format_reward_func/mean": 0.91015625, "rewards/strict_format_reward_func/std": 0.2862374484539032, "step": 48 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.044921875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1260.0, "completions/mean_length": 310.064453125, "completions/mean_terminated_length": 264.4437561035156, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.0245, "frac_reward_zero_std": 0.375, "grad_norm": 0.32057884335517883, "learning_rate": 8e-07, "loss": 0.0729, "num_tokens": 15808418.0, "reward": 0.580078125, "reward_std": 0.15646106004714966, "rewards/correctness_reward_func/mean": 0.22265625, "rewards/correctness_reward_func/std": 0.41643625497817993, "rewards/strict_format_reward_func/mean": 0.9375, "rewards/strict_format_reward_func/std": 0.2422981858253479, "step": 49 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1209.0, "completions/mean_length": 296.234375, "completions/mean_terminated_length": 252.06529235839844, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.025, "frac_reward_zero_std": 0.359375, "grad_norm": 0.3681601881980896, "learning_rate": 8.166666666666666e-07, "loss": 0.1429, "num_tokens": 16038850.0, "reward": 0.56640625, "reward_std": 0.14544308185577393, "rewards/correctness_reward_func/mean": 0.169921875, "rewards/correctness_reward_func/std": 0.3759314715862274, "rewards/strict_format_reward_func/mean": 0.962890625, "rewards/strict_format_reward_func/std": 0.18921469151973724, "step": 50 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1263.0, "completions/mean_length": 335.375, "completions/mean_terminated_length": 296.9755859375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.0255, "frac_reward_zero_std": 0.359375, "grad_norm": 0.3586282730102539, "learning_rate": 8.333333333333333e-07, "loss": 0.1065, "num_tokens": 16293666.0, "reward": 0.5302734375, "reward_std": 0.13623127341270447, "rewards/correctness_reward_func/mean": 0.115234375, "rewards/correctness_reward_func/std": 0.3196168541908264, "rewards/strict_format_reward_func/mean": 0.9453125, "rewards/strict_format_reward_func/std": 0.2275916188955307, "step": 51 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.025390625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1215.0, "completions/mean_length": 269.9765625, "completions/mean_terminated_length": 243.663330078125, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.026, "frac_reward_zero_std": 0.328125, "grad_norm": 0.3876008689403534, "learning_rate": 8.499999999999999e-07, "loss": 0.0597, "num_tokens": 16510590.0, "reward": 0.6083984375, "reward_std": 0.15176597237586975, "rewards/correctness_reward_func/mean": 0.244140625, "rewards/correctness_reward_func/std": 0.42999663949012756, "rewards/strict_format_reward_func/mean": 0.97265625, "rewards/strict_format_reward_func/std": 0.16324250400066376, "step": 52 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.041015625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1277.0, "completions/mean_length": 297.580078125, "completions/mean_terminated_length": 255.5621337890625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.0265, "frac_reward_zero_std": 0.3125, "grad_norm": 0.42054447531700134, "learning_rate": 8.666666666666667e-07, "loss": 0.1005, "num_tokens": 16743239.0, "reward": 0.5927734375, "reward_std": 0.16221949458122253, "rewards/correctness_reward_func/mean": 0.248046875, "rewards/correctness_reward_func/std": 0.4323015511035919, "rewards/strict_format_reward_func/mean": 0.9375, "rewards/strict_format_reward_func/std": 0.2422981858253479, "step": 53 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1115.0, "completions/mean_length": 242.103515625, "completions/mean_terminated_length": 225.6289825439453, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.027, "frac_reward_zero_std": 0.34375, "grad_norm": 0.4196186661720276, "learning_rate": 8.833333333333333e-07, "loss": 0.0946, "num_tokens": 16944868.0, "reward": 0.591796875, "reward_std": 0.14952966570854187, "rewards/correctness_reward_func/mean": 0.2109375, "rewards/correctness_reward_func/std": 0.4083731174468994, "rewards/strict_format_reward_func/mean": 0.97265625, "rewards/strict_format_reward_func/std": 0.16324250400066376, "step": 54 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1132.0, "completions/mean_length": 271.400390625, "completions/mean_terminated_length": 251.30877685546875, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.0275, "frac_reward_zero_std": 0.359375, "grad_norm": 0.36686980724334717, "learning_rate": 9e-07, "loss": 0.0488, "num_tokens": 17166681.0, "reward": 0.5830078125, "reward_std": 0.14440418779850006, "rewards/correctness_reward_func/mean": 0.1875, "rewards/correctness_reward_func/std": 0.39069411158561707, "rewards/strict_format_reward_func/mean": 0.978515625, "rewards/strict_format_reward_func/std": 0.14513419568538666, "step": 55 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1196.0, "completions/mean_length": 268.162109375, "completions/mean_terminated_length": 245.94610595703125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.028, "frac_reward_zero_std": 0.3125, "grad_norm": 0.3979751765727997, "learning_rate": 9.166666666666665e-07, "loss": 0.062, "num_tokens": 17384156.0, "reward": 0.580078125, "reward_std": 0.1604653298854828, "rewards/correctness_reward_func/mean": 0.19140625, "rewards/correctness_reward_func/std": 0.3937928080558777, "rewards/strict_format_reward_func/mean": 0.96875, "rewards/strict_format_reward_func/std": 0.17416280508041382, "step": 56 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1272.0, "completions/mean_length": 244.208984375, "completions/mean_terminated_length": 231.9268798828125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.0285, "frac_reward_zero_std": 0.515625, "grad_norm": 0.3326975703239441, "learning_rate": 9.333333333333333e-07, "loss": -0.0184, "num_tokens": 17589991.0, "reward": 0.5966796875, "reward_std": 0.10863800346851349, "rewards/correctness_reward_func/mean": 0.2109375, "rewards/correctness_reward_func/std": 0.4083731174468994, "rewards/strict_format_reward_func/mean": 0.982421875, "rewards/strict_format_reward_func/std": 0.13154059648513794, "step": 57 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1200.0, "completions/mean_length": 241.818359375, "completions/mean_terminated_length": 221.137451171875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.029, "frac_reward_zero_std": 0.484375, "grad_norm": 0.41635575890541077, "learning_rate": 9.499999999999999e-07, "loss": 0.0759, "num_tokens": 17792898.0, "reward": 0.5869140625, "reward_std": 0.1135234534740448, "rewards/correctness_reward_func/mean": 0.1875, "rewards/correctness_reward_func/std": 0.39069411158561707, "rewards/strict_format_reward_func/mean": 0.986328125, "rewards/strict_format_reward_func/std": 0.1162383034825325, "step": 58 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1252.0, "completions/mean_length": 265.177734375, "completions/mean_terminated_length": 240.82200622558594, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.0295, "frac_reward_zero_std": 0.4375, "grad_norm": 0.3422642946243286, "learning_rate": 9.666666666666666e-07, "loss": 0.0449, "num_tokens": 18008573.0, "reward": 0.6044921875, "reward_std": 0.1342127025127411, "rewards/correctness_reward_func/mean": 0.220703125, "rewards/correctness_reward_func/std": 0.4151262938976288, "rewards/strict_format_reward_func/mean": 0.98828125, "rewards/strict_format_reward_func/std": 0.10772226005792618, "step": 59 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1263.0, "completions/mean_length": 228.244140625, "completions/mean_terminated_length": 219.9626007080078, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.03, "frac_reward_zero_std": 0.46875, "grad_norm": 0.32586634159088135, "learning_rate": 9.833333333333332e-07, "loss": 0.0218, "num_tokens": 18202290.0, "reward": 0.5986328125, "reward_std": 0.12520861625671387, "rewards/correctness_reward_func/mean": 0.228515625, "rewards/correctness_reward_func/std": 0.4202871024608612, "rewards/strict_format_reward_func/mean": 0.96875, "rewards/strict_format_reward_func/std": 0.17416280508041382, "step": 60 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1086.0, "completions/mean_length": 235.298828125, "completions/mean_terminated_length": 222.91107177734375, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.0305, "frac_reward_zero_std": 0.375, "grad_norm": 0.37623488903045654, "learning_rate": 1e-06, "loss": 0.0071, "num_tokens": 18404795.0, "reward": 0.58203125, "reward_std": 0.14406809210777283, "rewards/correctness_reward_func/mean": 0.18359375, "rewards/correctness_reward_func/std": 0.3875311613082886, "rewards/strict_format_reward_func/mean": 0.98046875, "rewards/strict_format_reward_func/std": 0.1385180652141571, "step": 61 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1032.0, "completions/mean_length": 223.65234375, "completions/mean_terminated_length": 217.42633056640625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.031, "frac_reward_zero_std": 0.46875, "grad_norm": 0.3536680042743683, "learning_rate": 9.999993444041445e-07, "loss": 0.017, "num_tokens": 18598601.0, "reward": 0.64453125, "reward_std": 0.12280106544494629, "rewards/correctness_reward_func/mean": 0.30078125, "rewards/correctness_reward_func/std": 0.45904624462127686, "rewards/strict_format_reward_func/mean": 0.98828125, "rewards/strict_format_reward_func/std": 0.10772226005792618, "step": 62 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1246.0, "completions/mean_length": 248.73828125, "completions/mean_terminated_length": 236.5098876953125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.0315, "frac_reward_zero_std": 0.34375, "grad_norm": 0.4249816834926605, "learning_rate": 9.99997377618298e-07, "loss": 0.0152, "num_tokens": 18805451.0, "reward": 0.611328125, "reward_std": 0.13816098868846893, "rewards/correctness_reward_func/mean": 0.25390625, "rewards/correctness_reward_func/std": 0.43567025661468506, "rewards/strict_format_reward_func/mean": 0.96875, "rewards/strict_format_reward_func/std": 0.17416280508041382, "step": 63 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1268.0, "completions/mean_length": 290.15234375, "completions/mean_terminated_length": 268.4191589355469, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.032, "frac_reward_zero_std": 0.484375, "grad_norm": 0.3467140793800354, "learning_rate": 9.999940996476175e-07, "loss": 0.0502, "num_tokens": 19040769.0, "reward": 0.5927734375, "reward_std": 0.12330609560012817, "rewards/correctness_reward_func/mean": 0.216796875, "rewards/correctness_reward_func/std": 0.4124660789966583, "rewards/strict_format_reward_func/mean": 0.96875, "rewards/strict_format_reward_func/std": 0.17416280508041382, "step": 64 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1221.0, "completions/mean_length": 279.16015625, "completions/mean_terminated_length": 265.2871398925781, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.0325, "frac_reward_zero_std": 0.359375, "grad_norm": 0.3559761047363281, "learning_rate": 9.999895105006994e-07, "loss": 0.0552, "num_tokens": 19265571.0, "reward": 0.623046875, "reward_std": 0.14418897032737732, "rewards/correctness_reward_func/mean": 0.275390625, "rewards/correctness_reward_func/std": 0.44714778661727905, "rewards/strict_format_reward_func/mean": 0.970703125, "rewards/strict_format_reward_func/std": 0.16880230605602264, "step": 65 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1210.0, "completions/mean_length": 246.076171875, "completions/mean_terminated_length": 229.66470336914062, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.033, "frac_reward_zero_std": 0.375, "grad_norm": 0.3859092593193054, "learning_rate": 9.99983610189578e-07, "loss": 0.0579, "num_tokens": 19470018.0, "reward": 0.6279296875, "reward_std": 0.13720150291919708, "rewards/correctness_reward_func/mean": 0.26953125, "rewards/correctness_reward_func/std": 0.44415023922920227, "rewards/strict_format_reward_func/mean": 0.986328125, "rewards/strict_format_reward_func/std": 0.1162383034825325, "step": 66 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1112.0, "completions/mean_length": 251.421875, "completions/mean_terminated_length": 228.83831787109375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.0335, "frac_reward_zero_std": 0.484375, "grad_norm": 0.36630764603614807, "learning_rate": 9.999763987297264e-07, "loss": 0.0476, "num_tokens": 19677810.0, "reward": 0.619140625, "reward_std": 0.1067921370267868, "rewards/correctness_reward_func/mean": 0.26953125, "rewards/correctness_reward_func/std": 0.44415023922920227, "rewards/strict_format_reward_func/mean": 0.96875, "rewards/strict_format_reward_func/std": 0.17416280508041382, "step": 67 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1172.0, "completions/mean_length": 240.859375, "completions/mean_terminated_length": 222.26638793945312, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.034, "frac_reward_zero_std": 0.328125, "grad_norm": 0.38380587100982666, "learning_rate": 9.999678761400562e-07, "loss": 0.0692, "num_tokens": 19883234.0, "reward": 0.630859375, "reward_std": 0.15035656094551086, "rewards/correctness_reward_func/mean": 0.2890625, "rewards/correctness_reward_func/std": 0.45377036929130554, "rewards/strict_format_reward_func/mean": 0.97265625, "rewards/strict_format_reward_func/std": 0.16324250400066376, "step": 68 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1162.0, "completions/mean_length": 234.337890625, "completions/mean_terminated_length": 224.02565002441406, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.0345, "frac_reward_zero_std": 0.390625, "grad_norm": 0.35500645637512207, "learning_rate": 9.999580424429159e-07, "loss": 0.0483, "num_tokens": 20083247.0, "reward": 0.6640625, "reward_std": 0.1348668485879898, "rewards/correctness_reward_func/mean": 0.33203125, "rewards/correctness_reward_func/std": 0.47140273451805115, "rewards/strict_format_reward_func/mean": 0.99609375, "rewards/strict_format_reward_func/std": 0.06243881583213806, "step": 69 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1073.0, "completions/mean_length": 265.732421875, "completions/mean_terminated_length": 259.7544250488281, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.035, "frac_reward_zero_std": 0.578125, "grad_norm": 0.27369487285614014, "learning_rate": 9.999468976640939e-07, "loss": 0.0461, "num_tokens": 20300998.0, "reward": 0.5791015625, "reward_std": 0.09376809000968933, "rewards/correctness_reward_func/mean": 0.18359375, "rewards/correctness_reward_func/std": 0.3875311613082886, "rewards/strict_format_reward_func/mean": 0.974609375, "rewards/strict_format_reward_func/std": 0.15746226906776428, "step": 70 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1039.0, "completions/mean_length": 233.96875, "completions/mean_terminated_length": 215.25247192382812, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.0355, "frac_reward_zero_std": 0.4375, "grad_norm": 0.4125555753707886, "learning_rate": 9.99934441832816e-07, "loss": 0.0956, "num_tokens": 20497414.0, "reward": 0.625, "reward_std": 0.12506619095802307, "rewards/correctness_reward_func/mean": 0.263671875, "rewards/correctness_reward_func/std": 0.4410543739795685, "rewards/strict_format_reward_func/mean": 0.986328125, "rewards/strict_format_reward_func/std": 0.1162383034825325, "step": 71 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1265.0, "completions/mean_length": 271.978515625, "completions/mean_terminated_length": 247.7860107421875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.036, "frac_reward_zero_std": 0.453125, "grad_norm": 0.34837380051612854, "learning_rate": 9.99920674981746e-07, "loss": 0.1279, "num_tokens": 20720507.0, "reward": 0.6103515625, "reward_std": 0.11888537555932999, "rewards/correctness_reward_func/mean": 0.24609375, "rewards/correctness_reward_func/std": 0.4311550557613373, "rewards/strict_format_reward_func/mean": 0.974609375, "rewards/strict_format_reward_func/std": 0.15746226906776428, "step": 72 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1210.0, "completions/mean_length": 228.228515625, "completions/mean_terminated_length": 224.1039276123047, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.0365, "frac_reward_zero_std": 0.390625, "grad_norm": 0.39599257707595825, "learning_rate": 9.999055971469863e-07, "loss": 0.0224, "num_tokens": 20915896.0, "reward": 0.673828125, "reward_std": 0.13572058081626892, "rewards/correctness_reward_func/mean": 0.357421875, "rewards/correctness_reward_func/std": 0.4797092080116272, "rewards/strict_format_reward_func/mean": 0.990234375, "rewards/strict_format_reward_func/std": 0.09843364357948303, "step": 73 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1256.0, "completions/mean_length": 271.587890625, "completions/mean_terminated_length": 261.64300537109375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.037, "frac_reward_zero_std": 0.515625, "grad_norm": 0.35442784428596497, "learning_rate": 9.998892083680762e-07, "loss": 0.0349, "num_tokens": 21138349.0, "reward": 0.6015625, "reward_std": 0.11092907190322876, "rewards/correctness_reward_func/mean": 0.23828125, "rewards/correctness_reward_func/std": 0.42644867300987244, "rewards/strict_format_reward_func/mean": 0.96484375, "rewards/strict_format_reward_func/std": 0.1843547374010086, "step": 74 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1141.0, "completions/mean_length": 218.25, "completions/mean_terminated_length": 216.17221069335938, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.0375, "frac_reward_zero_std": 0.34375, "grad_norm": 0.4667811393737793, "learning_rate": 9.998715086879935e-07, "loss": 0.0262, "num_tokens": 21327421.0, "reward": 0.66796875, "reward_std": 0.1432858556509018, "rewards/correctness_reward_func/mean": 0.341796875, "rewards/correctness_reward_func/std": 0.4747757613658905, "rewards/strict_format_reward_func/mean": 0.994140625, "rewards/strict_format_reward_func/std": 0.07639661431312561, "step": 75 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1202.0, "completions/mean_length": 227.681640625, "completions/mean_terminated_length": 215.2035675048828, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.038, "frac_reward_zero_std": 0.546875, "grad_norm": 0.3435099124908447, "learning_rate": 9.99852498153154e-07, "loss": 0.041, "num_tokens": 21521426.0, "reward": 0.6953125, "reward_std": 0.10652374476194382, "rewards/correctness_reward_func/mean": 0.400390625, "rewards/correctness_reward_func/std": 0.4904567301273346, "rewards/strict_format_reward_func/mean": 0.990234375, "rewards/strict_format_reward_func/std": 0.09843364357948303, "step": 76 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1182.0, "completions/mean_length": 241.912109375, "completions/mean_terminated_length": 237.8411865234375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.0385, "frac_reward_zero_std": 0.359375, "grad_norm": 0.39670509099960327, "learning_rate": 9.9983217681341e-07, "loss": 0.0428, "num_tokens": 21724693.0, "reward": 0.6630859375, "reward_std": 0.14423537254333496, "rewards/correctness_reward_func/mean": 0.33203125, "rewards/correctness_reward_func/std": 0.47140273451805115, "rewards/strict_format_reward_func/mean": 0.994140625, "rewards/strict_format_reward_func/std": 0.07639661431312561, "step": 77 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1259.0, "completions/mean_length": 243.982421875, "completions/mean_terminated_length": 237.87623596191406, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.039, "frac_reward_zero_std": 0.484375, "grad_norm": 0.4161688983440399, "learning_rate": 9.998105447220522e-07, "loss": 0.0338, "num_tokens": 21929988.0, "reward": 0.619140625, "reward_std": 0.11164377629756927, "rewards/correctness_reward_func/mean": 0.265625, "rewards/correctness_reward_func/std": 0.44209739565849304, "rewards/strict_format_reward_func/mean": 0.97265625, "rewards/strict_format_reward_func/std": 0.16324250400066376, "step": 78 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1210.0, "completions/max_terminated_length": 1210.0, "completions/mean_length": 213.919921875, "completions/mean_terminated_length": 213.919921875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.0395, "frac_reward_zero_std": 0.390625, "grad_norm": 0.4172995388507843, "learning_rate": 9.997876019358083e-07, "loss": 0.0083, "num_tokens": 22114003.0, "reward": 0.72265625, "reward_std": 0.1379173994064331, "rewards/correctness_reward_func/mean": 0.451171875, "rewards/correctness_reward_func/std": 0.498096764087677, "rewards/strict_format_reward_func/mean": 0.994140625, "rewards/strict_format_reward_func/std": 0.07639661431312561, "step": 79 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1144.0, "completions/mean_length": 244.953125, "completions/mean_terminated_length": 240.89413452148438, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.04, "frac_reward_zero_std": 0.4375, "grad_norm": 0.37673693895339966, "learning_rate": 9.997633485148427e-07, "loss": 0.0219, "num_tokens": 22321795.0, "reward": 0.6494140625, "reward_std": 0.12516599893569946, "rewards/correctness_reward_func/mean": 0.302734375, "rewards/correctness_reward_func/std": 0.45989060401916504, "rewards/strict_format_reward_func/mean": 0.99609375, "rewards/strict_format_reward_func/std": 0.06243881583213806, "step": 80 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1247.0, "completions/max_terminated_length": 1247.0, "completions/mean_length": 230.265625, "completions/mean_terminated_length": 230.265625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.0405, "frac_reward_zero_std": 0.4375, "grad_norm": 0.37114495038986206, "learning_rate": 9.997377845227574e-07, "loss": -0.0031, "num_tokens": 22516963.0, "reward": 0.6845703125, "reward_std": 0.12628617882728577, "rewards/correctness_reward_func/mean": 0.37109375, "rewards/correctness_reward_func/std": 0.4835699498653412, "rewards/strict_format_reward_func/mean": 0.998046875, "rewards/strict_format_reward_func/std": 0.04419417306780815, "step": 81 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1265.0, "completions/mean_length": 253.953125, "completions/mean_terminated_length": 247.90570068359375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.041, "frac_reward_zero_std": 0.453125, "grad_norm": 0.3662205934524536, "learning_rate": 9.99710910026591e-07, "loss": 0.0094, "num_tokens": 22728275.0, "reward": 0.6787109375, "reward_std": 0.12296395003795624, "rewards/correctness_reward_func/mean": 0.375, "rewards/correctness_reward_func/std": 0.4845963716506958, "rewards/strict_format_reward_func/mean": 0.982421875, "rewards/strict_format_reward_func/std": 0.13154059648513794, "step": 82 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1265.0, "completions/mean_length": 252.30859375, "completions/mean_terminated_length": 242.17356872558594, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.0415, "frac_reward_zero_std": 0.46875, "grad_norm": 0.3793325424194336, "learning_rate": 9.996827250968189e-07, "loss": 0.0223, "num_tokens": 22937537.0, "reward": 0.6748046875, "reward_std": 0.11382539570331573, "rewards/correctness_reward_func/mean": 0.37890625, "rewards/correctness_reward_func/std": 0.4855891764163971, "rewards/strict_format_reward_func/mean": 0.970703125, "rewards/strict_format_reward_func/std": 0.16880230605602264, "step": 83 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1214.0, "completions/mean_length": 271.107421875, "completions/mean_terminated_length": 257.1227722167969, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.042, "frac_reward_zero_std": 0.546875, "grad_norm": 0.3364124596118927, "learning_rate": 9.996532298073524e-07, "loss": 0.0469, "num_tokens": 23159544.0, "reward": 0.6025390625, "reward_std": 0.10351093113422394, "rewards/correctness_reward_func/mean": 0.21484375, "rewards/correctness_reward_func/std": 0.4111155867576599, "rewards/strict_format_reward_func/mean": 0.990234375, "rewards/strict_format_reward_func/std": 0.09843364357948303, "step": 84 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1216.0, "completions/mean_length": 259.109375, "completions/mean_terminated_length": 247.00396728515625, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.0425, "frac_reward_zero_std": 0.546875, "grad_norm": 0.33089637756347656, "learning_rate": 9.996224242355397e-07, "loss": 0.0306, "num_tokens": 23373448.0, "reward": 0.63671875, "reward_std": 0.10616335272789001, "rewards/correctness_reward_func/mean": 0.27734375, "rewards/correctness_reward_func/std": 0.4481254518032074, "rewards/strict_format_reward_func/mean": 0.99609375, "rewards/strict_format_reward_func/std": 0.06243881583213806, "step": 85 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1249.0, "completions/mean_length": 279.59765625, "completions/mean_terminated_length": 267.7351989746094, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.043, "frac_reward_zero_std": 0.46875, "grad_norm": 0.3235888183116913, "learning_rate": 9.99590308462165e-07, "loss": 0.0283, "num_tokens": 23599442.0, "reward": 0.6181640625, "reward_std": 0.12362537533044815, "rewards/correctness_reward_func/mean": 0.2734375, "rewards/correctness_reward_func/std": 0.4461594223976135, "rewards/strict_format_reward_func/mean": 0.962890625, "rewards/strict_format_reward_func/std": 0.18921469151973724, "step": 86 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1255.0, "completions/mean_length": 278.6015625, "completions/mean_terminated_length": 258.65338134765625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.0435, "frac_reward_zero_std": 0.453125, "grad_norm": 0.3776208460330963, "learning_rate": 9.995568825714478e-07, "loss": 0.0546, "num_tokens": 23823638.0, "reward": 0.65234375, "reward_std": 0.12013693153858185, "rewards/correctness_reward_func/mean": 0.310546875, "rewards/correctness_reward_func/std": 0.46317005157470703, "rewards/strict_format_reward_func/mean": 0.994140625, "rewards/strict_format_reward_func/std": 0.07639661431312561, "step": 87 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1127.0, "completions/mean_length": 261.638671875, "completions/mean_terminated_length": 243.41748046875, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.044, "frac_reward_zero_std": 0.46875, "grad_norm": 0.3642917573451996, "learning_rate": 9.995221466510437e-07, "loss": 0.0622, "num_tokens": 24034997.0, "reward": 0.6865234375, "reward_std": 0.11267617344856262, "rewards/correctness_reward_func/mean": 0.37890625, "rewards/correctness_reward_func/std": 0.4855891764163971, "rewards/strict_format_reward_func/mean": 0.994140625, "rewards/strict_format_reward_func/std": 0.07639661431312561, "step": 88 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1257.0, "completions/max_terminated_length": 1257.0, "completions/mean_length": 234.8359375, "completions/mean_terminated_length": 234.8359375, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.0445, "frac_reward_zero_std": 0.515625, "grad_norm": 0.37823545932769775, "learning_rate": 9.994861007920439e-07, "loss": -0.0068, "num_tokens": 24231833.0, "reward": 0.662109375, "reward_std": 0.10682021081447601, "rewards/correctness_reward_func/mean": 0.32421875, "rewards/correctness_reward_func/std": 0.4685399830341339, "rewards/strict_format_reward_func/mean": 1.0, "rewards/strict_format_reward_func/std": 0.0, "step": 89 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1192.0, "completions/mean_length": 265.517578125, "completions/mean_terminated_length": 251.45545959472656, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.045, "frac_reward_zero_std": 0.453125, "grad_norm": 0.38415420055389404, "learning_rate": 9.99448745088974e-07, "loss": 0.0367, "num_tokens": 24448530.0, "reward": 0.6865234375, "reward_std": 0.12411317974328995, "rewards/correctness_reward_func/mean": 0.376953125, "rewards/correctness_reward_func/std": 0.4850969910621643, "rewards/strict_format_reward_func/mean": 0.99609375, "rewards/strict_format_reward_func/std": 0.06243881583213806, "step": 90 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1095.0, "completions/mean_length": 246.599609375, "completions/mean_terminated_length": 240.50885009765625, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.0455, "frac_reward_zero_std": 0.484375, "grad_norm": 0.4248853325843811, "learning_rate": 9.994100796397953e-07, "loss": 0.004, "num_tokens": 24654973.0, "reward": 0.6787109375, "reward_std": 0.11741062998771667, "rewards/correctness_reward_func/mean": 0.375, "rewards/correctness_reward_func/std": 0.4845963716506958, "rewards/strict_format_reward_func/mean": 0.982421875, "rewards/strict_format_reward_func/std": 0.13154059648513794, "step": 91 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1154.0, "completions/mean_length": 286.89453125, "completions/mean_terminated_length": 279.0747985839844, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.046, "frac_reward_zero_std": 0.53125, "grad_norm": 0.3524153232574463, "learning_rate": 9.993701045459033e-07, "loss": 0.0237, "num_tokens": 24884767.0, "reward": 0.6708984375, "reward_std": 0.10362257063388824, "rewards/correctness_reward_func/mean": 0.34765625, "rewards/correctness_reward_func/std": 0.47669193148612976, "rewards/strict_format_reward_func/mean": 0.994140625, "rewards/strict_format_reward_func/std": 0.07639661431312561, "step": 92 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1280.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 242.515625, "completions/mean_terminated_length": 238.4470672607422, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.0465, "frac_reward_zero_std": 0.53125, "grad_norm": 0.34622839093208313, "learning_rate": 9.993288199121282e-07, "loss": 0.001, "num_tokens": 25087511.0, "reward": 0.6669921875, "reward_std": 0.09893198311328888, "rewards/correctness_reward_func/mean": 0.3359375, "rewards/correctness_reward_func/std": 0.4727790653705597, "rewards/strict_format_reward_func/mean": 0.998046875, "rewards/strict_format_reward_func/std": 0.04419417306780815, "step": 93 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1165.0, "completions/mean_length": 271.091796875, "completions/mean_terminated_length": 269.1174011230469, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.047, "frac_reward_zero_std": 0.5, "grad_norm": 0.41750165820121765, "learning_rate": 9.992862258467337e-07, "loss": 0.0405, "num_tokens": 25306838.0, "reward": 0.6875, "reward_std": 0.11129043251276016, "rewards/correctness_reward_func/mean": 0.37890625, "rewards/correctness_reward_func/std": 0.4855891764163971, "rewards/strict_format_reward_func/mean": 0.99609375, "rewards/strict_format_reward_func/std": 0.06243881583213806, "step": 94 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1246.0, "completions/mean_length": 281.06640625, "completions/mean_terminated_length": 273.2007751464844, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.0475, "frac_reward_zero_std": 0.640625, "grad_norm": 0.3369617164134979, "learning_rate": 9.992423224614183e-07, "loss": 0.0388, "num_tokens": 25532536.0, "reward": 0.6162109375, "reward_std": 0.08216007798910141, "rewards/correctness_reward_func/mean": 0.25390625, "rewards/correctness_reward_func/std": 0.43567025661468506, "rewards/strict_format_reward_func/mean": 0.978515625, "rewards/strict_format_reward_func/std": 0.14513419568538666, "step": 95 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1171.0, "completions/mean_length": 284.396484375, "completions/mean_terminated_length": 274.5779113769531, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.048, "frac_reward_zero_std": 0.5625, "grad_norm": 0.3071320950984955, "learning_rate": 9.991971098713135e-07, "loss": 0.0062, "num_tokens": 25758363.0, "reward": 0.6513671875, "reward_std": 0.09823831915855408, "rewards/correctness_reward_func/mean": 0.328125, "rewards/correctness_reward_func/std": 0.4699897766113281, "rewards/strict_format_reward_func/mean": 0.974609375, "rewards/strict_format_reward_func/std": 0.15746226906776428, "step": 96 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1060.0, "completions/mean_length": 287.841796875, "completions/mean_terminated_length": 283.95098876953125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.0485, "frac_reward_zero_std": 0.46875, "grad_norm": 0.3856940269470215, "learning_rate": 9.991505881949836e-07, "loss": 0.0094, "num_tokens": 25986954.0, "reward": 0.6591796875, "reward_std": 0.10984852910041809, "rewards/correctness_reward_func/mean": 0.318359375, "rewards/correctness_reward_func/std": 0.46629536151885986, "rewards/strict_format_reward_func/mean": 1.0, "rewards/strict_format_reward_func/std": 0.0, "step": 97 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1237.0, "completions/mean_length": 271.947265625, "completions/mean_terminated_length": 259.99407958984375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.049, "frac_reward_zero_std": 0.453125, "grad_norm": 0.40509501099586487, "learning_rate": 9.991027575544265e-07, "loss": 0.0416, "num_tokens": 26208991.0, "reward": 0.65234375, "reward_std": 0.12657982110977173, "rewards/correctness_reward_func/mean": 0.330078125, "rewards/correctness_reward_func/std": 0.47070086002349854, "rewards/strict_format_reward_func/mean": 0.974609375, "rewards/strict_format_reward_func/std": 0.15746226906776428, "step": 98 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1274.0, "completions/mean_length": 280.7890625, "completions/mean_terminated_length": 268.94073486328125, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.0495, "frac_reward_zero_std": 0.578125, "grad_norm": 0.33009451627731323, "learning_rate": 9.990536180750723e-07, "loss": 0.0195, "num_tokens": 26434115.0, "reward": 0.662109375, "reward_std": 0.09571787714958191, "rewards/correctness_reward_func/mean": 0.361328125, "rewards/correctness_reward_func/std": 0.48085519671440125, "rewards/strict_format_reward_func/mean": 0.962890625, "rewards/strict_format_reward_func/std": 0.18921469151973724, "step": 99 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1280.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 242.962890625, "completions/mean_terminated_length": 230.666015625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.05, "frac_reward_zero_std": 0.453125, "grad_norm": 0.41944050788879395, "learning_rate": 9.990031698857841e-07, "loss": 0.028, "num_tokens": 26637352.0, "reward": 0.7265625, "reward_std": 0.12141813337802887, "rewards/correctness_reward_func/mean": 0.45703125, "rewards/correctness_reward_func/std": 0.49863746762275696, "rewards/strict_format_reward_func/mean": 0.99609375, "rewards/strict_format_reward_func/std": 0.06243881583213806, "step": 100 } ], "logging_steps": 1, "max_steps": 2000, "num_input_tokens_seen": 26637352, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }