{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.1, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1171875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1251.0, "completions/mean_length": 519.09765625, "completions/mean_terminated_length": 418.0929260253906, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.0005, "frac_reward_zero_std": 0.015625, "grad_norm": 0.338766485452652, "learning_rate": 0.0, "loss": 0.078, "num_tokens": 345050.0, "reward": 0.3798828125, "reward_std": 0.28941643238067627, "rewards/correctness_reward_func/mean": 0.185546875, "rewards/correctness_reward_func/std": 0.38912075757980347, "rewards/strict_format_reward_func/mean": 0.57421875, "rewards/strict_format_reward_func/std": 0.4949444830417633, "step": 1 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.08203125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1264.0, "completions/mean_length": 492.201171875, "completions/mean_terminated_length": 421.8021240234375, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.001, "frac_reward_zero_std": 0.0, "grad_norm": 0.35604149103164673, "learning_rate": 1.6666666666666667e-08, "loss": 0.0504, "num_tokens": 677193.0, "reward": 0.34765625, "reward_std": 0.26424503326416016, "rewards/correctness_reward_func/mean": 0.099609375, "rewards/correctness_reward_func/std": 0.29977133870124817, "rewards/strict_format_reward_func/mean": 0.595703125, "rewards/strict_format_reward_func/std": 0.4912354052066803, "step": 2 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.107421875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1280.0, "completions/mean_length": 517.6953125, "completions/mean_terminated_length": 425.9518737792969, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.0015, "frac_reward_zero_std": 0.015625, "grad_norm": 0.332509309053421, "learning_rate": 3.3333333333333334e-08, "loss": 0.0796, "num_tokens": 1019981.0, "reward": 0.345703125, "reward_std": 0.2651558816432953, "rewards/correctness_reward_func/mean": 0.125, "rewards/correctness_reward_func/std": 0.3310423493385315, "rewards/strict_format_reward_func/mean": 0.56640625, "rewards/strict_format_reward_func/std": 0.4960552453994751, "step": 3 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1273.0, "completions/mean_length": 495.77734375, "completions/mean_terminated_length": 422.0470275878906, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.002, "frac_reward_zero_std": 0.03125, "grad_norm": 0.32658830285072327, "learning_rate": 5e-08, "loss": 0.0471, "num_tokens": 1354419.0, "reward": 0.373046875, "reward_std": 0.26736605167388916, "rewards/correctness_reward_func/mean": 0.1484375, "rewards/correctness_reward_func/std": 0.35588082671165466, "rewards/strict_format_reward_func/mean": 0.59765625, "rewards/strict_format_reward_func/std": 0.4908501207828522, "step": 4 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.11328125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1262.0, "completions/mean_length": 530.775390625, "completions/mean_terminated_length": 435.0594482421875, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.0025, "frac_reward_zero_std": 0.03125, "grad_norm": 0.32245033979415894, "learning_rate": 6.666666666666667e-08, "loss": 0.0435, "num_tokens": 1705264.0, "reward": 0.318359375, "reward_std": 0.25092875957489014, "rewards/correctness_reward_func/mean": 0.103515625, "rewards/correctness_reward_func/std": 0.30492907762527466, "rewards/strict_format_reward_func/mean": 0.533203125, "rewards/strict_format_reward_func/std": 0.4993842542171478, "step": 5 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1015625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1279.0, "completions/mean_length": 504.80859375, "completions/mean_terminated_length": 417.1782531738281, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.003, "frac_reward_zero_std": 0.046875, "grad_norm": 0.34851861000061035, "learning_rate": 8.333333333333333e-08, "loss": -0.001, "num_tokens": 2046694.0, "reward": 0.33203125, "reward_std": 0.2571216821670532, "rewards/correctness_reward_func/mean": 0.126953125, "rewards/correctness_reward_func/std": 0.33324605226516724, "rewards/strict_format_reward_func/mean": 0.537109375, "rewards/strict_format_reward_func/std": 0.4991086423397064, "step": 6 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.111328125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1276.0, "completions/mean_length": 515.822265625, "completions/mean_terminated_length": 420.0901184082031, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.0035, "frac_reward_zero_std": 0.03125, "grad_norm": 0.3351835608482361, "learning_rate": 1e-07, "loss": 0.0657, "num_tokens": 2392451.0, "reward": 0.33984375, "reward_std": 0.258058100938797, "rewards/correctness_reward_func/mean": 0.12109375, "rewards/correctness_reward_func/std": 0.3265552520751953, "rewards/strict_format_reward_func/mean": 0.55859375, "rewards/strict_format_reward_func/std": 0.4970405399799347, "step": 7 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.099609375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1229.0, "completions/mean_length": 486.931640625, "completions/mean_terminated_length": 399.1952209472656, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.004, "frac_reward_zero_std": 0.03125, "grad_norm": 0.3637154996395111, "learning_rate": 1.1666666666666667e-07, "loss": 0.1004, "num_tokens": 2722008.0, "reward": 0.345703125, "reward_std": 0.2512314021587372, "rewards/correctness_reward_func/mean": 0.12109375, "rewards/correctness_reward_func/std": 0.3265552520751953, "rewards/strict_format_reward_func/mean": 0.5703125, "rewards/strict_format_reward_func/std": 0.4955156147480011, "step": 8 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.10546875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1268.0, "completions/mean_length": 499.8046875, "completions/mean_terminated_length": 407.81658935546875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.0045, "frac_reward_zero_std": 0.0625, "grad_norm": 0.31052860617637634, "learning_rate": 1.3333333333333334e-07, "loss": 0.0588, "num_tokens": 3059220.0, "reward": 0.337890625, "reward_std": 0.24443647265434265, "rewards/correctness_reward_func/mean": 0.123046875, "rewards/correctness_reward_func/std": 0.32881227135658264, "rewards/strict_format_reward_func/mean": 0.552734375, "rewards/strict_format_reward_func/std": 0.4976975917816162, "step": 9 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.083984375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1271.0, "completions/mean_length": 482.73046875, "completions/mean_terminated_length": 409.6332702636719, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.005, "frac_reward_zero_std": 0.078125, "grad_norm": 0.3554303050041199, "learning_rate": 1.5e-07, "loss": 0.044, "num_tokens": 3386450.0, "reward": 0.31640625, "reward_std": 0.25127753615379333, "rewards/correctness_reward_func/mean": 0.09765625, "rewards/correctness_reward_func/std": 0.29713961482048035, "rewards/strict_format_reward_func/mean": 0.53515625, "rewards/strict_format_reward_func/std": 0.49925029277801514, "step": 10 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.099609375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1278.0, "completions/mean_length": 470.484375, "completions/mean_terminated_length": 380.9284362792969, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.0055, "frac_reward_zero_std": 0.0625, "grad_norm": 0.36456865072250366, "learning_rate": 1.6666666666666665e-07, "loss": 0.066, "num_tokens": 3707954.0, "reward": 0.3544921875, "reward_std": 0.2562474310398102, "rewards/correctness_reward_func/mean": 0.1484375, "rewards/correctness_reward_func/std": 0.35588082671165466, "rewards/strict_format_reward_func/mean": 0.560546875, "rewards/strict_format_reward_func/std": 0.49680593609809875, "step": 11 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1278.0, "completions/mean_length": 480.06640625, "completions/mean_terminated_length": 412.2754211425781, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.006, "frac_reward_zero_std": 0.046875, "grad_norm": 0.36434271931648254, "learning_rate": 1.833333333333333e-07, "loss": 0.0494, "num_tokens": 4033044.0, "reward": 0.3251953125, "reward_std": 0.2645646929740906, "rewards/correctness_reward_func/mean": 0.11328125, "rewards/correctness_reward_func/std": 0.3172462284564972, "rewards/strict_format_reward_func/mean": 0.537109375, "rewards/strict_format_reward_func/std": 0.4991086423397064, "step": 12 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.08203125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1279.0, "completions/mean_length": 468.708984375, "completions/mean_terminated_length": 396.21063232421875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.0065, "frac_reward_zero_std": 0.015625, "grad_norm": 0.36773481965065, "learning_rate": 2e-07, "loss": 0.053, "num_tokens": 4354615.0, "reward": 0.3193359375, "reward_std": 0.2689654529094696, "rewards/correctness_reward_func/mean": 0.111328125, "rewards/correctness_reward_func/std": 0.31484565138816833, "rewards/strict_format_reward_func/mean": 0.52734375, "rewards/strict_format_reward_func/std": 0.49974003434181213, "step": 13 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.111328125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1260.0, "completions/mean_length": 485.958984375, "completions/mean_terminated_length": 386.4857177734375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.007, "frac_reward_zero_std": 0.046875, "grad_norm": 0.33340662717819214, "learning_rate": 2.1666666666666667e-07, "loss": 0.0758, "num_tokens": 4685202.0, "reward": 0.361328125, "reward_std": 0.2693173885345459, "rewards/correctness_reward_func/mean": 0.154296875, "rewards/correctness_reward_func/std": 0.36158639192581177, "rewards/strict_format_reward_func/mean": 0.568359375, "rewards/strict_format_reward_func/std": 0.4957893490791321, "step": 14 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.115234375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1278.0, "completions/mean_length": 496.302734375, "completions/mean_terminated_length": 394.2317810058594, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.0075, "frac_reward_zero_std": 0.046875, "grad_norm": 0.31166115403175354, "learning_rate": 2.3333333333333333e-07, "loss": 0.0588, "num_tokens": 5018365.0, "reward": 0.3857421875, "reward_std": 0.25771355628967285, "rewards/correctness_reward_func/mean": 0.15234375, "rewards/correctness_reward_func/std": 0.35970520973205566, "rewards/strict_format_reward_func/mean": 0.619140625, "rewards/strict_format_reward_func/std": 0.48607301712036133, "step": 15 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.095703125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1266.0, "completions/mean_length": 486.75390625, "completions/mean_terminated_length": 402.8034362792969, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.008, "frac_reward_zero_std": 0.046875, "grad_norm": 0.36958619952201843, "learning_rate": 2.5e-07, "loss": 0.1103, "num_tokens": 5350215.0, "reward": 0.3349609375, "reward_std": 0.24944864213466644, "rewards/correctness_reward_func/mean": 0.083984375, "rewards/correctness_reward_func/std": 0.2776356339454651, "rewards/strict_format_reward_func/mean": 0.5859375, "rewards/strict_format_reward_func/std": 0.49304109811782837, "step": 16 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.083984375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1276.0, "completions/mean_length": 487.0859375, "completions/mean_terminated_length": 414.3880615234375, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.0085, "frac_reward_zero_std": 0.015625, "grad_norm": 0.3691529631614685, "learning_rate": 2.6666666666666667e-07, "loss": 0.0438, "num_tokens": 5679539.0, "reward": 0.365234375, "reward_std": 0.2718730568885803, "rewards/correctness_reward_func/mean": 0.134765625, "rewards/correctness_reward_func/std": 0.3418070077896118, "rewards/strict_format_reward_func/mean": 0.595703125, "rewards/strict_format_reward_func/std": 0.4912354052066803, "step": 17 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.146484375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1249.0, "completions/mean_length": 539.380859375, "completions/mean_terminated_length": 412.27227783203125, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.009, "frac_reward_zero_std": 0.0625, "grad_norm": 0.2965885400772095, "learning_rate": 2.833333333333333e-07, "loss": 0.0626, "num_tokens": 6042406.0, "reward": 0.3232421875, "reward_std": 0.25863024592399597, "rewards/correctness_reward_func/mean": 0.13671875, "rewards/correctness_reward_func/std": 0.3438861668109894, "rewards/strict_format_reward_func/mean": 0.509765625, "rewards/strict_format_reward_func/std": 0.5003935098648071, "step": 18 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1275.0, "completions/mean_length": 485.21875, "completions/mean_terminated_length": 403.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.0095, "frac_reward_zero_std": 0.046875, "grad_norm": 0.38167980313301086, "learning_rate": 3e-07, "loss": 0.0318, "num_tokens": 6369486.0, "reward": 0.3876953125, "reward_std": 0.262542188167572, "rewards/correctness_reward_func/mean": 0.15625, "rewards/correctness_reward_func/std": 0.36344730854034424, "rewards/strict_format_reward_func/mean": 0.619140625, "rewards/strict_format_reward_func/std": 0.48607301712036133, "step": 19 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.107421875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1277.0, "completions/mean_length": 508.498046875, "completions/mean_terminated_length": 415.647705078125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.01, "frac_reward_zero_std": 0.0625, "grad_norm": 0.3474893867969513, "learning_rate": 3.166666666666666e-07, "loss": 0.0142, "num_tokens": 6712861.0, "reward": 0.3330078125, "reward_std": 0.24650248885154724, "rewards/correctness_reward_func/mean": 0.099609375, "rewards/correctness_reward_func/std": 0.29977133870124817, "rewards/strict_format_reward_func/mean": 0.56640625, "rewards/strict_format_reward_func/std": 0.4960552453994751, "step": 20 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1272.0, "completions/mean_length": 514.66015625, "completions/mean_terminated_length": 420.6710510253906, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.0105, "frac_reward_zero_std": 0.03125, "grad_norm": 0.3415084779262543, "learning_rate": 3.333333333333333e-07, "loss": 0.1265, "num_tokens": 7054527.0, "reward": 0.375, "reward_std": 0.26221993565559387, "rewards/correctness_reward_func/mean": 0.162109375, "rewards/correctness_reward_func/std": 0.3689115643501282, "rewards/strict_format_reward_func/mean": 0.587890625, "rewards/strict_format_reward_func/std": 0.49269601702690125, "step": 21 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.111328125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1266.0, "completions/mean_length": 525.611328125, "completions/mean_terminated_length": 431.1054992675781, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.011, "frac_reward_zero_std": 0.015625, "grad_norm": 0.31360530853271484, "learning_rate": 3.5e-07, "loss": 0.0387, "num_tokens": 7407728.0, "reward": 0.326171875, "reward_std": 0.24292460083961487, "rewards/correctness_reward_func/mean": 0.068359375, "rewards/correctness_reward_func/std": 0.25260838866233826, "rewards/strict_format_reward_func/mean": 0.583984375, "rewards/strict_format_reward_func/std": 0.493378221988678, "step": 22 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.11328125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1274.0, "completions/mean_length": 530.44140625, "completions/mean_terminated_length": 434.68280029296875, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.0115, "frac_reward_zero_std": 0.03125, "grad_norm": 0.3227068781852722, "learning_rate": 3.666666666666666e-07, "loss": 0.0178, "num_tokens": 7758946.0, "reward": 0.3701171875, "reward_std": 0.25999653339385986, "rewards/correctness_reward_func/mean": 0.11328125, "rewards/correctness_reward_func/std": 0.3172462284564972, "rewards/strict_format_reward_func/mean": 0.626953125, "rewards/strict_format_reward_func/std": 0.48408737778663635, "step": 23 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.119140625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1267.0, "completions/mean_length": 489.134765625, "completions/mean_terminated_length": 382.1662902832031, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.012, "frac_reward_zero_std": 0.046875, "grad_norm": 0.3339863121509552, "learning_rate": 3.8333333333333335e-07, "loss": 0.0657, "num_tokens": 8089551.0, "reward": 0.3837890625, "reward_std": 0.2371073067188263, "rewards/correctness_reward_func/mean": 0.150390625, "rewards/correctness_reward_func/std": 0.35780346393585205, "rewards/strict_format_reward_func/mean": 0.6171875, "rewards/strict_format_reward_func/std": 0.486548513174057, "step": 24 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09765625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1273.0, "completions/mean_length": 478.623046875, "completions/mean_terminated_length": 391.8939514160156, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.0125, "frac_reward_zero_std": 0.0, "grad_norm": 0.3568263351917267, "learning_rate": 4e-07, "loss": 0.0492, "num_tokens": 8414726.0, "reward": 0.373046875, "reward_std": 0.27422550320625305, "rewards/correctness_reward_func/mean": 0.125, "rewards/correctness_reward_func/std": 0.3310423493385315, "rewards/strict_format_reward_func/mean": 0.62109375, "rewards/strict_format_reward_func/std": 0.4855891764163971, "step": 25 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1278.0, "completions/mean_length": 480.068359375, "completions/mean_terminated_length": 381.8311462402344, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.013, "frac_reward_zero_std": 0.09375, "grad_norm": 0.31916946172714233, "learning_rate": 4.1666666666666667e-07, "loss": 0.0776, "num_tokens": 8744809.0, "reward": 0.3720703125, "reward_std": 0.26208120584487915, "rewards/correctness_reward_func/mean": 0.162109375, "rewards/correctness_reward_func/std": 0.3689115643501282, "rewards/strict_format_reward_func/mean": 0.58203125, "rewards/strict_format_reward_func/std": 0.4937073290348053, "step": 26 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09765625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1276.0, "completions/mean_length": 495.048828125, "completions/mean_terminated_length": 410.097412109375, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.0135, "frac_reward_zero_std": 0.03125, "grad_norm": 0.3349536955356598, "learning_rate": 4.3333333333333335e-07, "loss": 0.07, "num_tokens": 9081114.0, "reward": 0.3876953125, "reward_std": 0.2574464678764343, "rewards/correctness_reward_func/mean": 0.1328125, "rewards/correctness_reward_func/std": 0.33970388770103455, "rewards/strict_format_reward_func/mean": 0.642578125, "rewards/strict_format_reward_func/std": 0.4797092080116272, "step": 27 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1269.0, "completions/mean_length": 476.123046875, "completions/mean_terminated_length": 400.5448913574219, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.014, "frac_reward_zero_std": 0.046875, "grad_norm": 0.36770424246788025, "learning_rate": 4.5e-07, "loss": 0.0595, "num_tokens": 9407537.0, "reward": 0.3681640625, "reward_std": 0.2513667941093445, "rewards/correctness_reward_func/mean": 0.10546875, "rewards/correctness_reward_func/std": 0.3074568510055542, "rewards/strict_format_reward_func/mean": 0.630859375, "rewards/strict_format_reward_func/std": 0.4830440282821655, "step": 28 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1262.0, "completions/mean_length": 478.572265625, "completions/mean_terminated_length": 395.66595458984375, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.0145, "frac_reward_zero_std": 0.078125, "grad_norm": 0.34928959608078003, "learning_rate": 4.6666666666666666e-07, "loss": 0.0713, "num_tokens": 9730062.0, "reward": 0.4130859375, "reward_std": 0.245479553937912, "rewards/correctness_reward_func/mean": 0.12890625, "rewards/correctness_reward_func/std": 0.33542385697364807, "rewards/strict_format_reward_func/mean": 0.697265625, "rewards/strict_format_reward_func/std": 0.45989060401916504, "step": 29 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.08984375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1273.0, "completions/mean_length": 475.310546875, "completions/mean_terminated_length": 395.877685546875, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.015, "frac_reward_zero_std": 0.046875, "grad_norm": 0.3654455542564392, "learning_rate": 4.833333333333333e-07, "loss": 0.0367, "num_tokens": 10054341.0, "reward": 0.4169921875, "reward_std": 0.25523918867111206, "rewards/correctness_reward_func/mean": 0.14453125, "rewards/correctness_reward_func/std": 0.35197147727012634, "rewards/strict_format_reward_func/mean": 0.689453125, "rewards/strict_format_reward_func/std": 0.46317005157470703, "step": 30 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.095703125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1262.0, "completions/mean_length": 478.076171875, "completions/mean_terminated_length": 393.20733642578125, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.0155, "frac_reward_zero_std": 0.015625, "grad_norm": 0.3683274984359741, "learning_rate": 5e-07, "loss": 0.0631, "num_tokens": 10379580.0, "reward": 0.4091796875, "reward_std": 0.2527580261230469, "rewards/correctness_reward_func/mean": 0.115234375, "rewards/correctness_reward_func/std": 0.3196168541908264, "rewards/strict_format_reward_func/mean": 0.703125, "rewards/strict_format_reward_func/std": 0.45732781291007996, "step": 31 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.107421875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1233.0, "completions/mean_length": 510.697265625, "completions/mean_terminated_length": 418.1116027832031, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.016, "frac_reward_zero_std": 0.140625, "grad_norm": 0.309796541929245, "learning_rate": 5.166666666666667e-07, "loss": 0.0752, "num_tokens": 10719009.0, "reward": 0.4140625, "reward_std": 0.21483345329761505, "rewards/correctness_reward_func/mean": 0.1171875, "rewards/correctness_reward_func/std": 0.32195815443992615, "rewards/strict_format_reward_func/mean": 0.7109375, "rewards/strict_format_reward_func/std": 0.45377036929130554, "step": 32 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09765625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1253.0, "completions/mean_length": 463.578125, "completions/mean_terminated_length": 375.22076416015625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.0165, "frac_reward_zero_std": 0.078125, "grad_norm": 0.3456345796585083, "learning_rate": 5.333333333333333e-07, "loss": 0.0511, "num_tokens": 11033809.0, "reward": 0.4267578125, "reward_std": 0.24420523643493652, "rewards/correctness_reward_func/mean": 0.13671875, "rewards/correctness_reward_func/std": 0.3438861668109894, "rewards/strict_format_reward_func/mean": 0.716796875, "rewards/strict_format_reward_func/std": 0.4509948492050171, "step": 33 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.08984375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1268.0, "completions/mean_length": 433.3203125, "completions/mean_terminated_length": 349.74249267578125, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.017, "frac_reward_zero_std": 0.0625, "grad_norm": 0.39212310314178467, "learning_rate": 5.5e-07, "loss": 0.0611, "num_tokens": 11337629.0, "reward": 0.4404296875, "reward_std": 0.2442634403705597, "rewards/correctness_reward_func/mean": 0.125, "rewards/correctness_reward_func/std": 0.3310423493385315, "rewards/strict_format_reward_func/mean": 0.755859375, "rewards/strict_format_reward_func/std": 0.42999663949012756, "step": 34 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.123046875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1279.0, "completions/mean_length": 509.75, "completions/mean_terminated_length": 401.6748352050781, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.0175, "frac_reward_zero_std": 0.078125, "grad_norm": 0.3314379155635834, "learning_rate": 5.666666666666666e-07, "loss": 0.1008, "num_tokens": 11679389.0, "reward": 0.4677734375, "reward_std": 0.2392577975988388, "rewards/correctness_reward_func/mean": 0.138671875, "rewards/correctness_reward_func/std": 0.34594178199768066, "rewards/strict_format_reward_func/mean": 0.796875, "rewards/strict_format_reward_func/std": 0.4027182459831238, "step": 35 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.126953125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1277.0, "completions/mean_length": 522.14453125, "completions/mean_terminated_length": 411.94183349609375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.018, "frac_reward_zero_std": 0.125, "grad_norm": 0.3079322576522827, "learning_rate": 5.833333333333334e-07, "loss": 0.0629, "num_tokens": 12026983.0, "reward": 0.453125, "reward_std": 0.20784910023212433, "rewards/correctness_reward_func/mean": 0.095703125, "rewards/correctness_reward_func/std": 0.2944713830947876, "rewards/strict_format_reward_func/mean": 0.810546875, "rewards/strict_format_reward_func/std": 0.3922513723373413, "step": 36 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1254.0, "completions/mean_length": 434.671875, "completions/mean_terminated_length": 363.0339050292969, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.0185, "frac_reward_zero_std": 0.171875, "grad_norm": 0.34872007369995117, "learning_rate": 6e-07, "loss": 0.1292, "num_tokens": 12325079.0, "reward": 0.5009765625, "reward_std": 0.204152449965477, "rewards/correctness_reward_func/mean": 0.1484375, "rewards/correctness_reward_func/std": 0.35588082671165466, "rewards/strict_format_reward_func/mean": 0.853515625, "rewards/strict_format_reward_func/std": 0.35393697023391724, "step": 37 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1271.0, "completions/mean_length": 433.001953125, "completions/mean_terminated_length": 353.36968994140625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.019, "frac_reward_zero_std": 0.171875, "grad_norm": 0.31309986114501953, "learning_rate": 6.166666666666667e-07, "loss": 0.0508, "num_tokens": 12627696.0, "reward": 0.48828125, "reward_std": 0.1888417899608612, "rewards/correctness_reward_func/mean": 0.109375, "rewards/correctness_reward_func/std": 0.31241437792778015, "rewards/strict_format_reward_func/mean": 0.8671875, "rewards/strict_format_reward_func/std": 0.33970388770103455, "step": 38 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1270.0, "completions/mean_length": 424.388671875, "completions/mean_terminated_length": 359.6785888671875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.0195, "frac_reward_zero_std": 0.09375, "grad_norm": 0.3900044560432434, "learning_rate": 6.333333333333332e-07, "loss": 0.1074, "num_tokens": 12926895.0, "reward": 0.4833984375, "reward_std": 0.2132570892572403, "rewards/correctness_reward_func/mean": 0.134765625, "rewards/correctness_reward_func/std": 0.3418070077896118, "rewards/strict_format_reward_func/mean": 0.83203125, "rewards/strict_format_reward_func/std": 0.374204158782959, "step": 39 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.103515625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1254.0, "completions/mean_length": 464.607421875, "completions/mean_terminated_length": 370.455322265625, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.02, "frac_reward_zero_std": 0.140625, "grad_norm": 0.3177374601364136, "learning_rate": 6.5e-07, "loss": 0.0991, "num_tokens": 13245982.0, "reward": 0.501953125, "reward_std": 0.20007643103599548, "rewards/correctness_reward_func/mean": 0.126953125, "rewards/correctness_reward_func/std": 0.33324605226516724, "rewards/strict_format_reward_func/mean": 0.876953125, "rewards/strict_format_reward_func/std": 0.32881227135658264, "step": 40 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.095703125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1280.0, "completions/mean_length": 448.64453125, "completions/mean_terminated_length": 360.660888671875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.0205, "frac_reward_zero_std": 0.140625, "grad_norm": 0.31568291783332825, "learning_rate": 6.666666666666666e-07, "loss": 0.106, "num_tokens": 13554360.0, "reward": 0.4990234375, "reward_std": 0.19699305295944214, "rewards/correctness_reward_func/mean": 0.12890625, "rewards/correctness_reward_func/std": 0.33542385697364807, "rewards/strict_format_reward_func/mean": 0.869140625, "rewards/strict_format_reward_func/std": 0.33757632970809937, "step": 41 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1269.0, "completions/mean_length": 443.26171875, "completions/mean_terminated_length": 383.7447509765625, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.021, "frac_reward_zero_std": 0.234375, "grad_norm": 0.28603100776672363, "learning_rate": 6.833333333333333e-07, "loss": 0.0884, "num_tokens": 13860294.0, "reward": 0.50390625, "reward_std": 0.17007258534431458, "rewards/correctness_reward_func/mean": 0.107421875, "rewards/correctness_reward_func/std": 0.30995169281959534, "rewards/strict_format_reward_func/mean": 0.900390625, "rewards/strict_format_reward_func/std": 0.29977133870124817, "step": 42 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1261.0, "completions/mean_length": 410.166015625, "completions/mean_terminated_length": 348.29498291015625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.0215, "frac_reward_zero_std": 0.234375, "grad_norm": 0.3239445388317108, "learning_rate": 7e-07, "loss": 0.1053, "num_tokens": 14150835.0, "reward": 0.505859375, "reward_std": 0.17652790248394012, "rewards/correctness_reward_func/mean": 0.1484375, "rewards/correctness_reward_func/std": 0.35588082671165466, "rewards/strict_format_reward_func/mean": 0.86328125, "rewards/strict_format_reward_func/std": 0.3438861668109894, "step": 43 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.064453125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1249.0, "completions/mean_length": 396.798828125, "completions/mean_terminated_length": 335.9519958496094, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.022, "frac_reward_zero_std": 0.1875, "grad_norm": 0.3426041603088379, "learning_rate": 7.166666666666667e-07, "loss": 0.1241, "num_tokens": 14434852.0, "reward": 0.5283203125, "reward_std": 0.18634288012981415, "rewards/correctness_reward_func/mean": 0.1484375, "rewards/correctness_reward_func/std": 0.35588082671165466, "rewards/strict_format_reward_func/mean": 0.908203125, "rewards/strict_format_reward_func/std": 0.289021372795105, "step": 44 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1273.0, "completions/mean_length": 400.2890625, "completions/mean_terminated_length": 337.7154846191406, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.0225, "frac_reward_zero_std": 0.25, "grad_norm": 0.3275441527366638, "learning_rate": 7.333333333333332e-07, "loss": 0.0845, "num_tokens": 14721088.0, "reward": 0.5400390625, "reward_std": 0.16695258021354675, "rewards/correctness_reward_func/mean": 0.162109375, "rewards/correctness_reward_func/std": 0.3689115643501282, "rewards/strict_format_reward_func/mean": 0.91796875, "rewards/strict_format_reward_func/std": 0.2746807038784027, "step": 45 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.083984375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1259.0, "completions/mean_length": 395.701171875, "completions/mean_terminated_length": 314.624755859375, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.023, "frac_reward_zero_std": 0.328125, "grad_norm": 0.3019014894962311, "learning_rate": 7.5e-07, "loss": 0.072, "num_tokens": 15002687.0, "reward": 0.5380859375, "reward_std": 0.14848747849464417, "rewards/correctness_reward_func/mean": 0.1484375, "rewards/correctness_reward_func/std": 0.35588082671165466, "rewards/strict_format_reward_func/mean": 0.927734375, "rewards/strict_format_reward_func/std": 0.2591804563999176, "step": 46 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1275.0, "completions/mean_length": 409.96875, "completions/mean_terminated_length": 351.9666748046875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.0235, "frac_reward_zero_std": 0.25, "grad_norm": 0.3243018686771393, "learning_rate": 7.666666666666667e-07, "loss": 0.1358, "num_tokens": 15292767.0, "reward": 0.5087890625, "reward_std": 0.16159790754318237, "rewards/correctness_reward_func/mean": 0.103515625, "rewards/correctness_reward_func/std": 0.30492907762527466, "rewards/strict_format_reward_func/mean": 0.9140625, "rewards/strict_format_reward_func/std": 0.28054583072662354, "step": 47 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.068359375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1280.0, "completions/mean_length": 377.39453125, "completions/mean_terminated_length": 311.1656188964844, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.024, "frac_reward_zero_std": 0.296875, "grad_norm": 0.3503972291946411, "learning_rate": 7.833333333333333e-07, "loss": 0.0887, "num_tokens": 15570681.0, "reward": 0.51953125, "reward_std": 0.15567830204963684, "rewards/correctness_reward_func/mean": 0.12890625, "rewards/correctness_reward_func/std": 0.33542385697364807, "rewards/strict_format_reward_func/mean": 0.91015625, "rewards/strict_format_reward_func/std": 0.2862374484539032, "step": 48 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.044921875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1260.0, "completions/mean_length": 310.064453125, "completions/mean_terminated_length": 264.4437561035156, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.0245, "frac_reward_zero_std": 0.375, "grad_norm": 0.32057884335517883, "learning_rate": 8e-07, "loss": 0.0729, "num_tokens": 15808418.0, "reward": 0.580078125, "reward_std": 0.15646106004714966, "rewards/correctness_reward_func/mean": 0.22265625, "rewards/correctness_reward_func/std": 0.41643625497817993, "rewards/strict_format_reward_func/mean": 0.9375, "rewards/strict_format_reward_func/std": 0.2422981858253479, "step": 49 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1209.0, "completions/mean_length": 296.234375, "completions/mean_terminated_length": 252.06529235839844, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.025, "frac_reward_zero_std": 0.359375, "grad_norm": 0.3681601881980896, "learning_rate": 8.166666666666666e-07, "loss": 0.1429, "num_tokens": 16038850.0, "reward": 0.56640625, "reward_std": 0.14544308185577393, "rewards/correctness_reward_func/mean": 0.169921875, "rewards/correctness_reward_func/std": 0.3759314715862274, "rewards/strict_format_reward_func/mean": 0.962890625, "rewards/strict_format_reward_func/std": 0.18921469151973724, "step": 50 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1263.0, "completions/mean_length": 335.375, "completions/mean_terminated_length": 296.9755859375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.0255, "frac_reward_zero_std": 0.359375, "grad_norm": 0.3586282730102539, "learning_rate": 8.333333333333333e-07, "loss": 0.1065, "num_tokens": 16293666.0, "reward": 0.5302734375, "reward_std": 0.13623127341270447, "rewards/correctness_reward_func/mean": 0.115234375, "rewards/correctness_reward_func/std": 0.3196168541908264, "rewards/strict_format_reward_func/mean": 0.9453125, "rewards/strict_format_reward_func/std": 0.2275916188955307, "step": 51 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.025390625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1215.0, "completions/mean_length": 269.9765625, "completions/mean_terminated_length": 243.663330078125, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.026, "frac_reward_zero_std": 0.328125, "grad_norm": 0.3876008689403534, "learning_rate": 8.499999999999999e-07, "loss": 0.0597, "num_tokens": 16510590.0, "reward": 0.6083984375, "reward_std": 0.15176597237586975, "rewards/correctness_reward_func/mean": 0.244140625, "rewards/correctness_reward_func/std": 0.42999663949012756, "rewards/strict_format_reward_func/mean": 0.97265625, "rewards/strict_format_reward_func/std": 0.16324250400066376, "step": 52 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.041015625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1277.0, "completions/mean_length": 297.580078125, "completions/mean_terminated_length": 255.5621337890625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.0265, "frac_reward_zero_std": 0.3125, "grad_norm": 0.42054447531700134, "learning_rate": 8.666666666666667e-07, "loss": 0.1005, "num_tokens": 16743239.0, "reward": 0.5927734375, "reward_std": 0.16221949458122253, "rewards/correctness_reward_func/mean": 0.248046875, "rewards/correctness_reward_func/std": 0.4323015511035919, "rewards/strict_format_reward_func/mean": 0.9375, "rewards/strict_format_reward_func/std": 0.2422981858253479, "step": 53 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1115.0, "completions/mean_length": 242.103515625, "completions/mean_terminated_length": 225.6289825439453, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.027, "frac_reward_zero_std": 0.34375, "grad_norm": 0.4196186661720276, "learning_rate": 8.833333333333333e-07, "loss": 0.0946, "num_tokens": 16944868.0, "reward": 0.591796875, "reward_std": 0.14952966570854187, "rewards/correctness_reward_func/mean": 0.2109375, "rewards/correctness_reward_func/std": 0.4083731174468994, "rewards/strict_format_reward_func/mean": 0.97265625, "rewards/strict_format_reward_func/std": 0.16324250400066376, "step": 54 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1132.0, "completions/mean_length": 271.400390625, "completions/mean_terminated_length": 251.30877685546875, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.0275, "frac_reward_zero_std": 0.359375, "grad_norm": 0.36686980724334717, "learning_rate": 9e-07, "loss": 0.0488, "num_tokens": 17166681.0, "reward": 0.5830078125, "reward_std": 0.14440418779850006, "rewards/correctness_reward_func/mean": 0.1875, "rewards/correctness_reward_func/std": 0.39069411158561707, "rewards/strict_format_reward_func/mean": 0.978515625, "rewards/strict_format_reward_func/std": 0.14513419568538666, "step": 55 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1196.0, "completions/mean_length": 268.162109375, "completions/mean_terminated_length": 245.94610595703125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.028, "frac_reward_zero_std": 0.3125, "grad_norm": 0.3979751765727997, "learning_rate": 9.166666666666665e-07, "loss": 0.062, "num_tokens": 17384156.0, "reward": 0.580078125, "reward_std": 0.1604653298854828, "rewards/correctness_reward_func/mean": 0.19140625, "rewards/correctness_reward_func/std": 0.3937928080558777, "rewards/strict_format_reward_func/mean": 0.96875, "rewards/strict_format_reward_func/std": 0.17416280508041382, "step": 56 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1272.0, "completions/mean_length": 244.208984375, "completions/mean_terminated_length": 231.9268798828125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.0285, "frac_reward_zero_std": 0.515625, "grad_norm": 0.3326975703239441, "learning_rate": 9.333333333333333e-07, "loss": -0.0184, "num_tokens": 17589991.0, "reward": 0.5966796875, "reward_std": 0.10863800346851349, "rewards/correctness_reward_func/mean": 0.2109375, "rewards/correctness_reward_func/std": 0.4083731174468994, "rewards/strict_format_reward_func/mean": 0.982421875, "rewards/strict_format_reward_func/std": 0.13154059648513794, "step": 57 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1200.0, "completions/mean_length": 241.818359375, "completions/mean_terminated_length": 221.137451171875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.029, "frac_reward_zero_std": 0.484375, "grad_norm": 0.41635575890541077, "learning_rate": 9.499999999999999e-07, "loss": 0.0759, "num_tokens": 17792898.0, "reward": 0.5869140625, "reward_std": 0.1135234534740448, "rewards/correctness_reward_func/mean": 0.1875, "rewards/correctness_reward_func/std": 0.39069411158561707, "rewards/strict_format_reward_func/mean": 0.986328125, "rewards/strict_format_reward_func/std": 0.1162383034825325, "step": 58 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1252.0, "completions/mean_length": 265.177734375, "completions/mean_terminated_length": 240.82200622558594, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.0295, "frac_reward_zero_std": 0.4375, "grad_norm": 0.3422642946243286, "learning_rate": 9.666666666666666e-07, "loss": 0.0449, "num_tokens": 18008573.0, "reward": 0.6044921875, "reward_std": 0.1342127025127411, "rewards/correctness_reward_func/mean": 0.220703125, "rewards/correctness_reward_func/std": 0.4151262938976288, "rewards/strict_format_reward_func/mean": 0.98828125, "rewards/strict_format_reward_func/std": 0.10772226005792618, "step": 59 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1263.0, "completions/mean_length": 228.244140625, "completions/mean_terminated_length": 219.9626007080078, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.03, "frac_reward_zero_std": 0.46875, "grad_norm": 0.32586634159088135, "learning_rate": 9.833333333333332e-07, "loss": 0.0218, "num_tokens": 18202290.0, "reward": 0.5986328125, "reward_std": 0.12520861625671387, "rewards/correctness_reward_func/mean": 0.228515625, "rewards/correctness_reward_func/std": 0.4202871024608612, "rewards/strict_format_reward_func/mean": 0.96875, "rewards/strict_format_reward_func/std": 0.17416280508041382, "step": 60 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1086.0, "completions/mean_length": 235.298828125, "completions/mean_terminated_length": 222.91107177734375, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.0305, "frac_reward_zero_std": 0.375, "grad_norm": 0.37623488903045654, "learning_rate": 1e-06, "loss": 0.0071, "num_tokens": 18404795.0, "reward": 0.58203125, "reward_std": 0.14406809210777283, "rewards/correctness_reward_func/mean": 0.18359375, "rewards/correctness_reward_func/std": 0.3875311613082886, "rewards/strict_format_reward_func/mean": 0.98046875, "rewards/strict_format_reward_func/std": 0.1385180652141571, "step": 61 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1032.0, "completions/mean_length": 223.65234375, "completions/mean_terminated_length": 217.42633056640625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.031, "frac_reward_zero_std": 0.46875, "grad_norm": 0.3536680042743683, "learning_rate": 9.999993444041445e-07, "loss": 0.017, "num_tokens": 18598601.0, "reward": 0.64453125, "reward_std": 0.12280106544494629, "rewards/correctness_reward_func/mean": 0.30078125, "rewards/correctness_reward_func/std": 0.45904624462127686, "rewards/strict_format_reward_func/mean": 0.98828125, "rewards/strict_format_reward_func/std": 0.10772226005792618, "step": 62 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1246.0, "completions/mean_length": 248.73828125, "completions/mean_terminated_length": 236.5098876953125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.0315, "frac_reward_zero_std": 0.34375, "grad_norm": 0.4249816834926605, "learning_rate": 9.99997377618298e-07, "loss": 0.0152, "num_tokens": 18805451.0, "reward": 0.611328125, "reward_std": 0.13816098868846893, "rewards/correctness_reward_func/mean": 0.25390625, "rewards/correctness_reward_func/std": 0.43567025661468506, "rewards/strict_format_reward_func/mean": 0.96875, "rewards/strict_format_reward_func/std": 0.17416280508041382, "step": 63 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1268.0, "completions/mean_length": 290.15234375, "completions/mean_terminated_length": 268.4191589355469, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.032, "frac_reward_zero_std": 0.484375, "grad_norm": 0.3467140793800354, "learning_rate": 9.999940996476175e-07, "loss": 0.0502, "num_tokens": 19040769.0, "reward": 0.5927734375, "reward_std": 0.12330609560012817, "rewards/correctness_reward_func/mean": 0.216796875, "rewards/correctness_reward_func/std": 0.4124660789966583, "rewards/strict_format_reward_func/mean": 0.96875, "rewards/strict_format_reward_func/std": 0.17416280508041382, "step": 64 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1221.0, "completions/mean_length": 279.16015625, "completions/mean_terminated_length": 265.2871398925781, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.0325, "frac_reward_zero_std": 0.359375, "grad_norm": 0.3559761047363281, "learning_rate": 9.999895105006994e-07, "loss": 0.0552, "num_tokens": 19265571.0, "reward": 0.623046875, "reward_std": 0.14418897032737732, "rewards/correctness_reward_func/mean": 0.275390625, "rewards/correctness_reward_func/std": 0.44714778661727905, "rewards/strict_format_reward_func/mean": 0.970703125, "rewards/strict_format_reward_func/std": 0.16880230605602264, "step": 65 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1210.0, "completions/mean_length": 246.076171875, "completions/mean_terminated_length": 229.66470336914062, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.033, "frac_reward_zero_std": 0.375, "grad_norm": 0.3859092593193054, "learning_rate": 9.99983610189578e-07, "loss": 0.0579, "num_tokens": 19470018.0, "reward": 0.6279296875, "reward_std": 0.13720150291919708, "rewards/correctness_reward_func/mean": 0.26953125, "rewards/correctness_reward_func/std": 0.44415023922920227, "rewards/strict_format_reward_func/mean": 0.986328125, "rewards/strict_format_reward_func/std": 0.1162383034825325, "step": 66 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1112.0, "completions/mean_length": 251.421875, "completions/mean_terminated_length": 228.83831787109375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.0335, "frac_reward_zero_std": 0.484375, "grad_norm": 0.36630764603614807, "learning_rate": 9.999763987297264e-07, "loss": 0.0476, "num_tokens": 19677810.0, "reward": 0.619140625, "reward_std": 0.1067921370267868, "rewards/correctness_reward_func/mean": 0.26953125, "rewards/correctness_reward_func/std": 0.44415023922920227, "rewards/strict_format_reward_func/mean": 0.96875, "rewards/strict_format_reward_func/std": 0.17416280508041382, "step": 67 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1172.0, "completions/mean_length": 240.859375, "completions/mean_terminated_length": 222.26638793945312, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.034, "frac_reward_zero_std": 0.328125, "grad_norm": 0.38380587100982666, "learning_rate": 9.999678761400562e-07, "loss": 0.0692, "num_tokens": 19883234.0, "reward": 0.630859375, "reward_std": 0.15035656094551086, "rewards/correctness_reward_func/mean": 0.2890625, "rewards/correctness_reward_func/std": 0.45377036929130554, "rewards/strict_format_reward_func/mean": 0.97265625, "rewards/strict_format_reward_func/std": 0.16324250400066376, "step": 68 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1162.0, "completions/mean_length": 234.337890625, "completions/mean_terminated_length": 224.02565002441406, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.0345, "frac_reward_zero_std": 0.390625, "grad_norm": 0.35500645637512207, "learning_rate": 9.999580424429159e-07, "loss": 0.0483, "num_tokens": 20083247.0, "reward": 0.6640625, "reward_std": 0.1348668485879898, "rewards/correctness_reward_func/mean": 0.33203125, "rewards/correctness_reward_func/std": 0.47140273451805115, "rewards/strict_format_reward_func/mean": 0.99609375, "rewards/strict_format_reward_func/std": 0.06243881583213806, "step": 69 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1073.0, "completions/mean_length": 265.732421875, "completions/mean_terminated_length": 259.7544250488281, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.035, "frac_reward_zero_std": 0.578125, "grad_norm": 0.27369487285614014, "learning_rate": 9.999468976640939e-07, "loss": 0.0461, "num_tokens": 20300998.0, "reward": 0.5791015625, "reward_std": 0.09376809000968933, "rewards/correctness_reward_func/mean": 0.18359375, "rewards/correctness_reward_func/std": 0.3875311613082886, "rewards/strict_format_reward_func/mean": 0.974609375, "rewards/strict_format_reward_func/std": 0.15746226906776428, "step": 70 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1039.0, "completions/mean_length": 233.96875, "completions/mean_terminated_length": 215.25247192382812, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.0355, "frac_reward_zero_std": 0.4375, "grad_norm": 0.4125555753707886, "learning_rate": 9.99934441832816e-07, "loss": 0.0956, "num_tokens": 20497414.0, "reward": 0.625, "reward_std": 0.12506619095802307, "rewards/correctness_reward_func/mean": 0.263671875, "rewards/correctness_reward_func/std": 0.4410543739795685, "rewards/strict_format_reward_func/mean": 0.986328125, "rewards/strict_format_reward_func/std": 0.1162383034825325, "step": 71 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1265.0, "completions/mean_length": 271.978515625, "completions/mean_terminated_length": 247.7860107421875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.036, "frac_reward_zero_std": 0.453125, "grad_norm": 0.34837380051612854, "learning_rate": 9.99920674981746e-07, "loss": 0.1279, "num_tokens": 20720507.0, "reward": 0.6103515625, "reward_std": 0.11888537555932999, "rewards/correctness_reward_func/mean": 0.24609375, "rewards/correctness_reward_func/std": 0.4311550557613373, "rewards/strict_format_reward_func/mean": 0.974609375, "rewards/strict_format_reward_func/std": 0.15746226906776428, "step": 72 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1210.0, "completions/mean_length": 228.228515625, "completions/mean_terminated_length": 224.1039276123047, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.0365, "frac_reward_zero_std": 0.390625, "grad_norm": 0.39599257707595825, "learning_rate": 9.999055971469863e-07, "loss": 0.0224, "num_tokens": 20915896.0, "reward": 0.673828125, "reward_std": 0.13572058081626892, "rewards/correctness_reward_func/mean": 0.357421875, "rewards/correctness_reward_func/std": 0.4797092080116272, "rewards/strict_format_reward_func/mean": 0.990234375, "rewards/strict_format_reward_func/std": 0.09843364357948303, "step": 73 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1256.0, "completions/mean_length": 271.587890625, "completions/mean_terminated_length": 261.64300537109375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.037, "frac_reward_zero_std": 0.515625, "grad_norm": 0.35442784428596497, "learning_rate": 9.998892083680762e-07, "loss": 0.0349, "num_tokens": 21138349.0, "reward": 0.6015625, "reward_std": 0.11092907190322876, "rewards/correctness_reward_func/mean": 0.23828125, "rewards/correctness_reward_func/std": 0.42644867300987244, "rewards/strict_format_reward_func/mean": 0.96484375, "rewards/strict_format_reward_func/std": 0.1843547374010086, "step": 74 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1141.0, "completions/mean_length": 218.25, "completions/mean_terminated_length": 216.17221069335938, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.0375, "frac_reward_zero_std": 0.34375, "grad_norm": 0.4667811393737793, "learning_rate": 9.998715086879935e-07, "loss": 0.0262, "num_tokens": 21327421.0, "reward": 0.66796875, "reward_std": 0.1432858556509018, "rewards/correctness_reward_func/mean": 0.341796875, "rewards/correctness_reward_func/std": 0.4747757613658905, "rewards/strict_format_reward_func/mean": 0.994140625, "rewards/strict_format_reward_func/std": 0.07639661431312561, "step": 75 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1202.0, "completions/mean_length": 227.681640625, "completions/mean_terminated_length": 215.2035675048828, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.038, "frac_reward_zero_std": 0.546875, "grad_norm": 0.3435099124908447, "learning_rate": 9.99852498153154e-07, "loss": 0.041, "num_tokens": 21521426.0, "reward": 0.6953125, "reward_std": 0.10652374476194382, "rewards/correctness_reward_func/mean": 0.400390625, "rewards/correctness_reward_func/std": 0.4904567301273346, "rewards/strict_format_reward_func/mean": 0.990234375, "rewards/strict_format_reward_func/std": 0.09843364357948303, "step": 76 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1182.0, "completions/mean_length": 241.912109375, "completions/mean_terminated_length": 237.8411865234375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.0385, "frac_reward_zero_std": 0.359375, "grad_norm": 0.39670509099960327, "learning_rate": 9.9983217681341e-07, "loss": 0.0428, "num_tokens": 21724693.0, "reward": 0.6630859375, "reward_std": 0.14423537254333496, "rewards/correctness_reward_func/mean": 0.33203125, "rewards/correctness_reward_func/std": 0.47140273451805115, "rewards/strict_format_reward_func/mean": 0.994140625, "rewards/strict_format_reward_func/std": 0.07639661431312561, "step": 77 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1259.0, "completions/mean_length": 243.982421875, "completions/mean_terminated_length": 237.87623596191406, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.039, "frac_reward_zero_std": 0.484375, "grad_norm": 0.4161688983440399, "learning_rate": 9.998105447220522e-07, "loss": 0.0338, "num_tokens": 21929988.0, "reward": 0.619140625, "reward_std": 0.11164377629756927, "rewards/correctness_reward_func/mean": 0.265625, "rewards/correctness_reward_func/std": 0.44209739565849304, "rewards/strict_format_reward_func/mean": 0.97265625, "rewards/strict_format_reward_func/std": 0.16324250400066376, "step": 78 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1210.0, "completions/max_terminated_length": 1210.0, "completions/mean_length": 213.919921875, "completions/mean_terminated_length": 213.919921875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.0395, "frac_reward_zero_std": 0.390625, "grad_norm": 0.4172995388507843, "learning_rate": 9.997876019358083e-07, "loss": 0.0083, "num_tokens": 22114003.0, "reward": 0.72265625, "reward_std": 0.1379173994064331, "rewards/correctness_reward_func/mean": 0.451171875, "rewards/correctness_reward_func/std": 0.498096764087677, "rewards/strict_format_reward_func/mean": 0.994140625, "rewards/strict_format_reward_func/std": 0.07639661431312561, "step": 79 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1144.0, "completions/mean_length": 244.953125, "completions/mean_terminated_length": 240.89413452148438, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.04, "frac_reward_zero_std": 0.4375, "grad_norm": 0.37673693895339966, "learning_rate": 9.997633485148427e-07, "loss": 0.0219, "num_tokens": 22321795.0, "reward": 0.6494140625, "reward_std": 0.12516599893569946, "rewards/correctness_reward_func/mean": 0.302734375, "rewards/correctness_reward_func/std": 0.45989060401916504, "rewards/strict_format_reward_func/mean": 0.99609375, "rewards/strict_format_reward_func/std": 0.06243881583213806, "step": 80 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1247.0, "completions/max_terminated_length": 1247.0, "completions/mean_length": 230.265625, "completions/mean_terminated_length": 230.265625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.0405, "frac_reward_zero_std": 0.4375, "grad_norm": 0.37114495038986206, "learning_rate": 9.997377845227574e-07, "loss": -0.0031, "num_tokens": 22516963.0, "reward": 0.6845703125, "reward_std": 0.12628617882728577, "rewards/correctness_reward_func/mean": 0.37109375, "rewards/correctness_reward_func/std": 0.4835699498653412, "rewards/strict_format_reward_func/mean": 0.998046875, "rewards/strict_format_reward_func/std": 0.04419417306780815, "step": 81 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1265.0, "completions/mean_length": 253.953125, "completions/mean_terminated_length": 247.90570068359375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.041, "frac_reward_zero_std": 0.453125, "grad_norm": 0.3662205934524536, "learning_rate": 9.99710910026591e-07, "loss": 0.0094, "num_tokens": 22728275.0, "reward": 0.6787109375, "reward_std": 0.12296395003795624, "rewards/correctness_reward_func/mean": 0.375, "rewards/correctness_reward_func/std": 0.4845963716506958, "rewards/strict_format_reward_func/mean": 0.982421875, "rewards/strict_format_reward_func/std": 0.13154059648513794, "step": 82 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1265.0, "completions/mean_length": 252.30859375, "completions/mean_terminated_length": 242.17356872558594, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.0415, "frac_reward_zero_std": 0.46875, "grad_norm": 0.3793325424194336, "learning_rate": 9.996827250968189e-07, "loss": 0.0223, "num_tokens": 22937537.0, "reward": 0.6748046875, "reward_std": 0.11382539570331573, "rewards/correctness_reward_func/mean": 0.37890625, "rewards/correctness_reward_func/std": 0.4855891764163971, "rewards/strict_format_reward_func/mean": 0.970703125, "rewards/strict_format_reward_func/std": 0.16880230605602264, "step": 83 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1214.0, "completions/mean_length": 271.107421875, "completions/mean_terminated_length": 257.1227722167969, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.042, "frac_reward_zero_std": 0.546875, "grad_norm": 0.3364124596118927, "learning_rate": 9.996532298073524e-07, "loss": 0.0469, "num_tokens": 23159544.0, "reward": 0.6025390625, "reward_std": 0.10351093113422394, "rewards/correctness_reward_func/mean": 0.21484375, "rewards/correctness_reward_func/std": 0.4111155867576599, "rewards/strict_format_reward_func/mean": 0.990234375, "rewards/strict_format_reward_func/std": 0.09843364357948303, "step": 84 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1216.0, "completions/mean_length": 259.109375, "completions/mean_terminated_length": 247.00396728515625, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.0425, "frac_reward_zero_std": 0.546875, "grad_norm": 0.33089637756347656, "learning_rate": 9.996224242355397e-07, "loss": 0.0306, "num_tokens": 23373448.0, "reward": 0.63671875, "reward_std": 0.10616335272789001, "rewards/correctness_reward_func/mean": 0.27734375, "rewards/correctness_reward_func/std": 0.4481254518032074, "rewards/strict_format_reward_func/mean": 0.99609375, "rewards/strict_format_reward_func/std": 0.06243881583213806, "step": 85 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1249.0, "completions/mean_length": 279.59765625, "completions/mean_terminated_length": 267.7351989746094, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.043, "frac_reward_zero_std": 0.46875, "grad_norm": 0.3235888183116913, "learning_rate": 9.99590308462165e-07, "loss": 0.0283, "num_tokens": 23599442.0, "reward": 0.6181640625, "reward_std": 0.12362537533044815, "rewards/correctness_reward_func/mean": 0.2734375, "rewards/correctness_reward_func/std": 0.4461594223976135, "rewards/strict_format_reward_func/mean": 0.962890625, "rewards/strict_format_reward_func/std": 0.18921469151973724, "step": 86 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1255.0, "completions/mean_length": 278.6015625, "completions/mean_terminated_length": 258.65338134765625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.0435, "frac_reward_zero_std": 0.453125, "grad_norm": 0.3776208460330963, "learning_rate": 9.995568825714478e-07, "loss": 0.0546, "num_tokens": 23823638.0, "reward": 0.65234375, "reward_std": 0.12013693153858185, "rewards/correctness_reward_func/mean": 0.310546875, "rewards/correctness_reward_func/std": 0.46317005157470703, "rewards/strict_format_reward_func/mean": 0.994140625, "rewards/strict_format_reward_func/std": 0.07639661431312561, "step": 87 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1127.0, "completions/mean_length": 261.638671875, "completions/mean_terminated_length": 243.41748046875, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.044, "frac_reward_zero_std": 0.46875, "grad_norm": 0.3642917573451996, "learning_rate": 9.995221466510437e-07, "loss": 0.0622, "num_tokens": 24034997.0, "reward": 0.6865234375, "reward_std": 0.11267617344856262, "rewards/correctness_reward_func/mean": 0.37890625, "rewards/correctness_reward_func/std": 0.4855891764163971, "rewards/strict_format_reward_func/mean": 0.994140625, "rewards/strict_format_reward_func/std": 0.07639661431312561, "step": 88 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1257.0, "completions/max_terminated_length": 1257.0, "completions/mean_length": 234.8359375, "completions/mean_terminated_length": 234.8359375, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.0445, "frac_reward_zero_std": 0.515625, "grad_norm": 0.37823545932769775, "learning_rate": 9.994861007920439e-07, "loss": -0.0068, "num_tokens": 24231833.0, "reward": 0.662109375, "reward_std": 0.10682021081447601, "rewards/correctness_reward_func/mean": 0.32421875, "rewards/correctness_reward_func/std": 0.4685399830341339, "rewards/strict_format_reward_func/mean": 1.0, "rewards/strict_format_reward_func/std": 0.0, "step": 89 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1192.0, "completions/mean_length": 265.517578125, "completions/mean_terminated_length": 251.45545959472656, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.045, "frac_reward_zero_std": 0.453125, "grad_norm": 0.38415420055389404, "learning_rate": 9.99448745088974e-07, "loss": 0.0367, "num_tokens": 24448530.0, "reward": 0.6865234375, "reward_std": 0.12411317974328995, "rewards/correctness_reward_func/mean": 0.376953125, "rewards/correctness_reward_func/std": 0.4850969910621643, "rewards/strict_format_reward_func/mean": 0.99609375, "rewards/strict_format_reward_func/std": 0.06243881583213806, "step": 90 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1095.0, "completions/mean_length": 246.599609375, "completions/mean_terminated_length": 240.50885009765625, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.0455, "frac_reward_zero_std": 0.484375, "grad_norm": 0.4248853325843811, "learning_rate": 9.994100796397953e-07, "loss": 0.004, "num_tokens": 24654973.0, "reward": 0.6787109375, "reward_std": 0.11741062998771667, "rewards/correctness_reward_func/mean": 0.375, "rewards/correctness_reward_func/std": 0.4845963716506958, "rewards/strict_format_reward_func/mean": 0.982421875, "rewards/strict_format_reward_func/std": 0.13154059648513794, "step": 91 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1154.0, "completions/mean_length": 286.89453125, "completions/mean_terminated_length": 279.0747985839844, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.046, "frac_reward_zero_std": 0.53125, "grad_norm": 0.3524153232574463, "learning_rate": 9.993701045459033e-07, "loss": 0.0237, "num_tokens": 24884767.0, "reward": 0.6708984375, "reward_std": 0.10362257063388824, "rewards/correctness_reward_func/mean": 0.34765625, "rewards/correctness_reward_func/std": 0.47669193148612976, "rewards/strict_format_reward_func/mean": 0.994140625, "rewards/strict_format_reward_func/std": 0.07639661431312561, "step": 92 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1280.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 242.515625, "completions/mean_terminated_length": 238.4470672607422, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.0465, "frac_reward_zero_std": 0.53125, "grad_norm": 0.34622839093208313, "learning_rate": 9.993288199121282e-07, "loss": 0.001, "num_tokens": 25087511.0, "reward": 0.6669921875, "reward_std": 0.09893198311328888, "rewards/correctness_reward_func/mean": 0.3359375, "rewards/correctness_reward_func/std": 0.4727790653705597, "rewards/strict_format_reward_func/mean": 0.998046875, "rewards/strict_format_reward_func/std": 0.04419417306780815, "step": 93 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1165.0, "completions/mean_length": 271.091796875, "completions/mean_terminated_length": 269.1174011230469, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.047, "frac_reward_zero_std": 0.5, "grad_norm": 0.41750165820121765, "learning_rate": 9.992862258467337e-07, "loss": 0.0405, "num_tokens": 25306838.0, "reward": 0.6875, "reward_std": 0.11129043251276016, "rewards/correctness_reward_func/mean": 0.37890625, "rewards/correctness_reward_func/std": 0.4855891764163971, "rewards/strict_format_reward_func/mean": 0.99609375, "rewards/strict_format_reward_func/std": 0.06243881583213806, "step": 94 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1246.0, "completions/mean_length": 281.06640625, "completions/mean_terminated_length": 273.2007751464844, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.0475, "frac_reward_zero_std": 0.640625, "grad_norm": 0.3369617164134979, "learning_rate": 9.992423224614183e-07, "loss": 0.0388, "num_tokens": 25532536.0, "reward": 0.6162109375, "reward_std": 0.08216007798910141, "rewards/correctness_reward_func/mean": 0.25390625, "rewards/correctness_reward_func/std": 0.43567025661468506, "rewards/strict_format_reward_func/mean": 0.978515625, "rewards/strict_format_reward_func/std": 0.14513419568538666, "step": 95 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1171.0, "completions/mean_length": 284.396484375, "completions/mean_terminated_length": 274.5779113769531, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.048, "frac_reward_zero_std": 0.5625, "grad_norm": 0.3071320950984955, "learning_rate": 9.991971098713135e-07, "loss": 0.0062, "num_tokens": 25758363.0, "reward": 0.6513671875, "reward_std": 0.09823831915855408, "rewards/correctness_reward_func/mean": 0.328125, "rewards/correctness_reward_func/std": 0.4699897766113281, "rewards/strict_format_reward_func/mean": 0.974609375, "rewards/strict_format_reward_func/std": 0.15746226906776428, "step": 96 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1060.0, "completions/mean_length": 287.841796875, "completions/mean_terminated_length": 283.95098876953125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.0485, "frac_reward_zero_std": 0.46875, "grad_norm": 0.3856940269470215, "learning_rate": 9.991505881949836e-07, "loss": 0.0094, "num_tokens": 25986954.0, "reward": 0.6591796875, "reward_std": 0.10984852910041809, "rewards/correctness_reward_func/mean": 0.318359375, "rewards/correctness_reward_func/std": 0.46629536151885986, "rewards/strict_format_reward_func/mean": 1.0, "rewards/strict_format_reward_func/std": 0.0, "step": 97 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1237.0, "completions/mean_length": 271.947265625, "completions/mean_terminated_length": 259.99407958984375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.049, "frac_reward_zero_std": 0.453125, "grad_norm": 0.40509501099586487, "learning_rate": 9.991027575544265e-07, "loss": 0.0416, "num_tokens": 26208991.0, "reward": 0.65234375, "reward_std": 0.12657982110977173, "rewards/correctness_reward_func/mean": 0.330078125, "rewards/correctness_reward_func/std": 0.47070086002349854, "rewards/strict_format_reward_func/mean": 0.974609375, "rewards/strict_format_reward_func/std": 0.15746226906776428, "step": 98 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1274.0, "completions/mean_length": 280.7890625, "completions/mean_terminated_length": 268.94073486328125, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.0495, "frac_reward_zero_std": 0.578125, "grad_norm": 0.33009451627731323, "learning_rate": 9.990536180750723e-07, "loss": 0.0195, "num_tokens": 26434115.0, "reward": 0.662109375, "reward_std": 0.09571787714958191, "rewards/correctness_reward_func/mean": 0.361328125, "rewards/correctness_reward_func/std": 0.48085519671440125, "rewards/strict_format_reward_func/mean": 0.962890625, "rewards/strict_format_reward_func/std": 0.18921469151973724, "step": 99 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1280.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 242.962890625, "completions/mean_terminated_length": 230.666015625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.05, "frac_reward_zero_std": 0.453125, "grad_norm": 0.41944050788879395, "learning_rate": 9.990031698857841e-07, "loss": 0.028, "num_tokens": 26637352.0, "reward": 0.7265625, "reward_std": 0.12141813337802887, "rewards/correctness_reward_func/mean": 0.45703125, "rewards/correctness_reward_func/std": 0.49863746762275696, "rewards/strict_format_reward_func/mean": 0.99609375, "rewards/strict_format_reward_func/std": 0.06243881583213806, "step": 100 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1214.0, "completions/mean_length": 317.193359375, "completions/mean_terminated_length": 305.7767028808594, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.0505, "frac_reward_zero_std": 0.53125, "grad_norm": 0.3403419256210327, "learning_rate": 9.989514131188558e-07, "loss": 0.0018, "num_tokens": 26877219.0, "reward": 0.6357421875, "reward_std": 0.1002131998538971, "rewards/correctness_reward_func/mean": 0.279296875, "rewards/correctness_reward_func/std": 0.44909247756004333, "rewards/strict_format_reward_func/mean": 0.9921875, "rewards/strict_format_reward_func/std": 0.08812850713729858, "step": 101 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 1280.0, "completions/max_terminated_length": 920.0, "completions/mean_length": 274.77734375, "completions/mean_terminated_length": 264.8639221191406, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.051, "frac_reward_zero_std": 0.421875, "grad_norm": 0.4247595965862274, "learning_rate": 9.988983479100138e-07, "loss": 0.0445, "num_tokens": 27096441.0, "reward": 0.6767578125, "reward_std": 0.12336335331201553, "rewards/correctness_reward_func/mean": 0.361328125, "rewards/correctness_reward_func/std": 0.48085519671440125, "rewards/strict_format_reward_func/mean": 0.9921875, "rewards/strict_format_reward_func/std": 0.08812850713729858, "step": 102 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1026.0, "completions/mean_length": 290.76953125, "completions/mean_terminated_length": 281.0138244628906, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.0515, "frac_reward_zero_std": 0.46875, "grad_norm": 0.376570463180542, "learning_rate": 9.988439743984152e-07, "loss": 0.0118, "num_tokens": 27325419.0, "reward": 0.6689453125, "reward_std": 0.1276625692844391, "rewards/correctness_reward_func/mean": 0.36328125, "rewards/correctness_reward_func/std": 0.4814152419567108, "rewards/strict_format_reward_func/mean": 0.974609375, "rewards/strict_format_reward_func/std": 0.15746226906776428, "step": 103 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1170.0, "completions/mean_length": 281.7734375, "completions/mean_terminated_length": 279.8199462890625, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.052, "frac_reward_zero_std": 0.515625, "grad_norm": 0.38942837715148926, "learning_rate": 9.987882927266486e-07, "loss": 0.033, "num_tokens": 27550783.0, "reward": 0.6962890625, "reward_std": 0.1096472442150116, "rewards/correctness_reward_func/mean": 0.39453125, "rewards/correctness_reward_func/std": 0.4892277717590332, "rewards/strict_format_reward_func/mean": 0.998046875, "rewards/strict_format_reward_func/std": 0.04419417306780815, "step": 104 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1266.0, "completions/mean_length": 340.18359375, "completions/mean_terminated_length": 325.2658996582031, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.0525, "frac_reward_zero_std": 0.4375, "grad_norm": 0.33258897066116333, "learning_rate": 9.987313030407323e-07, "loss": 0.0367, "num_tokens": 27810173.0, "reward": 0.6240234375, "reward_std": 0.12276707589626312, "rewards/correctness_reward_func/mean": 0.298828125, "rewards/correctness_reward_func/std": 0.45819199085235596, "rewards/strict_format_reward_func/mean": 0.94921875, "rewards/strict_format_reward_func/std": 0.21976542472839355, "step": 105 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1157.0, "completions/mean_length": 291.203125, "completions/mean_terminated_length": 285.375244140625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.053, "frac_reward_zero_std": 0.5, "grad_norm": 0.3699053227901459, "learning_rate": 9.986730054901152e-07, "loss": 0.0202, "num_tokens": 28039605.0, "reward": 0.68359375, "reward_std": 0.10830052196979523, "rewards/correctness_reward_func/mean": 0.38671875, "rewards/correctness_reward_func/std": 0.48747459053993225, "rewards/strict_format_reward_func/mean": 0.98046875, "rewards/strict_format_reward_func/std": 0.1385180652141571, "step": 106 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1252.0, "completions/mean_length": 309.080078125, "completions/mean_terminated_length": 291.7077331542969, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.0535, "frac_reward_zero_std": 0.515625, "grad_norm": 0.352000892162323, "learning_rate": 9.986134002276759e-07, "loss": 0.0478, "num_tokens": 28281262.0, "reward": 0.642578125, "reward_std": 0.10818085074424744, "rewards/correctness_reward_func/mean": 0.322265625, "rewards/correctness_reward_func/std": 0.46780112385749817, "rewards/strict_format_reward_func/mean": 0.962890625, "rewards/strict_format_reward_func/std": 0.18921469151973724, "step": 107 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1220.0, "completions/mean_length": 296.845703125, "completions/mean_terminated_length": 289.1043395996094, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.054, "frac_reward_zero_std": 0.40625, "grad_norm": 0.37620407342910767, "learning_rate": 9.985524874097223e-07, "loss": 0.0485, "num_tokens": 28512487.0, "reward": 0.71484375, "reward_std": 0.13464301824569702, "rewards/correctness_reward_func/mean": 0.44140625, "rewards/correctness_reward_func/std": 0.4970405399799347, "rewards/strict_format_reward_func/mean": 0.98828125, "rewards/strict_format_reward_func/std": 0.10772226005792618, "step": 108 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1229.0, "completions/mean_length": 312.244140625, "completions/mean_terminated_length": 304.6240234375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.0545, "frac_reward_zero_std": 0.515625, "grad_norm": 0.328682541847229, "learning_rate": 9.98490267195991e-07, "loss": -0.0099, "num_tokens": 28757084.0, "reward": 0.6767578125, "reward_std": 0.10340342670679092, "rewards/correctness_reward_func/mean": 0.35546875, "rewards/correctness_reward_func/std": 0.47912323474884033, "rewards/strict_format_reward_func/mean": 0.998046875, "rewards/strict_format_reward_func/std": 0.04419417306780815, "step": 109 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1190.0, "completions/mean_length": 284.455078125, "completions/mean_terminated_length": 280.5509948730469, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.055, "frac_reward_zero_std": 0.4375, "grad_norm": 0.3380143344402313, "learning_rate": 9.984267397496474e-07, "loss": 0.0211, "num_tokens": 28979253.0, "reward": 0.6962890625, "reward_std": 0.1271398961544037, "rewards/correctness_reward_func/mean": 0.396484375, "rewards/correctness_reward_func/std": 0.4896455705165863, "rewards/strict_format_reward_func/mean": 0.99609375, "rewards/strict_format_reward_func/std": 0.06243881583213806, "step": 110 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1216.0, "completions/mean_length": 310.7890625, "completions/mean_terminated_length": 299.29644775390625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.0555, "frac_reward_zero_std": 0.46875, "grad_norm": 0.3332838714122772, "learning_rate": 9.983619052372847e-07, "loss": 0.0308, "num_tokens": 29220033.0, "reward": 0.6796875, "reward_std": 0.11766843497753143, "rewards/correctness_reward_func/mean": 0.37109375, "rewards/correctness_reward_func/std": 0.4835699498653412, "rewards/strict_format_reward_func/mean": 0.98828125, "rewards/strict_format_reward_func/std": 0.10772226005792618, "step": 111 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1276.0, "completions/mean_length": 319.283203125, "completions/mean_terminated_length": 311.718505859375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.056, "frac_reward_zero_std": 0.515625, "grad_norm": 0.3073104918003082, "learning_rate": 9.982957638289238e-07, "loss": 0.0259, "num_tokens": 29464242.0, "reward": 0.6826171875, "reward_std": 0.1096472442150116, "rewards/correctness_reward_func/mean": 0.38671875, "rewards/correctness_reward_func/std": 0.48747459053993225, "rewards/strict_format_reward_func/mean": 0.978515625, "rewards/strict_format_reward_func/std": 0.14513419568538666, "step": 112 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1268.0, "completions/mean_length": 309.068359375, "completions/mean_terminated_length": 301.4232177734375, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.0565, "frac_reward_zero_std": 0.46875, "grad_norm": 0.3485553562641144, "learning_rate": 9.98228315698013e-07, "loss": 0.0007, "num_tokens": 29704661.0, "reward": 0.65234375, "reward_std": 0.12013046443462372, "rewards/correctness_reward_func/mean": 0.30859375, "rewards/correctness_reward_func/std": 0.4623647928237915, "rewards/strict_format_reward_func/mean": 0.99609375, "rewards/strict_format_reward_func/std": 0.06243881583213806, "step": 113 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1172.0, "completions/mean_length": 331.8671875, "completions/mean_terminated_length": 311.0498962402344, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.057, "frac_reward_zero_std": 0.453125, "grad_norm": 0.30490702390670776, "learning_rate": 9.981595610214274e-07, "loss": 0.0156, "num_tokens": 29956513.0, "reward": 0.650390625, "reward_std": 0.12339082360267639, "rewards/correctness_reward_func/mean": 0.33984375, "rewards/correctness_reward_func/std": 0.4741191864013672, "rewards/strict_format_reward_func/mean": 0.9609375, "rewards/strict_format_reward_func/std": 0.1939331740140915, "step": 114 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1214.0, "completions/mean_length": 331.599609375, "completions/mean_terminated_length": 329.74365234375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.0575, "frac_reward_zero_std": 0.46875, "grad_norm": 0.31770405173301697, "learning_rate": 9.980894999794678e-07, "loss": 0.01, "num_tokens": 30207324.0, "reward": 0.6689453125, "reward_std": 0.11708052456378937, "rewards/correctness_reward_func/mean": 0.33984375, "rewards/correctness_reward_func/std": 0.4741191864013672, "rewards/strict_format_reward_func/mean": 0.998046875, "rewards/strict_format_reward_func/std": 0.04419417306780815, "step": 115 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1135.0, "completions/mean_length": 291.490234375, "completions/mean_terminated_length": 283.7066955566406, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.058, "frac_reward_zero_std": 0.5, "grad_norm": 0.37668558955192566, "learning_rate": 9.980181327558608e-07, "loss": 0.0447, "num_tokens": 30437999.0, "reward": 0.693359375, "reward_std": 0.11040641367435455, "rewards/correctness_reward_func/mean": 0.39453125, "rewards/correctness_reward_func/std": 0.4892277717590332, "rewards/strict_format_reward_func/mean": 0.9921875, "rewards/strict_format_reward_func/std": 0.08812850713729858, "step": 116 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1234.0, "completions/mean_length": 272.30078125, "completions/mean_terminated_length": 264.36614990234375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.0585, "frac_reward_zero_std": 0.4375, "grad_norm": 0.3976609408855438, "learning_rate": 9.979454595377593e-07, "loss": 0.0347, "num_tokens": 30656337.0, "reward": 0.748046875, "reward_std": 0.12540830671787262, "rewards/correctness_reward_func/mean": 0.501953125, "rewards/correctness_reward_func/std": 0.5004851818084717, "rewards/strict_format_reward_func/mean": 0.994140625, "rewards/strict_format_reward_func/std": 0.07639661431312561, "step": 117 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1190.0, "completions/mean_length": 286.828125, "completions/mean_terminated_length": 280.9744567871094, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.059, "frac_reward_zero_std": 0.453125, "grad_norm": 0.37348294258117676, "learning_rate": 9.978714805157398e-07, "loss": 0.0214, "num_tokens": 30885913.0, "reward": 0.7001953125, "reward_std": 0.12218964099884033, "rewards/correctness_reward_func/mean": 0.404296875, "rewards/correctness_reward_func/std": 0.4912354052066803, "rewards/strict_format_reward_func/mean": 0.99609375, "rewards/strict_format_reward_func/std": 0.06243881583213806, "step": 118 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1259.0, "completions/mean_length": 278.900390625, "completions/mean_terminated_length": 274.9745178222656, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.0595, "frac_reward_zero_std": 0.359375, "grad_norm": 0.41844749450683594, "learning_rate": 9.97796195883804e-07, "loss": 0.0428, "num_tokens": 31105718.0, "reward": 0.7080078125, "reward_std": 0.14895838499069214, "rewards/correctness_reward_func/mean": 0.419921875, "rewards/correctness_reward_func/std": 0.4940285086631775, "rewards/strict_format_reward_func/mean": 0.99609375, "rewards/strict_format_reward_func/std": 0.06243881583213806, "step": 119 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1280.0, "completions/mean_length": 288.63671875, "completions/mean_terminated_length": 278.8599548339844, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.06, "frac_reward_zero_std": 0.515625, "grad_norm": 0.4099017381668091, "learning_rate": 9.977196058393769e-07, "loss": 0.0324, "num_tokens": 31331660.0, "reward": 0.7109375, "reward_std": 0.10908715426921844, "rewards/correctness_reward_func/mean": 0.4296875, "rewards/correctness_reward_func/std": 0.4955156147480011, "rewards/strict_format_reward_func/mean": 0.9921875, "rewards/strict_format_reward_func/std": 0.08812850713729858, "step": 120 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1280.0, "completions/mean_length": 336.83984375, "completions/mean_terminated_length": 331.28094482421875, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.0605, "frac_reward_zero_std": 0.359375, "grad_norm": 0.3835758566856384, "learning_rate": 9.97641710583307e-07, "loss": 0.0261, "num_tokens": 31583962.0, "reward": 0.6767578125, "reward_std": 0.14295417070388794, "rewards/correctness_reward_func/mean": 0.359375, "rewards/correctness_reward_func/std": 0.48028653860092163, "rewards/strict_format_reward_func/mean": 0.994140625, "rewards/strict_format_reward_func/std": 0.07639661431312561, "step": 121 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1128.0, "completions/mean_length": 285.509765625, "completions/mean_terminated_length": 275.7021789550781, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.061, "frac_reward_zero_std": 0.375, "grad_norm": 0.4575214982032776, "learning_rate": 9.975625103198654e-07, "loss": 0.0767, "num_tokens": 31809903.0, "reward": 0.7431640625, "reward_std": 0.13756409287452698, "rewards/correctness_reward_func/mean": 0.49609375, "rewards/correctness_reward_func/std": 0.5004737377166748, "rewards/strict_format_reward_func/mean": 0.990234375, "rewards/strict_format_reward_func/std": 0.09843364357948303, "step": 122 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1193.0, "completions/mean_length": 297.978515625, "completions/mean_terminated_length": 274.4100036621094, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.0615, "frac_reward_zero_std": 0.453125, "grad_norm": 0.4273553788661957, "learning_rate": 9.974820052567459e-07, "loss": 0.0543, "num_tokens": 32042436.0, "reward": 0.7216796875, "reward_std": 0.11543769389390945, "rewards/correctness_reward_func/mean": 0.462890625, "rewards/correctness_reward_func/std": 0.4991086423397064, "rewards/strict_format_reward_func/mean": 0.98046875, "rewards/strict_format_reward_func/std": 0.1385180652141571, "step": 123 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1254.0, "completions/mean_length": 319.01953125, "completions/mean_terminated_length": 311.4527587890625, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.062, "frac_reward_zero_std": 0.421875, "grad_norm": 0.39186662435531616, "learning_rate": 9.974001956050635e-07, "loss": -0.0186, "num_tokens": 32289126.0, "reward": 0.708984375, "reward_std": 0.13475368916988373, "rewards/correctness_reward_func/mean": 0.423828125, "rewards/correctness_reward_func/std": 0.4946470856666565, "rewards/strict_format_reward_func/mean": 0.994140625, "rewards/strict_format_reward_func/std": 0.07639661431312561, "step": 124 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1138.0, "completions/mean_length": 312.427734375, "completions/mean_terminated_length": 306.7249755859375, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.0625, "frac_reward_zero_std": 0.5, "grad_norm": 0.35031789541244507, "learning_rate": 9.973170815793542e-07, "loss": 0.0228, "num_tokens": 32528433.0, "reward": 0.6875, "reward_std": 0.1170354038476944, "rewards/correctness_reward_func/mean": 0.37890625, "rewards/correctness_reward_func/std": 0.4855891764163971, "rewards/strict_format_reward_func/mean": 0.99609375, "rewards/strict_format_reward_func/std": 0.06243881583213806, "step": 125 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1257.0, "completions/mean_length": 326.7109375, "completions/mean_terminated_length": 315.4071350097656, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.063, "frac_reward_zero_std": 0.53125, "grad_norm": 0.33133941888809204, "learning_rate": 9.972326633975752e-07, "loss": -0.0113, "num_tokens": 32779333.0, "reward": 0.6748046875, "reward_std": 0.10773944854736328, "rewards/correctness_reward_func/mean": 0.365234375, "rewards/correctness_reward_func/std": 0.4819667339324951, "rewards/strict_format_reward_func/mean": 0.984375, "rewards/strict_format_reward_func/std": 0.12414088100194931, "step": 126 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1203.0, "completions/mean_length": 289.03515625, "completions/mean_terminated_length": 287.09588623046875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.0635, "frac_reward_zero_std": 0.453125, "grad_norm": 0.44464367628097534, "learning_rate": 9.971469412811032e-07, "loss": -0.0007, "num_tokens": 33009039.0, "reward": 0.66796875, "reward_std": 0.11655725538730621, "rewards/correctness_reward_func/mean": 0.3359375, "rewards/correctness_reward_func/std": 0.4727790653705597, "rewards/strict_format_reward_func/mean": 1.0, "rewards/strict_format_reward_func/std": 0.0, "step": 127 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1221.0, "completions/mean_length": 345.41015625, "completions/mean_terminated_length": 324.8902282714844, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.064, "frac_reward_zero_std": 0.40625, "grad_norm": 0.3842817544937134, "learning_rate": 9.970599154547344e-07, "loss": 0.0311, "num_tokens": 33271201.0, "reward": 0.69140625, "reward_std": 0.1338273286819458, "rewards/correctness_reward_func/mean": 0.41015625, "rewards/correctness_reward_func/std": 0.49234291911125183, "rewards/strict_format_reward_func/mean": 0.97265625, "rewards/strict_format_reward_func/std": 0.16324250400066376, "step": 128 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1119.0, "completions/mean_length": 332.306640625, "completions/mean_terminated_length": 321.0691833496094, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.0645, "frac_reward_zero_std": 0.5625, "grad_norm": 0.29276326298713684, "learning_rate": 9.969715861466839e-07, "loss": 0.0519, "num_tokens": 33521310.0, "reward": 0.6787109375, "reward_std": 0.09630244970321655, "rewards/correctness_reward_func/mean": 0.369140625, "rewards/correctness_reward_func/std": 0.4830440282821655, "rewards/strict_format_reward_func/mean": 0.98828125, "rewards/strict_format_reward_func/std": 0.10772226005792618, "step": 129 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1257.0, "completions/mean_length": 324.552734375, "completions/mean_terminated_length": 322.6829833984375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.065, "frac_reward_zero_std": 0.453125, "grad_norm": 0.37196236848831177, "learning_rate": 9.96881953588585e-07, "loss": 0.0367, "num_tokens": 33766665.0, "reward": 0.689453125, "reward_std": 0.12194855511188507, "rewards/correctness_reward_func/mean": 0.380859375, "rewards/correctness_reward_func/std": 0.48607301712036133, "rewards/strict_format_reward_func/mean": 0.998046875, "rewards/strict_format_reward_func/std": 0.04419417306780815, "step": 130 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1280.0, "completions/mean_length": 368.708984375, "completions/mean_terminated_length": 352.403564453125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.0655, "frac_reward_zero_std": 0.484375, "grad_norm": 0.33307549357414246, "learning_rate": 9.967910180154888e-07, "loss": 0.0504, "num_tokens": 34041764.0, "reward": 0.6376953125, "reward_std": 0.1138189435005188, "rewards/correctness_reward_func/mean": 0.30078125, "rewards/correctness_reward_func/std": 0.45904624462127686, "rewards/strict_format_reward_func/mean": 0.974609375, "rewards/strict_format_reward_func/std": 0.15746226906776428, "step": 131 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1263.0, "completions/mean_length": 319.814453125, "completions/mean_terminated_length": 306.5049743652344, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.066, "frac_reward_zero_std": 0.53125, "grad_norm": 0.36741000413894653, "learning_rate": 9.96698779665863e-07, "loss": 0.0432, "num_tokens": 34290317.0, "reward": 0.6806640625, "reward_std": 0.10530900955200195, "rewards/correctness_reward_func/mean": 0.3828125, "rewards/correctness_reward_func/std": 0.486548513174057, "rewards/strict_format_reward_func/mean": 0.978515625, "rewards/strict_format_reward_func/std": 0.14513419568538666, "step": 132 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1277.0, "completions/mean_length": 306.5078125, "completions/mean_terminated_length": 298.842529296875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.0665, "frac_reward_zero_std": 0.484375, "grad_norm": 0.355952650308609, "learning_rate": 9.96605238781592e-07, "loss": 0.0165, "num_tokens": 34526649.0, "reward": 0.720703125, "reward_std": 0.11906169354915619, "rewards/correctness_reward_func/mean": 0.447265625, "rewards/correctness_reward_func/std": 0.4976975917816162, "rewards/strict_format_reward_func/mean": 0.994140625, "rewards/strict_format_reward_func/std": 0.07639661431312561, "step": 133 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1233.0, "completions/mean_length": 331.890625, "completions/mean_terminated_length": 320.6482238769531, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.067, "frac_reward_zero_std": 0.34375, "grad_norm": 0.40900564193725586, "learning_rate": 9.965103956079763e-07, "loss": 0.0417, "num_tokens": 34776817.0, "reward": 0.703125, "reward_std": 0.1474318504333496, "rewards/correctness_reward_func/mean": 0.416015625, "rewards/correctness_reward_func/std": 0.493378221988678, "rewards/strict_format_reward_func/mean": 0.990234375, "rewards/strict_format_reward_func/std": 0.09843364357948303, "step": 134 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1245.0, "completions/mean_length": 323.171875, "completions/mean_terminated_length": 309.908935546875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.0675, "frac_reward_zero_std": 0.5, "grad_norm": 0.3504819869995117, "learning_rate": 9.964142503937305e-07, "loss": 0.0377, "num_tokens": 35022641.0, "reward": 0.73046875, "reward_std": 0.11839006841182709, "rewards/correctness_reward_func/mean": 0.48828125, "rewards/correctness_reward_func/std": 0.5003514885902405, "rewards/strict_format_reward_func/mean": 0.97265625, "rewards/strict_format_reward_func/std": 0.16324250400066376, "step": 135 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1257.0, "completions/mean_length": 341.70703125, "completions/mean_terminated_length": 326.8135070800781, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.068, "frac_reward_zero_std": 0.59375, "grad_norm": 0.3507496118545532, "learning_rate": 9.96316803390984e-07, "loss": -0.0049, "num_tokens": 35280587.0, "reward": 0.6787109375, "reward_std": 0.09252271056175232, "rewards/correctness_reward_func/mean": 0.37890625, "rewards/correctness_reward_func/std": 0.4855891764163971, "rewards/strict_format_reward_func/mean": 0.978515625, "rewards/strict_format_reward_func/std": 0.14513419568538666, "step": 136 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1167.0, "completions/mean_length": 315.8828125, "completions/mean_terminated_length": 302.5188293457031, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.0685, "frac_reward_zero_std": 0.34375, "grad_norm": 0.41387316584587097, "learning_rate": 9.96218054855281e-07, "loss": 0.0158, "num_tokens": 35523623.0, "reward": 0.7373046875, "reward_std": 0.1472911685705185, "rewards/correctness_reward_func/mean": 0.484375, "rewards/correctness_reward_func/std": 0.5002445578575134, "rewards/strict_format_reward_func/mean": 0.990234375, "rewards/strict_format_reward_func/std": 0.09843364357948303, "step": 137 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1226.0, "completions/mean_length": 332.626953125, "completions/mean_terminated_length": 319.49505615234375, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.069, "frac_reward_zero_std": 0.515625, "grad_norm": 0.39417994022369385, "learning_rate": 9.961180050455774e-07, "loss": 0.0484, "num_tokens": 35779000.0, "reward": 0.701171875, "reward_std": 0.1087215393781662, "rewards/correctness_reward_func/mean": 0.4140625, "rewards/correctness_reward_func/std": 0.49304109811782837, "rewards/strict_format_reward_func/mean": 0.98828125, "rewards/strict_format_reward_func/std": 0.10772226005792618, "step": 138 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1130.0, "completions/mean_length": 294.974609375, "completions/mean_terminated_length": 287.218505859375, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.0695, "frac_reward_zero_std": 0.5625, "grad_norm": 0.4120825231075287, "learning_rate": 9.960166542242428e-07, "loss": 0.023, "num_tokens": 36007547.0, "reward": 0.7265625, "reward_std": 0.10926302522420883, "rewards/correctness_reward_func/mean": 0.47265625, "rewards/correctness_reward_func/std": 0.49974003434181213, "rewards/strict_format_reward_func/mean": 0.98046875, "rewards/strict_format_reward_func/std": 0.1385180652141571, "step": 139 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1262.0, "completions/mean_length": 351.033203125, "completions/mean_terminated_length": 332.52789306640625, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.07, "frac_reward_zero_std": 0.515625, "grad_norm": 0.3772619366645813, "learning_rate": 9.95914002657057e-07, "loss": 0.0311, "num_tokens": 36270980.0, "reward": 0.6484375, "reward_std": 0.11300899088382721, "rewards/correctness_reward_func/mean": 0.314453125, "rewards/correctness_reward_func/std": 0.4647517800331116, "rewards/strict_format_reward_func/mean": 0.982421875, "rewards/strict_format_reward_func/std": 0.13154059648513794, "step": 140 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1210.0, "completions/mean_length": 316.1328125, "completions/mean_terminated_length": 306.6272277832031, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.0705, "frac_reward_zero_std": 0.484375, "grad_norm": 0.41429778933525085, "learning_rate": 9.958100506132126e-07, "loss": 0.0094, "num_tokens": 36515904.0, "reward": 0.662109375, "reward_std": 0.11911870539188385, "rewards/correctness_reward_func/mean": 0.349609375, "rewards/correctness_reward_func/std": 0.47731292247772217, "rewards/strict_format_reward_func/mean": 0.974609375, "rewards/strict_format_reward_func/std": 0.15746226906776428, "step": 141 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.029296875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1242.0, "completions/mean_length": 356.990234375, "completions/mean_terminated_length": 329.1327819824219, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.071, "frac_reward_zero_std": 0.40625, "grad_norm": 0.41644033789634705, "learning_rate": 9.95704798365311e-07, "loss": 0.0608, "num_tokens": 36781971.0, "reward": 0.673828125, "reward_std": 0.14013952016830444, "rewards/correctness_reward_func/mean": 0.3671875, "rewards/correctness_reward_func/std": 0.48250964283943176, "rewards/strict_format_reward_func/mean": 0.98046875, "rewards/strict_format_reward_func/std": 0.1385180652141571, "step": 142 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1256.0, "completions/mean_length": 331.8984375, "completions/mean_terminated_length": 330.0430603027344, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.0715, "frac_reward_zero_std": 0.34375, "grad_norm": 0.480290949344635, "learning_rate": 9.955982461893646e-07, "loss": -0.0065, "num_tokens": 37032839.0, "reward": 0.7353515625, "reward_std": 0.14598089456558228, "rewards/correctness_reward_func/mean": 0.474609375, "rewards/correctness_reward_func/std": 0.4998432695865631, "rewards/strict_format_reward_func/mean": 0.99609375, "rewards/strict_format_reward_func/std": 0.06243881583213806, "step": 143 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1266.0, "completions/mean_length": 329.5234375, "completions/mean_terminated_length": 316.3485107421875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.072, "frac_reward_zero_std": 0.421875, "grad_norm": 0.46903443336486816, "learning_rate": 9.95490394364794e-07, "loss": 0.032, "num_tokens": 37280259.0, "reward": 0.720703125, "reward_std": 0.13132202625274658, "rewards/correctness_reward_func/mean": 0.4609375, "rewards/correctness_reward_func/std": 0.4989593029022217, "rewards/strict_format_reward_func/mean": 0.98046875, "rewards/strict_format_reward_func/std": 0.1385180652141571, "step": 144 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1162.0, "completions/mean_length": 308.98046875, "completions/mean_terminated_length": 295.52081298828125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.0725, "frac_reward_zero_std": 0.46875, "grad_norm": 0.46370160579681396, "learning_rate": 9.953812431744274e-07, "loss": 0.0261, "num_tokens": 37516353.0, "reward": 0.76171875, "reward_std": 0.11604039371013641, "rewards/correctness_reward_func/mean": 0.53125, "rewards/correctness_reward_func/std": 0.4995105266571045, "rewards/strict_format_reward_func/mean": 0.9921875, "rewards/strict_format_reward_func/std": 0.08812850713729858, "step": 145 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1271.0, "completions/mean_length": 357.921875, "completions/mean_terminated_length": 341.4234313964844, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.073, "frac_reward_zero_std": 0.46875, "grad_norm": 0.4102765619754791, "learning_rate": 9.952707929045018e-07, "loss": 0.009, "num_tokens": 37780129.0, "reward": 0.6904296875, "reward_std": 0.12260813266038895, "rewards/correctness_reward_func/mean": 0.3984375, "rewards/correctness_reward_func/std": 0.4900552034378052, "rewards/strict_format_reward_func/mean": 0.982421875, "rewards/strict_format_reward_func/std": 0.13154059648513794, "step": 146 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1248.0, "completions/mean_length": 309.810546875, "completions/mean_terminated_length": 307.91192626953125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.0735, "frac_reward_zero_std": 0.453125, "grad_norm": 0.4716207981109619, "learning_rate": 9.951590438446596e-07, "loss": -0.0008, "num_tokens": 38017552.0, "reward": 0.767578125, "reward_std": 0.11757326126098633, "rewards/correctness_reward_func/mean": 0.541015625, "rewards/correctness_reward_func/std": 0.49880221486091614, "rewards/strict_format_reward_func/mean": 0.994140625, "rewards/strict_format_reward_func/std": 0.07639661431312561, "step": 147 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1275.0, "completions/mean_length": 337.72265625, "completions/mean_terminated_length": 324.6614074707031, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.074, "frac_reward_zero_std": 0.46875, "grad_norm": 0.4467242360115051, "learning_rate": 9.9504599628795e-07, "loss": 0.0249, "num_tokens": 38272346.0, "reward": 0.71875, "reward_std": 0.11920684576034546, "rewards/correctness_reward_func/mean": 0.4609375, "rewards/correctness_reward_func/std": 0.4989593029022217, "rewards/strict_format_reward_func/mean": 0.9765625, "rewards/strict_format_reward_func/std": 0.15143637359142303, "step": 148 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1270.0, "completions/mean_length": 385.158203125, "completions/mean_terminated_length": 376.3333435058594, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.0745, "frac_reward_zero_std": 0.390625, "grad_norm": 0.40453091263771057, "learning_rate": 9.94931650530827e-07, "loss": 0.0326, "num_tokens": 38551019.0, "reward": 0.705078125, "reward_std": 0.1426570862531662, "rewards/correctness_reward_func/mean": 0.41796875, "rewards/correctness_reward_func/std": 0.4937073290348053, "rewards/strict_format_reward_func/mean": 0.9921875, "rewards/strict_format_reward_func/std": 0.08812850713729858, "step": 149 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1229.0, "completions/mean_length": 372.548828125, "completions/mean_terminated_length": 350.77001953125, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.075, "frac_reward_zero_std": 0.53125, "grad_norm": 0.3817805349826813, "learning_rate": 9.948160068731491e-07, "loss": 0.0353, "num_tokens": 38821844.0, "reward": 0.693359375, "reward_std": 0.11664407700300217, "rewards/correctness_reward_func/mean": 0.400390625, "rewards/correctness_reward_func/std": 0.4904567301273346, "rewards/strict_format_reward_func/mean": 0.986328125, "rewards/strict_format_reward_func/std": 0.1162383034825325, "step": 150 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1272.0, "completions/mean_length": 345.74609375, "completions/mean_terminated_length": 338.3897705078125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.0755, "frac_reward_zero_std": 0.546875, "grad_norm": 0.41571244597435, "learning_rate": 9.946990656181779e-07, "loss": 0.0327, "num_tokens": 39078066.0, "reward": 0.70703125, "reward_std": 0.10186225175857544, "rewards/correctness_reward_func/mean": 0.421875, "rewards/correctness_reward_func/std": 0.49434176087379456, "rewards/strict_format_reward_func/mean": 0.9921875, "rewards/strict_format_reward_func/std": 0.08812850713729858, "step": 151 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1199.0, "completions/mean_length": 341.287109375, "completions/mean_terminated_length": 339.4501037597656, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.076, "frac_reward_zero_std": 0.515625, "grad_norm": 0.41722550988197327, "learning_rate": 9.945808270725789e-07, "loss": 0.0098, "num_tokens": 39331645.0, "reward": 0.7265625, "reward_std": 0.10665793716907501, "rewards/correctness_reward_func/mean": 0.45703125, "rewards/correctness_reward_func/std": 0.49863746762275696, "rewards/strict_format_reward_func/mean": 0.99609375, "rewards/strict_format_reward_func/std": 0.06243881583213806, "step": 152 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1144.0, "completions/mean_length": 352.1796875, "completions/mean_terminated_length": 339.3188171386719, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.0765, "frac_reward_zero_std": 0.53125, "grad_norm": 0.3978036940097809, "learning_rate": 9.94461291546418e-07, "loss": 0.0413, "num_tokens": 39594281.0, "reward": 0.7294921875, "reward_std": 0.1095801368355751, "rewards/correctness_reward_func/mean": 0.482421875, "rewards/correctness_reward_func/std": 0.5001795887947083, "rewards/strict_format_reward_func/mean": 0.9765625, "rewards/strict_format_reward_func/std": 0.15143637359142303, "step": 153 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1229.0, "completions/mean_length": 331.8515625, "completions/mean_terminated_length": 326.2632751464844, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.077, "frac_reward_zero_std": 0.390625, "grad_norm": 0.4544813930988312, "learning_rate": 9.943404593531641e-07, "loss": 0.022, "num_tokens": 39842997.0, "reward": 0.728515625, "reward_std": 0.142686128616333, "rewards/correctness_reward_func/mean": 0.462890625, "rewards/correctness_reward_func/std": 0.4991086423397064, "rewards/strict_format_reward_func/mean": 0.994140625, "rewards/strict_format_reward_func/std": 0.07639661431312561, "step": 154 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1263.0, "completions/mean_length": 350.720703125, "completions/mean_terminated_length": 341.55621337890625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.0775, "frac_reward_zero_std": 0.578125, "grad_norm": 0.3213302791118622, "learning_rate": 9.942183308096853e-07, "loss": 0.0282, "num_tokens": 40106102.0, "reward": 0.7138671875, "reward_std": 0.09759382158517838, "rewards/correctness_reward_func/mean": 0.45703125, "rewards/correctness_reward_func/std": 0.49863746762275696, "rewards/strict_format_reward_func/mean": 0.970703125, "rewards/strict_format_reward_func/std": 0.16880230605602264, "step": 155 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1224.0, "completions/mean_length": 384.44921875, "completions/mean_terminated_length": 366.60955810546875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.078, "frac_reward_zero_std": 0.390625, "grad_norm": 0.38836321234703064, "learning_rate": 9.94094906236249e-07, "loss": 0.0144, "num_tokens": 40381964.0, "reward": 0.7001953125, "reward_std": 0.14046339690685272, "rewards/correctness_reward_func/mean": 0.4296875, "rewards/correctness_reward_func/std": 0.4955156147480011, "rewards/strict_format_reward_func/mean": 0.970703125, "rewards/strict_format_reward_func/std": 0.16880230605602264, "step": 156 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1140.0, "completions/mean_length": 321.892578125, "completions/mean_terminated_length": 318.13531494140625, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.0785, "frac_reward_zero_std": 0.5625, "grad_norm": 0.47672128677368164, "learning_rate": 9.93970185956522e-07, "loss": 0.022, "num_tokens": 40623309.0, "reward": 0.751953125, "reward_std": 0.09210360050201416, "rewards/correctness_reward_func/mean": 0.5078125, "rewards/correctness_reward_func/std": 0.5004279017448425, "rewards/strict_format_reward_func/mean": 0.99609375, "rewards/strict_format_reward_func/std": 0.06243881583213806, "step": 157 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1259.0, "completions/mean_length": 380.896484375, "completions/mean_terminated_length": 372.02960205078125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.079, "frac_reward_zero_std": 0.609375, "grad_norm": 0.30757391452789307, "learning_rate": 9.938441702975689e-07, "loss": 0.012, "num_tokens": 40898376.0, "reward": 0.71484375, "reward_std": 0.0859459787607193, "rewards/correctness_reward_func/mean": 0.43359375, "rewards/correctness_reward_func/std": 0.4960552453994751, "rewards/strict_format_reward_func/mean": 0.99609375, "rewards/strict_format_reward_func/std": 0.06243881583213806, "step": 158 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1257.0, "completions/mean_length": 401.5390625, "completions/mean_terminated_length": 387.5952453613281, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.0795, "frac_reward_zero_std": 0.546875, "grad_norm": 0.36408668756484985, "learning_rate": 9.937168595898508e-07, "loss": 0.027, "num_tokens": 41184868.0, "reward": 0.7197265625, "reward_std": 0.09851805865764618, "rewards/correctness_reward_func/mean": 0.478515625, "rewards/correctness_reward_func/std": 0.5000267624855042, "rewards/strict_format_reward_func/mean": 0.9609375, "rewards/strict_format_reward_func/std": 0.1939331740140915, "step": 159 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1249.0, "completions/mean_length": 393.591796875, "completions/mean_terminated_length": 379.5218505859375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.08, "frac_reward_zero_std": 0.46875, "grad_norm": 0.4031688868999481, "learning_rate": 9.935882541672253e-07, "loss": 0.0445, "num_tokens": 41466827.0, "reward": 0.7216796875, "reward_std": 0.11392833292484283, "rewards/correctness_reward_func/mean": 0.44921875, "rewards/correctness_reward_func/std": 0.497901052236557, "rewards/strict_format_reward_func/mean": 0.994140625, "rewards/strict_format_reward_func/std": 0.07639661431312561, "step": 160 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1269.0, "completions/mean_length": 428.3515625, "completions/mean_terminated_length": 407.9120178222656, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.0805, "frac_reward_zero_std": 0.421875, "grad_norm": 0.36778077483177185, "learning_rate": 9.934583543669453e-07, "loss": 0.0241, "num_tokens": 41766359.0, "reward": 0.712890625, "reward_std": 0.13032951951026917, "rewards/correctness_reward_func/mean": 0.45703125, "rewards/correctness_reward_func/std": 0.49863746762275696, "rewards/strict_format_reward_func/mean": 0.96875, "rewards/strict_format_reward_func/std": 0.17416280508041382, "step": 161 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.025390625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1125.0, "completions/mean_length": 389.283203125, "completions/mean_terminated_length": 366.0781555175781, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.081, "frac_reward_zero_std": 0.5, "grad_norm": 0.34888869524002075, "learning_rate": 9.933271605296577e-07, "loss": 0.0333, "num_tokens": 42045808.0, "reward": 0.7197265625, "reward_std": 0.11474403738975525, "rewards/correctness_reward_func/mean": 0.478515625, "rewards/correctness_reward_func/std": 0.5000267624855042, "rewards/strict_format_reward_func/mean": 0.9609375, "rewards/strict_format_reward_func/std": 0.1939331740140915, "step": 162 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.029296875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1248.0, "completions/mean_length": 440.572265625, "completions/mean_terminated_length": 415.2373962402344, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.0815, "frac_reward_zero_std": 0.390625, "grad_norm": 0.37559840083122253, "learning_rate": 9.93194672999403e-07, "loss": 0.0416, "num_tokens": 42354653.0, "reward": 0.6865234375, "reward_std": 0.14560376107692719, "rewards/correctness_reward_func/mean": 0.42578125, "rewards/correctness_reward_func/std": 0.4949444830417633, "rewards/strict_format_reward_func/mean": 0.947265625, "rewards/strict_format_reward_func/std": 0.22372129559516907, "step": 163 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1259.0, "completions/mean_length": 396.716796875, "completions/mean_terminated_length": 386.24310302734375, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.082, "frac_reward_zero_std": 0.4375, "grad_norm": 0.39976930618286133, "learning_rate": 9.930608921236144e-07, "loss": 0.0031, "num_tokens": 42638268.0, "reward": 0.7216796875, "reward_std": 0.13179604709148407, "rewards/correctness_reward_func/mean": 0.4609375, "rewards/correctness_reward_func/std": 0.4989593029022217, "rewards/strict_format_reward_func/mean": 0.982421875, "rewards/strict_format_reward_func/std": 0.13154059648513794, "step": 164 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1249.0, "completions/mean_length": 407.828125, "completions/mean_terminated_length": 397.4861755371094, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.0825, "frac_reward_zero_std": 0.5, "grad_norm": 0.34297263622283936, "learning_rate": 9.929258182531166e-07, "loss": 0.0108, "num_tokens": 42928428.0, "reward": 0.708984375, "reward_std": 0.11530442535877228, "rewards/correctness_reward_func/mean": 0.458984375, "rewards/correctness_reward_func/std": 0.49880221486091614, "rewards/strict_format_reward_func/mean": 0.958984375, "rewards/strict_format_reward_func/std": 0.19852031767368317, "step": 165 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1241.0, "completions/mean_length": 446.0625, "completions/mean_terminated_length": 419.1612854003906, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.083, "frac_reward_zero_std": 0.484375, "grad_norm": 0.34740379452705383, "learning_rate": 9.927894517421252e-07, "loss": 0.025, "num_tokens": 43239252.0, "reward": 0.6865234375, "reward_std": 0.11644323170185089, "rewards/correctness_reward_func/mean": 0.41015625, "rewards/correctness_reward_func/std": 0.49234291911125183, "rewards/strict_format_reward_func/mean": 0.962890625, "rewards/strict_format_reward_func/std": 0.18921469151973724, "step": 166 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1277.0, "completions/mean_length": 446.63671875, "completions/mean_terminated_length": 438.41815185546875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.0835, "frac_reward_zero_std": 0.390625, "grad_norm": 0.34829995036125183, "learning_rate": 9.926517929482452e-07, "loss": 0.0208, "num_tokens": 43546234.0, "reward": 0.771484375, "reward_std": 0.13678140938282013, "rewards/correctness_reward_func/mean": 0.552734375, "rewards/correctness_reward_func/std": 0.4976975917816162, "rewards/strict_format_reward_func/mean": 0.990234375, "rewards/strict_format_reward_func/std": 0.09843364357948303, "step": 167 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1269.0, "completions/mean_length": 427.0703125, "completions/mean_terminated_length": 406.6000061035156, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.084, "frac_reward_zero_std": 0.53125, "grad_norm": 0.3564009666442871, "learning_rate": 9.92512842232471e-07, "loss": 0.0611, "num_tokens": 43843686.0, "reward": 0.720703125, "reward_std": 0.11196675896644592, "rewards/correctness_reward_func/mean": 0.46484375, "rewards/correctness_reward_func/std": 0.49925029277801514, "rewards/strict_format_reward_func/mean": 0.9765625, "rewards/strict_format_reward_func/std": 0.15143637359142303, "step": 168 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1267.0, "completions/mean_length": 414.095703125, "completions/mean_terminated_length": 396.84661865234375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.0845, "frac_reward_zero_std": 0.453125, "grad_norm": 0.34615087509155273, "learning_rate": 9.923725999591846e-07, "loss": 0.0088, "num_tokens": 44133599.0, "reward": 0.7392578125, "reward_std": 0.1200411394238472, "rewards/correctness_reward_func/mean": 0.505859375, "rewards/correctness_reward_func/std": 0.5004546642303467, "rewards/strict_format_reward_func/mean": 0.97265625, "rewards/strict_format_reward_func/std": 0.16324250400066376, "step": 169 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1220.0, "completions/mean_length": 423.25, "completions/mean_terminated_length": 409.65081787109375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.085, "frac_reward_zero_std": 0.40625, "grad_norm": 0.38962292671203613, "learning_rate": 9.922310664961549e-07, "loss": 0.0605, "num_tokens": 44431423.0, "reward": 0.77734375, "reward_std": 0.14031630754470825, "rewards/correctness_reward_func/mean": 0.572265625, "rewards/correctness_reward_func/std": 0.4952339828014374, "rewards/strict_format_reward_func/mean": 0.982421875, "rewards/strict_format_reward_func/std": 0.13154059648513794, "step": 170 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1277.0, "completions/mean_length": 414.103515625, "completions/mean_terminated_length": 405.5641174316406, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.0855, "frac_reward_zero_std": 0.53125, "grad_norm": 0.33015012741088867, "learning_rate": 9.92088242214537e-07, "loss": 0.0091, "num_tokens": 44723324.0, "reward": 0.7431640625, "reward_std": 0.1013030856847763, "rewards/correctness_reward_func/mean": 0.537109375, "rewards/correctness_reward_func/std": 0.4991086423397064, "rewards/strict_format_reward_func/mean": 0.94921875, "rewards/strict_format_reward_func/std": 0.21976542472839355, "step": 171 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1235.0, "completions/mean_length": 430.6953125, "completions/mean_terminated_length": 415.4989929199219, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.086, "frac_reward_zero_std": 0.40625, "grad_norm": 0.3435264527797699, "learning_rate": 9.91944127488871e-07, "loss": 0.0256, "num_tokens": 45022504.0, "reward": 0.7431640625, "reward_std": 0.13365474343299866, "rewards/correctness_reward_func/mean": 0.5234375, "rewards/correctness_reward_func/std": 0.49993884563446045, "rewards/strict_format_reward_func/mean": 0.962890625, "rewards/strict_format_reward_func/std": 0.18921469151973724, "step": 172 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1250.0, "completions/mean_length": 365.103515625, "completions/mean_terminated_length": 357.89959716796875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.0865, "frac_reward_zero_std": 0.625, "grad_norm": 0.3298518657684326, "learning_rate": 9.91798722697081e-07, "loss": 0.0069, "num_tokens": 45290685.0, "reward": 0.7197265625, "reward_std": 0.08781109750270844, "rewards/correctness_reward_func/mean": 0.474609375, "rewards/correctness_reward_func/std": 0.4998432695865631, "rewards/strict_format_reward_func/mean": 0.96484375, "rewards/strict_format_reward_func/std": 0.1843547374010086, "step": 173 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1274.0, "completions/mean_length": 405.525390625, "completions/mean_terminated_length": 386.3253479003906, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.087, "frac_reward_zero_std": 0.484375, "grad_norm": 0.30965280532836914, "learning_rate": 9.916520282204738e-07, "loss": 0.0269, "num_tokens": 45575506.0, "reward": 0.8173828125, "reward_std": 0.119574636220932, "rewards/correctness_reward_func/mean": 0.65234375, "rewards/correctness_reward_func/std": 0.47669193148612976, "rewards/strict_format_reward_func/mean": 0.982421875, "rewards/strict_format_reward_func/std": 0.13154059648513794, "step": 174 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.033203125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1202.0, "completions/mean_length": 441.138671875, "completions/mean_terminated_length": 412.3293151855469, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.0875, "frac_reward_zero_std": 0.515625, "grad_norm": 0.30991971492767334, "learning_rate": 9.915040444437388e-07, "loss": 0.0676, "num_tokens": 45881017.0, "reward": 0.724609375, "reward_std": 0.1126285195350647, "rewards/correctness_reward_func/mean": 0.478515625, "rewards/correctness_reward_func/std": 0.5000267624855042, "rewards/strict_format_reward_func/mean": 0.970703125, "rewards/strict_format_reward_func/std": 0.16880230605602264, "step": 175 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1230.0, "completions/mean_length": 424.384765625, "completions/mean_terminated_length": 403.8500061035156, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.088, "frac_reward_zero_std": 0.453125, "grad_norm": 0.3026152551174164, "learning_rate": 9.913547717549462e-07, "loss": 0.0236, "num_tokens": 46180358.0, "reward": 0.71484375, "reward_std": 0.131489560008049, "rewards/correctness_reward_func/mean": 0.466796875, "rewards/correctness_reward_func/std": 0.4993842542171478, "rewards/strict_format_reward_func/mean": 0.962890625, "rewards/strict_format_reward_func/std": 0.18921469151973724, "step": 176 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1251.0, "completions/mean_length": 423.875, "completions/mean_terminated_length": 399.8072204589844, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.0885, "frac_reward_zero_std": 0.5, "grad_norm": 0.3119610548019409, "learning_rate": 9.912042105455461e-07, "loss": 0.0275, "num_tokens": 46475958.0, "reward": 0.7939453125, "reward_std": 0.11265266686677933, "rewards/correctness_reward_func/mean": 0.611328125, "rewards/correctness_reward_func/std": 0.4879252314567566, "rewards/strict_format_reward_func/mean": 0.9765625, "rewards/strict_format_reward_func/std": 0.15143637359142303, "step": 177 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1184.0, "completions/mean_length": 406.939453125, "completions/mean_terminated_length": 400.0649719238281, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.089, "frac_reward_zero_std": 0.5, "grad_norm": 0.32366088032722473, "learning_rate": 9.910523612103678e-07, "loss": 0.0268, "num_tokens": 46762327.0, "reward": 0.771484375, "reward_std": 0.11418548226356506, "rewards/correctness_reward_func/mean": 0.552734375, "rewards/correctness_reward_func/std": 0.4976975917816162, "rewards/strict_format_reward_func/mean": 0.990234375, "rewards/strict_format_reward_func/std": 0.09843364357948303, "step": 178 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1237.0, "completions/mean_length": 380.923828125, "completions/mean_terminated_length": 373.844482421875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.0895, "frac_reward_zero_std": 0.5, "grad_norm": 0.3355458378791809, "learning_rate": 9.908992241476186e-07, "loss": 0.0385, "num_tokens": 47036968.0, "reward": 0.7236328125, "reward_std": 0.11382319033145905, "rewards/correctness_reward_func/mean": 0.45703125, "rewards/correctness_reward_func/std": 0.49863746762275696, "rewards/strict_format_reward_func/mean": 0.990234375, "rewards/strict_format_reward_func/std": 0.09843364357948303, "step": 179 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1173.0, "completions/mean_length": 381.072265625, "completions/mean_terminated_length": 370.4130554199219, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.09, "frac_reward_zero_std": 0.625, "grad_norm": 0.27785810828208923, "learning_rate": 9.907447997588825e-07, "loss": 0.0184, "num_tokens": 47311333.0, "reward": 0.7509765625, "reward_std": 0.08035620301961899, "rewards/correctness_reward_func/mean": 0.509765625, "rewards/correctness_reward_func/std": 0.5003935098648071, "rewards/strict_format_reward_func/mean": 0.9921875, "rewards/strict_format_reward_func/std": 0.08812850713729858, "step": 180 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1132.0, "completions/mean_length": 452.4609375, "completions/mean_terminated_length": 440.9901123046875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.0905, "frac_reward_zero_std": 0.59375, "grad_norm": 0.26823991537094116, "learning_rate": 9.905890884491194e-07, "loss": 0.0248, "num_tokens": 47624697.0, "reward": 0.732421875, "reward_std": 0.08801726996898651, "rewards/correctness_reward_func/mean": 0.5078125, "rewards/correctness_reward_func/std": 0.5004279017448425, "rewards/strict_format_reward_func/mean": 0.95703125, "rewards/strict_format_reward_func/std": 0.2029850035905838, "step": 181 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1265.0, "completions/mean_length": 428.19140625, "completions/mean_terminated_length": 416.3841857910156, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.091, "frac_reward_zero_std": 0.578125, "grad_norm": 0.27205950021743774, "learning_rate": 9.90432090626664e-07, "loss": 0.0255, "num_tokens": 47927315.0, "reward": 0.7041015625, "reward_std": 0.09131413698196411, "rewards/correctness_reward_func/mean": 0.439453125, "rewards/correctness_reward_func/std": 0.49680593609809875, "rewards/strict_format_reward_func/mean": 0.96875, "rewards/strict_format_reward_func/std": 0.17416280508041382, "step": 182 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1168.0, "completions/mean_length": 394.248046875, "completions/mean_terminated_length": 389.02752685546875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.0915, "frac_reward_zero_std": 0.5625, "grad_norm": 0.29022783041000366, "learning_rate": 9.902738067032253e-07, "loss": 0.0232, "num_tokens": 48208362.0, "reward": 0.796875, "reward_std": 0.09622794389724731, "rewards/correctness_reward_func/mean": 0.6015625, "rewards/correctness_reward_func/std": 0.4900552034378052, "rewards/strict_format_reward_func/mean": 0.9921875, "rewards/strict_format_reward_func/std": 0.08812850713729858, "step": 183 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1216.0, "completions/mean_length": 469.810546875, "completions/mean_terminated_length": 447.03411865234375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.092, "frac_reward_zero_std": 0.5, "grad_norm": 0.26838362216949463, "learning_rate": 9.901142370938836e-07, "loss": 0.0395, "num_tokens": 48529521.0, "reward": 0.744140625, "reward_std": 0.11906169354915619, "rewards/correctness_reward_func/mean": 0.5078125, "rewards/correctness_reward_func/std": 0.5004279017448425, "rewards/strict_format_reward_func/mean": 0.98046875, "rewards/strict_format_reward_func/std": 0.1385180652141571, "step": 184 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1259.0, "completions/mean_length": 440.087890625, "completions/mean_terminated_length": 428.445556640625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.0925, "frac_reward_zero_std": 0.515625, "grad_norm": 0.27794212102890015, "learning_rate": 9.899533822170921e-07, "loss": 0.0314, "num_tokens": 48833798.0, "reward": 0.7158203125, "reward_std": 0.1196797713637352, "rewards/correctness_reward_func/mean": 0.44921875, "rewards/correctness_reward_func/std": 0.497901052236557, "rewards/strict_format_reward_func/mean": 0.982421875, "rewards/strict_format_reward_func/std": 0.13154059648513794, "step": 185 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1231.0, "completions/mean_length": 457.375, "completions/mean_terminated_length": 439.3133850097656, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.093, "frac_reward_zero_std": 0.484375, "grad_norm": 0.26072224974632263, "learning_rate": 9.897912424946738e-07, "loss": 0.0296, "num_tokens": 49150118.0, "reward": 0.7490234375, "reward_std": 0.12023578584194183, "rewards/correctness_reward_func/mean": 0.517578125, "rewards/correctness_reward_func/std": 0.5001795887947083, "rewards/strict_format_reward_func/mean": 0.98046875, "rewards/strict_format_reward_func/std": 0.1385180652141571, "step": 186 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1264.0, "completions/mean_length": 439.544921875, "completions/mean_terminated_length": 424.5069274902344, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.0935, "frac_reward_zero_std": 0.515625, "grad_norm": 0.26937440037727356, "learning_rate": 9.896278183518216e-07, "loss": 0.0186, "num_tokens": 49458557.0, "reward": 0.740234375, "reward_std": 0.10539775341749191, "rewards/correctness_reward_func/mean": 0.498046875, "rewards/correctness_reward_func/std": 0.5004851818084717, "rewards/strict_format_reward_func/mean": 0.982421875, "rewards/strict_format_reward_func/std": 0.13154059648513794, "step": 187 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1226.0, "completions/mean_length": 422.125, "completions/mean_terminated_length": 411.95257568359375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.094, "frac_reward_zero_std": 0.484375, "grad_norm": 0.27661341428756714, "learning_rate": 9.894631102170956e-07, "loss": 0.0225, "num_tokens": 49754045.0, "reward": 0.7578125, "reward_std": 0.1238473504781723, "rewards/correctness_reward_func/mean": 0.53125, "rewards/correctness_reward_func/std": 0.4995105266571045, "rewards/strict_format_reward_func/mean": 0.984375, "rewards/strict_format_reward_func/std": 0.12414088100194931, "step": 188 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1178.0, "completions/mean_length": 426.634765625, "completions/mean_terminated_length": 423.28826904296875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.0945, "frac_reward_zero_std": 0.46875, "grad_norm": 0.2838056981563568, "learning_rate": 9.892971185224244e-07, "loss": 0.0314, "num_tokens": 50051402.0, "reward": 0.734375, "reward_std": 0.11945655941963196, "rewards/correctness_reward_func/mean": 0.49609375, "rewards/correctness_reward_func/std": 0.5004737377166748, "rewards/strict_format_reward_func/mean": 0.97265625, "rewards/strict_format_reward_func/std": 0.16324250400066376, "step": 189 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1280.0, "completions/mean_length": 454.34765625, "completions/mean_terminated_length": 437.9004211425781, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.095, "frac_reward_zero_std": 0.390625, "grad_norm": 0.2756107449531555, "learning_rate": 9.891298437031012e-07, "loss": -0.0039, "num_tokens": 50365404.0, "reward": 0.705078125, "reward_std": 0.13564766943454742, "rewards/correctness_reward_func/mean": 0.455078125, "rewards/correctness_reward_func/std": 0.4984649419784546, "rewards/strict_format_reward_func/mean": 0.955078125, "rewards/strict_format_reward_func/std": 0.20733514428138733, "step": 190 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1276.0, "completions/mean_length": 426.978515625, "completions/mean_terminated_length": 411.7156982421875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.0955, "frac_reward_zero_std": 0.5, "grad_norm": 0.25797563791275024, "learning_rate": 9.889612861977853e-07, "loss": 0.0239, "num_tokens": 50662273.0, "reward": 0.7548828125, "reward_std": 0.11666762083768845, "rewards/correctness_reward_func/mean": 0.541015625, "rewards/correctness_reward_func/std": 0.49880221486091614, "rewards/strict_format_reward_func/mean": 0.96875, "rewards/strict_format_reward_func/std": 0.17416280508041382, "step": 191 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1250.0, "completions/mean_length": 445.33984375, "completions/mean_terminated_length": 425.3080139160156, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.096, "frac_reward_zero_std": 0.390625, "grad_norm": 0.291076123714447, "learning_rate": 9.887914464484987e-07, "loss": 0.035, "num_tokens": 50970343.0, "reward": 0.7265625, "reward_std": 0.13782770931720734, "rewards/correctness_reward_func/mean": 0.498046875, "rewards/correctness_reward_func/std": 0.5004851818084717, "rewards/strict_format_reward_func/mean": 0.955078125, "rewards/strict_format_reward_func/std": 0.20733514428138733, "step": 192 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1278.0, "completions/mean_length": 377.708984375, "completions/mean_terminated_length": 368.8106689453125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.0965, "frac_reward_zero_std": 0.5, "grad_norm": 0.30128878355026245, "learning_rate": 9.886203249006264e-07, "loss": 0.053, "num_tokens": 51237506.0, "reward": 0.8125, "reward_std": 0.11813852190971375, "rewards/correctness_reward_func/mean": 0.634765625, "rewards/correctness_reward_func/std": 0.4819667339324951, "rewards/strict_format_reward_func/mean": 0.990234375, "rewards/strict_format_reward_func/std": 0.09843364357948303, "step": 193 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 1280.0, "completions/max_terminated_length": 1278.0, "completions/mean_length": 464.87890625, "completions/mean_terminated_length": 450.2942199707031, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.097, "frac_reward_zero_std": 0.59375, "grad_norm": 0.24394026398658752, "learning_rate": 9.88447922002915e-07, "loss": 0.0262, "num_tokens": 51557444.0, "reward": 0.7265625, "reward_std": 0.09223461151123047, "rewards/correctness_reward_func/mean": 0.47265625, "rewards/correctness_reward_func/std": 0.49974003434181213, "rewards/strict_format_reward_func/mean": 0.98046875, "rewards/strict_format_reward_func/std": 0.1385180652141571, "step": 194 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1220.0, "completions/mean_length": 443.154296875, "completions/mean_terminated_length": 434.9013977050781, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.0975, "frac_reward_zero_std": 0.5, "grad_norm": 0.27368584275245667, "learning_rate": 9.882742382074706e-07, "loss": -0.005, "num_tokens": 51865003.0, "reward": 0.728515625, "reward_std": 0.11139891296625137, "rewards/correctness_reward_func/mean": 0.4765625, "rewards/correctness_reward_func/std": 0.49993884563446045, "rewards/strict_format_reward_func/mean": 0.98046875, "rewards/strict_format_reward_func/std": 0.1385180652141571, "step": 195 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1266.0, "completions/mean_length": 413.12109375, "completions/mean_terminated_length": 402.8419189453125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.098, "frac_reward_zero_std": 0.5625, "grad_norm": 0.24915266036987305, "learning_rate": 9.880992739697588e-07, "loss": 0.0197, "num_tokens": 52156585.0, "reward": 0.724609375, "reward_std": 0.1030588150024414, "rewards/correctness_reward_func/mean": 0.490234375, "rewards/correctness_reward_func/std": 0.5003935098648071, "rewards/strict_format_reward_func/mean": 0.958984375, "rewards/strict_format_reward_func/std": 0.19852031767368317, "step": 196 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1204.0, "completions/mean_length": 431.666015625, "completions/mean_terminated_length": 423.2998046875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.0985, "frac_reward_zero_std": 0.46875, "grad_norm": 0.25503963232040405, "learning_rate": 9.879230297486034e-07, "loss": 0.0141, "num_tokens": 52460126.0, "reward": 0.7236328125, "reward_std": 0.11672013998031616, "rewards/correctness_reward_func/mean": 0.453125, "rewards/correctness_reward_func/std": 0.4982847273349762, "rewards/strict_format_reward_func/mean": 0.994140625, "rewards/strict_format_reward_func/std": 0.07639661431312561, "step": 197 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.029296875, "completions/max_length": 1280.0, "completions/max_terminated_length": 1250.0, "completions/mean_length": 464.04296875, "completions/mean_terminated_length": 439.4164733886719, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.099, "frac_reward_zero_std": 0.484375, "grad_norm": 0.2505626678466797, "learning_rate": 9.877455060061838e-07, "loss": 0.0417, "num_tokens": 52776668.0, "reward": 0.732421875, "reward_std": 0.11660541594028473, "rewards/correctness_reward_func/mean": 0.521484375, "rewards/correctness_reward_func/std": 0.5000267624855042, "rewards/strict_format_reward_func/mean": 0.943359375, "rewards/strict_format_reward_func/std": 0.23138070106506348, "step": 198 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1270.0, "completions/mean_length": 454.27734375, "completions/mean_terminated_length": 434.46002197265625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.0995, "frac_reward_zero_std": 0.5, "grad_norm": 0.2546021342277527, "learning_rate": 9.875667032080352e-07, "loss": 0.0281, "num_tokens": 53087450.0, "reward": 0.744140625, "reward_std": 0.10927945375442505, "rewards/correctness_reward_func/mean": 0.525390625, "rewards/correctness_reward_func/std": 0.4998432695865631, "rewards/strict_format_reward_func/mean": 0.962890625, "rewards/strict_format_reward_func/std": 0.18921469151973724, "step": 199 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 1280.0, "completions/max_terminated_length": 1183.0, "completions/mean_length": 473.017578125, "completions/mean_terminated_length": 455.2994079589844, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.1, "frac_reward_zero_std": 0.5625, "grad_norm": 0.22602103650569916, "learning_rate": 9.873866218230476e-07, "loss": 0.0308, "num_tokens": 53410371.0, "reward": 0.7041015625, "reward_std": 0.09856415539979935, "rewards/correctness_reward_func/mean": 0.4453125, "rewards/correctness_reward_func/std": 0.49748632311820984, "rewards/strict_format_reward_func/mean": 0.962890625, "rewards/strict_format_reward_func/std": 0.18921469151973724, "step": 200 } ], "logging_steps": 1, "max_steps": 2000, "num_input_tokens_seen": 53410371, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }