{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.78125, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 914.375, "completions/clipped_ratio": 0.0, "completions/max_length": 1183.0, "completions/max_terminated_length": 1183.0, "completions/mean_length": 914.375, "completions/mean_terminated_length": 914.375, "completions/min_length": 538.0, "completions/min_terminated_length": 538.0, "epoch": 0.00390625, "frac_reward_zero_std": 0.0, "grad_norm": 0.5925750732421875, "kl": 0.0005806067929370329, "learning_rate": 0.0, "loss": 0.0, "num_tokens": 48251.0, "reward": 0.17561426758766174, "reward_std": 0.5293996334075928, "rewards/reward_function/mean": 0.17561426758766174, "rewards/reward_function/std": 0.5293996930122375, "step": 1 }, { "completion_length": 939.875, "completions/clipped_ratio": 0.0, "completions/max_length": 1141.0, "completions/max_terminated_length": 1141.0, "completions/mean_length": 939.875, "completions/mean_terminated_length": 939.875, "completions/min_length": 781.0, "completions/min_terminated_length": 781.0, "epoch": 0.0078125, "frac_reward_zero_std": 0.0, "grad_norm": 0.4351307153701782, "kl": 0.00043600249045994133, "learning_rate": 5.000000000000001e-07, "loss": 0.0, "num_tokens": 96706.0, "reward": 0.07213374972343445, "reward_std": 0.6617448329925537, "rewards/reward_function/mean": 0.07213374972343445, "rewards/reward_function/std": 0.6617448925971985, "step": 2 }, { "completion_length": 957.625, "completions/clipped_ratio": 0.0, "completions/max_length": 1241.0, "completions/max_terminated_length": 1241.0, "completions/mean_length": 957.625, "completions/mean_terminated_length": 957.625, "completions/min_length": 684.0, "completions/min_terminated_length": 684.0, "epoch": 0.01171875, "frac_reward_zero_std": 0.0, "grad_norm": 0.274860680103302, "kl": 0.0005422348549473099, "learning_rate": 1.0000000000000002e-06, "loss": 0.0, "num_tokens": 145303.0, "reward": 0.1806439459323883, "reward_std": 0.5124807357788086, "rewards/reward_function/mean": 0.1806439459323883, "rewards/reward_function/std": 0.5124807357788086, "step": 3 }, { "completion_length": 918.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1079.0, "completions/max_terminated_length": 1079.0, "completions/mean_length": 918.0, "completions/mean_terminated_length": 918.0, "completions/min_length": 628.0, "completions/min_terminated_length": 628.0, "epoch": 0.015625, "frac_reward_zero_std": 0.0, "grad_norm": 0.33157476782798767, "kl": 0.00042391348688397557, "learning_rate": 1.5e-06, "loss": 0.0, "num_tokens": 193583.0, "reward": 0.07469659298658371, "reward_std": 0.6634011268615723, "rewards/reward_function/mean": 0.07469659298658371, "rewards/reward_function/std": 0.6634011268615723, "step": 4 }, { "completion_length": 1034.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1189.0, "completions/max_terminated_length": 1189.0, "completions/mean_length": 1034.5, "completions/mean_terminated_length": 1034.5, "completions/min_length": 887.0, "completions/min_terminated_length": 887.0, "epoch": 0.01953125, "frac_reward_zero_std": 0.0, "grad_norm": 0.2924421429634094, "kl": 0.00045540891733253375, "learning_rate": 2.0000000000000003e-06, "loss": 0.0, "num_tokens": 249859.0, "reward": 0.016167104244232178, "reward_std": 0.6689337491989136, "rewards/reward_function/mean": 0.016167104244232178, "rewards/reward_function/std": 0.6689338088035583, "step": 5 }, { "completion_length": 876.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1272.0, "completions/max_terminated_length": 1272.0, "completions/mean_length": 876.0, "completions/mean_terminated_length": 876.0, "completions/min_length": 645.0, "completions/min_terminated_length": 645.0, "epoch": 0.0234375, "frac_reward_zero_std": 0.0, "grad_norm": 0.4938257038593292, "kl": 0.0005481051848619245, "learning_rate": 2.5e-06, "loss": 0.0, "num_tokens": 304867.0, "reward": 0.0005262494087219238, "reward_std": 0.6533803343772888, "rewards/reward_function/mean": 0.0005262494087219238, "rewards/reward_function/std": 0.6533803343772888, "step": 6 }, { "completion_length": 1123.625, "completions/clipped_ratio": 0.0, "completions/max_length": 1490.0, "completions/max_terminated_length": 1490.0, "completions/mean_length": 1123.625, "completions/mean_terminated_length": 1123.625, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.02734375, "frac_reward_zero_std": 0.0, "grad_norm": 0.5299341082572937, "kl": 0.0005529591653612442, "learning_rate": 3e-06, "loss": 0.0, "num_tokens": 361856.0, "reward": -0.2599995732307434, "reward_std": 0.7911489009857178, "rewards/reward_function/mean": -0.2599995732307434, "rewards/reward_function/std": 0.7911489605903625, "step": 7 }, { "completion_length": 1054.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1215.0, "completions/max_terminated_length": 1215.0, "completions/mean_length": 1054.5, "completions/mean_terminated_length": 1054.5, "completions/min_length": 935.0, "completions/min_terminated_length": 935.0, "epoch": 0.03125, "frac_reward_zero_std": 0.0, "grad_norm": 0.29235586524009705, "kl": 0.0004442112476681359, "learning_rate": 3.5e-06, "loss": 0.0, "num_tokens": 418292.0, "reward": -0.27060431241989136, "reward_std": 0.7798133492469788, "rewards/reward_function/mean": -0.27060431241989136, "rewards/reward_function/std": 0.7798133492469788, "step": 8 }, { "completion_length": 1041.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1649.0, "completions/max_terminated_length": 1649.0, "completions/mean_length": 1041.25, "completions/mean_terminated_length": 1041.25, "completions/min_length": 694.0, "completions/min_terminated_length": 694.0, "epoch": 0.03515625, "frac_reward_zero_std": 0.0, "grad_norm": 0.29960209131240845, "kl": 0.0004553622711682692, "learning_rate": 4.000000000000001e-06, "loss": 0.0, "num_tokens": 463294.0, "reward": -0.09699174016714096, "reward_std": 0.7479231357574463, "rewards/reward_function/mean": -0.09699174016714096, "rewards/reward_function/std": 0.7479231357574463, "step": 9 }, { "completion_length": 986.375, "completions/clipped_ratio": 0.0, "completions/max_length": 1359.0, "completions/max_terminated_length": 1359.0, "completions/mean_length": 986.375, "completions/mean_terminated_length": 986.375, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 0.0390625, "frac_reward_zero_std": 0.0, "grad_norm": 0.44665637612342834, "kl": 0.0005360567811294459, "learning_rate": 4.5e-06, "loss": 0.0, "num_tokens": 507857.0, "reward": 0.07969788461923599, "reward_std": 0.6664658784866333, "rewards/reward_function/mean": 0.07969788461923599, "rewards/reward_function/std": 0.6664658784866333, "step": 10 }, { "completion_length": 925.875, "completions/clipped_ratio": 0.0, "completions/max_length": 1388.0, "completions/max_terminated_length": 1388.0, "completions/mean_length": 925.875, "completions/mean_terminated_length": 925.875, "completions/min_length": 598.0, "completions/min_terminated_length": 598.0, "epoch": 0.04296875, "frac_reward_zero_std": 0.0, "grad_norm": 0.4459737241268158, "kl": 0.0005316538081387989, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 551936.0, "reward": -0.1024932935833931, "reward_std": 0.7433716058731079, "rewards/reward_function/mean": -0.1024932935833931, "rewards/reward_function/std": 0.7433716058731079, "step": 11 }, { "completion_length": 1036.875, "completions/clipped_ratio": 0.0, "completions/max_length": 1326.0, "completions/max_terminated_length": 1326.0, "completions/mean_length": 1036.875, "completions/mean_terminated_length": 1036.875, "completions/min_length": 856.0, "completions/min_terminated_length": 856.0, "epoch": 0.046875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2756291329860687, "kl": 0.00041996776417363435, "learning_rate": 4.99847706754774e-06, "loss": 0.0, "num_tokens": 596903.0, "reward": 0.36005687713623047, "reward_std": 0.2220204770565033, "rewards/reward_function/mean": 0.36005687713623047, "rewards/reward_function/std": 0.2220204472541809, "step": 12 }, { "completion_length": 1167.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1468.0, "completions/max_terminated_length": 1468.0, "completions/mean_length": 1167.25, "completions/mean_terminated_length": 1167.25, "completions/min_length": 614.0, "completions/min_terminated_length": 614.0, "epoch": 0.05078125, "frac_reward_zero_std": 0.0, "grad_norm": 0.3536391854286194, "kl": 0.0005555427414947189, "learning_rate": 4.993910125649561e-06, "loss": 0.0, "num_tokens": 650185.0, "reward": -0.10552466660737991, "reward_std": 0.7408891320228577, "rewards/reward_function/mean": -0.10552466660737991, "rewards/reward_function/std": 0.7408891320228577, "step": 13 }, { "completion_length": 978.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1308.0, "completions/max_terminated_length": 1308.0, "completions/mean_length": 978.75, "completions/mean_terminated_length": 978.75, "completions/min_length": 612.0, "completions/min_terminated_length": 612.0, "epoch": 0.0546875, "frac_reward_zero_std": 0.0, "grad_norm": 0.3985886871814728, "kl": 0.000563694593438413, "learning_rate": 4.986304738420684e-06, "loss": 0.0, "num_tokens": 701959.0, "reward": -0.11333049833774567, "reward_std": 0.7342555522918701, "rewards/reward_function/mean": -0.11333049833774567, "rewards/reward_function/std": 0.7342556118965149, "step": 14 }, { "completion_length": 1253.875, "completions/clipped_ratio": 0.0, "completions/max_length": 1684.0, "completions/max_terminated_length": 1684.0, "completions/mean_length": 1253.875, "completions/mean_terminated_length": 1253.875, "completions/min_length": 1103.0, "completions/min_terminated_length": 1103.0, "epoch": 0.05859375, "frac_reward_zero_std": 0.0, "grad_norm": 0.29394710063934326, "kl": 0.0005726947129005566, "learning_rate": 4.975670171853926e-06, "loss": 0.0, "num_tokens": 755934.0, "reward": 0.19782808423042297, "reward_std": 0.5165873169898987, "rewards/reward_function/mean": 0.19782808423042297, "rewards/reward_function/std": 0.5165873169898987, "step": 15 }, { "completion_length": 1230.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1535.0, "completions/max_terminated_length": 1535.0, "completions/mean_length": 1230.0, "completions/mean_terminated_length": 1230.0, "completions/min_length": 934.0, "completions/min_terminated_length": 934.0, "epoch": 0.0625, "frac_reward_zero_std": 0.0, "grad_norm": 0.282123863697052, "kl": 0.000479752998217009, "learning_rate": 4.962019382530521e-06, "loss": 0.0, "num_tokens": 809718.0, "reward": -0.1809864193201065, "reward_std": 0.7100936770439148, "rewards/reward_function/mean": -0.1809864193201065, "rewards/reward_function/std": 0.7100936770439148, "step": 16 }, { "completion_length": 1231.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2350.0, "completions/max_terminated_length": 2350.0, "completions/mean_length": 1231.0, "completions/mean_terminated_length": 1231.0, "completions/min_length": 907.0, "completions/min_terminated_length": 907.0, "epoch": 0.06640625, "frac_reward_zero_std": 0.0, "grad_norm": 0.2648545503616333, "kl": 0.0004779834271175787, "learning_rate": 4.9453690018345144e-06, "loss": 0.0, "num_tokens": 868222.0, "reward": -0.18590302765369415, "reward_std": 0.7067751884460449, "rewards/reward_function/mean": -0.18590302765369415, "rewards/reward_function/std": 0.7067751884460449, "step": 17 }, { "completion_length": 1052.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1399.0, "completions/max_terminated_length": 1399.0, "completions/mean_length": 1052.5, "completions/mean_terminated_length": 1052.5, "completions/min_length": 626.0, "completions/min_terminated_length": 626.0, "epoch": 0.0703125, "frac_reward_zero_std": 0.0, "grad_norm": 0.30565333366394043, "kl": 0.0004614779900293797, "learning_rate": 4.925739315689991e-06, "loss": 0.0, "num_tokens": 925298.0, "reward": -0.2848556339740753, "reward_std": 0.764528214931488, "rewards/reward_function/mean": -0.2848556339740753, "rewards/reward_function/std": 0.7645282745361328, "step": 18 }, { "completion_length": 1201.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1488.0, "completions/max_terminated_length": 1488.0, "completions/mean_length": 1201.75, "completions/mean_terminated_length": 1201.75, "completions/min_length": 985.0, "completions/min_terminated_length": 985.0, "epoch": 0.07421875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2542005181312561, "kl": 0.00046427045890595764, "learning_rate": 4.903154239845798e-06, "loss": 0.0, "num_tokens": 983568.0, "reward": 0.26851925253868103, "reward_std": 0.292377233505249, "rewards/reward_function/mean": 0.26851925253868103, "rewards/reward_function/std": 0.29237720370292664, "step": 19 }, { "completion_length": 1270.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1647.0, "completions/max_terminated_length": 1647.0, "completions/mean_length": 1270.0, "completions/mean_terminated_length": 1270.0, "completions/min_length": 930.0, "completions/min_terminated_length": 930.0, "epoch": 0.078125, "frac_reward_zero_std": 0.0, "grad_norm": 0.26307252049446106, "kl": 0.0004189292449154891, "learning_rate": 4.8776412907378845e-06, "loss": 0.0, "num_tokens": 1042384.0, "reward": 0.07484199106693268, "reward_std": 0.6634507179260254, "rewards/reward_function/mean": 0.07484199106693268, "rewards/reward_function/std": 0.6634507179260254, "step": 20 }, { "completion_length": 997.375, "completions/clipped_ratio": 0.0, "completions/max_length": 1256.0, "completions/max_terminated_length": 1256.0, "completions/mean_length": 997.375, "completions/mean_terminated_length": 997.375, "completions/min_length": 807.0, "completions/min_terminated_length": 807.0, "epoch": 0.08203125, "frac_reward_zero_std": 0.0, "grad_norm": 0.35027605295181274, "kl": 0.0005483183849719353, "learning_rate": 4.849231551964771e-06, "loss": 0.0, "num_tokens": 1099571.0, "reward": 0.2563348412513733, "reward_std": 0.5079569220542908, "rewards/reward_function/mean": 0.2563348412513733, "rewards/reward_function/std": 0.5079569220542908, "step": 21 }, { "completion_length": 947.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1186.0, "completions/max_terminated_length": 1186.0, "completions/mean_length": 947.25, "completions/mean_terminated_length": 947.25, "completions/min_length": 756.0, "completions/min_terminated_length": 756.0, "epoch": 0.0859375, "frac_reward_zero_std": 0.0, "grad_norm": 0.3029741048812866, "kl": 0.0004957405471941456, "learning_rate": 4.817959636416969e-06, "loss": 0.0, "num_tokens": 1156357.0, "reward": -0.1654960811138153, "reward_std": 0.7127606272697449, "rewards/reward_function/mean": -0.1654960811138153, "rewards/reward_function/std": 0.7127606272697449, "step": 22 }, { "completion_length": 1005.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1239.0, "completions/max_terminated_length": 1239.0, "completions/mean_length": 1005.5, "completions/mean_terminated_length": 1005.5, "completions/min_length": 750.0, "completions/min_terminated_length": 750.0, "epoch": 0.08984375, "frac_reward_zero_std": 0.0, "grad_norm": 0.33245348930358887, "kl": 0.0005613294633803889, "learning_rate": 4.783863644106502e-06, "loss": 0.0, "num_tokens": 1213609.0, "reward": 0.19226206839084625, "reward_std": 0.5118025541305542, "rewards/reward_function/mean": 0.19226206839084625, "rewards/reward_function/std": 0.5118025541305542, "step": 23 }, { "completion_length": 888.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1100.0, "completions/max_terminated_length": 1100.0, "completions/mean_length": 888.25, "completions/mean_terminated_length": 888.25, "completions/min_length": 629.0, "completions/min_terminated_length": 629.0, "epoch": 0.09375, "frac_reward_zero_std": 0.0, "grad_norm": 0.34459948539733887, "kl": 0.0005078868562122807, "learning_rate": 4.746985115747918e-06, "loss": 0.0, "num_tokens": 1269923.0, "reward": 0.006138928234577179, "reward_std": 0.6435980200767517, "rewards/reward_function/mean": 0.006138928234577179, "rewards/reward_function/std": 0.6435979604721069, "step": 24 }, { "completion_length": 988.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1209.0, "completions/max_terminated_length": 1209.0, "completions/mean_length": 988.5, "completions/mean_terminated_length": 988.5, "completions/min_length": 650.0, "completions/min_terminated_length": 650.0, "epoch": 0.09765625, "frac_reward_zero_std": 0.0, "grad_norm": 0.3348124027252197, "kl": 0.0005601778539130464, "learning_rate": 4.707368982147318e-06, "loss": 0.0, "num_tokens": 1316151.0, "reward": 0.01586177945137024, "reward_std": 0.6504648923873901, "rewards/reward_function/mean": 0.01586177945137024, "rewards/reward_function/std": 0.6504649519920349, "step": 25 }, { "completion_length": 1129.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1498.0, "completions/max_terminated_length": 1498.0, "completions/mean_length": 1129.25, "completions/mean_terminated_length": 1129.25, "completions/min_length": 867.0, "completions/min_terminated_length": 867.0, "epoch": 0.1015625, "frac_reward_zero_std": 0.0, "grad_norm": 0.3254358470439911, "kl": 0.0005182483946555294, "learning_rate": 4.665063509461098e-06, "loss": 0.0, "num_tokens": 1363505.0, "reward": 0.10026979446411133, "reward_std": 0.679513156414032, "rewards/reward_function/mean": 0.10026979446411133, "rewards/reward_function/std": 0.6795132160186768, "step": 26 }, { "completion_length": 1122.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1480.0, "completions/max_terminated_length": 1480.0, "completions/mean_length": 1122.25, "completions/mean_terminated_length": 1122.25, "completions/min_length": 855.0, "completions/min_terminated_length": 855.0, "epoch": 0.10546875, "frac_reward_zero_std": 0.0, "grad_norm": 0.27222707867622375, "kl": 0.0005421710084192455, "learning_rate": 4.620120240391065e-06, "loss": 0.0, "num_tokens": 1410803.0, "reward": 0.018940787762403488, "reward_std": 0.6555877923965454, "rewards/reward_function/mean": 0.018940787762403488, "rewards/reward_function/std": 0.6555877923965454, "step": 27 }, { "completion_length": 956.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1243.0, "completions/max_terminated_length": 1243.0, "completions/mean_length": 956.5, "completions/mean_terminated_length": 956.5, "completions/min_length": 805.0, "completions/min_terminated_length": 805.0, "epoch": 0.109375, "frac_reward_zero_std": 0.0, "grad_norm": 0.3711608648300171, "kl": 0.0004645055887522176, "learning_rate": 4.572593931387604e-06, "loss": 0.0, "num_tokens": 1456775.0, "reward": 0.3047725558280945, "reward_std": 0.2678278088569641, "rewards/reward_function/mean": 0.3047725558280945, "rewards/reward_function/std": 0.2678278088569641, "step": 28 }, { "completion_length": 998.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1578.0, "completions/max_terminated_length": 1578.0, "completions/mean_length": 998.0, "completions/mean_terminated_length": 998.0, "completions/min_length": 723.0, "completions/min_terminated_length": 723.0, "epoch": 0.11328125, "frac_reward_zero_std": 0.0, "grad_norm": 0.3207095265388489, "kl": 0.0005635232664644718, "learning_rate": 4.522542485937369e-06, "loss": 0.0, "num_tokens": 1513687.0, "reward": -0.11509008705615997, "reward_std": 0.7328100800514221, "rewards/reward_function/mean": -0.11509008705615997, "rewards/reward_function/std": 0.7328100800514221, "step": 29 }, { "completion_length": 1042.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1424.0, "completions/max_terminated_length": 1424.0, "completions/mean_length": 1042.5, "completions/mean_terminated_length": 1042.5, "completions/min_length": 839.0, "completions/min_terminated_length": 839.0, "epoch": 0.1171875, "frac_reward_zero_std": 0.0, "grad_norm": 0.3066699504852295, "kl": 0.0005217162106418982, "learning_rate": 4.470026884016805e-06, "loss": 0.0, "num_tokens": 1570955.0, "reward": -0.1107390969991684, "reward_std": 0.7364593148231506, "rewards/reward_function/mean": -0.1107390969991684, "rewards/reward_function/std": 0.7364593148231506, "step": 30 }, { "completion_length": 1029.875, "completions/clipped_ratio": 0.0, "completions/max_length": 1752.0, "completions/max_terminated_length": 1752.0, "completions/mean_length": 1029.875, "completions/mean_terminated_length": 1029.875, "completions/min_length": 737.0, "completions/min_terminated_length": 737.0, "epoch": 0.12109375, "frac_reward_zero_std": 0.0, "grad_norm": 0.2864665985107422, "kl": 0.0004613926066667773, "learning_rate": 4.415111107797445e-06, "loss": 0.0, "num_tokens": 1628122.0, "reward": 0.24289178848266602, "reward_std": 0.5025007128715515, "rewards/reward_function/mean": 0.24289178848266602, "rewards/reward_function/std": 0.5025007128715515, "step": 31 }, { "completion_length": 904.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1088.0, "completions/max_terminated_length": 1088.0, "completions/mean_length": 904.5, "completions/mean_terminated_length": 904.5, "completions/min_length": 605.0, "completions/min_terminated_length": 605.0, "epoch": 0.125, "frac_reward_zero_std": 0.0, "grad_norm": 0.34721776843070984, "kl": 0.00046865284093655646, "learning_rate": 4.357862063693486e-06, "loss": 0.0, "num_tokens": 1684286.0, "reward": 0.4172564446926117, "reward_std": 0.015801435336470604, "rewards/reward_function/mean": 0.4172564446926117, "rewards/reward_function/std": 0.015801437199115753, "step": 32 }, { "completion_length": 748.125, "completions/clipped_ratio": 0.0, "completions/max_length": 1067.0, "completions/max_terminated_length": 1067.0, "completions/mean_length": 748.125, "completions/mean_terminated_length": 748.125, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 0.12890625, "frac_reward_zero_std": 0.0, "grad_norm": 0.5278727412223816, "kl": 0.0005870972745469771, "learning_rate": 4.2983495008466285e-06, "loss": 0.0, "num_tokens": 1734087.0, "reward": -0.012617290019989014, "reward_std": 0.6455253958702087, "rewards/reward_function/mean": -0.012617290019989014, "rewards/reward_function/std": 0.6455253958702087, "step": 33 }, { "completion_length": 717.375, "completions/clipped_ratio": 0.0, "completions/max_length": 824.0, "completions/max_terminated_length": 824.0, "completions/mean_length": 717.375, "completions/mean_terminated_length": 717.375, "completions/min_length": 624.0, "completions/min_terminated_length": 624.0, "epoch": 0.1328125, "frac_reward_zero_std": 0.0, "grad_norm": 0.34698522090911865, "kl": 0.0004982690297765657, "learning_rate": 4.236645926147493e-06, "loss": 0.0, "num_tokens": 1783642.0, "reward": 0.15846197307109833, "reward_std": 0.5145913362503052, "rewards/reward_function/mean": 0.15846197307109833, "rewards/reward_function/std": 0.5145912766456604, "step": 34 }, { "completion_length": 594.625, "completions/clipped_ratio": 0.0, "completions/max_length": 874.0, "completions/max_terminated_length": 874.0, "completions/mean_length": 594.625, "completions/mean_terminated_length": 594.625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.13671875, "frac_reward_zero_std": 0.0, "grad_norm": 0.5477731823921204, "kl": 0.0005064220604253933, "learning_rate": 4.172826515897146e-06, "loss": 0.0, "num_tokens": 1832215.0, "reward": -0.2889060974121094, "reward_std": 0.7602130770683289, "rewards/reward_function/mean": -0.2889060974121094, "rewards/reward_function/std": 0.7602130770683289, "step": 35 }, { "completion_length": 835.625, "completions/clipped_ratio": 0.0, "completions/max_length": 1236.0, "completions/max_terminated_length": 1236.0, "completions/mean_length": 835.625, "completions/mean_terminated_length": 835.625, "completions/min_length": 624.0, "completions/min_terminated_length": 624.0, "epoch": 0.140625, "frac_reward_zero_std": 0.0, "grad_norm": 0.3209806978702545, "kl": 0.0005167589188204147, "learning_rate": 4.106969024216348e-06, "loss": 0.0, "num_tokens": 1882716.0, "reward": 0.23565331101417542, "reward_std": 0.4997692406177521, "rewards/reward_function/mean": 0.23565331101417542, "rewards/reward_function/std": 0.4997692406177521, "step": 36 }, { "completion_length": 1067.125, "completions/clipped_ratio": 0.0, "completions/max_length": 1422.0, "completions/max_terminated_length": 1422.0, "completions/mean_length": 1067.125, "completions/mean_terminated_length": 1067.125, "completions/min_length": 886.0, "completions/min_terminated_length": 886.0, "epoch": 0.14453125, "frac_reward_zero_std": 0.0, "grad_norm": 0.31051820516586304, "kl": 0.0005310940687195398, "learning_rate": 4.039153688314146e-06, "loss": 0.0, "num_tokens": 1933333.0, "reward": 0.3604900538921356, "reward_std": 0.22474592924118042, "rewards/reward_function/mean": 0.3604900538921356, "rewards/reward_function/std": 0.22474592924118042, "step": 37 }, { "completion_length": 983.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1199.0, "completions/max_terminated_length": 1199.0, "completions/mean_length": 983.0, "completions/mean_terminated_length": 983.0, "completions/min_length": 791.0, "completions/min_terminated_length": 791.0, "epoch": 0.1484375, "frac_reward_zero_std": 0.0, "grad_norm": 0.28662100434303284, "kl": 0.0004848391836276278, "learning_rate": 3.969463130731183e-06, "loss": 0.0, "num_tokens": 1983277.0, "reward": -0.09930499643087387, "reward_std": 0.7459418773651123, "rewards/reward_function/mean": -0.09930499643087387, "rewards/reward_function/std": 0.7459418773651123, "step": 38 }, { "completion_length": 1083.375, "completions/clipped_ratio": 0.0, "completions/max_length": 1410.0, "completions/max_terminated_length": 1410.0, "completions/mean_length": 1083.375, "completions/mean_terminated_length": 1083.375, "completions/min_length": 937.0, "completions/min_terminated_length": 937.0, "epoch": 0.15234375, "frac_reward_zero_std": 0.0, "grad_norm": 0.2731831967830658, "kl": 0.0004829144090763293, "learning_rate": 3.897982258676867e-06, "loss": 0.0, "num_tokens": 2034024.0, "reward": -0.18277299404144287, "reward_std": 0.7117413878440857, "rewards/reward_function/mean": -0.18277299404144287, "rewards/reward_function/std": 0.7117413878440857, "step": 39 }, { "completion_length": 1012.875, "completions/clipped_ratio": 0.0, "completions/max_length": 1179.0, "completions/max_terminated_length": 1179.0, "completions/mean_length": 1012.875, "completions/mean_terminated_length": 1012.875, "completions/min_length": 803.0, "completions/min_terminated_length": 803.0, "epoch": 0.15625, "frac_reward_zero_std": 0.0, "grad_norm": 0.24166430532932281, "kl": 0.00040847257332643494, "learning_rate": 3.824798160583012e-06, "loss": 0.0, "num_tokens": 2084207.0, "reward": 0.37631669640541077, "reward_std": 0.19093753397464752, "rewards/reward_function/mean": 0.37631669640541077, "rewards/reward_function/std": 0.19093753397464752, "step": 40 }, { "completion_length": 928.875, "completions/clipped_ratio": 0.0, "completions/max_length": 1163.0, "completions/max_terminated_length": 1163.0, "completions/mean_length": 928.875, "completions/mean_terminated_length": 928.875, "completions/min_length": 579.0, "completions/min_terminated_length": 579.0, "epoch": 0.16015625, "frac_reward_zero_std": 0.0, "grad_norm": 0.32170408964157104, "kl": 0.000433544373663608, "learning_rate": 3.7500000000000005e-06, "loss": 0.0, "num_tokens": 2131326.0, "reward": 0.17959481477737427, "reward_std": 0.5257959365844727, "rewards/reward_function/mean": 0.17959481477737427, "rewards/reward_function/std": 0.5257958769798279, "step": 41 }, { "completion_length": 941.625, "completions/clipped_ratio": 0.0, "completions/max_length": 1077.0, "completions/max_terminated_length": 1077.0, "completions/mean_length": 941.625, "completions/mean_terminated_length": 941.625, "completions/min_length": 762.0, "completions/min_terminated_length": 762.0, "epoch": 0.1640625, "frac_reward_zero_std": 0.0, "grad_norm": 0.40420642495155334, "kl": 0.0004869201438850723, "learning_rate": 3.6736789069647273e-06, "loss": 0.0, "num_tokens": 2178547.0, "reward": 0.3620530366897583, "reward_std": 0.2077617347240448, "rewards/reward_function/mean": 0.3620530366897583, "rewards/reward_function/std": 0.207761749625206, "step": 42 }, { "completion_length": 1078.625, "completions/clipped_ratio": 0.0, "completions/max_length": 1457.0, "completions/max_terminated_length": 1457.0, "completions/mean_length": 1078.625, "completions/mean_terminated_length": 1078.625, "completions/min_length": 908.0, "completions/min_terminated_length": 908.0, "epoch": 0.16796875, "frac_reward_zero_std": 0.0, "grad_norm": 0.3174184262752533, "kl": 0.0005323004734236747, "learning_rate": 3.595927866972694e-06, "loss": 0.0, "num_tokens": 2226864.0, "reward": -0.10311660170555115, "reward_std": 0.7427304983139038, "rewards/reward_function/mean": -0.10311660170555115, "rewards/reward_function/std": 0.7427304983139038, "step": 43 }, { "completion_length": 920.125, "completions/clipped_ratio": 0.0, "completions/max_length": 1060.0, "completions/max_terminated_length": 1060.0, "completions/mean_length": 920.125, "completions/mean_terminated_length": 920.125, "completions/min_length": 758.0, "completions/min_terminated_length": 758.0, "epoch": 0.171875, "frac_reward_zero_std": 0.0, "grad_norm": 0.3262939751148224, "kl": 0.0004862127752858214, "learning_rate": 3.516841607689501e-06, "loss": 0.0, "num_tokens": 2273913.0, "reward": -0.10695745050907135, "reward_std": 0.7395932078361511, "rewards/reward_function/mean": -0.10695745050907135, "rewards/reward_function/std": 0.7395932078361511, "step": 44 }, { "completion_length": 806.875, "completions/clipped_ratio": 0.0, "completions/max_length": 943.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 806.875, "completions/mean_terminated_length": 806.875, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 0.17578125, "frac_reward_zero_std": 0.0, "grad_norm": 0.37206342816352844, "kl": 0.00043304747669026256, "learning_rate": 3.436516483539781e-06, "loss": 0.0, "num_tokens": 2326376.0, "reward": 0.08102156221866608, "reward_std": 0.6678274869918823, "rewards/reward_function/mean": 0.08102156221866608, "rewards/reward_function/std": 0.6678274869918823, "step": 45 }, { "completion_length": 912.125, "completions/clipped_ratio": 0.0, "completions/max_length": 1138.0, "completions/max_terminated_length": 1138.0, "completions/mean_length": 912.125, "completions/mean_terminated_length": 912.125, "completions/min_length": 768.0, "completions/min_terminated_length": 768.0, "epoch": 0.1796875, "frac_reward_zero_std": 0.0, "grad_norm": 0.31978893280029297, "kl": 0.0003971458208980039, "learning_rate": 3.3550503583141726e-06, "loss": 0.0, "num_tokens": 2379681.0, "reward": -0.09052923321723938, "reward_std": 0.7533401846885681, "rewards/reward_function/mean": -0.09052923321723938, "rewards/reward_function/std": 0.7533401846885681, "step": 46 }, { "completion_length": 906.375, "completions/clipped_ratio": 0.0, "completions/max_length": 1003.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 906.375, "completions/mean_terminated_length": 906.375, "completions/min_length": 725.0, "completions/min_terminated_length": 725.0, "epoch": 0.18359375, "frac_reward_zero_std": 0.0, "grad_norm": 0.3475249707698822, "kl": 0.00039272347203223035, "learning_rate": 3.272542485937369e-06, "loss": 0.0, "num_tokens": 2432940.0, "reward": 0.455754816532135, "reward_std": 0.018690278753638268, "rewards/reward_function/mean": 0.455754816532135, "rewards/reward_function/std": 0.01869027502834797, "step": 47 }, { "completion_length": 849.875, "completions/clipped_ratio": 0.0, "completions/max_length": 953.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 849.875, "completions/mean_terminated_length": 849.875, "completions/min_length": 757.0, "completions/min_terminated_length": 757.0, "epoch": 0.1875, "frac_reward_zero_std": 0.0, "grad_norm": 0.3427131772041321, "kl": 0.0004229802143527195, "learning_rate": 3.189093389542498e-06, "loss": 0.0, "num_tokens": 2485747.0, "reward": 0.2003743201494217, "reward_std": 0.5182459950447083, "rewards/reward_function/mean": 0.2003743201494217, "rewards/reward_function/std": 0.5182459950447083, "step": 48 }, { "completion_length": 890.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1381.0, "completions/max_terminated_length": 1381.0, "completions/mean_length": 890.25, "completions/mean_terminated_length": 890.25, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.19140625, "frac_reward_zero_std": 0.0, "grad_norm": 0.5939275026321411, "kl": 0.0005514047734322958, "learning_rate": 3.1048047389991693e-06, "loss": 0.0, "num_tokens": 2539709.0, "reward": -0.11278533935546875, "reward_std": 0.7347214221954346, "rewards/reward_function/mean": -0.11278533935546875, "rewards/reward_function/std": 0.7347214221954346, "step": 49 }, { "completion_length": 1022.125, "completions/clipped_ratio": 0.0, "completions/max_length": 1454.0, "completions/max_terminated_length": 1454.0, "completions/mean_length": 1022.125, "completions/mean_terminated_length": 1022.125, "completions/min_length": 797.0, "completions/min_terminated_length": 797.0, "epoch": 0.1953125, "frac_reward_zero_std": 0.0, "grad_norm": 0.339578777551651, "kl": 0.00044445088860811666, "learning_rate": 3.019779227044398e-06, "loss": 0.0, "num_tokens": 2594726.0, "reward": 0.05837273597717285, "reward_std": 0.653872549533844, "rewards/reward_function/mean": 0.05837273597717285, "rewards/reward_function/std": 0.653872549533844, "step": 50 }, { "completion_length": 872.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1108.0, "completions/max_terminated_length": 1108.0, "completions/mean_length": 872.25, "completions/mean_terminated_length": 872.25, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 0.19921875, "frac_reward_zero_std": 0.0, "grad_norm": 1.0404114723205566, "kl": 0.0005189829680602998, "learning_rate": 2.9341204441673267e-06, "loss": 0.0, "num_tokens": 2648544.0, "reward": 0.24048176407814026, "reward_std": 0.5014303922653198, "rewards/reward_function/mean": 0.24048176407814026, "rewards/reward_function/std": 0.5014303922653198, "step": 51 }, { "completion_length": 967.125, "completions/clipped_ratio": 0.0, "completions/max_length": 1227.0, "completions/max_terminated_length": 1227.0, "completions/mean_length": 967.125, "completions/mean_terminated_length": 967.125, "completions/min_length": 698.0, "completions/min_terminated_length": 698.0, "epoch": 0.203125, "frac_reward_zero_std": 0.0, "grad_norm": 0.29855239391326904, "kl": 0.0004989826556993648, "learning_rate": 2.847932752400164e-06, "loss": 0.0, "num_tokens": 2703121.0, "reward": -0.1780097782611847, "reward_std": 0.7012866139411926, "rewards/reward_function/mean": -0.1780097782611847, "rewards/reward_function/std": 0.7012866139411926, "step": 52 }, { "completion_length": 1005.625, "completions/clipped_ratio": 0.0, "completions/max_length": 1206.0, "completions/max_terminated_length": 1206.0, "completions/mean_length": 1005.625, "completions/mean_terminated_length": 1005.625, "completions/min_length": 864.0, "completions/min_terminated_length": 864.0, "epoch": 0.20703125, "frac_reward_zero_std": 0.0, "grad_norm": 0.2760545015335083, "kl": 0.0004165990831097588, "learning_rate": 2.761321158169134e-06, "loss": 0.0, "num_tokens": 2759446.0, "reward": 0.24998876452445984, "reward_std": 0.505235493183136, "rewards/reward_function/mean": 0.24998876452445984, "rewards/reward_function/std": 0.5052355527877808, "step": 53 }, { "completion_length": 1015.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1532.0, "completions/max_terminated_length": 1532.0, "completions/mean_length": 1015.75, "completions/mean_terminated_length": 1015.75, "completions/min_length": 660.0, "completions/min_terminated_length": 660.0, "epoch": 0.2109375, "frac_reward_zero_std": 0.0, "grad_norm": 0.3195282220840454, "kl": 0.0004872979479841888, "learning_rate": 2.6743911843603134e-06, "loss": 0.0, "num_tokens": 2815852.0, "reward": -0.2854868173599243, "reward_std": 0.7638800740242004, "rewards/reward_function/mean": -0.2854868173599243, "rewards/reward_function/std": 0.7638801336288452, "step": 54 }, { "completion_length": 1141.125, "completions/clipped_ratio": 0.0, "completions/max_length": 1444.0, "completions/max_terminated_length": 1444.0, "completions/mean_length": 1141.125, "completions/mean_terminated_length": 1141.125, "completions/min_length": 883.0, "completions/min_terminated_length": 883.0, "epoch": 0.21484375, "frac_reward_zero_std": 0.0, "grad_norm": 0.2491772174835205, "kl": 0.0004129245280637406, "learning_rate": 2.587248741756253e-06, "loss": 0.0, "num_tokens": 2873261.0, "reward": -0.18247561156749725, "reward_std": 0.7091330289840698, "rewards/reward_function/mean": -0.18247561156749725, "rewards/reward_function/std": 0.7091330885887146, "step": 55 }, { "completion_length": 1048.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1244.0, "completions/max_terminated_length": 1244.0, "completions/mean_length": 1048.75, "completions/mean_terminated_length": 1048.75, "completions/min_length": 811.0, "completions/min_terminated_length": 811.0, "epoch": 0.21875, "frac_reward_zero_std": 0.0, "grad_norm": 0.24889753758907318, "kl": 0.00038504171243403107, "learning_rate": 2.5e-06, "loss": 0.0, "num_tokens": 2929931.0, "reward": 0.25053682923316956, "reward_std": 0.5054035186767578, "rewards/reward_function/mean": 0.25053682923316956, "rewards/reward_function/std": 0.5054035186767578, "step": 56 }, { "completion_length": 763.375, "completions/clipped_ratio": 0.0, "completions/max_length": 1179.0, "completions/max_terminated_length": 1179.0, "completions/mean_length": 763.375, "completions/mean_terminated_length": 763.375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.22265625, "frac_reward_zero_std": 0.0, "grad_norm": 1.4766223430633545, "kl": 0.0004478975461097434, "learning_rate": 2.4127512582437486e-06, "loss": 0.0, "num_tokens": 2981454.0, "reward": 0.09302115440368652, "reward_std": 0.6747010946273804, "rewards/reward_function/mean": 0.09302115440368652, "rewards/reward_function/std": 0.6747010946273804, "step": 57 }, { "completion_length": 868.125, "completions/clipped_ratio": 0.0, "completions/max_length": 1033.0, "completions/max_terminated_length": 1033.0, "completions/mean_length": 868.125, "completions/mean_terminated_length": 868.125, "completions/min_length": 728.0, "completions/min_terminated_length": 728.0, "epoch": 0.2265625, "frac_reward_zero_std": 0.0, "grad_norm": 0.3186846673488617, "kl": 0.0003867191553581506, "learning_rate": 2.325608815639687e-06, "loss": -0.0, "num_tokens": 3033815.0, "reward": 0.4501683712005615, "reward_std": 0.012572327628731728, "rewards/reward_function/mean": 0.4501683712005615, "rewards/reward_function/std": 0.012572328560054302, "step": 58 }, { "completion_length": 850.625, "completions/clipped_ratio": 0.0, "completions/max_length": 1053.0, "completions/max_terminated_length": 1053.0, "completions/mean_length": 850.625, "completions/mean_terminated_length": 850.625, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.23046875, "frac_reward_zero_std": 0.0, "grad_norm": 0.4074990153312683, "kl": 0.0004984224360669032, "learning_rate": 2.238678841830867e-06, "loss": 0.0, "num_tokens": 3086036.0, "reward": -0.2775750756263733, "reward_std": 0.772331714630127, "rewards/reward_function/mean": -0.2775750756263733, "rewards/reward_function/std": 0.7723317742347717, "step": 59 }, { "completion_length": 824.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1076.0, "completions/max_terminated_length": 1076.0, "completions/mean_length": 824.0, "completions/mean_terminated_length": 824.0, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.234375, "frac_reward_zero_std": 0.0, "grad_norm": 0.8724004030227661, "kl": 0.0008119349731714465, "learning_rate": 2.1520672475998374e-06, "loss": 0.0, "num_tokens": 3138044.0, "reward": -0.4572715163230896, "reward_std": 0.7490780353546143, "rewards/reward_function/mean": -0.4572715163230896, "rewards/reward_function/std": 0.7490780353546143, "step": 60 }, { "completion_length": 1087.375, "completions/clipped_ratio": 0.0, "completions/max_length": 1528.0, "completions/max_terminated_length": 1528.0, "completions/mean_length": 1087.375, "completions/mean_terminated_length": 1087.375, "completions/min_length": 905.0, "completions/min_terminated_length": 905.0, "epoch": 0.23828125, "frac_reward_zero_std": 0.0, "grad_norm": 0.2991656959056854, "kl": 0.00046085825306363404, "learning_rate": 2.0658795558326745e-06, "loss": 0.0, "num_tokens": 3192767.0, "reward": -0.007034048438072205, "reward_std": 0.6521883606910706, "rewards/reward_function/mean": -0.007034048438072205, "rewards/reward_function/std": 0.6521883606910706, "step": 61 }, { "completion_length": 1113.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1320.0, "completions/max_terminated_length": 1320.0, "completions/mean_length": 1113.5, "completions/mean_terminated_length": 1113.5, "completions/min_length": 815.0, "completions/min_terminated_length": 815.0, "epoch": 0.2421875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2844128906726837, "kl": 0.0004267314507160336, "learning_rate": 1.9802207729556023e-06, "loss": 0.0, "num_tokens": 3247699.0, "reward": 0.25297480821609497, "reward_std": 0.506567656993866, "rewards/reward_function/mean": 0.25297480821609497, "rewards/reward_function/std": 0.506567656993866, "step": 62 }, { "completion_length": 977.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1705.0, "completions/max_terminated_length": 1705.0, "completions/mean_length": 977.5, "completions/mean_terminated_length": 977.5, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.24609375, "frac_reward_zero_std": 0.0, "grad_norm": 0.4520728886127472, "kl": 0.0006246675111469813, "learning_rate": 1.895195261000831e-06, "loss": 0.0, "num_tokens": 3301543.0, "reward": -0.12443238496780396, "reward_std": 0.7253407835960388, "rewards/reward_function/mean": -0.12443238496780396, "rewards/reward_function/std": 0.7253408432006836, "step": 63 }, { "completion_length": 1241.875, "completions/clipped_ratio": 0.0, "completions/max_length": 2091.0, "completions/max_terminated_length": 2091.0, "completions/mean_length": 1241.875, "completions/mean_terminated_length": 1241.875, "completions/min_length": 840.0, "completions/min_terminated_length": 840.0, "epoch": 0.25, "frac_reward_zero_std": 0.0, "grad_norm": 0.28819605708122253, "kl": 0.0004499929418670945, "learning_rate": 1.8109066104575023e-06, "loss": 0.0, "num_tokens": 3357502.0, "reward": -0.2883961498737335, "reward_std": 0.760738730430603, "rewards/reward_function/mean": -0.2883961498737335, "rewards/reward_function/std": 0.7607387900352478, "step": 64 }, { "completion_length": 969.875, "completions/clipped_ratio": 0.0, "completions/max_length": 1196.0, "completions/max_terminated_length": 1196.0, "completions/mean_length": 969.875, "completions/mean_terminated_length": 969.875, "completions/min_length": 877.0, "completions/min_terminated_length": 877.0, "epoch": 0.25390625, "frac_reward_zero_std": 0.0, "grad_norm": 0.3114400804042816, "kl": 0.0005156347615411505, "learning_rate": 1.7274575140626318e-06, "loss": 0.0, "num_tokens": 3410565.0, "reward": 0.06768127530813217, "reward_std": 0.6590126156806946, "rewards/reward_function/mean": 0.06768127530813217, "rewards/reward_function/std": 0.6590126156806946, "step": 65 }, { "completion_length": 1037.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1212.0, "completions/max_terminated_length": 1212.0, "completions/mean_length": 1037.5, "completions/mean_terminated_length": 1037.5, "completions/min_length": 834.0, "completions/min_terminated_length": 834.0, "epoch": 0.2578125, "frac_reward_zero_std": 0.0, "grad_norm": 0.28374701738357544, "kl": 0.0004491405125008896, "learning_rate": 1.6449496416858285e-06, "loss": 0.0, "num_tokens": 3464169.0, "reward": 0.28254395723342896, "reward_std": 0.27969422936439514, "rewards/reward_function/mean": 0.28254395723342896, "rewards/reward_function/std": 0.27969422936439514, "step": 66 }, { "completion_length": 945.125, "completions/clipped_ratio": 0.0, "completions/max_length": 1096.0, "completions/max_terminated_length": 1096.0, "completions/mean_length": 945.125, "completions/mean_terminated_length": 945.125, "completions/min_length": 720.0, "completions/min_terminated_length": 720.0, "epoch": 0.26171875, "frac_reward_zero_std": 0.0, "grad_norm": 0.3035137951374054, "kl": 0.0005614978290395811, "learning_rate": 1.56348351646022e-06, "loss": 0.0, "num_tokens": 3517034.0, "reward": 0.06603994220495224, "reward_std": 0.6580838561058044, "rewards/reward_function/mean": 0.06603994220495224, "rewards/reward_function/std": 0.6580838561058044, "step": 67 }, { "completion_length": 889.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1034.0, "completions/max_terminated_length": 1034.0, "completions/mean_length": 889.25, "completions/mean_terminated_length": 889.25, "completions/min_length": 761.0, "completions/min_terminated_length": 761.0, "epoch": 0.265625, "frac_reward_zero_std": 0.0, "grad_norm": 0.31817343831062317, "kl": 0.0004019583066110499, "learning_rate": 1.4831583923105e-06, "loss": 0.0, "num_tokens": 3569452.0, "reward": -0.1899593323469162, "reward_std": 0.7019972801208496, "rewards/reward_function/mean": -0.1899593323469162, "rewards/reward_function/std": 0.7019972801208496, "step": 68 }, { "completion_length": 978.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1133.0, "completions/max_terminated_length": 1133.0, "completions/mean_length": 978.75, "completions/mean_terminated_length": 978.75, "completions/min_length": 788.0, "completions/min_terminated_length": 788.0, "epoch": 0.26953125, "frac_reward_zero_std": 0.0, "grad_norm": 0.2677120566368103, "kl": 0.000421192504290957, "learning_rate": 1.4040721330273063e-06, "loss": 0.0, "num_tokens": 3622058.0, "reward": 0.26337215304374695, "reward_std": 0.5106496810913086, "rewards/reward_function/mean": 0.26337215304374695, "rewards/reward_function/std": 0.5106497406959534, "step": 69 }, { "completion_length": 1027.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1741.0, "completions/max_terminated_length": 1741.0, "completions/mean_length": 1027.5, "completions/mean_terminated_length": 1027.5, "completions/min_length": 785.0, "completions/min_terminated_length": 785.0, "epoch": 0.2734375, "frac_reward_zero_std": 0.0, "grad_norm": 0.3545612096786499, "kl": 0.0004943152016494423, "learning_rate": 1.3263210930352737e-06, "loss": 0.0, "num_tokens": 3675054.0, "reward": 0.4391711354255676, "reward_std": 0.04140199348330498, "rewards/reward_function/mean": 0.4391711354255676, "rewards/reward_function/std": 0.041401997208595276, "step": 70 }, { "completion_length": 1002.125, "completions/clipped_ratio": 0.0, "completions/max_length": 1330.0, "completions/max_terminated_length": 1330.0, "completions/mean_length": 1002.125, "completions/mean_terminated_length": 1002.125, "completions/min_length": 812.0, "completions/min_terminated_length": 812.0, "epoch": 0.27734375, "frac_reward_zero_std": 0.0, "grad_norm": 0.31723299622535706, "kl": 0.0004918482736684382, "learning_rate": 1.2500000000000007e-06, "loss": 0.0, "num_tokens": 3727847.0, "reward": -0.09308577328920364, "reward_std": 0.751098096370697, "rewards/reward_function/mean": -0.09308577328920364, "rewards/reward_function/std": 0.7510981559753418, "step": 71 }, { "completion_length": 1050.375, "completions/clipped_ratio": 0.0, "completions/max_length": 1413.0, "completions/max_terminated_length": 1413.0, "completions/mean_length": 1050.375, "completions/mean_terminated_length": 1050.375, "completions/min_length": 815.0, "completions/min_terminated_length": 815.0, "epoch": 0.28125, "frac_reward_zero_std": 0.0, "grad_norm": 0.2855725884437561, "kl": 0.00046525982179446146, "learning_rate": 1.1752018394169882e-06, "loss": 0.0, "num_tokens": 3781026.0, "reward": 0.2761775255203247, "reward_std": 0.5158948302268982, "rewards/reward_function/mean": 0.2761775255203247, "rewards/reward_function/std": 0.5158948302268982, "step": 72 }, { "completion_length": 1050.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1197.0, "completions/max_terminated_length": 1197.0, "completions/mean_length": 1050.0, "completions/mean_terminated_length": 1050.0, "completions/min_length": 909.0, "completions/min_terminated_length": 909.0, "epoch": 0.28515625, "frac_reward_zero_std": 0.0, "grad_norm": 0.4394182562828064, "kl": 0.0005783203814644367, "learning_rate": 1.1020177413231334e-06, "loss": 0.0, "num_tokens": 3836474.0, "reward": 0.24491167068481445, "reward_std": 0.5031947493553162, "rewards/reward_function/mean": 0.24491167068481445, "rewards/reward_function/std": 0.5031947493553162, "step": 73 }, { "completion_length": 1046.375, "completions/clipped_ratio": 0.0, "completions/max_length": 1290.0, "completions/max_terminated_length": 1290.0, "completions/mean_length": 1046.375, "completions/mean_terminated_length": 1046.375, "completions/min_length": 837.0, "completions/min_terminated_length": 837.0, "epoch": 0.2890625, "frac_reward_zero_std": 0.0, "grad_norm": 0.3345458507537842, "kl": 0.0005103340663481504, "learning_rate": 1.0305368692688175e-06, "loss": 0.0, "num_tokens": 3891893.0, "reward": 0.18502177298069, "reward_std": 0.5084241628646851, "rewards/reward_function/mean": 0.18502177298069, "rewards/reward_function/std": 0.5084241628646851, "step": 74 }, { "completion_length": 1180.125, "completions/clipped_ratio": 0.0, "completions/max_length": 1387.0, "completions/max_terminated_length": 1387.0, "completions/mean_length": 1180.125, "completions/mean_terminated_length": 1180.125, "completions/min_length": 939.0, "completions/min_terminated_length": 939.0, "epoch": 0.29296875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2869652509689331, "kl": 0.0005238868543528952, "learning_rate": 9.608463116858544e-07, "loss": 0.0, "num_tokens": 3948382.0, "reward": 0.4311676621437073, "reward_std": 0.01504568662494421, "rewards/reward_function/mean": 0.4311676621437073, "rewards/reward_function/std": 0.01504568662494421, "step": 75 }, { "completion_length": 1176.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1854.0, "completions/max_terminated_length": 1854.0, "completions/mean_length": 1176.0, "completions/mean_terminated_length": 1176.0, "completions/min_length": 870.0, "completions/min_terminated_length": 870.0, "epoch": 0.296875, "frac_reward_zero_std": 0.0, "grad_norm": 0.28590667247772217, "kl": 0.000421482436649967, "learning_rate": 8.930309757836517e-07, "loss": 0.0, "num_tokens": 4004838.0, "reward": 0.37514999508857727, "reward_std": 0.18447871506214142, "rewards/reward_function/mean": 0.37514999508857727, "rewards/reward_function/std": 0.18447871506214142, "step": 76 }, { "completion_length": 836.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1274.0, "completions/max_terminated_length": 1274.0, "completions/mean_length": 836.25, "completions/mean_terminated_length": 836.25, "completions/min_length": 694.0, "completions/min_terminated_length": 694.0, "epoch": 0.30078125, "frac_reward_zero_std": 0.0, "grad_norm": 0.3467836380004883, "kl": 0.0005228954396443442, "learning_rate": 8.271734841028553e-07, "loss": 0.0, "num_tokens": 4056272.0, "reward": 0.2929620146751404, "reward_std": 0.27145516872406006, "rewards/reward_function/mean": 0.2929620146751404, "rewards/reward_function/std": 0.27145513892173767, "step": 77 }, { "completion_length": 821.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1038.0, "completions/max_terminated_length": 1038.0, "completions/mean_length": 821.0, "completions/mean_terminated_length": 821.0, "completions/min_length": 692.0, "completions/min_terminated_length": 692.0, "epoch": 0.3046875, "frac_reward_zero_std": 0.0, "grad_norm": 0.3295641839504242, "kl": 0.0004961359809385613, "learning_rate": 7.633540738525066e-07, "loss": 0.0, "num_tokens": 4107584.0, "reward": -0.0900302305817604, "reward_std": 0.6223264932632446, "rewards/reward_function/mean": -0.0900302305817604, "rewards/reward_function/std": 0.6223265528678894, "step": 78 }, { "completion_length": 975.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1555.0, "completions/max_terminated_length": 1555.0, "completions/mean_length": 975.25, "completions/mean_terminated_length": 975.25, "completions/min_length": 711.0, "completions/min_terminated_length": 711.0, "epoch": 0.30859375, "frac_reward_zero_std": 0.0, "grad_norm": 0.3428508937358856, "kl": 0.0005168440329725854, "learning_rate": 7.016504991533727e-07, "loss": 0.0, "num_tokens": 4160130.0, "reward": 0.17976805567741394, "reward_std": 0.5253881216049194, "rewards/reward_function/mean": 0.17976805567741394, "rewards/reward_function/std": 0.5253881216049194, "step": 79 }, { "completion_length": 896.625, "completions/clipped_ratio": 0.0, "completions/max_length": 1078.0, "completions/max_terminated_length": 1078.0, "completions/mean_length": 896.625, "completions/mean_terminated_length": 896.625, "completions/min_length": 721.0, "completions/min_terminated_length": 721.0, "epoch": 0.3125, "frac_reward_zero_std": 0.0, "grad_norm": 0.33451908826828003, "kl": 0.0004913711964036338, "learning_rate": 6.421379363065142e-07, "loss": 0.0, "num_tokens": 4212047.0, "reward": 0.2575118839740753, "reward_std": 0.5082336068153381, "rewards/reward_function/mean": 0.2575118839740753, "rewards/reward_function/std": 0.5082336068153381, "step": 80 }, { "completion_length": 1186.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1439.0, "completions/max_terminated_length": 1439.0, "completions/mean_length": 1186.5, "completions/mean_terminated_length": 1186.5, "completions/min_length": 1046.0, "completions/min_terminated_length": 1046.0, "epoch": 0.31640625, "frac_reward_zero_std": 0.0, "grad_norm": 0.2777305543422699, "kl": 0.0004566588904708624, "learning_rate": 5.848888922025553e-07, "loss": 0.0, "num_tokens": 4270267.0, "reward": 0.08119191229343414, "reward_std": 0.6674035787582397, "rewards/reward_function/mean": 0.08119191229343414, "rewards/reward_function/std": 0.6674035787582397, "step": 81 }, { "completion_length": 1302.625, "completions/clipped_ratio": 0.0, "completions/max_length": 1983.0, "completions/max_terminated_length": 1983.0, "completions/mean_length": 1302.625, "completions/mean_terminated_length": 1302.625, "completions/min_length": 849.0, "completions/min_terminated_length": 849.0, "epoch": 0.3203125, "frac_reward_zero_std": 0.0, "grad_norm": 0.2776745855808258, "kl": 0.0004434596048668027, "learning_rate": 5.299731159831953e-07, "loss": 0.0, "num_tokens": 4329416.0, "reward": -0.09772323071956635, "reward_std": 0.7471930980682373, "rewards/reward_function/mean": -0.09772323071956635, "rewards/reward_function/std": 0.7471930980682373, "step": 82 }, { "completion_length": 1093.875, "completions/clipped_ratio": 0.0, "completions/max_length": 1306.0, "completions/max_terminated_length": 1306.0, "completions/mean_length": 1093.875, "completions/mean_terminated_length": 1093.875, "completions/min_length": 877.0, "completions/min_terminated_length": 877.0, "epoch": 0.32421875, "frac_reward_zero_std": 0.0, "grad_norm": 0.3191281259059906, "kl": 0.0004105157349840738, "learning_rate": 4.774575140626317e-07, "loss": 0.0, "num_tokens": 4386895.0, "reward": 0.256418377161026, "reward_std": 0.5078137516975403, "rewards/reward_function/mean": 0.256418377161026, "rewards/reward_function/std": 0.5078137516975403, "step": 83 }, { "completion_length": 1108.375, "completions/clipped_ratio": 0.0, "completions/max_length": 1572.0, "completions/max_terminated_length": 1572.0, "completions/mean_length": 1108.375, "completions/mean_terminated_length": 1108.375, "completions/min_length": 853.0, "completions/min_terminated_length": 853.0, "epoch": 0.328125, "frac_reward_zero_std": 0.0, "grad_norm": 0.26989954710006714, "kl": 0.0005018782903789543, "learning_rate": 4.27406068612396e-07, "loss": 0.0, "num_tokens": 4444490.0, "reward": 0.12714649736881256, "reward_std": 0.5166128277778625, "rewards/reward_function/mean": 0.12714649736881256, "rewards/reward_function/std": 0.5166128277778625, "step": 84 }, { "completion_length": 1082.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1199.0, "completions/max_terminated_length": 1199.0, "completions/mean_length": 1082.25, "completions/mean_terminated_length": 1082.25, "completions/min_length": 984.0, "completions/min_terminated_length": 984.0, "epoch": 0.33203125, "frac_reward_zero_std": 0.0, "grad_norm": 0.2745755612850189, "kl": 0.0005251698967185803, "learning_rate": 3.798797596089351e-07, "loss": 0.0, "num_tokens": 4502412.0, "reward": -0.08731149882078171, "reward_std": 0.7558506727218628, "rewards/reward_function/mean": -0.08731149882078171, "rewards/reward_function/std": 0.7558506727218628, "step": 85 }, { "completion_length": 960.625, "completions/clipped_ratio": 0.0, "completions/max_length": 1336.0, "completions/max_terminated_length": 1336.0, "completions/mean_length": 960.625, "completions/mean_terminated_length": 960.625, "completions/min_length": 536.0, "completions/min_terminated_length": 536.0, "epoch": 0.3359375, "frac_reward_zero_std": 0.0, "grad_norm": 0.3008907735347748, "kl": 0.0004591922843246721, "learning_rate": 3.3493649053890325e-07, "loss": 0.0, "num_tokens": 4559361.0, "reward": 0.1179749146103859, "reward_std": 0.5200047492980957, "rewards/reward_function/mean": 0.1179749146103859, "rewards/reward_function/std": 0.5200048089027405, "step": 86 }, { "completion_length": 1191.625, "completions/clipped_ratio": 0.0, "completions/max_length": 1353.0, "completions/max_terminated_length": 1353.0, "completions/mean_length": 1191.625, "completions/mean_terminated_length": 1191.625, "completions/min_length": 1014.0, "completions/min_terminated_length": 1014.0, "epoch": 0.33984375, "frac_reward_zero_std": 0.0, "grad_norm": 0.29382893443107605, "kl": 0.0004918071499560028, "learning_rate": 2.9263101785268253e-07, "loss": 0.0, "num_tokens": 4618158.0, "reward": 0.10288041085004807, "reward_std": 0.680895984172821, "rewards/reward_function/mean": 0.10288041085004807, "rewards/reward_function/std": 0.680895984172821, "step": 87 }, { "completion_length": 1024.625, "completions/clipped_ratio": 0.0, "completions/max_length": 1330.0, "completions/max_terminated_length": 1330.0, "completions/mean_length": 1024.625, "completions/mean_terminated_length": 1024.625, "completions/min_length": 830.0, "completions/min_terminated_length": 830.0, "epoch": 0.34375, "frac_reward_zero_std": 0.0, "grad_norm": 0.30418917536735535, "kl": 0.0005036265138187446, "learning_rate": 2.53014884252083e-07, "loss": 0.0, "num_tokens": 4675619.0, "reward": 0.20726439356803894, "reward_std": 0.5138247013092041, "rewards/reward_function/mean": 0.20726439356803894, "rewards/reward_function/std": 0.5138247609138489, "step": 88 }, { "completion_length": 1076.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1415.0, "completions/max_terminated_length": 1415.0, "completions/mean_length": 1076.25, "completions/mean_terminated_length": 1076.25, "completions/min_length": 825.0, "completions/min_terminated_length": 825.0, "epoch": 0.34765625, "frac_reward_zero_std": 0.0, "grad_norm": 0.29263392090797424, "kl": 0.0005031488981330767, "learning_rate": 2.1613635589349756e-07, "loss": 0.0, "num_tokens": 4727813.0, "reward": 0.18039438128471375, "reward_std": 0.5106222033500671, "rewards/reward_function/mean": 0.18039438128471375, "rewards/reward_function/std": 0.5106222033500671, "step": 89 }, { "completion_length": 1055.375, "completions/clipped_ratio": 0.0, "completions/max_length": 1210.0, "completions/max_terminated_length": 1210.0, "completions/mean_length": 1055.375, "completions/mean_terminated_length": 1055.375, "completions/min_length": 936.0, "completions/min_terminated_length": 936.0, "epoch": 0.3515625, "frac_reward_zero_std": 0.0, "grad_norm": 0.29214635491371155, "kl": 0.0004412992129800841, "learning_rate": 1.8204036358303173e-07, "loss": 0.0, "num_tokens": 4779840.0, "reward": 0.4273361563682556, "reward_std": 0.012010330334305763, "rewards/reward_function/mean": 0.4273361563682556, "rewards/reward_function/std": 0.012010332196950912, "step": 90 }, { "completion_length": 1136.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1483.0, "completions/max_terminated_length": 1483.0, "completions/mean_length": 1136.25, "completions/mean_terminated_length": 1136.25, "completions/min_length": 810.0, "completions/min_terminated_length": 810.0, "epoch": 0.35546875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2611209750175476, "kl": 0.0004386235450510867, "learning_rate": 1.507684480352292e-07, "loss": 0.0, "num_tokens": 4832514.0, "reward": 0.24031642079353333, "reward_std": 0.5015574097633362, "rewards/reward_function/mean": 0.24031642079353333, "rewards/reward_function/std": 0.5015574097633362, "step": 91 }, { "completion_length": 1092.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1367.0, "completions/max_terminated_length": 1367.0, "completions/mean_length": 1092.25, "completions/mean_terminated_length": 1092.25, "completions/min_length": 843.0, "completions/min_terminated_length": 843.0, "epoch": 0.359375, "frac_reward_zero_std": 0.0, "grad_norm": 0.31895720958709717, "kl": 0.0005823632091050968, "learning_rate": 1.223587092621162e-07, "loss": 0.0, "num_tokens": 4884836.0, "reward": -0.11290903389453888, "reward_std": 0.7347157001495361, "rewards/reward_function/mean": -0.11290903389453888, "rewards/reward_function/std": 0.7347157001495361, "step": 92 }, { "completion_length": 1156.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1366.0, "completions/max_terminated_length": 1366.0, "completions/mean_length": 1156.5, "completions/mean_terminated_length": 1156.5, "completions/min_length": 944.0, "completions/min_terminated_length": 944.0, "epoch": 0.36328125, "frac_reward_zero_std": 0.0, "grad_norm": 0.3020682632923126, "kl": 0.0004940428771078587, "learning_rate": 9.684576015420277e-08, "loss": 0.0, "num_tokens": 4941160.0, "reward": 0.06459739059209824, "reward_std": 0.6571158766746521, "rewards/reward_function/mean": 0.06459739059209824, "rewards/reward_function/std": 0.6571159362792969, "step": 93 }, { "completion_length": 1061.625, "completions/clipped_ratio": 0.0, "completions/max_length": 1237.0, "completions/max_terminated_length": 1237.0, "completions/mean_length": 1061.625, "completions/mean_terminated_length": 1061.625, "completions/min_length": 781.0, "completions/min_terminated_length": 781.0, "epoch": 0.3671875, "frac_reward_zero_std": 0.0, "grad_norm": 0.3015153706073761, "kl": 0.0005082947827759199, "learning_rate": 7.426068431000883e-08, "loss": 0.0, "num_tokens": 4996725.0, "reward": 0.2391453981399536, "reward_std": 0.5007839202880859, "rewards/reward_function/mean": 0.2391453981399536, "rewards/reward_function/std": 0.5007839798927307, "step": 94 }, { "completion_length": 947.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1387.0, "completions/max_terminated_length": 1387.0, "completions/mean_length": 947.75, "completions/mean_terminated_length": 947.75, "completions/min_length": 641.0, "completions/min_terminated_length": 641.0, "epoch": 0.37109375, "frac_reward_zero_std": 0.0, "grad_norm": 0.31657955050468445, "kl": 0.0004573179394355975, "learning_rate": 5.463099816548578e-08, "loss": 0.0, "num_tokens": 5051379.0, "reward": -0.019695594906806946, "reward_std": 0.638481855392456, "rewards/reward_function/mean": -0.019695594906806946, "rewards/reward_function/std": 0.6384819149971008, "step": 95 }, { "completion_length": 980.875, "completions/clipped_ratio": 0.0, "completions/max_length": 1112.0, "completions/max_terminated_length": 1112.0, "completions/mean_length": 980.875, "completions/mean_terminated_length": 980.875, "completions/min_length": 808.0, "completions/min_terminated_length": 808.0, "epoch": 0.375, "frac_reward_zero_std": 0.0, "grad_norm": 0.38886314630508423, "kl": 0.0004613845740095712, "learning_rate": 3.798061746947995e-08, "loss": -0.0, "num_tokens": 5106298.0, "reward": 0.41173434257507324, "reward_std": 0.008369507268071175, "rewards/reward_function/mean": 0.41173434257507324, "rewards/reward_function/std": 0.008369502611458302, "step": 96 }, { "completion_length": 1003.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1291.0, "completions/max_terminated_length": 1291.0, "completions/mean_length": 1003.5, "completions/mean_terminated_length": 1003.5, "completions/min_length": 668.0, "completions/min_terminated_length": 668.0, "epoch": 0.37890625, "frac_reward_zero_std": 0.0, "grad_norm": 0.3679444193840027, "kl": 0.00046288836892927065, "learning_rate": 2.4329828146074096e-08, "loss": 0.0, "num_tokens": 5146238.0, "reward": 0.18697933852672577, "reward_std": 0.5134840607643127, "rewards/reward_function/mean": 0.18697933852672577, "rewards/reward_function/std": 0.5134840607643127, "step": 97 }, { "completion_length": 1033.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1228.0, "completions/max_terminated_length": 1228.0, "completions/mean_length": 1033.0, "completions/mean_terminated_length": 1033.0, "completions/min_length": 853.0, "completions/min_terminated_length": 853.0, "epoch": 0.3828125, "frac_reward_zero_std": 0.0, "grad_norm": 0.2726477086544037, "kl": 0.00043194100726395845, "learning_rate": 1.3695261579316776e-08, "loss": 0.0, "num_tokens": 5186414.0, "reward": 0.07853780686855316, "reward_std": 0.6657574772834778, "rewards/reward_function/mean": 0.07853780686855316, "rewards/reward_function/std": 0.6657574772834778, "step": 98 }, { "completion_length": 1069.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1659.0, "completions/max_terminated_length": 1659.0, "completions/mean_length": 1069.5, "completions/mean_terminated_length": 1069.5, "completions/min_length": 922.0, "completions/min_terminated_length": 922.0, "epoch": 0.38671875, "frac_reward_zero_std": 0.0, "grad_norm": 0.30219021439552307, "kl": 0.000492539482365828, "learning_rate": 6.089874350439507e-09, "loss": 0.0, "num_tokens": 5226882.0, "reward": 0.07734433561563492, "reward_std": 0.6650528311729431, "rewards/reward_function/mean": 0.07734433561563492, "rewards/reward_function/std": 0.6650528907775879, "step": 99 }, { "completion_length": 918.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1200.0, "completions/max_terminated_length": 1200.0, "completions/mean_length": 918.0, "completions/mean_terminated_length": 918.0, "completions/min_length": 573.0, "completions/min_terminated_length": 573.0, "epoch": 0.390625, "frac_reward_zero_std": 0.0, "grad_norm": 0.44854360818862915, "kl": 0.00044552871986525133, "learning_rate": 1.5229324522605949e-09, "loss": 0.0, "num_tokens": 5266138.0, "reward": 0.25066205859184265, "reward_std": 0.5054365992546082, "rewards/reward_function/mean": 0.25066205859184265, "rewards/reward_function/std": 0.5054366588592529, "step": 100 }, { "completion_length": 971.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1311.0, "completions/max_terminated_length": 1311.0, "completions/mean_length": 971.75, "completions/mean_terminated_length": 971.75, "completions/min_length": 562.0, "completions/min_terminated_length": 562.0, "epoch": 0.39453125, "frac_reward_zero_std": 0.0, "grad_norm": 0.35067111253738403, "kl": 0.0005845300620421767, "learning_rate": 0.0, "loss": 0.0, "num_tokens": 5305824.0, "reward": -0.11205706000328064, "reward_std": 0.7353475689888, "rewards/reward_function/mean": -0.11205706000328064, "rewards/reward_function/std": 0.7353475689888, "step": 101 }, { "completion_length": 1009.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1224.0, "completions/max_terminated_length": 1224.0, "completions/mean_length": 1009.75, "completions/mean_terminated_length": 1009.75, "completions/min_length": 697.0, "completions/min_terminated_length": 697.0, "epoch": 0.3984375, "frac_reward_zero_std": 0.0, "grad_norm": 0.33843955397605896, "kl": 0.0005709103206754662, "learning_rate": 2.8910861626005774e-06, "loss": 0.0, "num_tokens": 5352350.0, "reward": -0.10698533058166504, "reward_std": 0.739599883556366, "rewards/reward_function/mean": -0.10698533058166504, "rewards/reward_function/std": 0.7395999431610107, "step": 102 }, { "completion_length": 1043.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1264.0, "completions/max_terminated_length": 1264.0, "completions/mean_length": 1043.0, "completions/mean_terminated_length": 1043.0, "completions/min_length": 945.0, "completions/min_terminated_length": 945.0, "epoch": 0.40234375, "frac_reward_zero_std": 0.0, "grad_norm": 0.3096908628940582, "kl": 0.0005897778464714065, "learning_rate": 2.847932752400164e-06, "loss": 0.0, "num_tokens": 5392606.0, "reward": 0.248357892036438, "reward_std": 0.5045785307884216, "rewards/reward_function/mean": 0.248357892036438, "rewards/reward_function/std": 0.5045785307884216, "step": 103 }, { "completion_length": 1079.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1243.0, "completions/max_terminated_length": 1243.0, "completions/mean_length": 1079.75, "completions/mean_terminated_length": 1079.75, "completions/min_length": 848.0, "completions/min_terminated_length": 848.0, "epoch": 0.40625, "frac_reward_zero_std": 0.0, "grad_norm": 0.3232628107070923, "kl": 0.0005662211769958958, "learning_rate": 2.804673358512869e-06, "loss": 0.0, "num_tokens": 5439692.0, "reward": 0.24970494210720062, "reward_std": 0.505219578742981, "rewards/reward_function/mean": 0.24970494210720062, "rewards/reward_function/std": 0.5052196383476257, "step": 104 }, { "completion_length": 1118.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1456.0, "completions/max_terminated_length": 1456.0, "completions/mean_length": 1118.75, "completions/mean_terminated_length": 1118.75, "completions/min_length": 465.0, "completions/min_terminated_length": 465.0, "epoch": 0.41015625, "frac_reward_zero_std": 0.0, "grad_norm": 0.3483864665031433, "kl": 0.0005037105802330188, "learning_rate": 2.761321158169134e-06, "loss": 0.0, "num_tokens": 5492746.0, "reward": -0.00016714632511138916, "reward_std": 0.64275723695755, "rewards/reward_function/mean": -0.00016714632511138916, "rewards/reward_function/std": 0.64275723695755, "step": 105 }, { "completion_length": 941.875, "completions/clipped_ratio": 0.0, "completions/max_length": 1185.0, "completions/max_terminated_length": 1185.0, "completions/mean_length": 941.875, "completions/mean_terminated_length": 941.875, "completions/min_length": 637.0, "completions/min_terminated_length": 637.0, "epoch": 0.4140625, "frac_reward_zero_std": 0.0, "grad_norm": 0.3742240071296692, "kl": 0.0005701555346604437, "learning_rate": 2.717889356869146e-06, "loss": 0.0, "num_tokens": 5549881.0, "reward": 0.2512204349040985, "reward_std": 0.505646288394928, "rewards/reward_function/mean": 0.2512204349040985, "rewards/reward_function/std": 0.5056463479995728, "step": 106 }, { "completion_length": 990.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1131.0, "completions/max_terminated_length": 1131.0, "completions/mean_length": 990.75, "completions/mean_terminated_length": 990.75, "completions/min_length": 906.0, "completions/min_terminated_length": 906.0, "epoch": 0.41796875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2908116579055786, "kl": 0.0004926583642372862, "learning_rate": 2.6743911843603134e-06, "loss": 0.0, "num_tokens": 5601911.0, "reward": 0.0615215003490448, "reward_std": 0.6552067399024963, "rewards/reward_function/mean": 0.0615215003490448, "rewards/reward_function/std": 0.6552067399024963, "step": 107 }, { "completion_length": 1138.625, "completions/clipped_ratio": 0.0, "completions/max_length": 1816.0, "completions/max_terminated_length": 1816.0, "completions/mean_length": 1138.625, "completions/mean_terminated_length": 1138.625, "completions/min_length": 861.0, "completions/min_terminated_length": 861.0, "epoch": 0.421875, "frac_reward_zero_std": 0.0, "grad_norm": 0.26039204001426697, "kl": 0.0006796395464334637, "learning_rate": 2.6308398906073603e-06, "loss": 0.0, "num_tokens": 5660620.0, "reward": -0.8185928463935852, "reward_std": 0.5130969882011414, "rewards/reward_function/mean": -0.8185928463935852, "rewards/reward_function/std": 0.5130969882011414, "step": 108 }, { "completion_length": 1128.375, "completions/clipped_ratio": 0.0, "completions/max_length": 1260.0, "completions/max_terminated_length": 1260.0, "completions/mean_length": 1128.375, "completions/mean_terminated_length": 1128.375, "completions/min_length": 983.0, "completions/min_terminated_length": 983.0, "epoch": 0.42578125, "frac_reward_zero_std": 0.0, "grad_norm": 0.28983625769615173, "kl": 0.000518458073202055, "learning_rate": 2.587248741756253e-06, "loss": 0.0, "num_tokens": 5713751.0, "reward": 0.09786057472229004, "reward_std": 0.520240306854248, "rewards/reward_function/mean": 0.09786057472229004, "rewards/reward_function/std": 0.520240306854248, "step": 109 }, { "completion_length": 980.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1334.0, "completions/max_terminated_length": 1334.0, "completions/mean_length": 980.5, "completions/mean_terminated_length": 980.5, "completions/min_length": 697.0, "completions/min_terminated_length": 697.0, "epoch": 0.4296875, "frac_reward_zero_std": 0.0, "grad_norm": 0.3524278700351715, "kl": 0.0005516579622053541, "learning_rate": 2.543631016093209e-06, "loss": 0.0, "num_tokens": 5771195.0, "reward": 0.06997495889663696, "reward_std": 0.660901665687561, "rewards/reward_function/mean": 0.06997495889663696, "rewards/reward_function/std": 0.6609017252922058, "step": 110 }, { "completion_length": 996.125, "completions/clipped_ratio": 0.0, "completions/max_length": 1295.0, "completions/max_terminated_length": 1295.0, "completions/mean_length": 996.125, "completions/mean_terminated_length": 996.125, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "epoch": 0.43359375, "frac_reward_zero_std": 0.0, "grad_norm": 0.3624061644077301, "kl": 0.000616533579886891, "learning_rate": 2.5e-06, "loss": 0.0, "num_tokens": 5823268.0, "reward": -0.29218366742134094, "reward_std": 0.7567039728164673, "rewards/reward_function/mean": -0.29218366742134094, "rewards/reward_function/std": 0.7567040324211121, "step": 111 }, { "completion_length": 993.625, "completions/clipped_ratio": 0.0, "completions/max_length": 1481.0, "completions/max_terminated_length": 1481.0, "completions/mean_length": 993.625, "completions/mean_terminated_length": 993.625, "completions/min_length": 747.0, "completions/min_terminated_length": 747.0, "epoch": 0.4375, "frac_reward_zero_std": 0.0, "grad_norm": 0.3491348624229431, "kl": 0.0005375376276788302, "learning_rate": 2.4563689839067913e-06, "loss": 0.0, "num_tokens": 5880817.0, "reward": 0.12125939130783081, "reward_std": 0.5074965357780457, "rewards/reward_function/mean": 0.12125939130783081, "rewards/reward_function/std": 0.5074965953826904, "step": 112 }, { "completion_length": 1085.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1872.0, "completions/max_terminated_length": 1872.0, "completions/mean_length": 1085.5, "completions/mean_terminated_length": 1085.5, "completions/min_length": 877.0, "completions/min_terminated_length": 877.0, "epoch": 0.44140625, "frac_reward_zero_std": 0.0, "grad_norm": 0.33136165142059326, "kl": 0.0005395807238528505, "learning_rate": 2.4127512582437486e-06, "loss": 0.0, "num_tokens": 5926349.0, "reward": -0.11549624800682068, "reward_std": 0.7324655055999756, "rewards/reward_function/mean": -0.11549624800682068, "rewards/reward_function/std": 0.7324654459953308, "step": 113 }, { "completion_length": 956.625, "completions/clipped_ratio": 0.0, "completions/max_length": 1136.0, "completions/max_terminated_length": 1136.0, "completions/mean_length": 956.625, "completions/mean_terminated_length": 956.625, "completions/min_length": 763.0, "completions/min_terminated_length": 763.0, "epoch": 0.4453125, "frac_reward_zero_std": 0.0, "grad_norm": 0.325693815946579, "kl": 0.000503135692270007, "learning_rate": 2.3691601093926406e-06, "loss": 0.0, "num_tokens": 5966210.0, "reward": -0.006239533424377441, "reward_std": 0.6517221927642822, "rewards/reward_function/mean": -0.006239533424377441, "rewards/reward_function/std": 0.651722252368927, "step": 114 }, { "completion_length": 1024.625, "completions/clipped_ratio": 0.0, "completions/max_length": 1456.0, "completions/max_terminated_length": 1456.0, "completions/mean_length": 1024.625, "completions/mean_terminated_length": 1024.625, "completions/min_length": 864.0, "completions/min_terminated_length": 864.0, "epoch": 0.44921875, "frac_reward_zero_std": 0.0, "grad_norm": 0.3105911612510681, "kl": 0.0004542986207525246, "learning_rate": 2.325608815639687e-06, "loss": 0.0, "num_tokens": 6011255.0, "reward": 0.0608128160238266, "reward_std": 0.6547765731811523, "rewards/reward_function/mean": 0.0608128160238266, "rewards/reward_function/std": 0.6547765731811523, "step": 115 }, { "completion_length": 1240.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1532.0, "completions/max_terminated_length": 1532.0, "completions/mean_length": 1240.75, "completions/mean_terminated_length": 1240.75, "completions/min_length": 942.0, "completions/min_terminated_length": 942.0, "epoch": 0.453125, "frac_reward_zero_std": 0.0, "grad_norm": 0.26683756709098816, "kl": 0.0005024799829698168, "learning_rate": 2.2821106431308546e-06, "loss": 0.0, "num_tokens": 6053389.0, "reward": 0.2592804431915283, "reward_std": 0.50887131690979, "rewards/reward_function/mean": 0.2592804431915283, "rewards/reward_function/std": 0.5088713765144348, "step": 116 }, { "completion_length": 1007.875, "completions/clipped_ratio": 0.0, "completions/max_length": 1154.0, "completions/max_terminated_length": 1154.0, "completions/mean_length": 1007.875, "completions/mean_terminated_length": 1007.875, "completions/min_length": 794.0, "completions/min_terminated_length": 794.0, "epoch": 0.45703125, "frac_reward_zero_std": 0.0, "grad_norm": 0.3248363137245178, "kl": 0.0005079211114207283, "learning_rate": 2.238678841830867e-06, "loss": 0.0, "num_tokens": 6098300.0, "reward": -0.29335445165634155, "reward_std": 0.7554491758346558, "rewards/reward_function/mean": -0.29335445165634155, "rewards/reward_function/std": 0.7554491758346558, "step": 117 }, { "completion_length": 1096.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1226.0, "completions/max_terminated_length": 1226.0, "completions/mean_length": 1096.25, "completions/mean_terminated_length": 1096.25, "completions/min_length": 902.0, "completions/min_terminated_length": 902.0, "epoch": 0.4609375, "frac_reward_zero_std": 0.0, "grad_norm": 0.32839903235435486, "kl": 0.0005834192561451346, "learning_rate": 2.195326641487132e-06, "loss": 0.0, "num_tokens": 6139278.0, "reward": -0.10875681042671204, "reward_std": 0.7381133437156677, "rewards/reward_function/mean": -0.10875681042671204, "rewards/reward_function/std": 0.7381134033203125, "step": 118 }, { "completion_length": 1068.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1261.0, "completions/max_terminated_length": 1261.0, "completions/mean_length": 1068.5, "completions/mean_terminated_length": 1068.5, "completions/min_length": 838.0, "completions/min_terminated_length": 838.0, "epoch": 0.46484375, "frac_reward_zero_std": 0.0, "grad_norm": 0.3054167628288269, "kl": 0.0005187875867704861, "learning_rate": 2.1520672475998374e-06, "loss": 0.0, "num_tokens": 6184674.0, "reward": -0.356624573469162, "reward_std": 0.7082850933074951, "rewards/reward_function/mean": -0.356624573469162, "rewards/reward_function/std": 0.7082850933074951, "step": 119 }, { "completion_length": 1191.875, "completions/clipped_ratio": 0.0, "completions/max_length": 1684.0, "completions/max_terminated_length": 1684.0, "completions/mean_length": 1191.875, "completions/mean_terminated_length": 1191.875, "completions/min_length": 929.0, "completions/min_terminated_length": 929.0, "epoch": 0.46875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2923835217952728, "kl": 0.0005189872026676312, "learning_rate": 2.1089138373994226e-06, "loss": 0.0, "num_tokens": 6226417.0, "reward": -0.10591146349906921, "reward_std": 0.7404077053070068, "rewards/reward_function/mean": -0.10591146349906921, "rewards/reward_function/std": 0.7404076457023621, "step": 120 }, { "completion_length": 987.125, "completions/clipped_ratio": 0.0, "completions/max_length": 1249.0, "completions/max_terminated_length": 1249.0, "completions/mean_length": 987.125, "completions/mean_terminated_length": 987.125, "completions/min_length": 757.0, "completions/min_terminated_length": 757.0, "epoch": 0.47265625, "frac_reward_zero_std": 0.0, "grad_norm": 0.31898999214172363, "kl": 0.0005351051659090444, "learning_rate": 2.0658795558326745e-06, "loss": 0.0, "num_tokens": 6273762.0, "reward": -0.007490452378988266, "reward_std": 0.645849347114563, "rewards/reward_function/mean": -0.007490452378988266, "rewards/reward_function/std": 0.645849347114563, "step": 121 }, { "completion_length": 1239.125, "completions/clipped_ratio": 0.0, "completions/max_length": 2190.0, "completions/max_terminated_length": 2190.0, "completions/mean_length": 1239.125, "completions/mean_terminated_length": 1239.125, "completions/min_length": 834.0, "completions/min_terminated_length": 834.0, "epoch": 0.4765625, "frac_reward_zero_std": 0.0, "grad_norm": 0.3210497498512268, "kl": 0.0005187717179069296, "learning_rate": 2.022977511558638e-06, "loss": 0.0, "num_tokens": 6332283.0, "reward": 0.05918067693710327, "reward_std": 0.6538223028182983, "rewards/reward_function/mean": 0.05918067693710327, "rewards/reward_function/std": 0.6538223624229431, "step": 122 }, { "completion_length": 1006.125, "completions/clipped_ratio": 0.0, "completions/max_length": 1189.0, "completions/max_terminated_length": 1189.0, "completions/mean_length": 1006.125, "completions/mean_terminated_length": 1006.125, "completions/min_length": 898.0, "completions/min_terminated_length": 898.0, "epoch": 0.48046875, "frac_reward_zero_std": 0.0, "grad_norm": 0.3229573667049408, "kl": 0.0005403026298154145, "learning_rate": 1.9802207729556023e-06, "loss": 0.0, "num_tokens": 6379780.0, "reward": 0.17201174795627594, "reward_std": 0.5175948143005371, "rewards/reward_function/mean": 0.17201174795627594, "rewards/reward_function/std": 0.5175948739051819, "step": 123 }, { "completion_length": 971.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1154.0, "completions/max_terminated_length": 1154.0, "completions/mean_length": 971.0, "completions/mean_terminated_length": 971.0, "completions/min_length": 845.0, "completions/min_terminated_length": 845.0, "epoch": 0.484375, "frac_reward_zero_std": 0.0, "grad_norm": 0.309548944234848, "kl": 0.00047888144035823643, "learning_rate": 1.937622364140338e-06, "loss": 0.0, "num_tokens": 6436156.0, "reward": -0.12267381697893143, "reward_std": 0.7265216708183289, "rewards/reward_function/mean": -0.12267381697893143, "rewards/reward_function/std": 0.7265217304229736, "step": 124 }, { "completion_length": 917.125, "completions/clipped_ratio": 0.0, "completions/max_length": 1127.0, "completions/max_terminated_length": 1127.0, "completions/mean_length": 917.125, "completions/mean_terminated_length": 917.125, "completions/min_length": 698.0, "completions/min_terminated_length": 698.0, "epoch": 0.48828125, "frac_reward_zero_std": 0.0, "grad_norm": 0.3436194658279419, "kl": 0.0005273657734505832, "learning_rate": 1.895195261000831e-06, "loss": 0.0, "num_tokens": 6482941.0, "reward": 0.06335102021694183, "reward_std": 0.6563363075256348, "rewards/reward_function/mean": 0.06335102021694183, "rewards/reward_function/std": 0.6563363075256348, "step": 125 }, { "completion_length": 981.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1088.0, "completions/max_terminated_length": 1088.0, "completions/mean_length": 981.25, "completions/mean_terminated_length": 981.25, "completions/min_length": 882.0, "completions/min_terminated_length": 882.0, "epoch": 0.4921875, "frac_reward_zero_std": 0.0, "grad_norm": 0.3238428235054016, "kl": 0.00048635365965310484, "learning_rate": 1.852952387243698e-06, "loss": 0.0, "num_tokens": 6539399.0, "reward": 0.4136182963848114, "reward_std": 0.0051859593950212, "rewards/reward_function/mean": 0.4136182963848114, "rewards/reward_function/std": 0.005185958929359913, "step": 126 }, { "completion_length": 980.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1164.0, "completions/max_terminated_length": 1164.0, "completions/mean_length": 980.25, "completions/mean_terminated_length": 980.25, "completions/min_length": 865.0, "completions/min_terminated_length": 865.0, "epoch": 0.49609375, "frac_reward_zero_std": 0.0, "grad_norm": 0.32173460721969604, "kl": 0.0005382613235269673, "learning_rate": 1.8109066104575023e-06, "loss": -0.0, "num_tokens": 6586689.0, "reward": 0.4277029037475586, "reward_std": 0.009604846127331257, "rewards/reward_function/mean": 0.4277029037475586, "rewards/reward_function/std": 0.00960485078394413, "step": 127 }, { "completion_length": 1130.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1473.0, "completions/max_terminated_length": 1473.0, "completions/mean_length": 1130.5, "completions/mean_terminated_length": 1130.5, "completions/min_length": 904.0, "completions/min_terminated_length": 904.0, "epoch": 0.5, "frac_reward_zero_std": 0.0, "grad_norm": 0.27207547426223755, "kl": 0.0005242059778538533, "learning_rate": 1.7690707381931585e-06, "loss": 0.0, "num_tokens": 6644341.0, "reward": -0.4683274030685425, "reward_std": 0.7338022589683533, "rewards/reward_function/mean": -0.4683274030685425, "rewards/reward_function/std": 0.7338022589683533, "step": 128 }, { "completion_length": 994.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1057.0, "completions/max_terminated_length": 1057.0, "completions/mean_length": 994.5, "completions/mean_terminated_length": 994.5, "completions/min_length": 866.0, "completions/min_terminated_length": 866.0, "epoch": 0.50390625, "frac_reward_zero_std": 0.0, "grad_norm": 0.30755338072776794, "kl": 0.0005628820799756795, "learning_rate": 1.7274575140626318e-06, "loss": 0.0, "num_tokens": 6698833.0, "reward": -0.016938716173171997, "reward_std": 0.645233154296875, "rewards/reward_function/mean": -0.016938716173171997, "rewards/reward_function/std": 0.6452332139015198, "step": 129 }, { "completion_length": 1191.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1623.0, "completions/max_terminated_length": 1623.0, "completions/mean_length": 1191.5, "completions/mean_terminated_length": 1191.5, "completions/min_length": 890.0, "completions/min_terminated_length": 890.0, "epoch": 0.5078125, "frac_reward_zero_std": 0.0, "grad_norm": 0.2911682426929474, "kl": 0.0005079254988231696, "learning_rate": 1.686079613857109e-06, "loss": 0.0, "num_tokens": 6755397.0, "reward": -0.10992002487182617, "reward_std": 0.7371261119842529, "rewards/reward_function/mean": -0.10992002487182617, "rewards/reward_function/std": 0.7371261119842529, "step": 130 }, { "completion_length": 929.375, "completions/clipped_ratio": 0.0, "completions/max_length": 1217.0, "completions/max_terminated_length": 1217.0, "completions/mean_length": 929.375, "completions/mean_terminated_length": 929.375, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.51171875, "frac_reward_zero_std": 0.0, "grad_norm": 1.3324370384216309, "kl": 0.0007534601754741743, "learning_rate": 1.6449496416858285e-06, "loss": 0.0, "num_tokens": 6809368.0, "reward": 0.1685469150543213, "reward_std": 0.5241107940673828, "rewards/reward_function/mean": 0.1685469150543213, "rewards/reward_function/std": 0.5241108536720276, "step": 131 }, { "completion_length": 1025.625, "completions/clipped_ratio": 0.0, "completions/max_length": 1330.0, "completions/max_terminated_length": 1330.0, "completions/mean_length": 1025.625, "completions/mean_terminated_length": 1025.625, "completions/min_length": 854.0, "completions/min_terminated_length": 854.0, "epoch": 0.515625, "frac_reward_zero_std": 0.0, "grad_norm": 0.34628283977508545, "kl": 0.0005288709871820174, "learning_rate": 1.6040801261367494e-06, "loss": 0.0, "num_tokens": 6864605.0, "reward": -0.10574899613857269, "reward_std": 0.7405577301979065, "rewards/reward_function/mean": -0.10574899613857269, "rewards/reward_function/std": 0.7405577898025513, "step": 132 }, { "completion_length": 1157.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1519.0, "completions/max_terminated_length": 1519.0, "completions/mean_length": 1157.75, "completions/mean_terminated_length": 1157.75, "completions/min_length": 886.0, "completions/min_terminated_length": 886.0, "epoch": 0.51953125, "frac_reward_zero_std": 0.0, "grad_norm": 0.29528382420539856, "kl": 0.0005665956414304674, "learning_rate": 1.56348351646022e-06, "loss": 0.0, "num_tokens": 6920403.0, "reward": -0.111003577709198, "reward_std": 0.7361726760864258, "rewards/reward_function/mean": -0.111003577709198, "rewards/reward_function/std": 0.7361726760864258, "step": 133 }, { "completion_length": 1294.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2684.0, "completions/max_terminated_length": 2684.0, "completions/mean_length": 1294.25, "completions/mean_terminated_length": 1294.25, "completions/min_length": 824.0, "completions/min_terminated_length": 824.0, "epoch": 0.5234375, "frac_reward_zero_std": 0.0, "grad_norm": 0.19954568147659302, "kl": 0.0004810404570889659, "learning_rate": 1.5231721787768162e-06, "loss": 0.0, "num_tokens": 6977789.0, "reward": 0.2454666793346405, "reward_std": 0.503404974937439, "rewards/reward_function/mean": 0.2454666793346405, "rewards/reward_function/std": 0.503404974937439, "step": 134 }, { "completion_length": 1026.875, "completions/clipped_ratio": 0.0, "completions/max_length": 1272.0, "completions/max_terminated_length": 1272.0, "completions/mean_length": 1026.875, "completions/mean_terminated_length": 1026.875, "completions/min_length": 663.0, "completions/min_terminated_length": 663.0, "epoch": 0.52734375, "frac_reward_zero_std": 0.0, "grad_norm": 0.2833687961101532, "kl": 0.00051848181465175, "learning_rate": 1.4831583923105e-06, "loss": 0.0, "num_tokens": 7032540.0, "reward": 0.23018378019332886, "reward_std": 0.4979630708694458, "rewards/reward_function/mean": 0.23018378019332886, "rewards/reward_function/std": 0.4979631304740906, "step": 135 }, { "completion_length": 1143.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2065.0, "completions/max_terminated_length": 2065.0, "completions/mean_length": 1143.5, "completions/mean_terminated_length": 1143.5, "completions/min_length": 831.0, "completions/min_terminated_length": 831.0, "epoch": 0.53125, "frac_reward_zero_std": 0.0, "grad_norm": 0.2916565239429474, "kl": 0.0005820317019242793, "learning_rate": 1.443454345648252e-06, "loss": 0.0, "num_tokens": 7088720.0, "reward": 0.0656561329960823, "reward_std": 0.6580207943916321, "rewards/reward_function/mean": 0.0656561329960823, "rewards/reward_function/std": 0.6580208539962769, "step": 136 }, { "completion_length": 1176.875, "completions/clipped_ratio": 0.0, "completions/max_length": 1339.0, "completions/max_terminated_length": 1339.0, "completions/mean_length": 1176.875, "completions/mean_terminated_length": 1176.875, "completions/min_length": 1011.0, "completions/min_terminated_length": 1011.0, "epoch": 0.53515625, "frac_reward_zero_std": 0.0, "grad_norm": 0.3021836280822754, "kl": 0.00046610408753622323, "learning_rate": 1.4040721330273063e-06, "loss": 0.0, "num_tokens": 7143431.0, "reward": -0.28037768602371216, "reward_std": 0.7693378329277039, "rewards/reward_function/mean": -0.28037768602371216, "rewards/reward_function/std": 0.7693378329277039, "step": 137 }, { "completion_length": 1035.125, "completions/clipped_ratio": 0.0, "completions/max_length": 1316.0, "completions/max_terminated_length": 1316.0, "completions/mean_length": 1035.125, "completions/mean_terminated_length": 1035.125, "completions/min_length": 822.0, "completions/min_terminated_length": 822.0, "epoch": 0.5390625, "frac_reward_zero_std": 0.0, "grad_norm": 0.32248759269714355, "kl": 0.00046888978249626234, "learning_rate": 1.3650237506511333e-06, "loss": 0.0, "num_tokens": 7194392.0, "reward": 0.250247597694397, "reward_std": 0.5053042769432068, "rewards/reward_function/mean": 0.250247597694397, "rewards/reward_function/std": 0.5053042769432068, "step": 138 }, { "completion_length": 1030.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1268.0, "completions/max_terminated_length": 1268.0, "completions/mean_length": 1030.25, "completions/mean_terminated_length": 1030.25, "completions/min_length": 778.0, "completions/min_terminated_length": 778.0, "epoch": 0.54296875, "frac_reward_zero_std": 0.0, "grad_norm": 0.3456158936023712, "kl": 0.0005144016322446987, "learning_rate": 1.3263210930352737e-06, "loss": 0.0, "num_tokens": 7247930.0, "reward": 0.2453567385673523, "reward_std": 0.503298282623291, "rewards/reward_function/mean": 0.2453567385673523, "rewards/reward_function/std": 0.503298282623291, "step": 139 }, { "completion_length": 1074.125, "completions/clipped_ratio": 0.0, "completions/max_length": 1220.0, "completions/max_terminated_length": 1220.0, "completions/mean_length": 1074.125, "completions/mean_terminated_length": 1074.125, "completions/min_length": 887.0, "completions/min_terminated_length": 887.0, "epoch": 0.546875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2869755029678345, "kl": 0.00043533078132895753, "learning_rate": 1.2879759493841577e-06, "loss": 0.0, "num_tokens": 7299203.0, "reward": -0.10356661677360535, "reward_std": 0.7423522472381592, "rewards/reward_function/mean": -0.10356661677360535, "rewards/reward_function/std": 0.7423522472381592, "step": 140 }, { "completion_length": 1284.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1708.0, "completions/max_terminated_length": 1708.0, "completions/mean_length": 1284.0, "completions/mean_terminated_length": 1284.0, "completions/min_length": 1076.0, "completions/min_terminated_length": 1076.0, "epoch": 0.55078125, "frac_reward_zero_std": 0.0, "grad_norm": 0.2680242955684662, "kl": 0.000499433335789945, "learning_rate": 1.2500000000000007e-06, "loss": 0.0, "num_tokens": 7354771.0, "reward": 0.014156922698020935, "reward_std": 0.6485656499862671, "rewards/reward_function/mean": 0.014156922698020935, "rewards/reward_function/std": 0.6485656499862671, "step": 141 }, { "completion_length": 1086.625, "completions/clipped_ratio": 0.0, "completions/max_length": 1583.0, "completions/max_terminated_length": 1583.0, "completions/mean_length": 1086.625, "completions/mean_terminated_length": 1086.625, "completions/min_length": 876.0, "completions/min_terminated_length": 876.0, "epoch": 0.5546875, "frac_reward_zero_std": 0.0, "grad_norm": 0.28385043144226074, "kl": 0.0004665546293836087, "learning_rate": 1.2124048127248644e-06, "loss": 0.0, "num_tokens": 7406144.0, "reward": 0.07084375619888306, "reward_std": 0.6610893607139587, "rewards/reward_function/mean": 0.07084375619888306, "rewards/reward_function/std": 0.661089301109314, "step": 142 }, { "completion_length": 1445.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1970.0, "completions/max_terminated_length": 1970.0, "completions/mean_length": 1445.0, "completions/mean_terminated_length": 1445.0, "completions/min_length": 1009.0, "completions/min_terminated_length": 1009.0, "epoch": 0.55859375, "frac_reward_zero_std": 0.0, "grad_norm": 0.27446597814559937, "kl": 0.0005598404095508158, "learning_rate": 1.1752018394169882e-06, "loss": 0.0, "num_tokens": 7463000.0, "reward": -0.09585624933242798, "reward_std": 0.7488118410110474, "rewards/reward_function/mean": -0.09585624933242798, "rewards/reward_function/std": 0.7488118410110474, "step": 143 }, { "completion_length": 1051.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1423.0, "completions/max_terminated_length": 1423.0, "completions/mean_length": 1051.5, "completions/mean_terminated_length": 1051.5, "completions/min_length": 884.0, "completions/min_terminated_length": 884.0, "epoch": 0.5625, "frac_reward_zero_std": 0.0, "grad_norm": 0.28686314821243286, "kl": 0.000490581318445038, "learning_rate": 1.1384024124624324e-06, "loss": 0.0, "num_tokens": 7514092.0, "reward": 0.25037020444869995, "reward_std": 0.5052564740180969, "rewards/reward_function/mean": 0.25037020444869995, "rewards/reward_function/std": 0.5052564740180969, "step": 144 }, { "completion_length": 910.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1172.0, "completions/max_terminated_length": 1172.0, "completions/mean_length": 910.25, "completions/mean_terminated_length": 910.25, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.56640625, "frac_reward_zero_std": 0.0, "grad_norm": 2.037623882293701, "kl": 0.0005943369615124539, "learning_rate": 1.1020177413231334e-06, "loss": 0.0, "num_tokens": 7570974.0, "reward": 0.25001391768455505, "reward_std": 0.5055700540542603, "rewards/reward_function/mean": 0.25001391768455505, "rewards/reward_function/std": 0.505570113658905, "step": 145 }, { "completion_length": 1040.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1460.0, "completions/max_terminated_length": 1460.0, "completions/mean_length": 1040.5, "completions/mean_terminated_length": 1040.5, "completions/min_length": 847.0, "completions/min_terminated_length": 847.0, "epoch": 0.5703125, "frac_reward_zero_std": 0.0, "grad_norm": 0.4468381106853485, "kl": 0.0005619311414193362, "learning_rate": 1.0660589091223854e-06, "loss": 0.0, "num_tokens": 7620194.0, "reward": 0.23401594161987305, "reward_std": 0.49918097257614136, "rewards/reward_function/mean": 0.23401594161987305, "rewards/reward_function/std": 0.49918097257614136, "step": 146 }, { "completion_length": 1247.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1814.0, "completions/max_terminated_length": 1814.0, "completions/mean_length": 1247.5, "completions/mean_terminated_length": 1247.5, "completions/min_length": 901.0, "completions/min_terminated_length": 901.0, "epoch": 0.57421875, "frac_reward_zero_std": 0.0, "grad_norm": 0.3102348744869232, "kl": 0.0004937511766911484, "learning_rate": 1.0305368692688175e-06, "loss": 0.0, "num_tokens": 7679774.0, "reward": 0.08763788640499115, "reward_std": 0.671554684638977, "rewards/reward_function/mean": 0.08763788640499115, "rewards/reward_function/std": 0.671554684638977, "step": 147 }, { "completion_length": 1088.375, "completions/clipped_ratio": 0.0, "completions/max_length": 1270.0, "completions/max_terminated_length": 1270.0, "completions/mean_length": 1088.375, "completions/mean_terminated_length": 1088.375, "completions/min_length": 955.0, "completions/min_terminated_length": 955.0, "epoch": 0.578125, "frac_reward_zero_std": 0.0, "grad_norm": 0.3440247178077698, "kl": 0.000519426612299867, "learning_rate": 9.95462442119879e-07, "loss": 0.0, "num_tokens": 7729377.0, "reward": 0.004164457321166992, "reward_std": 0.6441723108291626, "rewards/reward_function/mean": 0.004164457321166992, "rewards/reward_function/std": 0.6441722512245178, "step": 148 }, { "completion_length": 1232.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1499.0, "completions/max_terminated_length": 1499.0, "completions/mean_length": 1232.0, "completions/mean_terminated_length": 1232.0, "completions/min_length": 906.0, "completions/min_terminated_length": 906.0, "epoch": 0.58203125, "frac_reward_zero_std": 0.0, "grad_norm": 0.2699142098426819, "kl": 0.0005399520887294784, "learning_rate": 9.608463116858544e-07, "loss": 0.0, "num_tokens": 7788833.0, "reward": -0.1629893034696579, "reward_std": 0.7170795202255249, "rewards/reward_function/mean": -0.1629893034696579, "rewards/reward_function/std": 0.7170795202255249, "step": 149 }, { "completion_length": 1058.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1149.0, "completions/max_terminated_length": 1149.0, "completions/mean_length": 1058.25, "completions/mean_terminated_length": 1058.25, "completions/min_length": 867.0, "completions/min_terminated_length": 867.0, "epoch": 0.5859375, "frac_reward_zero_std": 0.0, "grad_norm": 0.2994002103805542, "kl": 0.0004978677097824402, "learning_rate": 9.266990223754069e-07, "loss": 0.0, "num_tokens": 7838195.0, "reward": -0.6465986371040344, "reward_std": 0.6543768048286438, "rewards/reward_function/mean": -0.6465986371040344, "rewards/reward_function/std": 0.6543768048286438, "step": 150 }, { "completion_length": 1092.375, "completions/clipped_ratio": 0.0, "completions/max_length": 1344.0, "completions/max_terminated_length": 1344.0, "completions/mean_length": 1092.375, "completions/mean_terminated_length": 1092.375, "completions/min_length": 787.0, "completions/min_terminated_length": 787.0, "epoch": 0.58984375, "frac_reward_zero_std": 0.0, "grad_norm": 0.30443111062049866, "kl": 0.000516350322868675, "learning_rate": 8.930309757836517e-07, "loss": 0.0, "num_tokens": 7896534.0, "reward": 0.008587591350078583, "reward_std": 0.6500032544136047, "rewards/reward_function/mean": 0.008587591350078583, "rewards/reward_function/std": 0.6500033140182495, "step": 151 }, { "completion_length": 969.875, "completions/clipped_ratio": 0.0, "completions/max_length": 1327.0, "completions/max_terminated_length": 1327.0, "completions/mean_length": 969.875, "completions/mean_terminated_length": 969.875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.59375, "frac_reward_zero_std": 0.0, "grad_norm": 0.7537317276000977, "kl": 0.0005359335409593768, "learning_rate": 8.598524275237321e-07, "loss": 0.0, "num_tokens": 7945189.0, "reward": -0.24850904941558838, "reward_std": 0.6657283306121826, "rewards/reward_function/mean": -0.24850904941558838, "rewards/reward_function/std": 0.6657283306121826, "step": 152 }, { "completion_length": 1033.625, "completions/clipped_ratio": 0.0, "completions/max_length": 1727.0, "completions/max_terminated_length": 1727.0, "completions/mean_length": 1033.625, "completions/mean_terminated_length": 1033.625, "completions/min_length": 667.0, "completions/min_terminated_length": 667.0, "epoch": 0.59765625, "frac_reward_zero_std": 0.0, "grad_norm": 0.34412840008735657, "kl": 0.000547852658201009, "learning_rate": 8.271734841028553e-07, "loss": 0.0, "num_tokens": 7996642.0, "reward": 0.16972142457962036, "reward_std": 0.5201033353805542, "rewards/reward_function/mean": 0.16972142457962036, "rewards/reward_function/std": 0.5201033353805542, "step": 153 }, { "completion_length": 828.875, "completions/clipped_ratio": 0.0, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 828.875, "completions/mean_terminated_length": 828.875, "completions/min_length": 545.0, "completions/min_terminated_length": 545.0, "epoch": 0.6015625, "frac_reward_zero_std": 0.0, "grad_norm": 0.3883880376815796, "kl": 0.0005664439522661269, "learning_rate": 7.950040998437541e-07, "loss": 0.0, "num_tokens": 8046737.0, "reward": -0.36253300309181213, "reward_std": 0.7088689208030701, "rewards/reward_function/mean": -0.36253300309181213, "rewards/reward_function/std": 0.7088689208030701, "step": 154 }, { "completion_length": 1078.375, "completions/clipped_ratio": 0.0, "completions/max_length": 1602.0, "completions/max_terminated_length": 1602.0, "completions/mean_length": 1078.375, "completions/mean_terminated_length": 1078.375, "completions/min_length": 809.0, "completions/min_terminated_length": 809.0, "epoch": 0.60546875, "frac_reward_zero_std": 0.0, "grad_norm": 0.3525019586086273, "kl": 0.000516050225996878, "learning_rate": 7.633540738525066e-07, "loss": 0.0, "num_tokens": 8098548.0, "reward": 0.010962013155221939, "reward_std": 0.6493006348609924, "rewards/reward_function/mean": 0.010962013155221939, "rewards/reward_function/std": 0.6493006944656372, "step": 155 }, { "completion_length": 900.625, "completions/clipped_ratio": 0.0, "completions/max_length": 1039.0, "completions/max_terminated_length": 1039.0, "completions/mean_length": 900.625, "completions/mean_terminated_length": 900.625, "completions/min_length": 755.0, "completions/min_terminated_length": 755.0, "epoch": 0.609375, "frac_reward_zero_std": 0.0, "grad_norm": 0.4146842658519745, "kl": 0.0006649910937994719, "learning_rate": 7.322330470336314e-07, "loss": 0.0, "num_tokens": 8149217.0, "reward": 0.07593274116516113, "reward_std": 0.6641193628311157, "rewards/reward_function/mean": 0.07593274116516113, "rewards/reward_function/std": 0.6641193628311157, "step": 156 }, { "completion_length": 794.0, "completions/clipped_ratio": 0.0, "completions/max_length": 999.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 794.0, "completions/mean_terminated_length": 794.0, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.61328125, "frac_reward_zero_std": 0.0, "grad_norm": 0.9788252711296082, "kl": 0.000908300731680356, "learning_rate": 7.016504991533727e-07, "loss": 0.0, "num_tokens": 8198753.0, "reward": -0.28955256938934326, "reward_std": 0.7595077753067017, "rewards/reward_function/mean": -0.28955256938934326, "rewards/reward_function/std": 0.7595077753067017, "step": 157 }, { "completion_length": 859.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 859.25, "completions/mean_terminated_length": 859.25, "completions/min_length": 712.0, "completions/min_terminated_length": 712.0, "epoch": 0.6171875, "frac_reward_zero_std": 0.0, "grad_norm": 0.3605237305164337, "kl": 0.0005935588560532779, "learning_rate": 6.716157459520739e-07, "loss": 0.0, "num_tokens": 8249091.0, "reward": 0.25031226873397827, "reward_std": 0.5052971839904785, "rewards/reward_function/mean": 0.25031226873397827, "rewards/reward_function/std": 0.5052971839904785, "step": 158 }, { "completion_length": 1081.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1559.0, "completions/max_terminated_length": 1559.0, "completions/mean_length": 1081.75, "completions/mean_terminated_length": 1081.75, "completions/min_length": 846.0, "completions/min_terminated_length": 846.0, "epoch": 0.62109375, "frac_reward_zero_std": 0.0, "grad_norm": 0.39315265417099, "kl": 0.0004596119179041125, "learning_rate": 6.421379363065142e-07, "loss": 0.0, "num_tokens": 8300929.0, "reward": 0.24795930087566376, "reward_std": 0.5042985677719116, "rewards/reward_function/mean": 0.24795930087566376, "rewards/reward_function/std": 0.5042985677719116, "step": 159 }, { "completion_length": 1107.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1957.0, "completions/max_terminated_length": 1957.0, "completions/mean_length": 1107.25, "completions/mean_terminated_length": 1107.25, "completions/min_length": 792.0, "completions/min_terminated_length": 792.0, "epoch": 0.625, "frac_reward_zero_std": 0.0, "grad_norm": 0.352717787027359, "kl": 0.0006316443686955608, "learning_rate": 6.1322604944307e-07, "loss": 0.0, "num_tokens": 8353251.0, "reward": -0.1729724407196045, "reward_std": 0.7113326787948608, "rewards/reward_function/mean": -0.1729724407196045, "rewards/reward_function/std": 0.7113326787948608, "step": 160 }, { "completion_length": 1091.875, "completions/clipped_ratio": 0.0, "completions/max_length": 1355.0, "completions/max_terminated_length": 1355.0, "completions/mean_length": 1091.875, "completions/mean_terminated_length": 1091.875, "completions/min_length": 734.0, "completions/min_terminated_length": 734.0, "epoch": 0.62890625, "frac_reward_zero_std": 0.0, "grad_norm": 0.303595632314682, "kl": 0.0004378617668407969, "learning_rate": 5.848888922025553e-07, "loss": 0.0, "num_tokens": 8406186.0, "reward": -0.46176353096961975, "reward_std": 0.7428471446037292, "rewards/reward_function/mean": -0.46176353096961975, "rewards/reward_function/std": 0.742847204208374, "step": 161 }, { "completion_length": 1162.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1402.0, "completions/max_terminated_length": 1402.0, "completions/mean_length": 1162.75, "completions/mean_terminated_length": 1162.75, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.6328125, "frac_reward_zero_std": 0.0, "grad_norm": 1.2087342739105225, "kl": 0.0008970491835498251, "learning_rate": 5.571350963575728e-07, "loss": 0.0, "num_tokens": 8459736.0, "reward": -0.175228551030159, "reward_std": 0.7207473516464233, "rewards/reward_function/mean": -0.175228551030159, "rewards/reward_function/std": 0.7207473516464233, "step": 162 }, { "completion_length": 1177.125, "completions/clipped_ratio": 0.0, "completions/max_length": 1390.0, "completions/max_terminated_length": 1390.0, "completions/mean_length": 1177.125, "completions/mean_terminated_length": 1177.125, "completions/min_length": 978.0, "completions/min_terminated_length": 978.0, "epoch": 0.63671875, "frac_reward_zero_std": 0.0, "grad_norm": 0.29434847831726074, "kl": 0.0004823799536097795, "learning_rate": 5.299731159831953e-07, "loss": 0.0, "num_tokens": 8513353.0, "reward": 0.08043806254863739, "reward_std": 0.6669427156448364, "rewards/reward_function/mean": 0.08043806254863739, "rewards/reward_function/std": 0.6669427156448364, "step": 163 }, { "completion_length": 1363.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1791.0, "completions/max_terminated_length": 1791.0, "completions/mean_length": 1363.25, "completions/mean_terminated_length": 1363.25, "completions/min_length": 1010.0, "completions/min_terminated_length": 1010.0, "epoch": 0.640625, "frac_reward_zero_std": 0.0, "grad_norm": 0.27553999423980713, "kl": 0.0005040961696067825, "learning_rate": 5.034112248817685e-07, "loss": 0.0, "num_tokens": 8568507.0, "reward": 0.09454244375228882, "reward_std": 0.6757357120513916, "rewards/reward_function/mean": 0.09454244375228882, "rewards/reward_function/std": 0.6757358312606812, "step": 164 }, { "completion_length": 1147.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1400.0, "completions/max_terminated_length": 1400.0, "completions/mean_length": 1147.0, "completions/mean_terminated_length": 1147.0, "completions/min_length": 667.0, "completions/min_terminated_length": 667.0, "epoch": 0.64453125, "frac_reward_zero_std": 0.0, "grad_norm": 0.31530848145484924, "kl": 0.0004901376814814284, "learning_rate": 4.774575140626317e-07, "loss": 0.0, "num_tokens": 8621883.0, "reward": -0.2817423939704895, "reward_std": 0.7679179310798645, "rewards/reward_function/mean": -0.2817423939704895, "rewards/reward_function/std": 0.7679178714752197, "step": 165 }, { "completion_length": 1058.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1364.0, "completions/max_terminated_length": 1364.0, "completions/mean_length": 1058.0, "completions/mean_terminated_length": 1058.0, "completions/min_length": 522.0, "completions/min_terminated_length": 522.0, "epoch": 0.6484375, "frac_reward_zero_std": 0.0, "grad_norm": 0.35542306303977966, "kl": 0.000599144957959652, "learning_rate": 4.5211988927752026e-07, "loss": 0.0, "num_tokens": 8674595.0, "reward": 0.014520317316055298, "reward_std": 0.6543052196502686, "rewards/reward_function/mean": 0.014520317316055298, "rewards/reward_function/std": 0.6543052196502686, "step": 166 }, { "completion_length": 1177.875, "completions/clipped_ratio": 0.0, "completions/max_length": 2078.0, "completions/max_terminated_length": 2078.0, "completions/mean_length": 1177.875, "completions/mean_terminated_length": 1177.875, "completions/min_length": 863.0, "completions/min_terminated_length": 863.0, "epoch": 0.65234375, "frac_reward_zero_std": 0.0, "grad_norm": 0.290801465511322, "kl": 0.000524831673828885, "learning_rate": 4.27406068612396e-07, "loss": 0.0, "num_tokens": 8728218.0, "reward": -0.28122541308403015, "reward_std": 0.7684498429298401, "rewards/reward_function/mean": -0.28122541308403015, "rewards/reward_function/std": 0.7684498429298401, "step": 167 }, { "completion_length": 1099.375, "completions/clipped_ratio": 0.0, "completions/max_length": 1303.0, "completions/max_terminated_length": 1303.0, "completions/mean_length": 1099.375, "completions/mean_terminated_length": 1099.375, "completions/min_length": 820.0, "completions/min_terminated_length": 820.0, "epoch": 0.65625, "frac_reward_zero_std": 0.0, "grad_norm": 0.2773479223251343, "kl": 0.00045157810382079333, "learning_rate": 4.033235801364402e-07, "loss": 0.0, "num_tokens": 8781261.0, "reward": 0.0877399668097496, "reward_std": 0.671479344367981, "rewards/reward_function/mean": 0.0877399668097496, "rewards/reward_function/std": 0.671479344367981, "step": 168 }, { "completion_length": 1257.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2235.0, "completions/max_terminated_length": 2235.0, "completions/mean_length": 1257.25, "completions/mean_terminated_length": 1257.25, "completions/min_length": 939.0, "completions/min_terminated_length": 939.0, "epoch": 0.66015625, "frac_reward_zero_std": 0.0, "grad_norm": 0.29819291830062866, "kl": 0.00048441492253914475, "learning_rate": 3.798797596089351e-07, "loss": 0.0, "num_tokens": 8838007.0, "reward": -0.043121978640556335, "reward_std": 0.630191445350647, "rewards/reward_function/mean": -0.043121978640556335, "rewards/reward_function/std": 0.630191445350647, "step": 169 }, { "completion_length": 1023.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1202.0, "completions/max_terminated_length": 1202.0, "completions/mean_length": 1023.0, "completions/mean_terminated_length": 1023.0, "completions/min_length": 802.0, "completions/min_terminated_length": 802.0, "epoch": 0.6640625, "frac_reward_zero_std": 0.0, "grad_norm": 0.338838666677475, "kl": 0.000568984636629466, "learning_rate": 3.5708174824471947e-07, "loss": 0.0, "num_tokens": 8887223.0, "reward": 0.05965143442153931, "reward_std": 0.6547546982765198, "rewards/reward_function/mean": 0.05965143442153931, "rewards/reward_function/std": 0.654754638671875, "step": 170 }, { "completion_length": 1429.75, "completions/clipped_ratio": 0.125, "completions/max_length": 3500.0, "completions/max_terminated_length": 1337.0, "completions/mean_length": 1429.75, "completions/mean_terminated_length": 1134.0, "completions/min_length": 860.0, "completions/min_terminated_length": 860.0, "epoch": 0.66796875, "frac_reward_zero_std": 0.0, "grad_norm": 0.27894267439842224, "kl": 0.0004727757041109726, "learning_rate": 3.3493649053890325e-07, "loss": 0.0, "num_tokens": 8945349.0, "reward": -0.09473268687725067, "reward_std": 0.7497490644454956, "rewards/reward_function/mean": -0.09473268687725067, "rewards/reward_function/std": 0.7497490048408508, "step": 171 }, { "completion_length": 1153.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1421.0, "completions/max_terminated_length": 1421.0, "completions/mean_length": 1153.25, "completions/mean_terminated_length": 1153.25, "completions/min_length": 959.0, "completions/min_terminated_length": 959.0, "epoch": 0.671875, "frac_reward_zero_std": 0.0, "grad_norm": 0.3112727403640747, "kl": 0.0005989774799672887, "learning_rate": 3.134507321515107e-07, "loss": 0.0, "num_tokens": 8995607.0, "reward": -0.17081062495708466, "reward_std": 0.7099243998527527, "rewards/reward_function/mean": -0.17081062495708466, "rewards/reward_function/std": 0.7099243998527527, "step": 172 }, { "completion_length": 1032.125, "completions/clipped_ratio": 0.0, "completions/max_length": 1514.0, "completions/max_terminated_length": 1514.0, "completions/mean_length": 1032.125, "completions/mean_terminated_length": 1032.125, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.67578125, "frac_reward_zero_std": 0.0, "grad_norm": 0.9038693308830261, "kl": 0.0006451614681282081, "learning_rate": 2.9263101785268253e-07, "loss": 0.0, "num_tokens": 9050552.0, "reward": 0.08472549915313721, "reward_std": 0.669560432434082, "rewards/reward_function/mean": 0.08472549915313721, "rewards/reward_function/std": 0.669560432434082, "step": 173 }, { "completion_length": 934.125, "completions/clipped_ratio": 0.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 934.125, "completions/mean_terminated_length": 934.125, "completions/min_length": 730.0, "completions/min_terminated_length": 730.0, "epoch": 0.6796875, "frac_reward_zero_std": 0.0, "grad_norm": 0.3780133128166199, "kl": 0.0005501867926795967, "learning_rate": 2.7248368952908055e-07, "loss": 0.0, "num_tokens": 9099057.0, "reward": 0.24916215240955353, "reward_std": 0.5048820376396179, "rewards/reward_function/mean": 0.24916215240955353, "rewards/reward_function/std": 0.5048820972442627, "step": 174 }, { "completion_length": 1211.375, "completions/clipped_ratio": 0.0, "completions/max_length": 1936.0, "completions/max_terminated_length": 1936.0, "completions/mean_length": 1211.375, "completions/mean_terminated_length": 1211.375, "completions/min_length": 953.0, "completions/min_terminated_length": 953.0, "epoch": 0.68359375, "frac_reward_zero_std": 0.0, "grad_norm": 0.36097532510757446, "kl": 0.0005826732158311643, "learning_rate": 2.53014884252083e-07, "loss": 0.0, "num_tokens": 9155436.0, "reward": 0.2617029845714569, "reward_std": 0.5099496245384216, "rewards/reward_function/mean": 0.2617029845714569, "rewards/reward_function/std": 0.5099496245384216, "step": 175 }, { "completion_length": 1103.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1672.0, "completions/max_terminated_length": 1672.0, "completions/mean_length": 1103.25, "completions/mean_terminated_length": 1103.25, "completions/min_length": 870.0, "completions/min_terminated_length": 870.0, "epoch": 0.6875, "frac_reward_zero_std": 0.0, "grad_norm": 0.30660855770111084, "kl": 0.000493127474328503, "learning_rate": 2.3423053240837518e-07, "loss": 0.0, "num_tokens": 9205294.0, "reward": -0.10572409629821777, "reward_std": 0.7405680418014526, "rewards/reward_function/mean": -0.10572409629821777, "rewards/reward_function/std": 0.7405680418014526, "step": 176 }, { "completion_length": 964.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1215.0, "completions/max_terminated_length": 1215.0, "completions/mean_length": 964.5, "completions/mean_terminated_length": 964.5, "completions/min_length": 851.0, "completions/min_terminated_length": 851.0, "epoch": 0.69140625, "frac_reward_zero_std": 0.0, "grad_norm": 0.3269523084163666, "kl": 0.00049879898870131, "learning_rate": 2.1613635589349756e-07, "loss": 0.0, "num_tokens": 9261322.0, "reward": -0.4611111581325531, "reward_std": 0.743747889995575, "rewards/reward_function/mean": -0.4611111581325531, "rewards/reward_function/std": 0.743747889995575, "step": 177 }, { "completion_length": 1027.625, "completions/clipped_ratio": 0.0, "completions/max_length": 1438.0, "completions/max_terminated_length": 1438.0, "completions/mean_length": 1027.625, "completions/mean_terminated_length": 1027.625, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 0.6953125, "frac_reward_zero_std": 0.0, "grad_norm": 0.5114771127700806, "kl": 0.0005179118452360854, "learning_rate": 1.9873786636889908e-07, "loss": 0.0, "num_tokens": 9318415.0, "reward": 0.2489490509033203, "reward_std": 0.5048139095306396, "rewards/reward_function/mean": 0.2489490509033203, "rewards/reward_function/std": 0.5048139691352844, "step": 178 }, { "completion_length": 1043.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2555.0, "completions/max_terminated_length": 2555.0, "completions/mean_length": 1043.5, "completions/mean_terminated_length": 1043.5, "completions/min_length": 565.0, "completions/min_terminated_length": 565.0, "epoch": 0.69921875, "frac_reward_zero_std": 0.0, "grad_norm": 0.339715838432312, "kl": 0.0005509828915819526, "learning_rate": 1.8204036358303173e-07, "loss": 0.0, "num_tokens": 9375075.0, "reward": -0.17033275961875916, "reward_std": 0.7077411413192749, "rewards/reward_function/mean": -0.17033275961875916, "rewards/reward_function/std": 0.7077411413192749, "step": 179 }, { "completion_length": 1009.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1202.0, "completions/max_terminated_length": 1202.0, "completions/mean_length": 1009.5, "completions/mean_terminated_length": 1009.5, "completions/min_length": 620.0, "completions/min_terminated_length": 620.0, "epoch": 0.703125, "frac_reward_zero_std": 0.0, "grad_norm": 0.38060900568962097, "kl": 0.0005021176039008424, "learning_rate": 1.6604893375699594e-07, "loss": 0.0, "num_tokens": 9432023.0, "reward": 0.24431279301643372, "reward_std": 0.5027908682823181, "rewards/reward_function/mean": 0.24431279301643372, "rewards/reward_function/std": 0.5027908682823181, "step": 180 }, { "completion_length": 921.375, "completions/clipped_ratio": 0.0, "completions/max_length": 1216.0, "completions/max_terminated_length": 1216.0, "completions/mean_length": 921.375, "completions/mean_terminated_length": 921.375, "completions/min_length": 742.0, "completions/min_terminated_length": 742.0, "epoch": 0.70703125, "frac_reward_zero_std": 0.0, "grad_norm": 0.405112624168396, "kl": 0.0005638250586343929, "learning_rate": 1.507684480352292e-07, "loss": 0.0, "num_tokens": 9487706.0, "reward": 0.2474900782108307, "reward_std": 0.5042015910148621, "rewards/reward_function/mean": 0.2474900782108307, "rewards/reward_function/std": 0.5042015910148621, "step": 181 }, { "completion_length": 1200.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2126.0, "completions/max_terminated_length": 2126.0, "completions/mean_length": 1200.25, "completions/mean_terminated_length": 1200.25, "completions/min_length": 815.0, "completions/min_terminated_length": 815.0, "epoch": 0.7109375, "frac_reward_zero_std": 0.0, "grad_norm": 0.3271883726119995, "kl": 0.0006362074636854231, "learning_rate": 1.362035610017079e-07, "loss": 0.0, "num_tokens": 9546180.0, "reward": 0.05625756084918976, "reward_std": 0.6525231599807739, "rewards/reward_function/mean": 0.05625756084918976, "rewards/reward_function/std": 0.6525231599807739, "step": 182 }, { "completion_length": 926.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1297.0, "completions/max_terminated_length": 1297.0, "completions/mean_length": 926.0, "completions/mean_terminated_length": 926.0, "completions/min_length": 737.0, "completions/min_terminated_length": 737.0, "epoch": 0.71484375, "frac_reward_zero_std": 0.0, "grad_norm": 0.34687018394470215, "kl": 0.0005171666198293678, "learning_rate": 1.223587092621162e-07, "loss": 0.0, "num_tokens": 9601900.0, "reward": 0.0704411044716835, "reward_std": 0.6608877182006836, "rewards/reward_function/mean": 0.0704411044716835, "rewards/reward_function/std": 0.6608877182006836, "step": 183 }, { "completion_length": 1005.125, "completions/clipped_ratio": 0.0, "completions/max_length": 1611.0, "completions/max_terminated_length": 1611.0, "completions/mean_length": 1005.125, "completions/mean_terminated_length": 1005.125, "completions/min_length": 768.0, "completions/min_terminated_length": 768.0, "epoch": 0.71875, "frac_reward_zero_std": 0.0, "grad_norm": 0.35485363006591797, "kl": 0.0004917978003504686, "learning_rate": 1.0923811009241142e-07, "loss": 0.0, "num_tokens": 9658813.0, "reward": 0.2492428421974182, "reward_std": 0.5048900842666626, "rewards/reward_function/mean": 0.2492428421974182, "rewards/reward_function/std": 0.5048900842666626, "step": 184 }, { "completion_length": 990.625, "completions/clipped_ratio": 0.0, "completions/max_length": 1283.0, "completions/max_terminated_length": 1283.0, "completions/mean_length": 990.625, "completions/mean_terminated_length": 990.625, "completions/min_length": 751.0, "completions/min_terminated_length": 751.0, "epoch": 0.72265625, "frac_reward_zero_std": 0.0, "grad_norm": 0.36808741092681885, "kl": 0.0005794434182462282, "learning_rate": 9.684576015420277e-08, "loss": 0.0, "num_tokens": 9708434.0, "reward": -0.28416281938552856, "reward_std": 0.7652737498283386, "rewards/reward_function/mean": -0.28416281938552856, "rewards/reward_function/std": 0.7652737498283386, "step": 185 }, { "completion_length": 1321.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2448.0, "completions/max_terminated_length": 2448.0, "completions/mean_length": 1321.0, "completions/mean_terminated_length": 1321.0, "completions/min_length": 747.0, "completions/min_terminated_length": 747.0, "epoch": 0.7265625, "frac_reward_zero_std": 0.0, "grad_norm": 0.34419044852256775, "kl": 0.000558189676667098, "learning_rate": 8.518543427732951e-08, "loss": 0.0, "num_tokens": 9762722.0, "reward": -0.2857913076877594, "reward_std": 0.7635413408279419, "rewards/reward_function/mean": -0.2857913076877594, "rewards/reward_function/std": 0.7635413408279419, "step": 186 }, { "completion_length": 1260.375, "completions/clipped_ratio": 0.0, "completions/max_length": 1672.0, "completions/max_terminated_length": 1672.0, "completions/mean_length": 1260.375, "completions/mean_terminated_length": 1260.375, "completions/min_length": 913.0, "completions/min_terminated_length": 913.0, "epoch": 0.73046875, "frac_reward_zero_std": 0.0, "grad_norm": 0.3106219470500946, "kl": 0.000590090683544986, "learning_rate": 7.426068431000883e-08, "loss": 0.0, "num_tokens": 9814501.0, "reward": -0.10060018301010132, "reward_std": 0.7447962164878845, "rewards/reward_function/mean": -0.10060018301010132, "rewards/reward_function/std": 0.7447961568832397, "step": 187 }, { "completion_length": 883.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1120.0, "completions/max_terminated_length": 1120.0, "completions/mean_length": 883.25, "completions/mean_terminated_length": 883.25, "completions/min_length": 576.0, "completions/min_terminated_length": 576.0, "epoch": 0.734375, "frac_reward_zero_std": 0.0, "grad_norm": 0.4308620095252991, "kl": 0.0005764737070421688, "learning_rate": 6.407483803691216e-08, "loss": 0.0, "num_tokens": 9865287.0, "reward": -0.11281464993953705, "reward_std": 0.7346830368041992, "rewards/reward_function/mean": -0.11281464993953705, "rewards/reward_function/std": 0.7346829771995544, "step": 188 }, { "completion_length": 987.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1191.0, "completions/max_terminated_length": 1191.0, "completions/mean_length": 987.0, "completions/mean_terminated_length": 987.0, "completions/min_length": 773.0, "completions/min_terminated_length": 773.0, "epoch": 0.73828125, "frac_reward_zero_std": 0.0, "grad_norm": 0.34522199630737305, "kl": 0.0005745336093241349, "learning_rate": 5.463099816548578e-08, "loss": -0.0, "num_tokens": 9914879.0, "reward": 0.43888330459594727, "reward_std": 0.013809502124786377, "rewards/reward_function/mean": 0.43888330459594727, "rewards/reward_function/std": 0.013809490017592907, "step": 189 }, { "completion_length": 1093.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1639.0, "completions/max_terminated_length": 1639.0, "completions/mean_length": 1093.0, "completions/mean_terminated_length": 1093.0, "completions/min_length": 577.0, "completions/min_terminated_length": 577.0, "epoch": 0.7421875, "frac_reward_zero_std": 0.0, "grad_norm": 0.33963632583618164, "kl": 0.0006164438163978048, "learning_rate": 4.593204138084006e-08, "loss": 0.0, "num_tokens": 9967343.0, "reward": -0.4747491180896759, "reward_std": 0.7252565622329712, "rewards/reward_function/mean": -0.4747491180896759, "rewards/reward_function/std": 0.725256621837616, "step": 190 }, { "completion_length": 974.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1168.0, "completions/max_terminated_length": 1168.0, "completions/mean_length": 974.0, "completions/mean_terminated_length": 974.0, "completions/min_length": 856.0, "completions/min_terminated_length": 856.0, "epoch": 0.74609375, "frac_reward_zero_std": 0.0, "grad_norm": 0.33787915110588074, "kl": 0.0005020303433411755, "learning_rate": 3.798061746947995e-08, "loss": 0.0, "num_tokens": 10016831.0, "reward": 0.07625642418861389, "reward_std": 0.6643387675285339, "rewards/reward_function/mean": 0.07625642418861389, "rewards/reward_function/std": 0.6643387675285339, "step": 191 }, { "completion_length": 1223.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1915.0, "completions/max_terminated_length": 1915.0, "completions/mean_length": 1223.5, "completions/mean_terminated_length": 1223.5, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "epoch": 0.75, "frac_reward_zero_std": 0.0, "grad_norm": 0.3466210961341858, "kl": 0.0006273999460972846, "learning_rate": 3.077914851215585e-08, "loss": 0.0, "num_tokens": 10070339.0, "reward": -0.4622008502483368, "reward_std": 0.7422569990158081, "rewards/reward_function/mean": -0.4622008502483368, "rewards/reward_function/std": 0.7422570586204529, "step": 192 }, { "completion_length": 950.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1292.0, "completions/max_terminated_length": 1292.0, "completions/mean_length": 950.75, "completions/mean_terminated_length": 950.75, "completions/min_length": 504.0, "completions/min_terminated_length": 504.0, "epoch": 0.75390625, "frac_reward_zero_std": 0.0, "grad_norm": 0.3800007402896881, "kl": 0.0005864633276360109, "learning_rate": 2.4329828146074096e-08, "loss": 0.0, "num_tokens": 10124977.0, "reward": -0.28742432594299316, "reward_std": 0.76183021068573, "rewards/reward_function/mean": -0.28742432594299316, "rewards/reward_function/std": 0.7618302702903748, "step": 193 }, { "completion_length": 977.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1252.0, "completions/max_terminated_length": 1252.0, "completions/mean_length": 977.0, "completions/mean_terminated_length": 977.0, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 0.7578125, "frac_reward_zero_std": 0.0, "grad_norm": 0.550014317035675, "kl": 0.0005290872577461414, "learning_rate": 1.8634620896695044e-08, "loss": 0.0, "num_tokens": 10175113.0, "reward": 0.24153178930282593, "reward_std": 0.5017148852348328, "rewards/reward_function/mean": 0.24153178930282593, "rewards/reward_function/std": 0.5017149448394775, "step": 194 }, { "completion_length": 1148.125, "completions/clipped_ratio": 0.0, "completions/max_length": 1667.0, "completions/max_terminated_length": 1667.0, "completions/mean_length": 1148.125, "completions/mean_terminated_length": 1148.125, "completions/min_length": 707.0, "completions/min_terminated_length": 707.0, "epoch": 0.76171875, "frac_reward_zero_std": 0.0, "grad_norm": 0.3343985378742218, "kl": 0.0005916821173741482, "learning_rate": 1.3695261579316776e-08, "loss": 0.0, "num_tokens": 10231330.0, "reward": 0.09651876986026764, "reward_std": 0.511390209197998, "rewards/reward_function/mean": 0.09651876986026764, "rewards/reward_function/std": 0.511390209197998, "step": 195 }, { "completion_length": 1165.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1567.0, "completions/max_terminated_length": 1567.0, "completions/mean_length": 1165.5, "completions/mean_terminated_length": 1165.5, "completions/min_length": 1004.0, "completions/min_terminated_length": 1004.0, "epoch": 0.765625, "frac_reward_zero_std": 0.0, "grad_norm": 0.33010783791542053, "kl": 0.0005538033583434299, "learning_rate": 9.513254770636138e-09, "loss": 0.0, "num_tokens": 10282974.0, "reward": -0.1761539876461029, "reward_std": 0.7018446922302246, "rewards/reward_function/mean": -0.1761539876461029, "rewards/reward_function/std": 0.7018447518348694, "step": 196 }, { "completion_length": 1035.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1474.0, "completions/max_terminated_length": 1474.0, "completions/mean_length": 1035.0, "completions/mean_terminated_length": 1035.0, "completions/min_length": 764.0, "completions/min_terminated_length": 764.0, "epoch": 0.76953125, "frac_reward_zero_std": 0.0, "grad_norm": 0.27371305227279663, "kl": 0.000488734214741271, "learning_rate": 6.089874350439507e-09, "loss": -0.0, "num_tokens": 10338286.0, "reward": 0.4277012050151825, "reward_std": 0.013616181910037994, "rewards/reward_function/mean": 0.4277012050151825, "rewards/reward_function/std": 0.013616186566650867, "step": 197 }, { "completion_length": 888.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1162.0, "completions/max_terminated_length": 1162.0, "completions/mean_length": 888.75, "completions/mean_terminated_length": 888.75, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.7734375, "frac_reward_zero_std": 0.0, "grad_norm": 0.8447083830833435, "kl": 0.0005265605868771672, "learning_rate": 3.4261631135654174e-09, "loss": 0.0, "num_tokens": 10387716.0, "reward": 0.40771353244781494, "reward_std": 0.02116406336426735, "rewards/reward_function/mean": 0.40771353244781494, "rewards/reward_function/std": 0.02116405963897705, "step": 198 }, { "completion_length": 1005.875, "completions/clipped_ratio": 0.0, "completions/max_length": 1336.0, "completions/max_terminated_length": 1336.0, "completions/mean_length": 1005.875, "completions/mean_terminated_length": 1005.875, "completions/min_length": 863.0, "completions/min_terminated_length": 863.0, "epoch": 0.77734375, "frac_reward_zero_std": 0.0, "grad_norm": 0.3126639127731323, "kl": 0.0005563631129916757, "learning_rate": 1.5229324522605949e-09, "loss": 0.0, "num_tokens": 10442795.0, "reward": 0.24751342833042145, "reward_std": 0.5041323304176331, "rewards/reward_function/mean": 0.24751342833042145, "rewards/reward_function/std": 0.5041323304176331, "step": 199 }, { "completion_length": 950.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1109.0, "completions/max_terminated_length": 1109.0, "completions/mean_length": 950.5, "completions/mean_terminated_length": 950.5, "completions/min_length": 681.0, "completions/min_terminated_length": 681.0, "epoch": 0.78125, "frac_reward_zero_std": 0.0, "grad_norm": 0.40675655007362366, "kl": 0.0006997519376454875, "learning_rate": 3.8076210902182607e-10, "loss": 0.0, "num_tokens": 10492719.0, "reward": -0.18069900572299957, "reward_std": 0.7014786601066589, "rewards/reward_function/mean": -0.18069900572299957, "rewards/reward_function/std": 0.7014786601066589, "step": 200 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 10492719, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }