{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.998452810727179, "eval_steps": 500, "global_step": 242, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4308035714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4077.0, "completions/mean_length": 2700.8662109375, "completions/mean_terminated_length": 1644.9412841796875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.0041258380608561115, "grad_norm": 0.1046895682811737, "kl": 0.0, "learning_rate": 1e-06, "loss": -0.0074, "num_tokens": 1477661.0, "reward": 0.0424107164144516, "reward_std": 0.08185788989067078, "rewards/code_format_reward/mean": 0.03125, "rewards/code_format_reward/std": 0.17418713867664337, "rewards/curriculum_aware_reward_fn/mean": 0.01116071455180645, "rewards/curriculum_aware_reward_fn/std": 0.0543251559138298, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4055.0, "completions/mean_length": 2686.825927734375, "completions/mean_terminated_length": 1722.6541748046875, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.008251676121712223, "grad_norm": 0.14168378710746765, "kl": 0.0005257129669189453, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 2931567.0, "reward": 0.0463169664144516, "reward_std": 0.09716739505529404, "rewards/code_format_reward/mean": 0.02678571455180645, "rewards/code_format_reward/std": 0.1616371124982834, "rewards/curriculum_aware_reward_fn/mean": 0.01953125, "rewards/curriculum_aware_reward_fn/std": 0.06716705113649368, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4263392857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4057.0, "completions/mean_length": 2571.357177734375, "completions/mean_terminated_length": 1438.2568359375, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.012377514182568335, "grad_norm": 0.14563687145709991, "kl": 0.0005273818969726562, "learning_rate": 1e-06, "loss": 0.0035, "num_tokens": 4336816.0, "reward": 0.0764508992433548, "reward_std": 0.11111512035131454, "rewards/code_format_reward/mean": 0.0446428582072258, "rewards/code_format_reward/std": 0.2067493349313736, "rewards/curriculum_aware_reward_fn/mean": 0.0318080373108387, "rewards/curriculum_aware_reward_fn/std": 0.08985739946365356, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5736607142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4087.0, "completions/mean_length": 3101.29931640625, "completions/mean_terminated_length": 1762.879638671875, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.016503352243424446, "grad_norm": 0.1036723181605339, "kl": 0.0005540847778320312, "learning_rate": 1e-06, "loss": 0.0143, "num_tokens": 6007045.0, "reward": 0.0837053582072258, "reward_std": 0.1315813809633255, "rewards/code_format_reward/mean": 0.0580357126891613, "rewards/code_format_reward/std": 0.23407234251499176, "rewards/curriculum_aware_reward_fn/mean": 0.02566964365541935, "rewards/curriculum_aware_reward_fn/std": 0.07596947997808456, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5357142857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4066.0, "completions/mean_length": 2905.504638671875, "completions/mean_terminated_length": 1531.8558349609375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.020629190304280558, "grad_norm": 835166.625, "kl": 58624.00070476532, "learning_rate": 1e-06, "loss": 583.8669, "num_tokens": 7588494.0, "reward": 0.1434151828289032, "reward_std": 0.18934299051761627, "rewards/code_format_reward/mean": 0.0959821417927742, "rewards/code_format_reward/std": 0.29489606618881226, "rewards/curriculum_aware_reward_fn/mean": 0.0474330373108387, "rewards/curriculum_aware_reward_fn/std": 0.10367463529109955, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4096.0, "completions/mean_length": 2611.6474609375, "completions/mean_terminated_length": 1596.03759765625, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.02475502836513667, "grad_norm": 0.19707472622394562, "kl": 0.0011153221130371094, "learning_rate": 1e-06, "loss": 0.0189, "num_tokens": 9034655.0, "reward": 0.220982164144516, "reward_std": 0.29879894852638245, "rewards/code_format_reward/mean": 0.1473214328289032, "rewards/code_format_reward/std": 0.3548222482204437, "rewards/curriculum_aware_reward_fn/mean": 0.0736607164144516, "rewards/curriculum_aware_reward_fn/std": 0.11409792304039001, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4046.0, "completions/mean_length": 2626.0068359375, "completions/mean_terminated_length": 1482.6787109375, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.02888086642599278, "grad_norm": 0.17933674156665802, "kl": 0.001499176025390625, "learning_rate": 1e-06, "loss": 0.0351, "num_tokens": 10492674.0, "reward": 0.2879464328289032, "reward_std": 0.2878584861755371, "rewards/code_format_reward/mean": 0.1964285671710968, "rewards/code_format_reward/std": 0.39774051308631897, "rewards/curriculum_aware_reward_fn/mean": 0.0915178582072258, "rewards/curriculum_aware_reward_fn/std": 0.1205669566988945, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4081.0, "completions/mean_length": 2744.79931640625, "completions/mean_terminated_length": 1475.4891357421875, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.03300670448684889, "grad_norm": 0.43404316902160645, "kl": 0.0015611648559570312, "learning_rate": 1e-06, "loss": 0.0158, "num_tokens": 11986545.0, "reward": 0.3063616156578064, "reward_std": 0.24489383399486542, "rewards/code_format_reward/mean": 0.2120535671710968, "rewards/code_format_reward/std": 0.40921953320503235, "rewards/curriculum_aware_reward_fn/mean": 0.0943080335855484, "rewards/curriculum_aware_reward_fn/std": 0.12130890786647797, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5446428571428572, "completions/max_length": 4096.0, "completions/max_terminated_length": 4030.0, "completions/mean_length": 2948.500244140625, "completions/mean_terminated_length": 1576.0, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 0.037132542547705004, "grad_norm": 0.18609629571437836, "kl": 0.0015954971313476562, "learning_rate": 1e-06, "loss": 0.0272, "num_tokens": 13569798.0, "reward": 0.2907366156578064, "reward_std": 0.2816566526889801, "rewards/code_format_reward/mean": 0.1986607164144516, "rewards/code_format_reward/std": 0.3994380831718445, "rewards/curriculum_aware_reward_fn/mean": 0.0920758917927742, "rewards/curriculum_aware_reward_fn/std": 0.12072089314460754, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4241071428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4096.0, "completions/mean_length": 2680.91748046875, "completions/mean_terminated_length": 1638.8023681640625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.041258380608561115, "grad_norm": 0.172908753156662, "kl": 0.0022592544555664062, "learning_rate": 1e-06, "loss": 0.0405, "num_tokens": 15042162.0, "reward": 0.482700914144516, "reward_std": 0.3393717110157013, "rewards/code_format_reward/mean": 0.3571428656578064, "rewards/code_format_reward/std": 0.47969308495521545, "rewards/curriculum_aware_reward_fn/mean": 0.1255580335855484, "rewards/curriculum_aware_reward_fn/std": 0.12513849139213562, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4866071428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4084.0, "completions/mean_length": 2782.64306640625, "completions/mean_terminated_length": 1537.80859375, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.04538421866941723, "grad_norm": 0.1751062572002411, "kl": 0.002716064453125, "learning_rate": 1e-06, "loss": 0.0617, "num_tokens": 16554660.0, "reward": 0.508370578289032, "reward_std": 0.3163003921508789, "rewards/code_format_reward/mean": 0.3861607015132904, "rewards/code_format_reward/std": 0.4874124228954315, "rewards/curriculum_aware_reward_fn/mean": 0.1222098246216774, "rewards/curriculum_aware_reward_fn/std": 0.12510856986045837, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4866071428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4065.0, "completions/mean_length": 2774.6630859375, "completions/mean_terminated_length": 1522.26513671875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.04951005673027334, "grad_norm": 0.9701169729232788, "kl": 0.003597259521484375, "learning_rate": 1e-06, "loss": 0.0593, "num_tokens": 18066502.0, "reward": 0.5267857313156128, "reward_std": 0.3041911721229553, "rewards/code_format_reward/mean": 0.4174107015132904, "rewards/code_format_reward/std": 0.4936830997467041, "rewards/curriculum_aware_reward_fn/mean": 0.109375, "rewards/curriculum_aware_reward_fn/std": 0.12415824085474014, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6584821428571428, "completions/max_length": 4096.0, "completions/max_terminated_length": 4047.0, "completions/mean_length": 3336.609619140625, "completions/mean_terminated_length": 1872.4248046875, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.05363589479112945, "grad_norm": 0.1315094232559204, "kl": 0.0018253326416015625, "learning_rate": 1e-06, "loss": 0.0457, "num_tokens": 19830684.0, "reward": 0.4073660969734192, "reward_std": 0.27718180418014526, "rewards/code_format_reward/mean": 0.3258928656578064, "rewards/code_format_reward/std": 0.4692314565181732, "rewards/curriculum_aware_reward_fn/mean": 0.0814732164144516, "rewards/curriculum_aware_reward_fn/std": 0.1173078641295433, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5066964285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4076.0, "completions/mean_length": 2732.910888671875, "completions/mean_terminated_length": 1332.8145751953125, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.05776173285198556, "grad_norm": 0.15690460801124573, "kl": 0.0043087005615234375, "learning_rate": 1e-06, "loss": 0.0651, "num_tokens": 21340929.0, "reward": 0.590401828289032, "reward_std": 0.23798644542694092, "rewards/code_format_reward/mean": 0.4754464328289032, "rewards/code_format_reward/std": 0.49995502829551697, "rewards/curriculum_aware_reward_fn/mean": 0.1149553582072258, "rewards/curriculum_aware_reward_fn/std": 0.131288543343544, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4977678571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4074.0, "completions/mean_length": 2652.2568359375, "completions/mean_terminated_length": 1221.3466796875, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.06188757091284167, "grad_norm": 0.1379333734512329, "kl": 0.0039424896240234375, "learning_rate": 1e-06, "loss": 0.0535, "num_tokens": 22784570.0, "reward": 0.6032366156578064, "reward_std": 0.206678569316864, "rewards/code_format_reward/mean": 0.4866071343421936, "rewards/code_format_reward/std": 0.5003793835639954, "rewards/curriculum_aware_reward_fn/mean": 0.1166294664144516, "rewards/curriculum_aware_reward_fn/std": 0.12485884875059128, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4040178571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4068.0, "completions/mean_length": 2427.598388671875, "completions/mean_terminated_length": 1296.584228515625, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.06601340897369778, "grad_norm": 0.14749102294445038, "kl": 0.00396728515625, "learning_rate": 1e-06, "loss": 0.0744, "num_tokens": 24157566.0, "reward": 0.7042410969734192, "reward_std": 0.17767514288425446, "rewards/code_format_reward/mean": 0.5825892686843872, "rewards/code_format_reward/std": 0.4936830997467041, "rewards/curriculum_aware_reward_fn/mean": 0.1216517835855484, "rewards/curriculum_aware_reward_fn/std": 0.12509484589099884, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4241071428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 3996.0, "completions/mean_length": 2584.977783203125, "completions/mean_terminated_length": 1472.2093505859375, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 0.07013924703455389, "grad_norm": 0.15793055295944214, "kl": 0.00390625, "learning_rate": 1e-06, "loss": 0.1035, "num_tokens": 25575553.0, "reward": 0.6975446939468384, "reward_std": 0.2576451301574707, "rewards/code_format_reward/mean": 0.5691964030265808, "rewards/code_format_reward/std": 0.4957422614097595, "rewards/curriculum_aware_reward_fn/mean": 0.1283482164144516, "rewards/curriculum_aware_reward_fn/std": 0.13268840312957764, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4285714285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4068.0, "completions/mean_length": 2531.3818359375, "completions/mean_terminated_length": 1357.91796875, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.07426508509541001, "grad_norm": 0.12563329935073853, "kl": 0.0038623809814453125, "learning_rate": 1e-06, "loss": 0.0573, "num_tokens": 26979791.0, "reward": 0.6981027126312256, "reward_std": 0.1848166137933731, "rewards/code_format_reward/mean": 0.5647321343421936, "rewards/code_format_reward/std": 0.4963463246822357, "rewards/curriculum_aware_reward_fn/mean": 0.1333705335855484, "rewards/curriculum_aware_reward_fn/std": 0.12485884875059128, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5066964285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4045.0, "completions/mean_length": 2890.832763671875, "completions/mean_terminated_length": 1652.94580078125, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.07839092315626611, "grad_norm": 0.1281888633966446, "kl": 0.00278472900390625, "learning_rate": 1e-06, "loss": 0.0768, "num_tokens": 28566203.0, "reward": 0.5714285969734192, "reward_std": 0.23000064492225647, "rewards/code_format_reward/mean": 0.46875, "rewards/code_format_reward/std": 0.49958035349845886, "rewards/curriculum_aware_reward_fn/mean": 0.1026785746216774, "rewards/curriculum_aware_reward_fn/std": 0.12312836199998856, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5602678571428572, "completions/max_length": 4096.0, "completions/max_terminated_length": 4047.0, "completions/mean_length": 2969.250244140625, "completions/mean_terminated_length": 1533.6446533203125, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.08251676121712223, "grad_norm": 0.1232510358095169, "kl": 0.0032062530517578125, "learning_rate": 1e-06, "loss": 0.064, "num_tokens": 30166480.0, "reward": 0.5340402126312256, "reward_std": 0.18625172972679138, "rewards/code_format_reward/mean": 0.4397321343421936, "rewards/code_format_reward/std": 0.49690937995910645, "rewards/curriculum_aware_reward_fn/mean": 0.0943080335855484, "rewards/curriculum_aware_reward_fn/std": 0.12130890786647797, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4486607142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4080.0, "completions/mean_length": 2699.654052734375, "completions/mean_terminated_length": 1563.3563232421875, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "epoch": 0.08664259927797834, "grad_norm": 0.14180642366409302, "kl": 0.0038776397705078125, "learning_rate": 1e-06, "loss": 0.0643, "num_tokens": 31637788.0, "reward": 0.6651785969734192, "reward_std": 0.19474074244499207, "rewards/code_format_reward/mean": 0.5446428656578064, "rewards/code_format_reward/std": 0.49855974316596985, "rewards/curriculum_aware_reward_fn/mean": 0.1205357164144516, "rewards/curriculum_aware_reward_fn/std": 0.12505991756916046, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4464285714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4028.0, "completions/mean_length": 2693.1787109375, "completions/mean_terminated_length": 1561.8709716796875, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.09076843733883445, "grad_norm": 0.15430673956871033, "kl": 0.003658294677734375, "learning_rate": 1e-06, "loss": 0.0874, "num_tokens": 33102331.0, "reward": 0.6796875596046448, "reward_std": 0.22615034878253937, "rewards/code_format_reward/mean": 0.5535714030265808, "rewards/code_format_reward/std": 0.4976775646209717, "rewards/curriculum_aware_reward_fn/mean": 0.1261160671710968, "rewards/curriculum_aware_reward_fn/std": 0.12624716758728027, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3950892857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4093.0, "completions/mean_length": 2583.727783203125, "completions/mean_terminated_length": 1596.0074462890625, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.09489427539969056, "grad_norm": 0.15241876244544983, "kl": 0.004024505615234375, "learning_rate": 1e-06, "loss": 0.0689, "num_tokens": 34523244.0, "reward": 0.7393973469734192, "reward_std": 0.21243248879909515, "rewards/code_format_reward/mean": 0.6004464030265808, "rewards/code_format_reward/std": 0.49035418033599854, "rewards/curriculum_aware_reward_fn/mean": 0.1389508992433548, "rewards/curriculum_aware_reward_fn/std": 0.12547722458839417, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3638392857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4079.0, "completions/mean_length": 2436.129638671875, "completions/mean_terminated_length": 1486.800048828125, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.09902011346054668, "grad_norm": 0.1570708006620407, "kl": 0.0048961639404296875, "learning_rate": 1e-06, "loss": 0.0699, "num_tokens": 35887679.0, "reward": 0.7795759439468384, "reward_std": 0.21272383630275726, "rewards/code_format_reward/mean": 0.6361607313156128, "rewards/code_format_reward/std": 0.4816409945487976, "rewards/curriculum_aware_reward_fn/mean": 0.1434151828289032, "rewards/curriculum_aware_reward_fn/std": 0.12377429753541946, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3683035714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4083.0, "completions/mean_length": 2458.484375, "completions/mean_terminated_length": 1503.7491455078125, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.10314595152140278, "grad_norm": 0.19118757545948029, "kl": 0.00563812255859375, "learning_rate": 1e-06, "loss": 0.0905, "num_tokens": 37260337.0, "reward": 0.7717634439468384, "reward_std": 0.22223451733589172, "rewards/code_format_reward/mean": 0.6294642686843872, "rewards/code_format_reward/std": 0.48348814249038696, "rewards/curriculum_aware_reward_fn/mean": 0.1422991007566452, "rewards/curriculum_aware_reward_fn/std": 0.12393557280302048, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3883928571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4091.0, "completions/mean_length": 2453.40185546875, "completions/mean_terminated_length": 1410.2919921875, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.1072717895822589, "grad_norm": 0.3272775113582611, "kl": 0.0047740936279296875, "learning_rate": 1e-06, "loss": 0.0586, "num_tokens": 38637620.0, "reward": 0.7248884439468384, "reward_std": 0.16748620569705963, "rewards/code_format_reward/mean": 0.5959821343421936, "rewards/code_format_reward/std": 0.49124953150749207, "rewards/curriculum_aware_reward_fn/mean": 0.12890625, "rewards/curriculum_aware_reward_fn/std": 0.12507861852645874, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4575892857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4063.0, "completions/mean_length": 2687.3037109375, "completions/mean_terminated_length": 1498.8970947265625, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.111397627643115, "grad_norm": 0.5656012296676636, "kl": 0.0044689178466796875, "learning_rate": 1e-06, "loss": 0.0619, "num_tokens": 40117850.0, "reward": 0.6679688096046448, "reward_std": 0.21167722344398499, "rewards/code_format_reward/mean": 0.5424107313156128, "rewards/code_format_reward/std": 0.4987550377845764, "rewards/curriculum_aware_reward_fn/mean": 0.1255580335855484, "rewards/curriculum_aware_reward_fn/std": 0.13167192041873932, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4441964285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4075.0, "completions/mean_length": 2593.060302734375, "completions/mean_terminated_length": 1391.9156494140625, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.11552346570397112, "grad_norm": 0.37587133049964905, "kl": 0.0052947998046875, "learning_rate": 1e-06, "loss": 0.0829, "num_tokens": 41551617.0, "reward": 0.680245578289032, "reward_std": 0.21719147264957428, "rewards/code_format_reward/mean": 0.5513392686843872, "rewards/code_format_reward/std": 0.49791327118873596, "rewards/curriculum_aware_reward_fn/mean": 0.12890625, "rewards/curriculum_aware_reward_fn/std": 0.12507861852645874, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3950892857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4094.0, "completions/mean_length": 2534.004638671875, "completions/mean_terminated_length": 1513.80810546875, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.11964930376482723, "grad_norm": 0.15881648659706116, "kl": 0.005340576171875, "learning_rate": 1e-06, "loss": 0.0813, "num_tokens": 42964709.0, "reward": 0.7455357313156128, "reward_std": 0.2159542590379715, "rewards/code_format_reward/mean": 0.6049107313156128, "rewards/code_format_reward/std": 0.4894163906574249, "rewards/curriculum_aware_reward_fn/mean": 0.140625, "rewards/curriculum_aware_reward_fn/std": 0.13074065744876862, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4196428571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4075.0, "completions/mean_length": 2647.8125, "completions/mean_terminated_length": 1600.6614990234375, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.12377514182568335, "grad_norm": 0.34859126806259155, "kl": 0.00583648681640625, "learning_rate": 1e-06, "loss": 0.1225, "num_tokens": 44415151.0, "reward": 0.6986607909202576, "reward_std": 0.2894200086593628, "rewards/code_format_reward/mean": 0.5758928656578064, "rewards/code_format_reward/std": 0.494759202003479, "rewards/curriculum_aware_reward_fn/mean": 0.1227678582072258, "rewards/curriculum_aware_reward_fn/std": 0.12511979043483734, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4040178571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4091.0, "completions/mean_length": 2473.044677734375, "completions/mean_terminated_length": 1372.8389892578125, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.12790097988653945, "grad_norm": 0.14530473947525024, "kl": 0.005558013916015625, "learning_rate": 1e-06, "loss": 0.0845, "num_tokens": 45805861.0, "reward": 0.7220982313156128, "reward_std": 0.1985878348350525, "rewards/code_format_reward/mean": 0.5959821343421936, "rewards/code_format_reward/std": 0.49124953150749207, "rewards/curriculum_aware_reward_fn/mean": 0.1261160671710968, "rewards/curriculum_aware_reward_fn/std": 0.1327260285615921, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3861607142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4056.0, "completions/mean_length": 2420.3974609375, "completions/mean_terminated_length": 1366.2908935546875, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.13202681794739557, "grad_norm": 0.15969304740428925, "kl": 0.006267547607421875, "learning_rate": 1e-06, "loss": 0.0952, "num_tokens": 47161645.0, "reward": 0.7472098469734192, "reward_std": 0.20535717904567719, "rewards/code_format_reward/mean": 0.6138392686843872, "rewards/code_format_reward/std": 0.4874124526977539, "rewards/curriculum_aware_reward_fn/mean": 0.1333705335855484, "rewards/curriculum_aware_reward_fn/std": 0.12485884875059128, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3995535714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4063.0, "completions/mean_length": 2528.055908203125, "completions/mean_terminated_length": 1484.70263671875, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.1361526560082517, "grad_norm": 0.168403759598732, "kl": 0.00608062744140625, "learning_rate": 1e-06, "loss": 0.1297, "num_tokens": 48572653.0, "reward": 0.7338169813156128, "reward_std": 0.2873011529445648, "rewards/code_format_reward/mean": 0.6004464030265808, "rewards/code_format_reward/std": 0.49035418033599854, "rewards/curriculum_aware_reward_fn/mean": 0.1333705335855484, "rewards/curriculum_aware_reward_fn/std": 0.12485884875059128, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4308035714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4047.0, "completions/mean_length": 2699.91748046875, "completions/mean_terminated_length": 1643.274658203125, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.14027849406910778, "grad_norm": 0.30353009700775146, "kl": 0.005817413330078125, "learning_rate": 1e-06, "loss": 0.0996, "num_tokens": 50044254.0, "reward": 0.6908482313156128, "reward_std": 0.28030288219451904, "rewards/code_format_reward/mean": 0.5691964030265808, "rewards/code_format_reward/std": 0.4957422912120819, "rewards/curriculum_aware_reward_fn/mean": 0.1216517835855484, "rewards/curriculum_aware_reward_fn/std": 0.12509484589099884, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4174107142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4022.0, "completions/mean_length": 2402.15185546875, "completions/mean_terminated_length": 1188.5516357421875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.1444043321299639, "grad_norm": 0.16529737412929535, "kl": 0.0071563720703125, "learning_rate": 1e-06, "loss": 0.092, "num_tokens": 51405654.0, "reward": 0.7098214626312256, "reward_std": 0.23024560511112213, "rewards/code_format_reward/mean": 0.5803571343421936, "rewards/code_format_reward/std": 0.4940521717071533, "rewards/curriculum_aware_reward_fn/mean": 0.1294642835855484, "rewards/curriculum_aware_reward_fn/std": 0.12505991756916046, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4086.0, "completions/mean_length": 2338.90185546875, "completions/mean_terminated_length": 1480.7840576171875, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.14853017019082002, "grad_norm": 47453.81640625, "kl": 1795.790958404541, "learning_rate": 1e-06, "loss": 18.2057, "num_tokens": 52723612.0, "reward": 0.8085938096046448, "reward_std": 0.252255380153656, "rewards/code_format_reward/mean": 0.671875, "rewards/code_format_reward/std": 0.470055490732193, "rewards/curriculum_aware_reward_fn/mean": 0.13671875, "rewards/curriculum_aware_reward_fn/std": 0.12458859384059906, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3102678571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4049.0, "completions/mean_length": 2285.90625, "completions/mean_terminated_length": 1471.656982421875, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.15265600825167613, "grad_norm": 0.21048317849636078, "kl": 0.011928558349609375, "learning_rate": 1e-06, "loss": 0.1207, "num_tokens": 54016624.0, "reward": 0.8359375596046448, "reward_std": 0.3040117621421814, "rewards/code_format_reward/mean": 0.6897321343421936, "rewards/code_format_reward/std": 0.46312034130096436, "rewards/curriculum_aware_reward_fn/mean": 0.1462053507566452, "rewards/curriculum_aware_reward_fn/std": 0.12332591414451599, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2767857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4069.0, "completions/mean_length": 2063.3125, "completions/mean_terminated_length": 1285.370361328125, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.15678184631253222, "grad_norm": 0.20206981897354126, "kl": 0.01004791259765625, "learning_rate": 1e-06, "loss": 0.079, "num_tokens": 55199330.0, "reward": 0.8878348469734192, "reward_std": 0.2320551574230194, "rewards/code_format_reward/mean": 0.7232142686843872, "rewards/code_format_reward/std": 0.44790980219841003, "rewards/curriculum_aware_reward_fn/mean": 0.1646205335855484, "rewards/curriculum_aware_reward_fn/std": 0.1186872199177742, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2946428571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 3976.0, "completions/mean_length": 2240.28125, "completions/mean_terminated_length": 1465.107666015625, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.16090768437338834, "grad_norm": 0.19678539037704468, "kl": 0.0091705322265625, "learning_rate": 1e-06, "loss": 0.1584, "num_tokens": 56474485.0, "reward": 0.8498884439468384, "reward_std": 0.3051430583000183, "rewards/code_format_reward/mean": 0.703125, "rewards/code_format_reward/std": 0.45739173889160156, "rewards/curriculum_aware_reward_fn/mean": 0.1467633992433548, "rewards/curriculum_aware_reward_fn/std": 0.12322844564914703, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4441964285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 3982.0, "completions/mean_length": 2650.609375, "completions/mean_terminated_length": 1495.457763671875, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 0.16503352243424446, "grad_norm": 0.18934322893619537, "kl": 0.008159637451171875, "learning_rate": 1e-06, "loss": 0.123, "num_tokens": 57926481.0, "reward": 0.6640625, "reward_std": 0.27679774165153503, "rewards/code_format_reward/mean": 0.5535714030265808, "rewards/code_format_reward/std": 0.49767759442329407, "rewards/curriculum_aware_reward_fn/mean": 0.1104910746216774, "rewards/curriculum_aware_reward_fn/std": 0.12429390102624893, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4196428571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4074.0, "completions/mean_length": 2459.743408203125, "completions/mean_terminated_length": 1276.6038818359375, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.16915936049510058, "grad_norm": 0.17007939517498016, "kl": 0.00934600830078125, "learning_rate": 1e-06, "loss": 0.1042, "num_tokens": 59299948.0, "reward": 0.699776828289032, "reward_std": 0.20519039034843445, "rewards/code_format_reward/mean": 0.5758928656578064, "rewards/code_format_reward/std": 0.494759202003479, "rewards/curriculum_aware_reward_fn/mean": 0.1238839253783226, "rewards/curriculum_aware_reward_fn/std": 0.1273498684167862, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2834821428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4087.0, "completions/mean_length": 1898.51123046875, "completions/mean_terminated_length": 1029.099609375, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.17328519855595667, "grad_norm": 0.1494082510471344, "kl": 0.0142974853515625, "learning_rate": 1e-06, "loss": 0.0595, "num_tokens": 60392723.0, "reward": 0.8761160969734192, "reward_std": 0.1509488821029663, "rewards/code_format_reward/mean": 0.7120535969734192, "rewards/code_format_reward/std": 0.4533122181892395, "rewards/curriculum_aware_reward_fn/mean": 0.1640625, "rewards/curriculum_aware_reward_fn/std": 0.11887246370315552, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3772321428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 3876.0, "completions/mean_length": 2363.09619140625, "completions/mean_terminated_length": 1313.415771484375, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.1774110366168128, "grad_norm": 1675.9124755859375, "kl": 119.21817016601562, "learning_rate": 1e-06, "loss": 1.2901, "num_tokens": 61712938.0, "reward": 0.7617188096046448, "reward_std": 0.24466641247272491, "rewards/code_format_reward/mean": 0.6227678656578064, "rewards/code_format_reward/std": 0.4852356016635895, "rewards/curriculum_aware_reward_fn/mean": 0.1389508992433548, "rewards/curriculum_aware_reward_fn/std": 0.13093030452728271, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3147321428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4048.0, "completions/mean_length": 2308.930908203125, "completions/mean_terminated_length": 1488.1595458984375, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.1815368746776689, "grad_norm": 0.1728798747062683, "kl": 0.010223388671875, "learning_rate": 1e-06, "loss": 0.0852, "num_tokens": 62993690.0, "reward": 0.8085938096046448, "reward_std": 0.23875407874584198, "rewards/code_format_reward/mean": 0.671875, "rewards/code_format_reward/std": 0.470055490732193, "rewards/curriculum_aware_reward_fn/mean": 0.13671875, "rewards/curriculum_aware_reward_fn/std": 0.13221123814582825, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4085.0, "completions/mean_length": 2481.15869140625, "completions/mean_terminated_length": 1635.2890625, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.18566271273852503, "grad_norm": 0.1963280290365219, "kl": 0.009490966796875, "learning_rate": 1e-06, "loss": 0.134, "num_tokens": 64395497.0, "reward": 0.7912946939468384, "reward_std": 0.3097690939903259, "rewards/code_format_reward/mean": 0.6540178656578064, "rewards/code_format_reward/std": 0.47621920704841614, "rewards/curriculum_aware_reward_fn/mean": 0.1372767835855484, "rewards/curriculum_aware_reward_fn/std": 0.1256524622440338, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 4096.0, "completions/max_terminated_length": 3884.0, "completions/mean_length": 1956.6763916015625, "completions/mean_terminated_length": 1243.5684814453125, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 0.18978855079938112, "grad_norm": 0.3371032476425171, "kl": 0.0122528076171875, "learning_rate": 1e-06, "loss": 0.0941, "num_tokens": 65541091.0, "reward": 0.906808078289032, "reward_std": 0.17700563371181488, "rewards/code_format_reward/mean": 0.75, "rewards/code_format_reward/std": 0.43349677324295044, "rewards/curriculum_aware_reward_fn/mean": 0.1568080335855484, "rewards/curriculum_aware_reward_fn/std": 0.12666550278663635, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3191964285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4065.0, "completions/mean_length": 2245.1474609375, "completions/mean_terminated_length": 1377.3704833984375, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.19391438886023724, "grad_norm": 0.17097435891628265, "kl": 0.01092529296875, "learning_rate": 1e-06, "loss": 0.0745, "num_tokens": 66817887.0, "reward": 0.8197544813156128, "reward_std": 0.19386102259159088, "rewards/code_format_reward/mean": 0.6741071343421936, "rewards/code_format_reward/std": 0.4692314565181732, "rewards/curriculum_aware_reward_fn/mean": 0.1456473171710968, "rewards/curriculum_aware_reward_fn/std": 0.12342077493667603, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4082.0, "completions/mean_length": 1998.12060546875, "completions/mean_terminated_length": 1239.3131103515625, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.19804022692109335, "grad_norm": 0.62779301404953, "kl": 0.01213836669921875, "learning_rate": 1e-06, "loss": 0.1424, "num_tokens": 67968608.0, "reward": 0.887276828289032, "reward_std": 0.21606780588626862, "rewards/code_format_reward/mean": 0.734375, "rewards/code_format_reward/std": 0.44215917587280273, "rewards/curriculum_aware_reward_fn/mean": 0.1529017835855484, "rewards/curriculum_aware_reward_fn/std": 0.1219823881983757, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2566964285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4055.0, "completions/mean_length": 1901.529052734375, "completions/mean_terminated_length": 1143.6787109375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.20216606498194944, "grad_norm": 0.1816868633031845, "kl": 0.0135345458984375, "learning_rate": 1e-06, "loss": 0.1173, "num_tokens": 69091295.0, "reward": 0.9006696939468384, "reward_std": 0.19481246173381805, "rewards/code_format_reward/mean": 0.7410714030265808, "rewards/code_format_reward/std": 0.43853598833084106, "rewards/curriculum_aware_reward_fn/mean": 0.1595982164144516, "rewards/curriculum_aware_reward_fn/std": 0.128131702542305, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3370535714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4012.0, "completions/mean_length": 2170.27685546875, "completions/mean_terminated_length": 1191.205322265625, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.20629190304280556, "grad_norm": 0.18150708079338074, "kl": 0.011688232421875, "learning_rate": 1e-06, "loss": 0.1259, "num_tokens": 70324917.0, "reward": 0.946986734867096, "reward_std": 0.24670284986495972, "rewards/code_format_reward/mean": 0.6629464030265808, "rewards/code_format_reward/std": 0.47323182225227356, "rewards/curriculum_aware_reward_fn/mean": 0.2840401828289032, "rewards/curriculum_aware_reward_fn/std": 0.23787550628185272, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2388392857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4080.0, "completions/mean_length": 2079.5224609375, "completions/mean_terminated_length": 1446.785888671875, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.21041774110366168, "grad_norm": 0.2106163054704666, "kl": 0.01255035400390625, "learning_rate": 1e-06, "loss": 0.0931, "num_tokens": 71521311.0, "reward": 1.0742188692092896, "reward_std": 0.3095935583114624, "rewards/code_format_reward/mean": 0.7589285969734192, "rewards/code_format_reward/std": 0.4282117187976837, "rewards/curriculum_aware_reward_fn/mean": 0.3152901828289032, "rewards/curriculum_aware_reward_fn/std": 0.23063845932483673, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3861607142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4028.0, "completions/mean_length": 2419.9375, "completions/mean_terminated_length": 1365.5418701171875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.2145435791645178, "grad_norm": 0.18075338006019592, "kl": 0.0103759765625, "learning_rate": 1e-06, "loss": 0.1135, "num_tokens": 72897336.0, "reward": 0.821428656578064, "reward_std": 0.2871280312538147, "rewards/code_format_reward/mean": 0.6138392686843872, "rewards/code_format_reward/std": 0.4874124526977539, "rewards/curriculum_aware_reward_fn/mean": 0.2075892835855484, "rewards/curriculum_aware_reward_fn/std": 0.23444539308547974, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2991071428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4089.0, "completions/mean_length": 2260.649658203125, "completions/mean_terminated_length": 1477.410888671875, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.2186694172253739, "grad_norm": 0.18771013617515564, "kl": 0.011383056640625, "learning_rate": 1e-06, "loss": 0.0869, "num_tokens": 74174819.0, "reward": 0.9642858505249023, "reward_std": 0.29447072744369507, "rewards/code_format_reward/mean": 0.7008928656578064, "rewards/code_format_reward/std": 0.45837873220443726, "rewards/curriculum_aware_reward_fn/mean": 0.2633928656578064, "rewards/curriculum_aware_reward_fn/std": 0.2499200701713562, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3236607142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4064.0, "completions/mean_length": 2193.8037109375, "completions/mean_terminated_length": 1283.5115966796875, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.22279525528623, "grad_norm": 0.2113543450832367, "kl": 0.01250457763671875, "learning_rate": 1e-06, "loss": 0.1088, "num_tokens": 75412009.0, "reward": 0.9441965222358704, "reward_std": 0.2789962887763977, "rewards/code_format_reward/mean": 0.6607142686843872, "rewards/code_format_reward/std": 0.47399622201919556, "rewards/curriculum_aware_reward_fn/mean": 0.2834821343421936, "rewards/curriculum_aware_reward_fn/std": 0.24000264704227448, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2678571428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4050.0, "completions/mean_length": 2136.676513671875, "completions/mean_terminated_length": 1419.8505859375, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.22692109334708613, "grad_norm": 1.6972644329071045, "kl": 0.0386810302734375, "learning_rate": 1e-06, "loss": 0.1264, "num_tokens": 76664416.0, "reward": 1.006138563156128, "reward_std": 0.31591087579727173, "rewards/code_format_reward/mean": 0.7276785969734192, "rewards/code_format_reward/std": 0.4456520676612854, "rewards/curriculum_aware_reward_fn/mean": 0.2784598171710968, "rewards/curriculum_aware_reward_fn/std": 0.2466764897108078, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4046.0, "completions/mean_length": 2378.185302734375, "completions/mean_terminated_length": 1539.25244140625, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.23104693140794225, "grad_norm": 0.18813899159431458, "kl": 0.011688232421875, "learning_rate": 1e-06, "loss": 0.1216, "num_tokens": 78020399.0, "reward": 0.9280134439468384, "reward_std": 0.3132590353488922, "rewards/code_format_reward/mean": 0.6696428656578064, "rewards/code_format_reward/std": 0.4708675146102905, "rewards/curriculum_aware_reward_fn/mean": 0.2583705484867096, "rewards/curriculum_aware_reward_fn/std": 0.24015797674655914, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4090.0, "completions/mean_length": 1929.29248046875, "completions/mean_terminated_length": 1322.6142578125, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.23517276946879834, "grad_norm": 0.21503381431102753, "kl": 0.01425933837890625, "learning_rate": 1e-06, "loss": 0.1124, "num_tokens": 79163266.0, "reward": 1.1372770071029663, "reward_std": 0.290945827960968, "rewards/code_format_reward/mean": 0.7790178656578064, "rewards/code_format_reward/std": 0.4153723120689392, "rewards/curriculum_aware_reward_fn/mean": 0.3582589328289032, "rewards/curriculum_aware_reward_fn/std": 0.23291480541229248, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3258928571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4050.0, "completions/mean_length": 2254.46875, "completions/mean_terminated_length": 1364.1920166015625, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.23929860752965446, "grad_norm": 0.2180740386247635, "kl": 0.0124664306640625, "learning_rate": 1e-06, "loss": 0.1529, "num_tokens": 80435005.0, "reward": 0.9414063692092896, "reward_std": 0.34737643599510193, "rewards/code_format_reward/mean": 0.671875, "rewards/code_format_reward/std": 0.470055490732193, "rewards/curriculum_aware_reward_fn/mean": 0.26953125, "rewards/curriculum_aware_reward_fn/std": 0.2418311983346939, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3080357142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4096.0, "completions/mean_length": 2117.957763671875, "completions/mean_terminated_length": 1237.40966796875, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.24342444559051057, "grad_norm": 2.0398552417755127, "kl": 0.18520355224609375, "learning_rate": 1e-06, "loss": 0.0847, "num_tokens": 81654484.0, "reward": 0.949776828289032, "reward_std": 0.262865275144577, "rewards/code_format_reward/mean": 0.6919642686843872, "rewards/code_format_reward/std": 0.46219751238822937, "rewards/curriculum_aware_reward_fn/mean": 0.2578125, "rewards/curriculum_aware_reward_fn/std": 0.24220581352710724, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4018.0, "completions/mean_length": 2058.332763671875, "completions/mean_terminated_length": 1260.9844970703125, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.2475502836513667, "grad_norm": 0.20060843229293823, "kl": 0.0138397216796875, "learning_rate": 1e-06, "loss": 0.0767, "num_tokens": 82858948.0, "reward": 0.9916296005249023, "reward_std": 0.21733404695987701, "rewards/code_format_reward/mean": 0.71875, "rewards/code_format_reward/std": 0.45011183619499207, "rewards/curriculum_aware_reward_fn/mean": 0.2728794515132904, "rewards/curriculum_aware_reward_fn/std": 0.24037623405456543, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2633928571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4072.0, "completions/mean_length": 2015.8773193359375, "completions/mean_terminated_length": 1272.257568359375, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.2516761217122228, "grad_norm": 0.36040976643562317, "kl": 0.01433563232421875, "learning_rate": 1e-06, "loss": 0.0735, "num_tokens": 84023154.0, "reward": 1.040178656578064, "reward_std": 0.22874124348163605, "rewards/code_format_reward/mean": 0.7366071343421936, "rewards/code_format_reward/std": 0.44096609950065613, "rewards/curriculum_aware_reward_fn/mean": 0.3035714328289032, "rewards/curriculum_aware_reward_fn/std": 0.23454120755195618, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2209821428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 3966.0, "completions/mean_length": 1874.5023193359375, "completions/mean_terminated_length": 1244.3353271484375, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.2558019597730789, "grad_norm": 0.26239436864852905, "kl": 0.0160369873046875, "learning_rate": 1e-06, "loss": 0.1443, "num_tokens": 85147579.0, "reward": 1.084263563156128, "reward_std": 0.3016469478607178, "rewards/code_format_reward/mean": 0.7745535969734192, "rewards/code_format_reward/std": 0.41834312677383423, "rewards/curriculum_aware_reward_fn/mean": 0.3097098171710968, "rewards/curriculum_aware_reward_fn/std": 0.24731335043907166, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2544642857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4066.0, "completions/mean_length": 1978.2255859375, "completions/mean_terminated_length": 1255.3922119140625, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.259927797833935, "grad_norm": 129.74130249023438, "kl": 5.70086669921875, "learning_rate": 1e-06, "loss": 0.1569, "num_tokens": 86297192.0, "reward": 1.0340403318405151, "reward_std": 0.23461735248565674, "rewards/code_format_reward/mean": 0.7477678656578064, "rewards/code_format_reward/std": 0.4347793161869049, "rewards/curriculum_aware_reward_fn/mean": 0.2862723171710968, "rewards/curriculum_aware_reward_fn/std": 0.24105042219161987, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2232142857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4082.0, "completions/mean_length": 2063.598388671875, "completions/mean_terminated_length": 1479.57470703125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.26405363589479114, "grad_norm": 3.3981306552886963, "kl": 0.02306365966796875, "learning_rate": 1e-06, "loss": 0.1086, "num_tokens": 87502377.0, "reward": 1.040178656578064, "reward_std": 0.28381508588790894, "rewards/code_format_reward/mean": 0.7700892686843872, "rewards/code_format_reward/std": 0.42124560475349426, "rewards/curriculum_aware_reward_fn/mean": 0.2700892984867096, "rewards/curriculum_aware_reward_fn/std": 0.23324955999851227, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2477678571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4070.0, "completions/mean_length": 1879.0023193359375, "completions/mean_terminated_length": 1148.7744140625, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.26817947395564723, "grad_norm": 0.24225927889347076, "kl": 0.01624298095703125, "learning_rate": 1e-06, "loss": 0.0991, "num_tokens": 88614419.0, "reward": 1.0429688692092896, "reward_std": 0.25699782371520996, "rewards/code_format_reward/mean": 0.7455357313156128, "rewards/code_format_reward/std": 0.4360465407371521, "rewards/curriculum_aware_reward_fn/mean": 0.2974330484867096, "rewards/curriculum_aware_reward_fn/std": 0.23674975335597992, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2276785714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4059.0, "completions/mean_length": 1847.6787109375, "completions/mean_terminated_length": 1184.8785400390625, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.2723053120165034, "grad_norm": 0.2089075744152069, "kl": 0.01638031005859375, "learning_rate": 1e-06, "loss": 0.0839, "num_tokens": 89713100.0, "reward": 1.0954241752624512, "reward_std": 0.21511933207511902, "rewards/code_format_reward/mean": 0.7723214030265808, "rewards/code_format_reward/std": 0.41980284452438354, "rewards/curriculum_aware_reward_fn/mean": 0.3231026828289032, "rewards/curriculum_aware_reward_fn/std": 0.229498952627182, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2321428571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 3977.0, "completions/mean_length": 1905.44873046875, "completions/mean_terminated_length": 1243.18896484375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.27643115007735947, "grad_norm": 0.21615156531333923, "kl": 0.01703643798828125, "learning_rate": 1e-06, "loss": 0.0889, "num_tokens": 90824726.0, "reward": 1.0998884439468384, "reward_std": 0.2688700258731842, "rewards/code_format_reward/mean": 0.7678571343421936, "rewards/code_format_reward/std": 0.4226716458797455, "rewards/curriculum_aware_reward_fn/mean": 0.33203125, "rewards/curriculum_aware_reward_fn/std": 0.23612669110298157, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1517857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 3972.0, "completions/mean_length": 1596.3148193359375, "completions/mean_terminated_length": 1149.002685546875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.28055698813821556, "grad_norm": 0.31372275948524475, "kl": 0.020721435546875, "learning_rate": 1e-06, "loss": 0.0943, "num_tokens": 91805007.0, "reward": 1.1685268878936768, "reward_std": 0.21287237107753754, "rewards/code_format_reward/mean": 0.8482142686843872, "rewards/code_format_reward/std": 0.3592142164707184, "rewards/curriculum_aware_reward_fn/mean": 0.3203125, "rewards/curriculum_aware_reward_fn/std": 0.22265510261058807, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2120535714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4013.0, "completions/mean_length": 1951.4107666015625, "completions/mean_terminated_length": 1374.2548828125, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.2846828261990717, "grad_norm": 0.9955394268035889, "kl": 0.01727294921875, "learning_rate": 1e-06, "loss": 0.1333, "num_tokens": 92938071.0, "reward": 1.094866156578064, "reward_std": 0.2818447947502136, "rewards/code_format_reward/mean": 0.7879464030265808, "rewards/code_format_reward/std": 0.40921956300735474, "rewards/curriculum_aware_reward_fn/mean": 0.3069196343421936, "rewards/curriculum_aware_reward_fn/std": 0.23013247549533844, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3928.0, "completions/mean_length": 1635.7410888671875, "completions/mean_terminated_length": 1233.1531982421875, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.2888086642599278, "grad_norm": 0.2072010189294815, "kl": 0.0184326171875, "learning_rate": 1e-06, "loss": 0.0757, "num_tokens": 93927384.0, "reward": 1.1975446939468384, "reward_std": 0.22524969279766083, "rewards/code_format_reward/mean": 0.859375, "rewards/code_format_reward/std": 0.3480229377746582, "rewards/curriculum_aware_reward_fn/mean": 0.3381696343421936, "rewards/curriculum_aware_reward_fn/std": 0.22003811597824097, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1540178571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 3862.0, "completions/mean_length": 1586.66748046875, "completions/mean_terminated_length": 1129.8232421875, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.2929345023207839, "grad_norm": 0.21777544915676117, "kl": 0.01844024658203125, "learning_rate": 1e-06, "loss": 0.1254, "num_tokens": 94912737.0, "reward": 1.1802457571029663, "reward_std": 0.23534706234931946, "rewards/code_format_reward/mean": 0.8459821343421936, "rewards/code_format_reward/std": 0.36136940121650696, "rewards/curriculum_aware_reward_fn/mean": 0.3342633843421936, "rewards/curriculum_aware_reward_fn/std": 0.22313886880874634, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2276785714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4019.0, "completions/mean_length": 1923.6943359375, "completions/mean_terminated_length": 1283.303466796875, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.29706034038164003, "grad_norm": 1.8488253355026245, "kl": 0.01645660400390625, "learning_rate": 1e-06, "loss": 0.096, "num_tokens": 96055847.0, "reward": 1.072544813156128, "reward_std": 0.23921768367290497, "rewards/code_format_reward/mean": 0.7723214030265808, "rewards/code_format_reward/std": 0.41980284452438354, "rewards/curriculum_aware_reward_fn/mean": 0.3002232015132904, "rewards/curriculum_aware_reward_fn/std": 0.23229221999645233, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2410714285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4092.0, "completions/mean_length": 1918.35498046875, "completions/mean_terminated_length": 1226.63232421875, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.3011861784424961, "grad_norm": 0.3103954792022705, "kl": 0.01715850830078125, "learning_rate": 1e-06, "loss": 0.1134, "num_tokens": 97181417.0, "reward": 1.0078126192092896, "reward_std": 0.24227580428123474, "rewards/code_format_reward/mean": 0.7589285969734192, "rewards/code_format_reward/std": 0.4282117486000061, "rewards/curriculum_aware_reward_fn/mean": 0.2488839328289032, "rewards/curriculum_aware_reward_fn/std": 0.24348074197769165, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3147321428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4041.0, "completions/mean_length": 2205.97119140625, "completions/mean_terminated_length": 1337.9119873046875, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.30531201650335227, "grad_norm": 0.172196626663208, "kl": 0.015716552734375, "learning_rate": 1e-06, "loss": 0.121, "num_tokens": 98463335.0, "reward": 0.9748885035514832, "reward_std": 0.25062766671180725, "rewards/code_format_reward/mean": 0.6875, "rewards/code_format_reward/std": 0.46403056383132935, "rewards/curriculum_aware_reward_fn/mean": 0.2873883843421936, "rewards/curriculum_aware_reward_fn/std": 0.23971574008464813, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2142857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 3930.0, "completions/mean_length": 1744.9866943359375, "completions/mean_terminated_length": 1103.8011474609375, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.30943785456420836, "grad_norm": 0.2059830278158188, "kl": 0.019775390625, "learning_rate": 1e-06, "loss": 0.065, "num_tokens": 99512272.0, "reward": 1.1138393878936768, "reward_std": 0.1958550065755844, "rewards/code_format_reward/mean": 0.7857142686843872, "rewards/code_format_reward/std": 0.41078460216522217, "rewards/curriculum_aware_reward_fn/mean": 0.328125, "rewards/curriculum_aware_reward_fn/std": 0.24182668328285217, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2700892857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4096.0, "completions/mean_length": 2075.29931640625, "completions/mean_terminated_length": 1327.5780029296875, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.31356369262506445, "grad_norm": 17.49361801147461, "kl": 1.5825347900390625, "learning_rate": 1e-06, "loss": 0.1568, "num_tokens": 100727809.0, "reward": 1.0027902126312256, "reward_std": 0.34511077404022217, "rewards/code_format_reward/mean": 0.7276785969734192, "rewards/code_format_reward/std": 0.4456520974636078, "rewards/curriculum_aware_reward_fn/mean": 0.2751116156578064, "rewards/curriculum_aware_reward_fn/std": 0.23722384870052338, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1852678571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4026.0, "completions/mean_length": 1702.2366943359375, "completions/mean_terminated_length": 1157.9013671875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.3176895306859206, "grad_norm": 0.2730734348297119, "kl": 0.01885986328125, "learning_rate": 1e-06, "loss": 0.1004, "num_tokens": 101770579.0, "reward": 1.1540179252624512, "reward_std": 0.2661118507385254, "rewards/code_format_reward/mean": 0.8102678656578064, "rewards/code_format_reward/std": 0.39252743124961853, "rewards/curriculum_aware_reward_fn/mean": 0.34375, "rewards/curriculum_aware_reward_fn/std": 0.23560336232185364, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2142857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4070.0, "completions/mean_length": 1857.1094970703125, "completions/mean_terminated_length": 1246.5029296875, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.3218153687467767, "grad_norm": 2405.08642578125, "kl": 75.51731872558594, "learning_rate": 1e-06, "loss": 0.8786, "num_tokens": 102878952.0, "reward": 1.0842634439468384, "reward_std": 0.2500801086425781, "rewards/code_format_reward/mean": 0.7834821343421936, "rewards/code_format_reward/std": 0.41233164072036743, "rewards/curriculum_aware_reward_fn/mean": 0.30078125, "rewards/curriculum_aware_reward_fn/std": 0.23486502468585968, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1651785714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4093.0, "completions/mean_length": 1748.9241943359375, "completions/mean_terminated_length": 1284.5294189453125, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.3259412068076328, "grad_norm": 0.6941543221473694, "kl": 0.0203399658203125, "learning_rate": 1e-06, "loss": 0.0674, "num_tokens": 103934609.0, "reward": 1.1043528318405151, "reward_std": 0.2072954624891281, "rewards/code_format_reward/mean": 0.8348214030265808, "rewards/code_format_reward/std": 0.37175676226615906, "rewards/curriculum_aware_reward_fn/mean": 0.26953125, "rewards/curriculum_aware_reward_fn/std": 0.23950733244419098, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2455357142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4086.0, "completions/mean_length": 1882.96435546875, "completions/mean_terminated_length": 1162.74560546875, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.3300670448684889, "grad_norm": 0.2440539300441742, "kl": 0.0200042724609375, "learning_rate": 1e-06, "loss": 0.1245, "num_tokens": 105041335.0, "reward": 1.0106028318405151, "reward_std": 0.25957995653152466, "rewards/code_format_reward/mean": 0.7544642686843872, "rewards/code_format_reward/std": 0.43088552355766296, "rewards/curriculum_aware_reward_fn/mean": 0.2561383843421936, "rewards/curriculum_aware_reward_fn/std": 0.23905864357948303, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2232142857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4084.0, "completions/mean_length": 1856.6251220703125, "completions/mean_terminated_length": 1213.12646484375, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.334192882929345, "grad_norm": 0.20898547768592834, "kl": 0.01934051513671875, "learning_rate": 1e-06, "loss": 0.081, "num_tokens": 106145347.0, "reward": 1.1054688692092896, "reward_std": 0.2386128008365631, "rewards/code_format_reward/mean": 0.7790178656578064, "rewards/code_format_reward/std": 0.4153723120689392, "rewards/curriculum_aware_reward_fn/mean": 0.3264508843421936, "rewards/curriculum_aware_reward_fn/std": 0.26680248975753784, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2254464285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4038.0, "completions/mean_length": 1951.0045166015625, "completions/mean_terminated_length": 1326.6685791015625, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.33831872099020116, "grad_norm": 0.45301738381385803, "kl": 0.01995849609375, "learning_rate": 1e-06, "loss": 0.1011, "num_tokens": 107288589.0, "reward": 1.088169813156128, "reward_std": 0.23264777660369873, "rewards/code_format_reward/mean": 0.7745535969734192, "rewards/code_format_reward/std": 0.41834309697151184, "rewards/curriculum_aware_reward_fn/mean": 0.3136160671710968, "rewards/curriculum_aware_reward_fn/std": 0.23441077768802643, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2790178571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4033.0, "completions/mean_length": 2092.859375, "completions/mean_terminated_length": 1317.650146484375, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.34244455905105725, "grad_norm": 0.26481834053993225, "kl": 0.0216064453125, "learning_rate": 1e-06, "loss": 0.1104, "num_tokens": 108519000.0, "reward": 0.9983260035514832, "reward_std": 0.3085317015647888, "rewards/code_format_reward/mean": 0.7209821343421936, "rewards/code_format_reward/std": 0.449017733335495, "rewards/curriculum_aware_reward_fn/mean": 0.27734375, "rewards/curriculum_aware_reward_fn/std": 0.23932482302188873, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2700892857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4074.0, "completions/mean_length": 2151.540283203125, "completions/mean_terminated_length": 1432.0306396484375, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.34657039711191334, "grad_norm": 0.37744393944740295, "kl": 0.14849853515625, "learning_rate": 1e-06, "loss": 0.1315, "num_tokens": 109766635.0, "reward": 0.981584906578064, "reward_std": 0.30362850427627563, "rewards/code_format_reward/mean": 0.7299107313156128, "rewards/code_format_reward/std": 0.444502055644989, "rewards/curriculum_aware_reward_fn/mean": 0.2516741156578064, "rewards/curriculum_aware_reward_fn/std": 0.23971574008464813, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1986607142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4044.0, "completions/mean_length": 1862.97998046875, "completions/mean_terminated_length": 1309.389892578125, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.3506962351727695, "grad_norm": 0.23975680768489838, "kl": 0.020782470703125, "learning_rate": 1e-06, "loss": 0.1582, "num_tokens": 110882811.0, "reward": 1.0664063692092896, "reward_std": 0.34618672728538513, "rewards/code_format_reward/mean": 0.8013392686843872, "rewards/code_format_reward/std": 0.3994380831718445, "rewards/curriculum_aware_reward_fn/mean": 0.2650669515132904, "rewards/curriculum_aware_reward_fn/std": 0.23272913694381714, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2589285714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4078.0, "completions/mean_length": 2102.96435546875, "completions/mean_terminated_length": 1406.6024169921875, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.3548220732336256, "grad_norm": 0.21237419545650482, "kl": 0.0214691162109375, "learning_rate": 1e-06, "loss": 0.1226, "num_tokens": 112083365.0, "reward": 0.9654018878936768, "reward_std": 0.30100569128990173, "rewards/code_format_reward/mean": 0.7366071343421936, "rewards/code_format_reward/std": 0.44096609950065613, "rewards/curriculum_aware_reward_fn/mean": 0.2287946492433548, "rewards/curriculum_aware_reward_fn/std": 0.2408204972743988, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1651785714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4051.0, "completions/mean_length": 1681.2679443359375, "completions/mean_terminated_length": 1203.4866943359375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.35894791129448167, "grad_norm": 0.25105348229408264, "kl": 0.0233154296875, "learning_rate": 1e-06, "loss": 0.1011, "num_tokens": 113121753.0, "reward": 1.1584821939468384, "reward_std": 0.22147175669670105, "rewards/code_format_reward/mean": 0.8348214030265808, "rewards/code_format_reward/std": 0.37175676226615906, "rewards/curriculum_aware_reward_fn/mean": 0.3236607015132904, "rewards/curriculum_aware_reward_fn/std": 0.2271760106086731, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2098214285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4063.0, "completions/mean_length": 1831.419677734375, "completions/mean_terminated_length": 1230.0904541015625, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.3630737493553378, "grad_norm": 0.20898748934268951, "kl": 0.0230712890625, "learning_rate": 1e-06, "loss": 0.1039, "num_tokens": 114218771.0, "reward": 1.078125238418579, "reward_std": 0.25099310278892517, "rewards/code_format_reward/mean": 0.7834821343421936, "rewards/code_format_reward/std": 0.41233164072036743, "rewards/curriculum_aware_reward_fn/mean": 0.2946428656578064, "rewards/curriculum_aware_reward_fn/std": 0.23402969539165497, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2276785714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4030.0, "completions/mean_length": 1856.5224609375, "completions/mean_terminated_length": 1196.3294677734375, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.3671995874161939, "grad_norm": 0.35406267642974854, "kl": 0.03717041015625, "learning_rate": 1e-06, "loss": 0.1187, "num_tokens": 115333209.0, "reward": 1.0039063692092896, "reward_std": 0.24784553050994873, "rewards/code_format_reward/mean": 0.7723214030265808, "rewards/code_format_reward/std": 0.41980284452438354, "rewards/curriculum_aware_reward_fn/mean": 0.2315848171710968, "rewards/curriculum_aware_reward_fn/std": 0.23842592537403107, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1919642857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4077.0, "completions/mean_length": 1926.24560546875, "completions/mean_terminated_length": 1410.779052734375, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.37132542547705005, "grad_norm": 1.0097814798355103, "kl": 0.144134521484375, "learning_rate": 1e-06, "loss": 0.1606, "num_tokens": 116476524.0, "reward": 1.1093751192092896, "reward_std": 0.3380582928657532, "rewards/code_format_reward/mean": 0.8013392686843872, "rewards/code_format_reward/std": 0.3994380831718445, "rewards/curriculum_aware_reward_fn/mean": 0.3080357015132904, "rewards/curriculum_aware_reward_fn/std": 0.23287460207939148, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1986607142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4072.0, "completions/mean_length": 1777.1429443359375, "completions/mean_terminated_length": 1202.27294921875, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.37545126353790614, "grad_norm": 0.22592586278915405, "kl": 0.0236053466796875, "learning_rate": 1e-06, "loss": 0.1169, "num_tokens": 117551676.0, "reward": 1.088169813156128, "reward_std": 0.22282269597053528, "rewards/code_format_reward/mean": 0.8013392686843872, "rewards/code_format_reward/std": 0.3994380831718445, "rewards/curriculum_aware_reward_fn/mean": 0.2868303656578064, "rewards/curriculum_aware_reward_fn/std": 0.23479391634464264, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2433035714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4024.0, "completions/mean_length": 2006.97998046875, "completions/mean_terminated_length": 1335.2890625, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.37957710159876223, "grad_norm": 3.0563249588012695, "kl": 0.4225006103515625, "learning_rate": 1e-06, "loss": 0.1485, "num_tokens": 118710307.0, "reward": 1.0223215818405151, "reward_std": 0.30270916223526, "rewards/code_format_reward/mean": 0.7544642686843872, "rewards/code_format_reward/std": 0.43088552355766296, "rewards/curriculum_aware_reward_fn/mean": 0.2678571343421936, "rewards/curriculum_aware_reward_fn/std": 0.272662490606308, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1629464285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4073.0, "completions/mean_length": 1677.888427734375, "completions/mean_terminated_length": 1207.16259765625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.3837029396596184, "grad_norm": 0.24719642102718353, "kl": 0.0235595703125, "learning_rate": 1e-06, "loss": 0.1163, "num_tokens": 119736282.0, "reward": 1.1618304252624512, "reward_std": 0.2898777723312378, "rewards/code_format_reward/mean": 0.8370535969734192, "rewards/code_format_reward/std": 0.3697296679019928, "rewards/curriculum_aware_reward_fn/mean": 0.3247767984867096, "rewards/curriculum_aware_reward_fn/std": 0.25025704503059387, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1830357142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4052.0, "completions/mean_length": 1777.868408203125, "completions/mean_terminated_length": 1258.50537109375, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.38782877772047447, "grad_norm": 0.2288348227739334, "kl": 0.0231170654296875, "learning_rate": 1e-06, "loss": 0.132, "num_tokens": 120799131.0, "reward": 1.1277903318405151, "reward_std": 0.2564563453197479, "rewards/code_format_reward/mean": 0.8169642686843872, "rewards/code_format_reward/std": 0.387128084897995, "rewards/curriculum_aware_reward_fn/mean": 0.3108258843421936, "rewards/curriculum_aware_reward_fn/std": 0.23781250417232513, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1763392857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4060.0, "completions/mean_length": 1826.2232666015625, "completions/mean_terminated_length": 1340.2818603515625, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.39195461578133056, "grad_norm": 0.21234188973903656, "kl": 0.02154541015625, "learning_rate": 1e-06, "loss": 0.1079, "num_tokens": 121893849.0, "reward": 1.1668527126312256, "reward_std": 0.28739964962005615, "rewards/code_format_reward/mean": 0.8191964030265808, "rewards/code_format_reward/std": 0.3852855861186981, "rewards/curriculum_aware_reward_fn/mean": 0.34765625, "rewards/curriculum_aware_reward_fn/std": 0.23310969769954681, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1473214285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4078.0, "completions/mean_length": 1696.6451416015625, "completions/mean_terminated_length": 1282.096923828125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.3960804538421867, "grad_norm": 0.45756828784942627, "kl": 0.0218353271484375, "learning_rate": 1e-06, "loss": 0.0609, "num_tokens": 122926001.0, "reward": 1.1813616752624512, "reward_std": 0.2528018653392792, "rewards/code_format_reward/mean": 0.8504464030265808, "rewards/code_format_reward/std": 0.3570319712162018, "rewards/curriculum_aware_reward_fn/mean": 0.3309151828289032, "rewards/curriculum_aware_reward_fn/std": 0.23052476346492767, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1830357142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4073.0, "completions/mean_length": 1778.212158203125, "completions/mean_terminated_length": 1258.9261474609375, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.4002062919030428, "grad_norm": 0.22586114704608917, "kl": 0.020263671875, "learning_rate": 1e-06, "loss": 0.0852, "num_tokens": 123989487.0, "reward": 1.141741156578064, "reward_std": 0.23073901236057281, "rewards/code_format_reward/mean": 0.8191964030265808, "rewards/code_format_reward/std": 0.38528555631637573, "rewards/curriculum_aware_reward_fn/mean": 0.3225446343421936, "rewards/curriculum_aware_reward_fn/std": 0.24527889490127563, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4069.0, "completions/mean_length": 1997.7857666015625, "completions/mean_terminated_length": 1355.4752197265625, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.4043321299638989, "grad_norm": 0.22004811465740204, "kl": 0.0184173583984375, "learning_rate": 1e-06, "loss": 0.1259, "num_tokens": 125161037.0, "reward": 1.224888563156128, "reward_std": 0.3483339548110962, "rewards/code_format_reward/mean": 0.7700892686843872, "rewards/code_format_reward/std": 0.42124560475349426, "rewards/curriculum_aware_reward_fn/mean": 0.4547991156578064, "rewards/curriculum_aware_reward_fn/std": 0.36105790734291077, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1808035714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4080.0, "completions/mean_length": 1730.10498046875, "completions/mean_terminated_length": 1207.931884765625, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.40845796802475504, "grad_norm": 0.21898438036441803, "kl": 0.020751953125, "learning_rate": 1e-06, "loss": 0.1138, "num_tokens": 126207206.0, "reward": 1.2566964626312256, "reward_std": 0.27331340312957764, "rewards/code_format_reward/mean": 0.8191964030265808, "rewards/code_format_reward/std": 0.38528555631637573, "rewards/curriculum_aware_reward_fn/mean": 0.4375, "rewards/curriculum_aware_reward_fn/std": 0.34957626461982727, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3102678571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4087.0, "completions/mean_length": 2256.05810546875, "completions/mean_terminated_length": 1428.3819580078125, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.4125838060856111, "grad_norm": 0.20322370529174805, "kl": 0.0179290771484375, "learning_rate": 1e-06, "loss": 0.1487, "num_tokens": 127483409.0, "reward": 1.0820313692092896, "reward_std": 0.3708450198173523, "rewards/code_format_reward/mean": 0.6830357313156128, "rewards/code_format_reward/std": 0.4658135175704956, "rewards/curriculum_aware_reward_fn/mean": 0.3989955484867096, "rewards/curriculum_aware_reward_fn/std": 0.35824280977249146, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2120535714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4084.0, "completions/mean_length": 1926.0068359375, "completions/mean_terminated_length": 1342.01416015625, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.4167096441464673, "grad_norm": 0.2344144731760025, "kl": 0.0195465087890625, "learning_rate": 1e-06, "loss": 0.1115, "num_tokens": 128630124.0, "reward": 1.2204241752624512, "reward_std": 0.294097900390625, "rewards/code_format_reward/mean": 0.7924107313156128, "rewards/code_format_reward/std": 0.4060344398021698, "rewards/curriculum_aware_reward_fn/mean": 0.4280133843421936, "rewards/curriculum_aware_reward_fn/std": 0.3451191484928131, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3926.0, "completions/mean_length": 2156.75, "completions/mean_terminated_length": 1397.9130859375, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.42083548220732336, "grad_norm": 0.45799532532691956, "kl": 0.019683837890625, "learning_rate": 1e-06, "loss": 0.1155, "num_tokens": 129882550.0, "reward": 1.1283483505249023, "reward_std": 0.3579336702823639, "rewards/code_format_reward/mean": 0.7254464030265808, "rewards/code_format_reward/std": 0.44678795337677, "rewards/curriculum_aware_reward_fn/mean": 0.4029017984867096, "rewards/curriculum_aware_reward_fn/std": 0.35482051968574524, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2254464285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4052.0, "completions/mean_length": 1941.88623046875, "completions/mean_terminated_length": 1314.896240234375, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.42496132026817945, "grad_norm": 0.5113571882247925, "kl": 0.0253143310546875, "learning_rate": 1e-06, "loss": 0.1012, "num_tokens": 131033775.0, "reward": 1.165178656578064, "reward_std": 0.2878580689430237, "rewards/code_format_reward/mean": 0.7790178656578064, "rewards/code_format_reward/std": 0.4153723120689392, "rewards/curriculum_aware_reward_fn/mean": 0.3861607015132904, "rewards/curriculum_aware_reward_fn/std": 0.3501971364021301, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2633928571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4071.0, "completions/mean_length": 2113.2412109375, "completions/mean_terminated_length": 1404.2545166015625, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.4290871583290356, "grad_norm": 0.479640394449234, "kl": 0.018890380859375, "learning_rate": 1e-06, "loss": 0.108, "num_tokens": 132237358.0, "reward": 1.1512277126312256, "reward_std": 0.3316226005554199, "rewards/code_format_reward/mean": 0.7410714030265808, "rewards/code_format_reward/std": 0.43853598833084106, "rewards/curriculum_aware_reward_fn/mean": 0.41015625, "rewards/curriculum_aware_reward_fn/std": 0.33801397681236267, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1339285714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4054.0, "completions/mean_length": 1529.060302734375, "completions/mean_terminated_length": 1132.11083984375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.4332129963898917, "grad_norm": 0.24146206676959991, "kl": 0.02374267578125, "learning_rate": 1e-06, "loss": 0.1513, "num_tokens": 133183323.0, "reward": 1.360491156578064, "reward_std": 0.3237505853176117, "rewards/code_format_reward/mean": 0.8683035969734192, "rewards/code_format_reward/std": 0.3385384678840637, "rewards/curriculum_aware_reward_fn/mean": 0.4921875, "rewards/curriculum_aware_reward_fn/std": 0.338927298784256, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1852678571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4075.0, "completions/mean_length": 1833.837158203125, "completions/mean_terminated_length": 1319.427490234375, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.4373388344507478, "grad_norm": 0.2235928177833557, "kl": 0.0216064453125, "learning_rate": 1e-06, "loss": 0.1185, "num_tokens": 134279772.0, "reward": 1.262834906578064, "reward_std": 0.3254813253879547, "rewards/code_format_reward/mean": 0.8191964030265808, "rewards/code_format_reward/std": 0.38528555631637573, "rewards/curriculum_aware_reward_fn/mean": 0.4436383843421936, "rewards/curriculum_aware_reward_fn/std": 0.34397777915000916, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1741071428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4020.0, "completions/mean_length": 1804.2054443359375, "completions/mean_terminated_length": 1321.0703125, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.44146467251160393, "grad_norm": 0.23870478570461273, "kl": 0.0242156982421875, "learning_rate": 1e-06, "loss": 0.0756, "num_tokens": 135341757.0, "reward": 1.2946429252624512, "reward_std": 0.28161728382110596, "rewards/code_format_reward/mean": 0.8325892686843872, "rewards/code_format_reward/std": 0.37375950813293457, "rewards/curriculum_aware_reward_fn/mean": 0.4620535671710968, "rewards/curriculum_aware_reward_fn/std": 0.34223541617393494, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4049.0, "completions/mean_length": 1701.419677734375, "completions/mean_terminated_length": 1257.978759765625, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.44559051057246, "grad_norm": 0.20999369025230408, "kl": 0.0244903564453125, "learning_rate": 1e-06, "loss": 0.092, "num_tokens": 136372961.0, "reward": 1.234375, "reward_std": 0.27727842330932617, "rewards/code_format_reward/mean": 0.84375, "rewards/code_format_reward/std": 0.36349809169769287, "rewards/curriculum_aware_reward_fn/mean": 0.390625, "rewards/curriculum_aware_reward_fn/std": 0.35792598128318787, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2053571428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4089.0, "completions/mean_length": 1929.8013916015625, "completions/mean_terminated_length": 1369.9971923828125, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.44971634863331617, "grad_norm": 0.2564976215362549, "kl": 0.0235595703125, "learning_rate": 1e-06, "loss": 0.097, "num_tokens": 137505745.0, "reward": 1.2589287757873535, "reward_std": 0.31317228078842163, "rewards/code_format_reward/mean": 0.7857142686843872, "rewards/code_format_reward/std": 0.41078460216522217, "rewards/curriculum_aware_reward_fn/mean": 0.4732142984867096, "rewards/curriculum_aware_reward_fn/std": 0.36577144265174866, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1651785714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4014.0, "completions/mean_length": 1801.02685546875, "completions/mean_terminated_length": 1346.941162109375, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.45384218669417226, "grad_norm": 0.22446659207344055, "kl": 0.024993896484375, "learning_rate": 1e-06, "loss": 0.1049, "num_tokens": 138585493.0, "reward": 1.2371653318405151, "reward_std": 0.3297220468521118, "rewards/code_format_reward/mean": 0.8348214030265808, "rewards/code_format_reward/std": 0.37175676226615906, "rewards/curriculum_aware_reward_fn/mean": 0.40234375, "rewards/curriculum_aware_reward_fn/std": 0.3358352482318878, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0915178571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 3929.0, "completions/mean_length": 1361.3170166015625, "completions/mean_terminated_length": 1085.8328857421875, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.45796802475502835, "grad_norm": 0.2520643174648285, "kl": 0.0305328369140625, "learning_rate": 1e-06, "loss": 0.1094, "num_tokens": 139441679.0, "reward": 1.4743304252624512, "reward_std": 0.2411898672580719, "rewards/code_format_reward/mean": 0.9084821343421936, "rewards/code_format_reward/std": 0.2886664867401123, "rewards/curriculum_aware_reward_fn/mean": 0.5658482313156128, "rewards/curriculum_aware_reward_fn/std": 0.3099125921726227, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0848214285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4077.0, "completions/mean_length": 1464.3460693359375, "completions/mean_terminated_length": 1220.4366455078125, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.4620938628158845, "grad_norm": 0.4099730849266052, "kl": 0.0286865234375, "learning_rate": 1e-06, "loss": 0.075, "num_tokens": 140380443.0, "reward": 1.4079242944717407, "reward_std": 0.2221948206424713, "rewards/code_format_reward/mean": 0.9151785969734192, "rewards/code_format_reward/std": 0.2789272665977478, "rewards/curriculum_aware_reward_fn/mean": 0.4927455484867096, "rewards/curriculum_aware_reward_fn/std": 0.3345801830291748, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1785714285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 3933.0, "completions/mean_length": 1821.825927734375, "completions/mean_terminated_length": 1327.440185546875, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.4662197008767406, "grad_norm": 0.23430632054805756, "kl": 0.02587890625, "learning_rate": 1e-06, "loss": 0.1224, "num_tokens": 141482648.0, "reward": 1.224888563156128, "reward_std": 0.342571884393692, "rewards/code_format_reward/mean": 0.8258928656578064, "rewards/code_format_reward/std": 0.37962549924850464, "rewards/curriculum_aware_reward_fn/mean": 0.3989955484867096, "rewards/curriculum_aware_reward_fn/std": 0.3348412811756134, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1897321428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4093.0, "completions/mean_length": 1756.6474609375, "completions/mean_terminated_length": 1208.864990234375, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.4703455389375967, "grad_norm": 0.22820448875427246, "kl": 0.026947021484375, "learning_rate": 1e-06, "loss": 0.1406, "num_tokens": 142522683.0, "reward": 1.2661831378936768, "reward_std": 0.3532147705554962, "rewards/code_format_reward/mean": 0.8147321343421936, "rewards/code_format_reward/std": 0.38894903659820557, "rewards/curriculum_aware_reward_fn/mean": 0.4514508843421936, "rewards/curriculum_aware_reward_fn/std": 0.34557464718818665, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0691964285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4070.0, "completions/mean_length": 1491.571533203125, "completions/mean_terminated_length": 1297.9569091796875, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.4744713769984528, "grad_norm": 0.8920139670372009, "kl": 0.041351318359375, "learning_rate": 1e-06, "loss": 0.1264, "num_tokens": 143453204.0, "reward": 1.4402902126312256, "reward_std": 0.28800126910209656, "rewards/code_format_reward/mean": 0.9330357313156128, "rewards/code_format_reward/std": 0.2502395808696747, "rewards/curriculum_aware_reward_fn/mean": 0.5072544813156128, "rewards/curriculum_aware_reward_fn/std": 0.34970080852508545, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1517857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4044.0, "completions/mean_length": 1787.3349609375, "completions/mean_terminated_length": 1374.205322265625, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.4785972150593089, "grad_norm": 0.24828733503818512, "kl": 0.028778076171875, "learning_rate": 1e-06, "loss": 0.1543, "num_tokens": 144520238.0, "reward": 1.2583706378936768, "reward_std": 0.3413547873497009, "rewards/code_format_reward/mean": 0.8504464030265808, "rewards/code_format_reward/std": 0.3570319712162018, "rewards/curriculum_aware_reward_fn/mean": 0.4079241156578064, "rewards/curriculum_aware_reward_fn/std": 0.3402997851371765, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0848214285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 3916.0, "completions/mean_length": 1429.7835693359375, "completions/mean_terminated_length": 1182.6707763671875, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.48272305312016506, "grad_norm": 0.28653624653816223, "kl": 0.0347900390625, "learning_rate": 1e-06, "loss": 0.1143, "num_tokens": 145440627.0, "reward": 1.4001116752624512, "reward_std": 0.2624467611312866, "rewards/code_format_reward/mean": 0.9129464030265808, "rewards/code_format_reward/std": 0.2822287082672119, "rewards/curriculum_aware_reward_fn/mean": 0.4871651828289032, "rewards/curriculum_aware_reward_fn/std": 0.3331555426120758, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4088.0, "completions/mean_length": 1476.118408203125, "completions/mean_terminated_length": 1205.0960693359375, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.48684889118102115, "grad_norm": 0.33151334524154663, "kl": 0.0331573486328125, "learning_rate": 1e-06, "loss": 0.1138, "num_tokens": 146374004.0, "reward": 1.4040180444717407, "reward_std": 0.26717686653137207, "rewards/code_format_reward/mean": 0.90625, "rewards/code_format_reward/std": 0.2918064594268799, "rewards/curriculum_aware_reward_fn/mean": 0.4977678656578064, "rewards/curriculum_aware_reward_fn/std": 0.3229573965072632, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1049107142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4081.0, "completions/mean_length": 1542.76123046875, "completions/mean_terminated_length": 1243.5037841796875, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.49097472924187724, "grad_norm": 0.4114866256713867, "kl": 0.032562255859375, "learning_rate": 1e-06, "loss": 0.1222, "num_tokens": 147338469.0, "reward": 1.3861607313156128, "reward_std": 0.28715091943740845, "rewards/code_format_reward/mean": 0.8995535969734192, "rewards/code_format_reward/std": 0.30093035101890564, "rewards/curriculum_aware_reward_fn/mean": 0.4866071343421936, "rewards/curriculum_aware_reward_fn/std": 0.40665504336357117, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0959821428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4028.0, "completions/mean_length": 1582.04248046875, "completions/mean_terminated_length": 1315.12841796875, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.4951005673027334, "grad_norm": 0.2522932291030884, "kl": 0.031341552734375, "learning_rate": 1e-06, "loss": 0.1164, "num_tokens": 148310697.0, "reward": 1.3638395071029663, "reward_std": 0.28626155853271484, "rewards/code_format_reward/mean": 0.9040178656578064, "rewards/code_format_reward/std": 0.29489606618881226, "rewards/curriculum_aware_reward_fn/mean": 0.4598214328289032, "rewards/curriculum_aware_reward_fn/std": 0.32948601245880127, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0848214285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4031.0, "completions/mean_length": 1414.5848388671875, "completions/mean_terminated_length": 1166.0634765625, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.4992264053635895, "grad_norm": 0.24609112739562988, "kl": 0.0311431884765625, "learning_rate": 1e-06, "loss": 0.1099, "num_tokens": 149212419.0, "reward": 1.393973469734192, "reward_std": 0.24073131382465363, "rewards/code_format_reward/mean": 0.9084821343421936, "rewards/code_format_reward/std": 0.2886664867401123, "rewards/curriculum_aware_reward_fn/mean": 0.4854910671710968, "rewards/curriculum_aware_reward_fn/std": 0.34280943870544434, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0669642857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4073.0, "completions/mean_length": 1362.212158203125, "completions/mean_terminated_length": 1166.007080078125, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.5033522434244456, "grad_norm": 0.2834206521511078, "kl": 0.0321807861328125, "learning_rate": 1e-06, "loss": 0.0867, "num_tokens": 150089439.0, "reward": 1.4436384439468384, "reward_std": 0.2840638756752014, "rewards/code_format_reward/mean": 0.9330357313156128, "rewards/code_format_reward/std": 0.2502395808696747, "rewards/curriculum_aware_reward_fn/mean": 0.5106026530265808, "rewards/curriculum_aware_reward_fn/std": 0.3281605541706085, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0870535714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4067.0, "completions/mean_length": 1611.4398193359375, "completions/mean_terminated_length": 1374.5257568359375, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.5074780814853017, "grad_norm": 0.2541446387767792, "kl": 0.0277862548828125, "learning_rate": 1e-06, "loss": 0.0649, "num_tokens": 151093905.0, "reward": 1.3560268878936768, "reward_std": 0.2548825442790985, "rewards/code_format_reward/mean": 0.9174107313156128, "rewards/code_format_reward/std": 0.2755681276321411, "rewards/curriculum_aware_reward_fn/mean": 0.4386160671710968, "rewards/curriculum_aware_reward_fn/std": 0.34127652645111084, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4023.0, "completions/mean_length": 1669.732177734375, "completions/mean_terminated_length": 1272.7064208984375, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.5116039195461578, "grad_norm": 0.27325934171676636, "kl": 0.0275115966796875, "learning_rate": 1e-06, "loss": 0.1469, "num_tokens": 152098880.0, "reward": 1.2678571939468384, "reward_std": 0.3051081597805023, "rewards/code_format_reward/mean": 0.8638392686843872, "rewards/code_format_reward/std": 0.34334254264831543, "rewards/curriculum_aware_reward_fn/mean": 0.4040178656578064, "rewards/curriculum_aware_reward_fn/std": 0.34514129161834717, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1160714285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4048.0, "completions/mean_length": 1610.993408203125, "completions/mean_terminated_length": 1284.6793212890625, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.5157297576070139, "grad_norm": 0.3332519829273224, "kl": 0.0283203125, "learning_rate": 1e-06, "loss": 0.0851, "num_tokens": 153083931.0, "reward": 1.2901787757873535, "reward_std": 0.25318700075149536, "rewards/code_format_reward/mean": 0.8883928656578064, "rewards/code_format_reward/std": 0.315234512090683, "rewards/curriculum_aware_reward_fn/mean": 0.4017857015132904, "rewards/curriculum_aware_reward_fn/std": 0.35174086689949036, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1316964285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4058.0, "completions/mean_length": 1601.6898193359375, "completions/mean_terminated_length": 1223.3753662109375, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.51985559566787, "grad_norm": 0.24008211493492126, "kl": 0.027740478515625, "learning_rate": 1e-06, "loss": 0.117, "num_tokens": 154061741.0, "reward": 1.3459821939468384, "reward_std": 0.28992971777915955, "rewards/code_format_reward/mean": 0.8683035969734192, "rewards/code_format_reward/std": 0.3385384678840637, "rewards/curriculum_aware_reward_fn/mean": 0.4776785671710968, "rewards/curriculum_aware_reward_fn/std": 0.34034061431884766, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0758928571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4059.0, "completions/mean_length": 1450.169677734375, "completions/mean_terminated_length": 1232.879150390625, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.5239814337287262, "grad_norm": 4.1945061683654785, "kl": 0.1844940185546875, "learning_rate": 1e-06, "loss": 0.0982, "num_tokens": 154985663.0, "reward": 1.3465402126312256, "reward_std": 0.26785099506378174, "rewards/code_format_reward/mean": 0.921875, "rewards/code_format_reward/std": 0.26866820454597473, "rewards/curriculum_aware_reward_fn/mean": 0.4246651828289032, "rewards/curriculum_aware_reward_fn/std": 0.3567657172679901, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4072.0, "completions/mean_length": 1795.9888916015625, "completions/mean_terminated_length": 1467.415771484375, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.5281072717895823, "grad_norm": 0.22446002066135406, "kl": 0.02386474609375, "learning_rate": 1e-06, "loss": 0.0603, "num_tokens": 156057771.0, "reward": 1.333147406578064, "reward_std": 0.2784731984138489, "rewards/code_format_reward/mean": 0.875, "rewards/code_format_reward/std": 0.3310886323451996, "rewards/curriculum_aware_reward_fn/mean": 0.4581473171710968, "rewards/curriculum_aware_reward_fn/std": 0.3415740728378296, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4095.0, "completions/mean_length": 1770.040283203125, "completions/mean_terminated_length": 1339.306884765625, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.5322331098504384, "grad_norm": 5.54105806350708, "kl": 0.023284912109375, "learning_rate": 1e-06, "loss": 0.1057, "num_tokens": 157149544.0, "reward": 1.28125, "reward_std": 0.3118419051170349, "rewards/code_format_reward/mean": 0.84375, "rewards/code_format_reward/std": 0.36349809169769287, "rewards/curriculum_aware_reward_fn/mean": 0.4375, "rewards/curriculum_aware_reward_fn/std": 0.3499760329723358, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1183035714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4084.0, "completions/mean_length": 1637.0023193359375, "completions/mean_terminated_length": 1307.060791015625, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.5363589479112945, "grad_norm": 0.2837858498096466, "kl": 0.024078369140625, "learning_rate": 1e-06, "loss": 0.0795, "num_tokens": 158150433.0, "reward": 1.3621653318405151, "reward_std": 0.276816189289093, "rewards/code_format_reward/mean": 0.8772321343421936, "rewards/code_format_reward/std": 0.3285374045372009, "rewards/curriculum_aware_reward_fn/mean": 0.4849330484867096, "rewards/curriculum_aware_reward_fn/std": 0.3542196750640869, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1607142857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4060.0, "completions/mean_length": 1794.009033203125, "completions/mean_terminated_length": 1353.2020263671875, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.5404847859721505, "grad_norm": 1.386220097541809, "kl": 0.04083251953125, "learning_rate": 1e-06, "loss": 0.0766, "num_tokens": 159232966.0, "reward": 1.2879464626312256, "reward_std": 0.2730337083339691, "rewards/code_format_reward/mean": 0.8415178656578064, "rewards/code_format_reward/std": 0.36560073494911194, "rewards/curriculum_aware_reward_fn/mean": 0.4464285671710968, "rewards/curriculum_aware_reward_fn/std": 0.34136614203453064, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4070.0, "completions/mean_length": 1691.6251220703125, "completions/mean_terminated_length": 1298.1817626953125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.5446106240330068, "grad_norm": 0.2902727425098419, "kl": 0.0252227783203125, "learning_rate": 1e-06, "loss": 0.1115, "num_tokens": 160274181.0, "reward": 1.3169643878936768, "reward_std": 0.3020443320274353, "rewards/code_format_reward/mean": 0.8571428656578064, "rewards/code_format_reward/std": 0.3503182828426361, "rewards/curriculum_aware_reward_fn/mean": 0.4598214328289032, "rewards/curriculum_aware_reward_fn/std": 0.3399295210838318, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4062.0, "completions/mean_length": 1716.4910888671875, "completions/mean_terminated_length": 1327.1168212890625, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.5487364620938628, "grad_norm": 0.23055890202522278, "kl": 0.0243072509765625, "learning_rate": 1e-06, "loss": 0.0755, "num_tokens": 161322504.0, "reward": 1.2466518878936768, "reward_std": 0.2668859362602234, "rewards/code_format_reward/mean": 0.8616071343421936, "rewards/code_format_reward/std": 0.34569787979125977, "rewards/curriculum_aware_reward_fn/mean": 0.3850446343421936, "rewards/curriculum_aware_reward_fn/std": 0.34822914004325867, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0848214285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 3985.0, "completions/mean_length": 1515.243408203125, "completions/mean_terminated_length": 1276.05126953125, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 0.5528623001547189, "grad_norm": 0.25876471400260925, "kl": 0.02783203125, "learning_rate": 1e-06, "loss": 0.0942, "num_tokens": 162272709.0, "reward": 1.3922991752624512, "reward_std": 0.2758462131023407, "rewards/code_format_reward/mean": 0.9151785969734192, "rewards/code_format_reward/std": 0.2789272665977478, "rewards/curriculum_aware_reward_fn/mean": 0.4771205484867096, "rewards/curriculum_aware_reward_fn/std": 0.32753223180770874, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0669642857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4039.0, "completions/mean_length": 1554.3660888671875, "completions/mean_terminated_length": 1371.9521484375, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.556988138215575, "grad_norm": 0.24866972863674164, "kl": 0.0265655517578125, "learning_rate": 1e-06, "loss": 0.0835, "num_tokens": 163253559.0, "reward": 1.4202009439468384, "reward_std": 0.2786424458026886, "rewards/code_format_reward/mean": 0.9308035969734192, "rewards/code_format_reward/std": 0.25407159328460693, "rewards/curriculum_aware_reward_fn/mean": 0.4893973171710968, "rewards/curriculum_aware_reward_fn/std": 0.3427489101886749, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0669642857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4051.0, "completions/mean_length": 1529.4420166015625, "completions/mean_terminated_length": 1345.2391357421875, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.5611139762764311, "grad_norm": 0.23123760521411896, "kl": 0.0279998779296875, "learning_rate": 1e-06, "loss": 0.0744, "num_tokens": 164201438.0, "reward": 1.4324777126312256, "reward_std": 0.21654264628887177, "rewards/code_format_reward/mean": 0.9330357313156128, "rewards/code_format_reward/std": 0.2502395808696747, "rewards/curriculum_aware_reward_fn/mean": 0.4994419515132904, "rewards/curriculum_aware_reward_fn/std": 0.3218805193901062, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0848214285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4046.0, "completions/mean_length": 1427.821533203125, "completions/mean_terminated_length": 1180.52685546875, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.5652398143372873, "grad_norm": 0.227939635515213, "kl": 0.028717041015625, "learning_rate": 1e-06, "loss": 0.0957, "num_tokens": 165107452.0, "reward": 1.340959906578064, "reward_std": 0.22032277286052704, "rewards/code_format_reward/mean": 0.9040178656578064, "rewards/code_format_reward/std": 0.29489606618881226, "rewards/curriculum_aware_reward_fn/mean": 0.4369419515132904, "rewards/curriculum_aware_reward_fn/std": 0.35758358240127563, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0736607142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4040.0, "completions/mean_length": 1452.040283203125, "completions/mean_terminated_length": 1241.797607421875, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.5693656523981434, "grad_norm": 0.29260191321372986, "kl": 0.031219482421875, "learning_rate": 1e-06, "loss": 0.0444, "num_tokens": 166034739.0, "reward": 1.4441965818405151, "reward_std": 0.2309379279613495, "rewards/code_format_reward/mean": 0.9285714030265808, "rewards/code_format_reward/std": 0.2578272819519043, "rewards/curriculum_aware_reward_fn/mean": 0.515625, "rewards/curriculum_aware_reward_fn/std": 0.3484244644641876, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0669642857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 3985.0, "completions/mean_length": 1325.2679443359375, "completions/mean_terminated_length": 1126.4114990234375, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.5734914904589995, "grad_norm": 0.3624438941478729, "kl": 0.0299072265625, "learning_rate": 1e-06, "loss": 0.0855, "num_tokens": 166880102.0, "reward": 1.4614956378936768, "reward_std": 0.23535968363285065, "rewards/code_format_reward/mean": 0.9330357313156128, "rewards/code_format_reward/std": 0.2502395808696747, "rewards/curriculum_aware_reward_fn/mean": 0.5284598469734192, "rewards/curriculum_aware_reward_fn/std": 0.3171095550060272, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0580357142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4085.0, "completions/mean_length": 1532.94873046875, "completions/mean_terminated_length": 1375.03564453125, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.5776173285198556, "grad_norm": 0.39535659551620483, "kl": 0.0291595458984375, "learning_rate": 1e-06, "loss": 0.0379, "num_tokens": 167843111.0, "reward": 1.4324778318405151, "reward_std": 0.27631857991218567, "rewards/code_format_reward/mean": 0.9419642686843872, "rewards/code_format_reward/std": 0.23407234251499176, "rewards/curriculum_aware_reward_fn/mean": 0.4905133843421936, "rewards/curriculum_aware_reward_fn/std": 0.3332679271697998, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0892857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4074.0, "completions/mean_length": 1409.63623046875, "completions/mean_terminated_length": 1146.2672119140625, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.5817431665807117, "grad_norm": 0.2379978448152542, "kl": 0.031463623046875, "learning_rate": 1e-06, "loss": 0.0652, "num_tokens": 168746200.0, "reward": 1.4441964626312256, "reward_std": 0.20717236399650574, "rewards/code_format_reward/mean": 0.9107142686843872, "rewards/code_format_reward/std": 0.2854744791984558, "rewards/curriculum_aware_reward_fn/mean": 0.5334821343421936, "rewards/curriculum_aware_reward_fn/std": 0.3276851177215576, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1160714285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4009.0, "completions/mean_length": 1701.87060546875, "completions/mean_terminated_length": 1387.4898681640625, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.5858690046415678, "grad_norm": 0.2616744935512543, "kl": 0.028411865234375, "learning_rate": 1e-06, "loss": 0.0809, "num_tokens": 169799194.0, "reward": 1.3063616752624512, "reward_std": 0.25762638449668884, "rewards/code_format_reward/mean": 0.8816964030265808, "rewards/code_format_reward/std": 0.32332828640937805, "rewards/curriculum_aware_reward_fn/mean": 0.4246651828289032, "rewards/curriculum_aware_reward_fn/std": 0.3622109293937683, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1361607142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4074.0, "completions/mean_length": 1853.5513916015625, "completions/mean_terminated_length": 1500.0904541015625, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.589994842702424, "grad_norm": 0.2574971318244934, "kl": 0.0262451171875, "learning_rate": 1e-06, "loss": 0.1291, "num_tokens": 170911874.0, "reward": 1.2963171005249023, "reward_std": 0.34903791546821594, "rewards/code_format_reward/mean": 0.8683035969734192, "rewards/code_format_reward/std": 0.3385384678840637, "rewards/curriculum_aware_reward_fn/mean": 0.4280133843421936, "rewards/curriculum_aware_reward_fn/std": 0.3463324308395386, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1160714285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4065.0, "completions/mean_length": 1672.60498046875, "completions/mean_terminated_length": 1354.38134765625, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.5941206807632801, "grad_norm": 0.2555321156978607, "kl": 0.025360107421875, "learning_rate": 1e-06, "loss": 0.0667, "num_tokens": 171921125.0, "reward": 1.3867188692092896, "reward_std": 0.2532961368560791, "rewards/code_format_reward/mean": 0.8861607313156128, "rewards/code_format_reward/std": 0.31797102093696594, "rewards/curriculum_aware_reward_fn/mean": 0.5005580186843872, "rewards/curriculum_aware_reward_fn/std": 0.32447636127471924, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1316964285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4095.0, "completions/mean_length": 1727.0379638671875, "completions/mean_terminated_length": 1367.7352294921875, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.5982465188241362, "grad_norm": 0.23264895379543304, "kl": 0.0254364013671875, "learning_rate": 1e-06, "loss": 0.1189, "num_tokens": 172965073.0, "reward": 1.329241156578064, "reward_std": 0.2897050380706787, "rewards/code_format_reward/mean": 0.8705357313156128, "rewards/code_format_reward/std": 0.3360883891582489, "rewards/curriculum_aware_reward_fn/mean": 0.4587053656578064, "rewards/curriculum_aware_reward_fn/std": 0.33938372135162354, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1316964285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4082.0, "completions/mean_length": 1562.3460693359375, "completions/mean_terminated_length": 1178.064208984375, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.6023723568849922, "grad_norm": 0.24680227041244507, "kl": 0.026519775390625, "learning_rate": 1e-06, "loss": 0.0752, "num_tokens": 173921568.0, "reward": 1.3906251192092896, "reward_std": 0.23298880457878113, "rewards/code_format_reward/mean": 0.8727678656578064, "rewards/code_format_reward/std": 0.3336053788661957, "rewards/curriculum_aware_reward_fn/mean": 0.5178571343421936, "rewards/curriculum_aware_reward_fn/std": 0.34305882453918457, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1116071428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 3911.0, "completions/mean_length": 1621.810302734375, "completions/mean_terminated_length": 1310.982421875, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.6064981949458483, "grad_norm": 0.23276476562023163, "kl": 0.023681640625, "learning_rate": 1e-06, "loss": 0.114, "num_tokens": 174914776.0, "reward": 1.5853794813156128, "reward_std": 0.34051597118377686, "rewards/code_format_reward/mean": 0.8861607313156128, "rewards/code_format_reward/std": 0.31797102093696594, "rewards/curriculum_aware_reward_fn/mean": 0.69921875, "rewards/curriculum_aware_reward_fn/std": 0.44566574692726135, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2008928571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4075.0, "completions/mean_length": 2018.6898193359375, "completions/mean_terminated_length": 1496.4608154296875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.6106240330067045, "grad_norm": 0.22757108509540558, "kl": 0.0224456787109375, "learning_rate": 1e-06, "loss": 0.1062, "num_tokens": 176091589.0, "reward": 1.3331475257873535, "reward_std": 0.38912174105644226, "rewards/code_format_reward/mean": 0.8013392686843872, "rewards/code_format_reward/std": 0.3994380831718445, "rewards/curriculum_aware_reward_fn/mean": 0.5318080186843872, "rewards/curriculum_aware_reward_fn/std": 0.44960641860961914, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2433035714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4087.0, "completions/mean_length": 2072.685302734375, "completions/mean_terminated_length": 1422.1209716796875, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.6147498710675606, "grad_norm": 0.21814404428005219, "kl": 0.021636962890625, "learning_rate": 1e-06, "loss": 0.1356, "num_tokens": 177309441.0, "reward": 1.26171875, "reward_std": 0.3499029576778412, "rewards/code_format_reward/mean": 0.7544642686843872, "rewards/code_format_reward/std": 0.43088552355766296, "rewards/curriculum_aware_reward_fn/mean": 0.5072544813156128, "rewards/curriculum_aware_reward_fn/std": 0.4689197540283203, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1316964285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4046.0, "completions/mean_length": 1769.4420166015625, "completions/mean_terminated_length": 1416.5706787109375, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.6188757091284167, "grad_norm": 0.31112489104270935, "kl": 0.0232086181640625, "learning_rate": 1e-06, "loss": 0.1199, "num_tokens": 178388640.0, "reward": 1.4916294813156128, "reward_std": 0.33390578627586365, "rewards/code_format_reward/mean": 0.875, "rewards/code_format_reward/std": 0.3310886323451996, "rewards/curriculum_aware_reward_fn/mean": 0.6166294813156128, "rewards/curriculum_aware_reward_fn/std": 0.44204020500183105, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1227678571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4073.0, "completions/mean_length": 1639.294677734375, "completions/mean_terminated_length": 1295.48095703125, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.6230015471892728, "grad_norm": 0.2728213965892792, "kl": 0.0240020751953125, "learning_rate": 1e-06, "loss": 0.0856, "num_tokens": 179387014.0, "reward": 1.5574778318405151, "reward_std": 0.3352726101875305, "rewards/code_format_reward/mean": 0.8772321343421936, "rewards/code_format_reward/std": 0.3285374045372009, "rewards/curriculum_aware_reward_fn/mean": 0.6802455186843872, "rewards/curriculum_aware_reward_fn/std": 0.42570555210113525, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0714285714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 3908.0, "completions/mean_length": 1386.1719970703125, "completions/mean_terminated_length": 1177.7236328125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.6271273852501289, "grad_norm": 9493.4013671875, "kl": 173.02638244628906, "learning_rate": 1e-06, "loss": 1.8191, "num_tokens": 180265801.0, "reward": 1.6411831378936768, "reward_std": 0.29393258690834045, "rewards/code_format_reward/mean": 0.9285714030265808, "rewards/code_format_reward/std": 0.2578272819519043, "rewards/curriculum_aware_reward_fn/mean": 0.7126116156578064, "rewards/curriculum_aware_reward_fn/std": 0.3994157910346985, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0669642857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4017.0, "completions/mean_length": 1396.482177734375, "completions/mean_terminated_length": 1202.73681640625, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.631253223310985, "grad_norm": 0.4193595349788666, "kl": 0.030120849609375, "learning_rate": 1e-06, "loss": 0.1039, "num_tokens": 181166630.0, "reward": 1.6155134439468384, "reward_std": 0.27149245142936707, "rewards/code_format_reward/mean": 0.9330357313156128, "rewards/code_format_reward/std": 0.2502395808696747, "rewards/curriculum_aware_reward_fn/mean": 0.6824776530265808, "rewards/curriculum_aware_reward_fn/std": 0.40898725390434265, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3971.0, "completions/mean_length": 1631.180908203125, "completions/mean_terminated_length": 1227.8466796875, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.6353790613718412, "grad_norm": 0.24393942952156067, "kl": 0.02801513671875, "learning_rate": 1e-06, "loss": 0.1284, "num_tokens": 182168236.0, "reward": 1.520647406578064, "reward_std": 0.3639439344406128, "rewards/code_format_reward/mean": 0.8571428656578064, "rewards/code_format_reward/std": 0.3503182828426361, "rewards/curriculum_aware_reward_fn/mean": 0.6635044813156128, "rewards/curriculum_aware_reward_fn/std": 0.41962698101997375, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0691964285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 3935.0, "completions/mean_length": 1488.997802734375, "completions/mean_terminated_length": 1295.19189453125, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.6395048994326973, "grad_norm": 0.24519453942775726, "kl": 0.0286712646484375, "learning_rate": 1e-06, "loss": 0.0596, "num_tokens": 183118744.0, "reward": 1.5418527126312256, "reward_std": 0.28998079895973206, "rewards/code_format_reward/mean": 0.9308035969734192, "rewards/code_format_reward/std": 0.25407159328460693, "rewards/curriculum_aware_reward_fn/mean": 0.6110491156578064, "rewards/curriculum_aware_reward_fn/std": 0.45067140460014343, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0959821428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4072.0, "completions/mean_length": 1458.91748046875, "completions/mean_terminated_length": 1178.930908203125, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.6436307374935534, "grad_norm": 0.2833305597305298, "kl": 0.031219482421875, "learning_rate": 1e-06, "loss": 0.0691, "num_tokens": 184048615.0, "reward": 1.5625001192092896, "reward_std": 0.22553038597106934, "rewards/code_format_reward/mean": 0.9040178656578064, "rewards/code_format_reward/std": 0.29489603638648987, "rewards/curriculum_aware_reward_fn/mean": 0.6584821343421936, "rewards/curriculum_aware_reward_fn/std": 0.41805654764175415, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.044642857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 4026.0, "completions/mean_length": 1351.66748046875, "completions/mean_terminated_length": 1223.427490234375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.6477565755544095, "grad_norm": 0.23887711763381958, "kl": 0.0341796875, "learning_rate": 1e-06, "loss": 0.0761, "num_tokens": 184932615.0, "reward": 1.6177456378936768, "reward_std": 0.2380119115114212, "rewards/code_format_reward/mean": 0.953125, "rewards/code_format_reward/std": 0.21160738170146942, "rewards/curriculum_aware_reward_fn/mean": 0.6646205186843872, "rewards/curriculum_aware_reward_fn/std": 0.4662471413612366, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0736607142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4045.0, "completions/mean_length": 1501.12060546875, "completions/mean_terminated_length": 1294.78076171875, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.6518824136152656, "grad_norm": 0.2671646773815155, "kl": 0.031646728515625, "learning_rate": 1e-06, "loss": 0.0737, "num_tokens": 185850208.0, "reward": 1.4492188692092896, "reward_std": 0.29938623309135437, "rewards/code_format_reward/mean": 0.9241071343421936, "rewards/code_format_reward/std": 0.265122652053833, "rewards/curriculum_aware_reward_fn/mean": 0.5251116156578064, "rewards/curriculum_aware_reward_fn/std": 0.458342969417572, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.060267857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3965.0, "completions/mean_length": 1513.7701416015625, "completions/mean_terminated_length": 1348.1639404296875, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.6560082516761218, "grad_norm": 0.29023823142051697, "kl": 0.032379150390625, "learning_rate": 1e-06, "loss": 0.0534, "num_tokens": 186809616.0, "reward": 1.5669643878936768, "reward_std": 0.2523789405822754, "rewards/code_format_reward/mean": 0.9397321343421936, "rewards/code_format_reward/std": 0.23824848234653473, "rewards/curriculum_aware_reward_fn/mean": 0.6272321343421936, "rewards/curriculum_aware_reward_fn/std": 0.43542200326919556, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0669642857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4015.0, "completions/mean_length": 1465.0982666015625, "completions/mean_terminated_length": 1276.2774658203125, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.6601340897369778, "grad_norm": 0.26688751578330994, "kl": 0.0313720703125, "learning_rate": 1e-06, "loss": 0.0827, "num_tokens": 187766672.0, "reward": 1.5385044813156128, "reward_std": 0.2747633159160614, "rewards/code_format_reward/mean": 0.9308035969734192, "rewards/code_format_reward/std": 0.25407159328460693, "rewards/curriculum_aware_reward_fn/mean": 0.6077008843421936, "rewards/curriculum_aware_reward_fn/std": 0.4386039674282074, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0691964285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4089.0, "completions/mean_length": 1635.1160888671875, "completions/mean_terminated_length": 1452.1727294921875, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.6642599277978339, "grad_norm": 0.507167637348175, "kl": 0.03192138671875, "learning_rate": 1e-06, "loss": 0.081, "num_tokens": 188772652.0, "reward": 1.6110491752624512, "reward_std": 0.3116791844367981, "rewards/code_format_reward/mean": 0.9330357313156128, "rewards/code_format_reward/std": 0.2502395808696747, "rewards/curriculum_aware_reward_fn/mean": 0.6780133843421936, "rewards/curriculum_aware_reward_fn/std": 0.4424128830432892, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0535714285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4049.0, "completions/mean_length": 1457.5626220703125, "completions/mean_terminated_length": 1308.217041015625, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.66838576585869, "grad_norm": 0.24933311343193054, "kl": 0.032867431640625, "learning_rate": 1e-06, "loss": 0.0462, "num_tokens": 189695399.0, "reward": 1.6099331378936768, "reward_std": 0.290281742811203, "rewards/code_format_reward/mean": 0.9419642686843872, "rewards/code_format_reward/std": 0.23407234251499176, "rewards/curriculum_aware_reward_fn/mean": 0.66796875, "rewards/curriculum_aware_reward_fn/std": 0.4274497628211975, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0424107142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 3995.0, "completions/mean_length": 1397.3482666015625, "completions/mean_terminated_length": 1277.8275146484375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.6725116039195461, "grad_norm": 0.26965391635894775, "kl": 0.03326416015625, "learning_rate": 1e-06, "loss": 0.0669, "num_tokens": 190589091.0, "reward": 1.520647406578064, "reward_std": 0.2290925681591034, "rewards/code_format_reward/mean": 0.9553571343421936, "rewards/code_format_reward/std": 0.2067493349313736, "rewards/curriculum_aware_reward_fn/mean": 0.5652901530265808, "rewards/curriculum_aware_reward_fn/std": 0.4540465772151947, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0691964285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4077.0, "completions/mean_length": 1418.1920166015625, "completions/mean_terminated_length": 1219.122314453125, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.6766374419804023, "grad_norm": 0.23531630635261536, "kl": 0.0329742431640625, "learning_rate": 1e-06, "loss": 0.0451, "num_tokens": 191495894.0, "reward": 1.6333706378936768, "reward_std": 0.23477166891098022, "rewards/code_format_reward/mean": 0.9308035969734192, "rewards/code_format_reward/std": 0.25407156348228455, "rewards/curriculum_aware_reward_fn/mean": 0.7025669813156128, "rewards/curriculum_aware_reward_fn/std": 0.411127507686615, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4072.0, "completions/mean_length": 1574.9688720703125, "completions/mean_terminated_length": 1314.2684326171875, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.6807632800412584, "grad_norm": 0.2831145226955414, "kl": 0.0308685302734375, "learning_rate": 1e-06, "loss": 0.0951, "num_tokens": 192482075.0, "reward": 1.6037946939468384, "reward_std": 0.3270074129104614, "rewards/code_format_reward/mean": 0.9084821343421936, "rewards/code_format_reward/std": 0.2886664867401123, "rewards/curriculum_aware_reward_fn/mean": 0.6953125, "rewards/curriculum_aware_reward_fn/std": 0.4024815261363983, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0691964285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4095.0, "completions/mean_length": 1588.5045166015625, "completions/mean_terminated_length": 1402.095947265625, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.6848891181021145, "grad_norm": 0.30057018995285034, "kl": 0.0283660888671875, "learning_rate": 1e-06, "loss": 0.0795, "num_tokens": 193479822.0, "reward": 1.5837054252624512, "reward_std": 0.31081902980804443, "rewards/code_format_reward/mean": 0.9285714030265808, "rewards/code_format_reward/std": 0.2578272819519043, "rewards/curriculum_aware_reward_fn/mean": 0.6551339030265808, "rewards/curriculum_aware_reward_fn/std": 0.4467586278915405, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0803571428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4030.0, "completions/mean_length": 1408.1563720703125, "completions/mean_terminated_length": 1173.58251953125, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.6890149561629706, "grad_norm": 0.2894628345966339, "kl": 0.033660888671875, "learning_rate": 1e-06, "loss": 0.093, "num_tokens": 194377908.0, "reward": 1.5184152126312256, "reward_std": 0.2959974706172943, "rewards/code_format_reward/mean": 0.9151785969734192, "rewards/code_format_reward/std": 0.2789272665977478, "rewards/curriculum_aware_reward_fn/mean": 0.6032366156578064, "rewards/curriculum_aware_reward_fn/std": 0.45744597911834717, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.060267857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 4085.0, "completions/mean_length": 1560.6251220703125, "completions/mean_terminated_length": 1398.0238037109375, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.6931407942238267, "grad_norm": 0.24402505159378052, "kl": 0.0276641845703125, "learning_rate": 1e-06, "loss": 0.049, "num_tokens": 195368562.0, "reward": 1.5937501192092896, "reward_std": 0.293010950088501, "rewards/code_format_reward/mean": 0.9419642686843872, "rewards/code_format_reward/std": 0.23407234251499176, "rewards/curriculum_aware_reward_fn/mean": 0.6517857313156128, "rewards/curriculum_aware_reward_fn/std": 0.4134986698627472, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1026785714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4074.0, "completions/mean_length": 1704.462158203125, "completions/mean_terminated_length": 1430.803466796875, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.6972666322846828, "grad_norm": 0.2717946469783783, "kl": 0.0318450927734375, "learning_rate": 1e-06, "loss": 0.0658, "num_tokens": 196418412.0, "reward": 1.4324777126312256, "reward_std": 0.32240214943885803, "rewards/code_format_reward/mean": 0.8973214030265808, "rewards/code_format_reward/std": 0.30387791991233826, "rewards/curriculum_aware_reward_fn/mean": 0.53515625, "rewards/curriculum_aware_reward_fn/std": 0.4744802415370941, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1941964285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4093.0, "completions/mean_length": 2030.5023193359375, "completions/mean_terminated_length": 1532.722900390625, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.701392470345539, "grad_norm": 0.2802993655204773, "kl": 0.026092529296875, "learning_rate": 1e-06, "loss": 0.0919, "num_tokens": 197621083.0, "reward": 1.3928571939468384, "reward_std": 0.39308470487594604, "rewards/code_format_reward/mean": 0.8058035969734192, "rewards/code_format_reward/std": 0.3960230052471161, "rewards/curriculum_aware_reward_fn/mean": 0.5870535969734192, "rewards/curriculum_aware_reward_fn/std": 0.44897323846817017, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1584821428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4083.0, "completions/mean_length": 1918.5179443359375, "completions/mean_terminated_length": 1508.4349365234375, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.7055183084063951, "grad_norm": 0.36477211117744446, "kl": 0.0260162353515625, "learning_rate": 1e-06, "loss": 0.0777, "num_tokens": 198755252.0, "reward": 1.454241156578064, "reward_std": 0.3312704563140869, "rewards/code_format_reward/mean": 0.8325892686843872, "rewards/code_format_reward/std": 0.37375950813293457, "rewards/curriculum_aware_reward_fn/mean": 0.6216517686843872, "rewards/curriculum_aware_reward_fn/std": 0.4318680167198181, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1495535714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4078.0, "completions/mean_length": 1739.9241943359375, "completions/mean_terminated_length": 1325.60107421875, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.7096441464672512, "grad_norm": 0.46442899107933044, "kl": 0.0276031494140625, "learning_rate": 1e-06, "loss": 0.1028, "num_tokens": 199796048.0, "reward": 1.5027902126312256, "reward_std": 0.32464084029197693, "rewards/code_format_reward/mean": 0.8504464030265808, "rewards/code_format_reward/std": 0.3570319712162018, "rewards/curriculum_aware_reward_fn/mean": 0.65234375, "rewards/curriculum_aware_reward_fn/std": 0.468325674533844, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1361607142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4094.0, "completions/mean_length": 1747.138427734375, "completions/mean_terminated_length": 1376.9044189453125, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.7137699845281072, "grad_norm": 0.2245771586894989, "kl": 0.0282440185546875, "learning_rate": 1e-06, "loss": 0.0923, "num_tokens": 200836555.0, "reward": 1.5725446939468384, "reward_std": 0.3155837953090668, "rewards/code_format_reward/mean": 0.8683035969734192, "rewards/code_format_reward/std": 0.3385384678840637, "rewards/curriculum_aware_reward_fn/mean": 0.7042410969734192, "rewards/curriculum_aware_reward_fn/std": 0.43042027950286865, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0714285714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 3981.0, "completions/mean_length": 1397.227783203125, "completions/mean_terminated_length": 1189.6298828125, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.7178958225889633, "grad_norm": 0.33881711959838867, "kl": 0.0340576171875, "learning_rate": 1e-06, "loss": 0.0967, "num_tokens": 201729878.0, "reward": 1.6674107313156128, "reward_std": 0.2932094633579254, "rewards/code_format_reward/mean": 0.9241071343421936, "rewards/code_format_reward/std": 0.265122652053833, "rewards/curriculum_aware_reward_fn/mean": 0.7433035969734192, "rewards/curriculum_aware_reward_fn/std": 0.4337673783302307, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1495535714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4034.0, "completions/mean_length": 1938.9287109375, "completions/mean_terminated_length": 1559.60107421875, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.7220216606498195, "grad_norm": 0.2635800242424011, "kl": 0.0278778076171875, "learning_rate": 1e-06, "loss": 0.0897, "num_tokens": 202897922.0, "reward": 1.4179688692092896, "reward_std": 0.38792121410369873, "rewards/code_format_reward/mean": 0.8526785969734192, "rewards/code_format_reward/std": 0.3548222780227661, "rewards/curriculum_aware_reward_fn/mean": 0.5652901530265808, "rewards/curriculum_aware_reward_fn/std": 0.446594774723053, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1160714285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4053.0, "completions/mean_length": 1676.79248046875, "completions/mean_terminated_length": 1359.11865234375, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.7261474987106756, "grad_norm": 0.23359020054340363, "kl": 0.03045654296875, "learning_rate": 1e-06, "loss": 0.0611, "num_tokens": 203918884.0, "reward": 1.6484376192092896, "reward_std": 0.3379304111003876, "rewards/code_format_reward/mean": 0.8794642686843872, "rewards/code_format_reward/std": 0.3259509205818176, "rewards/curriculum_aware_reward_fn/mean": 0.7689732313156128, "rewards/curriculum_aware_reward_fn/std": 0.43693751096725464, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1629464285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4038.0, "completions/mean_length": 1882.0201416015625, "completions/mean_terminated_length": 1451.031982421875, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.7302733367715317, "grad_norm": 0.23006048798561096, "kl": 0.0271148681640625, "learning_rate": 1e-06, "loss": 0.1149, "num_tokens": 205031249.0, "reward": 1.4631696939468384, "reward_std": 0.3731190860271454, "rewards/code_format_reward/mean": 0.8482142686843872, "rewards/code_format_reward/std": 0.3592142164707184, "rewards/curriculum_aware_reward_fn/mean": 0.6149553656578064, "rewards/curriculum_aware_reward_fn/std": 0.49001163244247437, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1741071428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4093.0, "completions/mean_length": 1831.4554443359375, "completions/mean_terminated_length": 1354.06494140625, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.7343991748323878, "grad_norm": 0.24213403463363647, "kl": 0.0294342041015625, "learning_rate": 1e-06, "loss": 0.0726, "num_tokens": 206132973.0, "reward": 1.4341518878936768, "reward_std": 0.3073028028011322, "rewards/code_format_reward/mean": 0.828125, "rewards/code_format_reward/std": 0.3776935040950775, "rewards/curriculum_aware_reward_fn/mean": 0.6060267686843872, "rewards/curriculum_aware_reward_fn/std": 0.4343699514865875, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0870535714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4068.0, "completions/mean_length": 1576.2879638671875, "completions/mean_terminated_length": 1336.0220947265625, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.7385250128932439, "grad_norm": 0.2813894748687744, "kl": 0.03204345703125, "learning_rate": 1e-06, "loss": 0.0251, "num_tokens": 207089113.0, "reward": 1.5351563692092896, "reward_std": 0.2363065779209137, "rewards/code_format_reward/mean": 0.9174107313156128, "rewards/code_format_reward/std": 0.2755681276321411, "rewards/curriculum_aware_reward_fn/mean": 0.6177455186843872, "rewards/curriculum_aware_reward_fn/std": 0.4880596995353699, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4068.0, "completions/mean_length": 1835.216552734375, "completions/mean_terminated_length": 1557.576416015625, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.7426508509541001, "grad_norm": 0.4142996370792389, "kl": 0.0323028564453125, "learning_rate": 1e-06, "loss": 0.0862, "num_tokens": 208171816.0, "reward": 1.4564732313156128, "reward_std": 0.31579554080963135, "rewards/code_format_reward/mean": 0.8928571343421936, "rewards/code_format_reward/std": 0.3096405565738678, "rewards/curriculum_aware_reward_fn/mean": 0.5636160969734192, "rewards/curriculum_aware_reward_fn/std": 0.44824275374412537, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1763392857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 3952.0, "completions/mean_length": 1840.94873046875, "completions/mean_terminated_length": 1358.159912109375, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.7467766890149562, "grad_norm": 0.40619099140167236, "kl": 0.031097412109375, "learning_rate": 1e-06, "loss": 0.0761, "num_tokens": 209278927.0, "reward": 1.4319196939468384, "reward_std": 0.3028517961502075, "rewards/code_format_reward/mean": 0.8236607313156128, "rewards/code_format_reward/std": 0.3815346360206604, "rewards/curriculum_aware_reward_fn/mean": 0.6082589030265808, "rewards/curriculum_aware_reward_fn/std": 0.4515061378479004, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1272321428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4095.0, "completions/mean_length": 1845.9107666015625, "completions/mean_terminated_length": 1517.892578125, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.7509025270758123, "grad_norm": 0.2417289912700653, "kl": 0.0299072265625, "learning_rate": 1e-06, "loss": 0.1114, "num_tokens": 210361941.0, "reward": 1.4838169813156128, "reward_std": 0.3714328110218048, "rewards/code_format_reward/mean": 0.8727678656578064, "rewards/code_format_reward/std": 0.3336053490638733, "rewards/curriculum_aware_reward_fn/mean": 0.6110491156578064, "rewards/curriculum_aware_reward_fn/std": 0.46261295676231384, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1272321428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4033.0, "completions/mean_length": 1753.477783203125, "completions/mean_terminated_length": 1411.984619140625, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.7550283651366684, "grad_norm": 0.2215878665447235, "kl": 0.0321044921875, "learning_rate": 1e-06, "loss": 0.0735, "num_tokens": 211407794.0, "reward": 1.5697544813156128, "reward_std": 0.3379369080066681, "rewards/code_format_reward/mean": 0.875, "rewards/code_format_reward/std": 0.3310886323451996, "rewards/curriculum_aware_reward_fn/mean": 0.6947544813156128, "rewards/curriculum_aware_reward_fn/std": 0.42586973309516907, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1205357142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4095.0, "completions/mean_length": 1907.5068359375, "completions/mean_terminated_length": 1607.5609130859375, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.7591542031975245, "grad_norm": 0.25174498558044434, "kl": 0.0454559326171875, "learning_rate": 1e-06, "loss": 0.0811, "num_tokens": 212547824.0, "reward": 1.4787946939468384, "reward_std": 0.35792452096939087, "rewards/code_format_reward/mean": 0.8816964030265808, "rewards/code_format_reward/std": 0.32332828640937805, "rewards/curriculum_aware_reward_fn/mean": 0.5970982313156128, "rewards/curriculum_aware_reward_fn/std": 0.43645721673965454, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0736607142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 3922.0, "completions/mean_length": 1554.5179443359375, "completions/mean_terminated_length": 1352.424072265625, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.7632800412583806, "grad_norm": 0.23494736850261688, "kl": 0.03240966796875, "learning_rate": 1e-06, "loss": 0.0397, "num_tokens": 213514945.0, "reward": 1.4916294813156128, "reward_std": 0.24327993392944336, "rewards/code_format_reward/mean": 0.9263392686843872, "rewards/code_format_reward/std": 0.2615099549293518, "rewards/curriculum_aware_reward_fn/mean": 0.5652901530265808, "rewards/curriculum_aware_reward_fn/std": 0.46168097853660583, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1339285714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4095.0, "completions/mean_length": 1819.962158203125, "completions/mean_terminated_length": 1467.9974365234375, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.7674058793192368, "grad_norm": 0.966958224773407, "kl": 0.0330657958984375, "learning_rate": 1e-06, "loss": 0.0939, "num_tokens": 214591164.0, "reward": 1.4726563692092896, "reward_std": 0.275736927986145, "rewards/code_format_reward/mean": 0.8616071343421936, "rewards/code_format_reward/std": 0.34569787979125977, "rewards/curriculum_aware_reward_fn/mean": 0.6110491156578064, "rewards/curriculum_aware_reward_fn/std": 0.4194989800453186, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.044642857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3917.0, "completions/mean_length": 1395.8326416015625, "completions/mean_terminated_length": 1269.656494140625, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.7715317173800929, "grad_norm": 0.2275300920009613, "kl": 0.034637451171875, "learning_rate": 1e-06, "loss": 0.0766, "num_tokens": 215484940.0, "reward": 1.5781251192092896, "reward_std": 0.22841887176036835, "rewards/code_format_reward/mean": 0.9553571343421936, "rewards/code_format_reward/std": 0.2067493200302124, "rewards/curriculum_aware_reward_fn/mean": 0.6227678656578064, "rewards/curriculum_aware_reward_fn/std": 0.4536646008491516, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0401785714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4077.0, "completions/mean_length": 1437.8482666015625, "completions/mean_terminated_length": 1326.5767822265625, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "epoch": 0.7756575554409489, "grad_norm": 0.22897613048553467, "kl": 0.03436279296875, "learning_rate": 1e-06, "loss": 0.0523, "num_tokens": 216392807.0, "reward": 1.6791294813156128, "reward_std": 0.23795431852340698, "rewards/code_format_reward/mean": 0.9598214030265808, "rewards/code_format_reward/std": 0.1965973675251007, "rewards/curriculum_aware_reward_fn/mean": 0.7193080186843872, "rewards/curriculum_aware_reward_fn/std": 0.39788445830345154, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0803571428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4082.0, "completions/mean_length": 1588.37060546875, "completions/mean_terminated_length": 1369.25732421875, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.779783393501805, "grad_norm": 0.2361663430929184, "kl": 0.0339813232421875, "learning_rate": 1e-06, "loss": 0.0549, "num_tokens": 217363355.0, "reward": 1.5965402126312256, "reward_std": 0.3110112249851227, "rewards/code_format_reward/mean": 0.9196428656578064, "rewards/code_format_reward/std": 0.2721492052078247, "rewards/curriculum_aware_reward_fn/mean": 0.6768973469734192, "rewards/curriculum_aware_reward_fn/std": 0.44254547357559204, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0758928571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4054.0, "completions/mean_length": 1609.6585693359375, "completions/mean_terminated_length": 1405.4661865234375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.7839092315626611, "grad_norm": 105.76721954345703, "kl": 8.5748291015625, "learning_rate": 1e-06, "loss": 0.1734, "num_tokens": 218338099.0, "reward": 1.6116071939468384, "reward_std": 0.32082968950271606, "rewards/code_format_reward/mean": 0.9241071343421936, "rewards/code_format_reward/std": 0.2651226818561554, "rewards/curriculum_aware_reward_fn/mean": 0.6875, "rewards/curriculum_aware_reward_fn/std": 0.45826974511146545, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.049107142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 4095.0, "completions/mean_length": 1434.0648193359375, "completions/mean_terminated_length": 1296.5938720703125, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.7880350696235173, "grad_norm": 0.2986173927783966, "kl": 0.035888671875, "learning_rate": 1e-06, "loss": 0.1232, "num_tokens": 219230558.0, "reward": 1.5585938692092896, "reward_std": 0.30678367614746094, "rewards/code_format_reward/mean": 0.9486607313156128, "rewards/code_format_reward/std": 0.22093553841114044, "rewards/curriculum_aware_reward_fn/mean": 0.6099330186843872, "rewards/curriculum_aware_reward_fn/std": 0.4697762131690979, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0669642857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4021.0, "completions/mean_length": 1682.0023193359375, "completions/mean_terminated_length": 1508.748779296875, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "epoch": 0.7921609076843734, "grad_norm": 0.22439171373844147, "kl": 0.0318145751953125, "learning_rate": 1e-06, "loss": 0.0876, "num_tokens": 220277927.0, "reward": 1.583147406578064, "reward_std": 0.2523297071456909, "rewards/code_format_reward/mean": 0.9352678656578064, "rewards/code_format_reward/std": 0.24632768332958221, "rewards/curriculum_aware_reward_fn/mean": 0.6478794813156128, "rewards/curriculum_aware_reward_fn/std": 0.41676414012908936, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4063.0, "completions/mean_length": 1550.071533203125, "completions/mean_terminated_length": 1334.3148193359375, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.7962867457452295, "grad_norm": 0.23406584560871124, "kl": 0.0334625244140625, "learning_rate": 1e-06, "loss": 0.0729, "num_tokens": 221246598.0, "reward": 1.5809152126312256, "reward_std": 0.2662461996078491, "rewards/code_format_reward/mean": 0.9241071343421936, "rewards/code_format_reward/std": 0.265122652053833, "rewards/curriculum_aware_reward_fn/mean": 0.6568080186843872, "rewards/curriculum_aware_reward_fn/std": 0.427119642496109, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1183035714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4075.0, "completions/mean_length": 1826.5938720703125, "completions/mean_terminated_length": 1522.0911865234375, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.8004125838060856, "grad_norm": 0.21730855107307434, "kl": 0.0298004150390625, "learning_rate": 1e-06, "loss": 0.0458, "num_tokens": 222330142.0, "reward": 1.5094866752624512, "reward_std": 0.29588255286216736, "rewards/code_format_reward/mean": 0.8794642686843872, "rewards/code_format_reward/std": 0.3259509205818176, "rewards/curriculum_aware_reward_fn/mean": 0.6300223469734192, "rewards/curriculum_aware_reward_fn/std": 0.4289279580116272, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0915178571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4081.0, "completions/mean_length": 1633.388427734375, "completions/mean_terminated_length": 1385.31201171875, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.8045384218669417, "grad_norm": 0.30130332708358765, "kl": 0.0331573486328125, "learning_rate": 1e-06, "loss": 0.0836, "num_tokens": 223321579.0, "reward": 1.4949777126312256, "reward_std": 0.295271098613739, "rewards/code_format_reward/mean": 0.9129464030265808, "rewards/code_format_reward/std": 0.2822287082672119, "rewards/curriculum_aware_reward_fn/mean": 0.58203125, "rewards/curriculum_aware_reward_fn/std": 0.44192442297935486, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.060267857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 4077.0, "completions/mean_length": 1580.6451416015625, "completions/mean_terminated_length": 1419.327880859375, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.8086642599277978, "grad_norm": 0.23669199645519257, "kl": 0.034393310546875, "learning_rate": 1e-06, "loss": 0.0955, "num_tokens": 224302008.0, "reward": 1.4860491752624512, "reward_std": 0.27263176441192627, "rewards/code_format_reward/mean": 0.9330357313156128, "rewards/code_format_reward/std": 0.2502395808696747, "rewards/curriculum_aware_reward_fn/mean": 0.5530133843421936, "rewards/curriculum_aware_reward_fn/std": 0.45564672350883484, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0915178571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4018.0, "completions/mean_length": 1787.5045166015625, "completions/mean_terminated_length": 1554.9532470703125, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "epoch": 0.812790097988654, "grad_norm": 0.23128311336040497, "kl": 0.0313262939453125, "learning_rate": 1e-06, "loss": 0.0688, "num_tokens": 225370693.0, "reward": 1.4927457571029663, "reward_std": 0.34113866090774536, "rewards/code_format_reward/mean": 0.9107142686843872, "rewards/code_format_reward/std": 0.2854745090007782, "rewards/curriculum_aware_reward_fn/mean": 0.58203125, "rewards/curriculum_aware_reward_fn/std": 0.46774688363075256, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1160714285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4024.0, "completions/mean_length": 1613.60498046875, "completions/mean_terminated_length": 1287.6337890625, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.8169159360495101, "grad_norm": 0.2518206536769867, "kl": 0.0317840576171875, "learning_rate": 1e-06, "loss": 0.0955, "num_tokens": 226348311.0, "reward": 1.5078126192092896, "reward_std": 0.2923620939254761, "rewards/code_format_reward/mean": 0.8883928656578064, "rewards/code_format_reward/std": 0.31523454189300537, "rewards/curriculum_aware_reward_fn/mean": 0.6194196343421936, "rewards/curriculum_aware_reward_fn/std": 0.42334306240081787, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0580357142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4089.0, "completions/mean_length": 1561.1607666015625, "completions/mean_terminated_length": 1404.98583984375, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.8210417741103662, "grad_norm": 0.24923692643642426, "kl": 0.0327911376953125, "learning_rate": 1e-06, "loss": 0.0653, "num_tokens": 227307719.0, "reward": 1.6110491752624512, "reward_std": 0.28037238121032715, "rewards/code_format_reward/mean": 0.9441964030265808, "rewards/code_format_reward/std": 0.22979861497879028, "rewards/curriculum_aware_reward_fn/mean": 0.6668526530265808, "rewards/curriculum_aware_reward_fn/std": 0.42690619826316833, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0691964285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4088.0, "completions/mean_length": 1702.05810546875, "completions/mean_terminated_length": 1524.0911865234375, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.8251676121712223, "grad_norm": 0.2382507473230362, "kl": 0.031768798828125, "learning_rate": 1e-06, "loss": 0.0812, "num_tokens": 228353041.0, "reward": 1.4849331378936768, "reward_std": 0.27355092763900757, "rewards/code_format_reward/mean": 0.9308035969734192, "rewards/code_format_reward/std": 0.25407159328460693, "rewards/curriculum_aware_reward_fn/mean": 0.5541294813156128, "rewards/curriculum_aware_reward_fn/std": 0.4691247045993805, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0558035714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4039.0, "completions/mean_length": 1397.5670166015625, "completions/mean_terminated_length": 1238.0850830078125, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.8292934502320783, "grad_norm": 0.3091779351234436, "kl": 0.035430908203125, "learning_rate": 1e-06, "loss": 0.0267, "num_tokens": 229248721.0, "reward": 1.6277902126312256, "reward_std": 0.25161013007164, "rewards/code_format_reward/mean": 0.9486607313156128, "rewards/code_format_reward/std": 0.22093553841114044, "rewards/curriculum_aware_reward_fn/mean": 0.6791294813156128, "rewards/curriculum_aware_reward_fn/std": 0.4202304482460022, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0647321428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4095.0, "completions/mean_length": 1521.2388916015625, "completions/mean_terminated_length": 1343.033447265625, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.8334192882929345, "grad_norm": 0.25191670656204224, "kl": 0.035369873046875, "learning_rate": 1e-06, "loss": 0.0772, "num_tokens": 230190321.0, "reward": 1.613281488418579, "reward_std": 0.2954835891723633, "rewards/code_format_reward/mean": 0.9330357313156128, "rewards/code_format_reward/std": 0.2502395808696747, "rewards/curriculum_aware_reward_fn/mean": 0.6802455186843872, "rewards/curriculum_aware_reward_fn/std": 0.3996157944202423, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0736607142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4031.0, "completions/mean_length": 1783.04248046875, "completions/mean_terminated_length": 1599.1204833984375, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "epoch": 0.8375451263537906, "grad_norm": 0.23308780789375305, "kl": 0.0330963134765625, "learning_rate": 1e-06, "loss": 0.0635, "num_tokens": 231271952.0, "reward": 1.5106027126312256, "reward_std": 0.32970985770225525, "rewards/code_format_reward/mean": 0.9308035969734192, "rewards/code_format_reward/std": 0.25407156348228455, "rewards/curriculum_aware_reward_fn/mean": 0.5797991156578064, "rewards/curriculum_aware_reward_fn/std": 0.4192906618118286, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.049107142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3916.0, "completions/mean_length": 1422.5670166015625, "completions/mean_terminated_length": 1284.5023193359375, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.8416709644146467, "grad_norm": 0.6658977270126343, "kl": 0.03765869140625, "learning_rate": 1e-06, "loss": 0.06, "num_tokens": 232174947.0, "reward": 1.641741156578064, "reward_std": 0.251468300819397, "rewards/code_format_reward/mean": 0.9464285969734192, "rewards/code_format_reward/std": 0.2254217267036438, "rewards/curriculum_aware_reward_fn/mean": 0.6953125, "rewards/curriculum_aware_reward_fn/std": 0.4049060046672821, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.044642857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 4089.0, "completions/mean_length": 1428.4532470703125, "completions/mean_terminated_length": 1303.8013916015625, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.8457968024755028, "grad_norm": 0.20766569674015045, "kl": 0.0347900390625, "learning_rate": 1e-06, "loss": 0.0215, "num_tokens": 233080558.0, "reward": 1.6088169813156128, "reward_std": 0.1832144558429718, "rewards/code_format_reward/mean": 0.9553571343421936, "rewards/code_format_reward/std": 0.2067493200302124, "rewards/curriculum_aware_reward_fn/mean": 0.6534598469734192, "rewards/curriculum_aware_reward_fn/std": 0.501980721950531, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 3983.0, "completions/mean_length": 1395.8192138671875, "completions/mean_terminated_length": 1321.502197265625, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.8499226405363589, "grad_norm": 0.273607075214386, "kl": 0.03741455078125, "learning_rate": 1e-06, "loss": 0.0568, "num_tokens": 233967881.0, "reward": 1.6724331378936768, "reward_std": 0.21942391991615295, "rewards/code_format_reward/mean": 0.9732142686843872, "rewards/code_format_reward/std": 0.1616371124982834, "rewards/curriculum_aware_reward_fn/mean": 0.69921875, "rewards/curriculum_aware_reward_fn/std": 0.4062768816947937, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0401785714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 3814.0, "completions/mean_length": 1502.5938720703125, "completions/mean_terminated_length": 1394.0325927734375, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.8540484785972151, "grad_norm": 0.24034377932548523, "kl": 0.035125732421875, "learning_rate": 1e-06, "loss": 0.0611, "num_tokens": 234898629.0, "reward": 1.6238839626312256, "reward_std": 0.20875972509384155, "rewards/code_format_reward/mean": 0.9598214030265808, "rewards/code_format_reward/std": 0.1965973675251007, "rewards/curriculum_aware_reward_fn/mean": 0.6640625, "rewards/curriculum_aware_reward_fn/std": 0.4008280038833618, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0558035714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4078.0, "completions/mean_length": 1555.4910888671875, "completions/mean_terminated_length": 1405.3427734375, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 0.8581743166580712, "grad_norm": 0.32704412937164307, "kl": 0.0338287353515625, "learning_rate": 1e-06, "loss": 0.0606, "num_tokens": 235857555.0, "reward": 1.5145089626312256, "reward_std": 0.2827695310115814, "rewards/code_format_reward/mean": 0.9419642686843872, "rewards/code_format_reward/std": 0.23407234251499176, "rewards/curriculum_aware_reward_fn/mean": 0.5725446343421936, "rewards/curriculum_aware_reward_fn/std": 0.4402620196342468, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0647321428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4047.0, "completions/mean_length": 1573.5201416015625, "completions/mean_terminated_length": 1398.9332275390625, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.8623001547189273, "grad_norm": 0.2380460798740387, "kl": 0.034698486328125, "learning_rate": 1e-06, "loss": 0.0658, "num_tokens": 236842184.0, "reward": 1.618303656578064, "reward_std": 0.27431195974349976, "rewards/code_format_reward/mean": 0.9308035969734192, "rewards/code_format_reward/std": 0.25407159328460693, "rewards/curriculum_aware_reward_fn/mean": 0.6875, "rewards/curriculum_aware_reward_fn/std": 0.42106181383132935, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4042.0, "completions/mean_length": 1800.904052734375, "completions/mean_terminated_length": 1519.0501708984375, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.8664259927797834, "grad_norm": 0.281207412481308, "kl": 0.0328216552734375, "learning_rate": 1e-06, "loss": 0.0827, "num_tokens": 237916550.0, "reward": 1.5267857313156128, "reward_std": 0.32390275597572327, "rewards/code_format_reward/mean": 0.890625, "rewards/code_format_reward/std": 0.3124580383300781, "rewards/curriculum_aware_reward_fn/mean": 0.6361607313156128, "rewards/curriculum_aware_reward_fn/std": 0.4513692855834961, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1071428571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4051.0, "completions/mean_length": 1704.77685546875, "completions/mean_terminated_length": 1417.8299560546875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.8705518308406395, "grad_norm": 0.23124706745147705, "kl": 0.03436279296875, "learning_rate": 1e-06, "loss": 0.0972, "num_tokens": 238935888.0, "reward": 1.5452009439468384, "reward_std": 0.29965800046920776, "rewards/code_format_reward/mean": 0.8883928656578064, "rewards/code_format_reward/std": 0.315234512090683, "rewards/curriculum_aware_reward_fn/mean": 0.6568080186843872, "rewards/curriculum_aware_reward_fn/std": 0.4488282799720764, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.024553571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 4092.0, "completions/mean_length": 1319.421875, "completions/mean_terminated_length": 1249.5308837890625, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.8746776689014956, "grad_norm": 0.22175519168376923, "kl": 0.035797119140625, "learning_rate": 1e-06, "loss": 0.0146, "num_tokens": 239791372.0, "reward": 1.715959906578064, "reward_std": 0.19139330089092255, "rewards/code_format_reward/mean": 0.9709821343421936, "rewards/code_format_reward/std": 0.16804419457912445, "rewards/curriculum_aware_reward_fn/mean": 0.7449776530265808, "rewards/curriculum_aware_reward_fn/std": 0.39017921686172485, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1071428571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4003.0, "completions/mean_length": 1668.9219970703125, "completions/mean_terminated_length": 1377.6724853515625, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.8788035069623518, "grad_norm": 0.25599682331085205, "kl": 0.0331573486328125, "learning_rate": 1e-06, "loss": 0.0407, "num_tokens": 240811762.0, "reward": 1.5524554252624512, "reward_std": 0.2817254066467285, "rewards/code_format_reward/mean": 0.8928571343421936, "rewards/code_format_reward/std": 0.3096405565738678, "rewards/curriculum_aware_reward_fn/mean": 0.6595982313156128, "rewards/curriculum_aware_reward_fn/std": 0.41996780037879944, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0825892857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4060.0, "completions/mean_length": 1719.2813720703125, "completions/mean_terminated_length": 1505.3187255859375, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.8829293450232079, "grad_norm": 0.2377719134092331, "kl": 0.03167724609375, "learning_rate": 1e-06, "loss": 0.0809, "num_tokens": 241862945.0, "reward": 1.5591518878936768, "reward_std": 0.325631320476532, "rewards/code_format_reward/mean": 0.9151785969734192, "rewards/code_format_reward/std": 0.2789272665977478, "rewards/curriculum_aware_reward_fn/mean": 0.6439732313156128, "rewards/curriculum_aware_reward_fn/std": 0.413931667804718, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0558035714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4051.0, "completions/mean_length": 1636.587158203125, "completions/mean_terminated_length": 1491.231689453125, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.887055183084064, "grad_norm": 0.2097349315881729, "kl": 0.030731201171875, "learning_rate": 1e-06, "loss": 0.04, "num_tokens": 242856130.0, "reward": 1.6071429252624512, "reward_std": 0.2852386236190796, "rewards/code_format_reward/mean": 0.9419642686843872, "rewards/code_format_reward/std": 0.23407234251499176, "rewards/curriculum_aware_reward_fn/mean": 0.6651785969734192, "rewards/curriculum_aware_reward_fn/std": 0.4181341826915741, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0825892857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4084.0, "completions/mean_length": 1646.435302734375, "completions/mean_terminated_length": 1425.914794921875, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.89118102114492, "grad_norm": 0.20821727812290192, "kl": 0.0323028564453125, "learning_rate": 1e-06, "loss": 0.0798, "num_tokens": 243863751.0, "reward": 1.5122768878936768, "reward_std": 0.24135467410087585, "rewards/code_format_reward/mean": 0.9174107313156128, "rewards/code_format_reward/std": 0.2755681276321411, "rewards/curriculum_aware_reward_fn/mean": 0.5948660969734192, "rewards/curriculum_aware_reward_fn/std": 0.4480087459087372, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0714285714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4089.0, "completions/mean_length": 1559.0335693359375, "completions/mean_terminated_length": 1363.8822021484375, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.8953068592057761, "grad_norm": 0.2385074347257614, "kl": 0.033721923828125, "learning_rate": 1e-06, "loss": 0.0869, "num_tokens": 244825202.0, "reward": 1.4441964626312256, "reward_std": 0.2371453046798706, "rewards/code_format_reward/mean": 0.9308035969734192, "rewards/code_format_reward/std": 0.25407159328460693, "rewards/curriculum_aware_reward_fn/mean": 0.5133928656578064, "rewards/curriculum_aware_reward_fn/std": 0.4614189863204956, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4014.0, "completions/mean_length": 1555.3973388671875, "completions/mean_terminated_length": 1340.092041015625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.8994326972666323, "grad_norm": 0.25095048546791077, "kl": 0.0374755859375, "learning_rate": 1e-06, "loss": 0.0435, "num_tokens": 245780285.0, "reward": 1.6936384439468384, "reward_std": 0.27349722385406494, "rewards/code_format_reward/mean": 0.9263392686843872, "rewards/code_format_reward/std": 0.2615099549293518, "rewards/curriculum_aware_reward_fn/mean": 0.7672991156578064, "rewards/curriculum_aware_reward_fn/std": 0.4104436933994293, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0691964285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4078.0, "completions/mean_length": 1663.5045166015625, "completions/mean_terminated_length": 1482.6715087890625, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.9035585353274884, "grad_norm": 0.20970918238162994, "kl": 0.0321502685546875, "learning_rate": 1e-06, "loss": 0.0496, "num_tokens": 246796722.0, "reward": 1.511160969734192, "reward_std": 0.23341883718967438, "rewards/code_format_reward/mean": 0.9330357313156128, "rewards/code_format_reward/std": 0.2502395808696747, "rewards/curriculum_aware_reward_fn/mean": 0.578125, "rewards/curriculum_aware_reward_fn/std": 0.47831177711486816, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1071428571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4059.0, "completions/mean_length": 1584.825927734375, "completions/mean_terminated_length": 1283.4849853515625, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.9076843733883445, "grad_norm": 1.9628300666809082, "kl": 0.26776123046875, "learning_rate": 1e-06, "loss": 0.0724, "num_tokens": 247762217.0, "reward": 1.6065850257873535, "reward_std": 0.309932142496109, "rewards/code_format_reward/mean": 0.8928571343421936, "rewards/code_format_reward/std": 0.3096405565738678, "rewards/curriculum_aware_reward_fn/mean": 0.7137276530265808, "rewards/curriculum_aware_reward_fn/std": 0.44141846895217896, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0401785714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 3974.0, "completions/mean_length": 1528.55810546875, "completions/mean_terminated_length": 1421.083740234375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.9118102114492006, "grad_norm": 0.5420495867729187, "kl": 0.0351409912109375, "learning_rate": 1e-06, "loss": 0.0779, "num_tokens": 248699599.0, "reward": 1.6489956378936768, "reward_std": 0.27486705780029297, "rewards/code_format_reward/mean": 0.9575892686843872, "rewards/code_format_reward/std": 0.20174959301948547, "rewards/curriculum_aware_reward_fn/mean": 0.69140625, "rewards/curriculum_aware_reward_fn/std": 0.39650148153305054, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0736607142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4065.0, "completions/mean_length": 1595.5826416015625, "completions/mean_terminated_length": 1396.7542724609375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.9159360495100567, "grad_norm": 0.4353036880493164, "kl": 0.035675048828125, "learning_rate": 1e-06, "loss": 0.0789, "num_tokens": 249678768.0, "reward": 1.5965402126312256, "reward_std": 0.3013409972190857, "rewards/code_format_reward/mean": 0.9241071343421936, "rewards/code_format_reward/std": 0.265122652053833, "rewards/curriculum_aware_reward_fn/mean": 0.6724330186843872, "rewards/curriculum_aware_reward_fn/std": 0.41468310356140137, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0558035714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 3997.0, "completions/mean_length": 1619.3460693359375, "completions/mean_terminated_length": 1472.9716796875, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.9200618875709129, "grad_norm": 0.21462301909923553, "kl": 0.0312957763671875, "learning_rate": 1e-06, "loss": 0.0676, "num_tokens": 250670298.0, "reward": 1.6289063692092896, "reward_std": 0.2668452262878418, "rewards/code_format_reward/mean": 0.9441964030265808, "rewards/code_format_reward/std": 0.22979861497879028, "rewards/curriculum_aware_reward_fn/mean": 0.6847098469734192, "rewards/curriculum_aware_reward_fn/std": 0.4456545412540436, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4089.0, "completions/mean_length": 1730.560302734375, "completions/mean_terminated_length": 1530.099365234375, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.924187725631769, "grad_norm": 0.27297359704971313, "kl": 0.0303497314453125, "learning_rate": 1e-06, "loss": 0.089, "num_tokens": 251724287.0, "reward": 1.5797991752624512, "reward_std": 0.306170791387558, "rewards/code_format_reward/mean": 0.921875, "rewards/code_format_reward/std": 0.26866820454597473, "rewards/curriculum_aware_reward_fn/mean": 0.6579241156578064, "rewards/curriculum_aware_reward_fn/std": 0.413392573595047, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0691964285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4049.0, "completions/mean_length": 1456.3660888671875, "completions/mean_terminated_length": 1260.13427734375, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.9283135636926251, "grad_norm": 0.2921152114868164, "kl": 0.035400390625, "learning_rate": 1e-06, "loss": 0.0889, "num_tokens": 252646211.0, "reward": 1.6049107313156128, "reward_std": 0.2598549425601959, "rewards/code_format_reward/mean": 0.9308035969734192, "rewards/code_format_reward/std": 0.25407159328460693, "rewards/curriculum_aware_reward_fn/mean": 0.6741071343421936, "rewards/curriculum_aware_reward_fn/std": 0.42118039727211, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4086.0, "completions/mean_length": 1423.0960693359375, "completions/mean_terminated_length": 1324.0994873046875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.9324394017534812, "grad_norm": 0.2500095069408417, "kl": 0.03564453125, "learning_rate": 1e-06, "loss": 0.0302, "num_tokens": 253548197.0, "reward": 1.6584821939468384, "reward_std": 0.24217143654823303, "rewards/code_format_reward/mean": 0.9642857313156128, "rewards/code_format_reward/std": 0.18578432500362396, "rewards/curriculum_aware_reward_fn/mean": 0.6941964030265808, "rewards/curriculum_aware_reward_fn/std": 0.3960230350494385, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0848214285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4048.0, "completions/mean_length": 1628.3013916015625, "completions/mean_terminated_length": 1399.5877685546875, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.9365652398143373, "grad_norm": 0.2387629598379135, "kl": 0.035858154296875, "learning_rate": 1e-06, "loss": 0.0707, "num_tokens": 254541135.0, "reward": 1.4481027126312256, "reward_std": 0.26029425859451294, "rewards/code_format_reward/mean": 0.9151785969734192, "rewards/code_format_reward/std": 0.2789272665977478, "rewards/curriculum_aware_reward_fn/mean": 0.5329241156578064, "rewards/curriculum_aware_reward_fn/std": 0.46300947666168213, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.044642857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 4091.0, "completions/mean_length": 1509.05810546875, "completions/mean_terminated_length": 1388.1728515625, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.9406910778751933, "grad_norm": 0.48750928044319153, "kl": 0.0355224609375, "learning_rate": 1e-06, "loss": 0.0798, "num_tokens": 255483116.0, "reward": 1.6411831378936768, "reward_std": 0.27610358595848083, "rewards/code_format_reward/mean": 0.9486607313156128, "rewards/code_format_reward/std": 0.22093555331230164, "rewards/curriculum_aware_reward_fn/mean": 0.6925223469734192, "rewards/curriculum_aware_reward_fn/std": 0.43177083134651184, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0513392857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4086.0, "completions/mean_length": 1402.279052734375, "completions/mean_terminated_length": 1256.5010986328125, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.9448169159360496, "grad_norm": 0.36268895864486694, "kl": 0.04205322265625, "learning_rate": 1e-06, "loss": 0.0993, "num_tokens": 256393290.0, "reward": 1.622209906578064, "reward_std": 0.25994282960891724, "rewards/code_format_reward/mean": 0.9486607313156128, "rewards/code_format_reward/std": 0.22093555331230164, "rewards/curriculum_aware_reward_fn/mean": 0.6735491156578064, "rewards/curriculum_aware_reward_fn/std": 0.418915331363678, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0424107142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4042.0, "completions/mean_length": 1465.154052734375, "completions/mean_terminated_length": 1348.6363525390625, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.9489427539969056, "grad_norm": 0.2450164556503296, "kl": 0.0352783203125, "learning_rate": 1e-06, "loss": 0.0229, "num_tokens": 257319921.0, "reward": 1.6601563692092896, "reward_std": 0.23058827221393585, "rewards/code_format_reward/mean": 0.9553571343421936, "rewards/code_format_reward/std": 0.2067493349313736, "rewards/curriculum_aware_reward_fn/mean": 0.7047991156578064, "rewards/curriculum_aware_reward_fn/std": 0.40865132212638855, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0580357142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4039.0, "completions/mean_length": 1533.9085693359375, "completions/mean_terminated_length": 1376.0545654296875, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.9530685920577617, "grad_norm": 0.2817387282848358, "kl": 0.033966064453125, "learning_rate": 1e-06, "loss": 0.076, "num_tokens": 258269598.0, "reward": 1.622209906578064, "reward_std": 0.25159940123558044, "rewards/code_format_reward/mean": 0.9397321343421936, "rewards/code_format_reward/std": 0.23824846744537354, "rewards/curriculum_aware_reward_fn/mean": 0.6824776530265808, "rewards/curriculum_aware_reward_fn/std": 0.4478258192539215, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0401785714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4035.0, "completions/mean_length": 1496.1920166015625, "completions/mean_terminated_length": 1387.36279296875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.9571944301186178, "grad_norm": 0.2651069462299347, "kl": 0.0347900390625, "learning_rate": 1e-06, "loss": 0.0674, "num_tokens": 259207481.0, "reward": 1.6953126192092896, "reward_std": 0.24807168543338776, "rewards/code_format_reward/mean": 0.9598214030265808, "rewards/code_format_reward/std": 0.1965973675251007, "rewards/curriculum_aware_reward_fn/mean": 0.7354910969734192, "rewards/curriculum_aware_reward_fn/std": 0.4042024314403534, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4014.0, "completions/mean_length": 1585.40185546875, "completions/mean_terminated_length": 1372.6392822265625, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.9613202681794739, "grad_norm": 0.23186470568180084, "kl": 0.03326416015625, "learning_rate": 1e-06, "loss": 0.0765, "num_tokens": 260186422.0, "reward": 1.6177456378936768, "reward_std": 0.29622310400009155, "rewards/code_format_reward/mean": 0.921875, "rewards/code_format_reward/std": 0.26866820454597473, "rewards/curriculum_aware_reward_fn/mean": 0.6958705186843872, "rewards/curriculum_aware_reward_fn/std": 0.41063833236694336, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0848214285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4084.0, "completions/mean_length": 1678.5357666015625, "completions/mean_terminated_length": 1454.47802734375, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.9654461062403301, "grad_norm": 0.2390112727880478, "kl": 0.0320587158203125, "learning_rate": 1e-06, "loss": 0.0546, "num_tokens": 261211245.0, "reward": 1.5351563692092896, "reward_std": 0.25613510608673096, "rewards/code_format_reward/mean": 0.9174107313156128, "rewards/code_format_reward/std": 0.2755681276321411, "rewards/curriculum_aware_reward_fn/mean": 0.6177455186843872, "rewards/curriculum_aware_reward_fn/std": 0.4455256462097168, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4092.0, "completions/mean_length": 1753.9398193359375, "completions/mean_terminated_length": 1466.3182373046875, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.9695719443011862, "grad_norm": 0.23524099588394165, "kl": 0.0356292724609375, "learning_rate": 1e-06, "loss": 0.083, "num_tokens": 262268834.0, "reward": 1.5407366752624512, "reward_std": 0.29423972964286804, "rewards/code_format_reward/mean": 0.8928571343421936, "rewards/code_format_reward/std": 0.3096405565738678, "rewards/curriculum_aware_reward_fn/mean": 0.6478794813156128, "rewards/curriculum_aware_reward_fn/std": 0.4649709165096283, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0825892857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4092.0, "completions/mean_length": 1669.2701416015625, "completions/mean_terminated_length": 1450.8052978515625, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.9736977823620423, "grad_norm": 0.23558498919010162, "kl": 0.0329437255859375, "learning_rate": 1e-06, "loss": 0.0918, "num_tokens": 263291977.0, "reward": 1.5541294813156128, "reward_std": 0.3235393464565277, "rewards/code_format_reward/mean": 0.9196428656578064, "rewards/code_format_reward/std": 0.2721492052078247, "rewards/curriculum_aware_reward_fn/mean": 0.6344866156578064, "rewards/curriculum_aware_reward_fn/std": 0.43242380023002625, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0892857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4048.0, "completions/mean_length": 1622.96435546875, "completions/mean_terminated_length": 1380.5098876953125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.9778236204228984, "grad_norm": 0.455951988697052, "kl": 0.03375244140625, "learning_rate": 1e-06, "loss": 0.0575, "num_tokens": 264276217.0, "reward": 1.5993304252624512, "reward_std": 0.26301223039627075, "rewards/code_format_reward/mean": 0.9174107313156128, "rewards/code_format_reward/std": 0.2755681276321411, "rewards/curriculum_aware_reward_fn/mean": 0.6819196343421936, "rewards/curriculum_aware_reward_fn/std": 0.4117908477783203, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0892857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4092.0, "completions/mean_length": 1797.055908203125, "completions/mean_terminated_length": 1571.669189453125, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.9819494584837545, "grad_norm": 120.07787322998047, "kl": 15.591033935546875, "learning_rate": 1e-06, "loss": 0.2321, "num_tokens": 265358078.0, "reward": 1.5848215818405151, "reward_std": 0.26491549611091614, "rewards/code_format_reward/mean": 0.9107142686843872, "rewards/code_format_reward/std": 0.2854744791984558, "rewards/curriculum_aware_reward_fn/mean": 0.6741071343421936, "rewards/curriculum_aware_reward_fn/std": 0.41279762983322144, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0870535714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4096.0, "completions/mean_length": 1681.9241943359375, "completions/mean_terminated_length": 1451.7310791015625, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.9860752965446106, "grad_norm": 0.23227033019065857, "kl": 0.032562255859375, "learning_rate": 1e-06, "loss": 0.1121, "num_tokens": 266379673.0, "reward": 1.3275669813156128, "reward_std": 0.2691631019115448, "rewards/code_format_reward/mean": 0.9129464030265808, "rewards/code_format_reward/std": 0.2822287082672119, "rewards/curriculum_aware_reward_fn/mean": 0.4146205484867096, "rewards/curriculum_aware_reward_fn/std": 0.457773357629776, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0870535714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4067.0, "completions/mean_length": 1748.779052734375, "completions/mean_terminated_length": 1524.9609375, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.9902011346054668, "grad_norm": 0.23980259895324707, "kl": 0.031524658203125, "learning_rate": 1e-06, "loss": 0.0558, "num_tokens": 267436721.0, "reward": 1.5820313692092896, "reward_std": 0.29253730177879333, "rewards/code_format_reward/mean": 0.9151785969734192, "rewards/code_format_reward/std": 0.2789272665977478, "rewards/curriculum_aware_reward_fn/mean": 0.6668526530265808, "rewards/curriculum_aware_reward_fn/std": 0.41528475284576416, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0825892857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4051.0, "completions/mean_length": 1729.62060546875, "completions/mean_terminated_length": 1516.5887451171875, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.9943269726663229, "grad_norm": 0.2937520146369934, "kl": 0.0323028564453125, "learning_rate": 1e-06, "loss": 0.0652, "num_tokens": 268488763.0, "reward": 1.4051339626312256, "reward_std": 0.2679254710674286, "rewards/code_format_reward/mean": 0.9151785969734192, "rewards/code_format_reward/std": 0.2789272665977478, "rewards/curriculum_aware_reward_fn/mean": 0.4899553656578064, "rewards/curriculum_aware_reward_fn/std": 0.4645240008831024, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04379562043795615, "completions/max_length": 4096.0, "completions/max_terminated_length": 4037.0, "completions/mean_length": 1597.912353515625, "completions/mean_terminated_length": 1483.4962158203125, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.998452810727179, "grad_norm": 0.22581250965595245, "kl": 0.0320281982421875, "learning_rate": 1e-06, "loss": 0.0783, "num_tokens": 269513003.0, "reward": 1.5580357313156128, "reward_std": 0.2645653486251831, "rewards/code_format_reward/mean": 0.9330357313156128, "rewards/code_format_reward/std": 0.2502395808696747, "rewards/curriculum_aware_reward_fn/mean": 0.625, "rewards/curriculum_aware_reward_fn/std": 0.4363899827003479, "step": 242 }, { "epoch": 0.998452810727179, "step": 242, "total_flos": 0.0, "train_loss": 2.5904848981785302, "train_runtime": 94263.811, "train_samples_per_second": 0.164, "train_steps_per_second": 0.003 } ], "logging_steps": 1, "max_steps": 242, "num_input_tokens_seen": 269513003, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }