{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 205, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 384.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 236.5, "completions/mean_terminated_length": 226.6666717529297, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.45487434044480324, "epoch": 0.004878048780487805, "frac_reward_zero_std": 0.625, "grad_norm": 0.04558239504694939, "kl": 0.0, "learning_rate": 9e-06, "loss": 0.0225, "num_tokens": 85808.0, "reward": 1.6640625, "reward_std": 0.594359815120697, "rewards/answer_reward/mean": 0.75, "rewards/answer_reward/std": 0.4399413466453552, "rewards/format_reward/mean": 0.9140625, "rewards/format_reward/std": 0.27393028140068054, "sampling/importance_sampling_ratio/max": 2.0195910930633545, "sampling/importance_sampling_ratio/mean": 0.8151408433914185, "sampling/importance_sampling_ratio/min": 0.1251400262117386, "sampling/sampling_logp_difference/max": 1.5235940217971802, "sampling/sampling_logp_difference/mean": 0.01995115727186203, "step": 1, "step_time": 11.550526609644294 }, { "clip_ratio/high_max": 0.0013483481016010046, "clip_ratio/high_mean": 0.00033708702540025115, "clip_ratio/low_mean": 8.138021075865254e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004184672434348613, "completions/clipped_ratio": 0.1875, "completions/max_length": 384.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 257.5, "completions/mean_terminated_length": 228.3076934814453, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.4151771403849125, "epoch": 0.00975609756097561, "frac_reward_zero_std": 0.875, "grad_norm": 0.01572185754776001, "kl": 0.0008572587539674714, "learning_rate": 8.95609756097561e-06, "loss": -0.001, "num_tokens": 168326.0, "reward": 1.484375, "reward_std": 0.7749544382095337, "rewards/answer_reward/mean": 0.65625, "rewards/answer_reward/std": 0.4825586974620819, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.3671901822090149, "sampling/importance_sampling_ratio/max": 2.384889841079712, "sampling/importance_sampling_ratio/mean": 0.8450397253036499, "sampling/importance_sampling_ratio/min": 0.1483311802148819, "sampling/sampling_logp_difference/max": 0.8770904541015625, "sampling/sampling_logp_difference/mean": 0.01609395630657673, "step": 2, "step_time": 10.935106498654932 }, { "clip_ratio/high_max": 0.0052421706495806575, "clip_ratio/high_mean": 0.0013105426623951644, "clip_ratio/low_mean": 0.0004111842135898769, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017217268759850413, "completions/clipped_ratio": 0.0625, "completions/max_length": 384.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 228.65625, "completions/mean_terminated_length": 218.30001831054688, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.44841642677783966, "epoch": 0.014634146341463415, "frac_reward_zero_std": 0.75, "grad_norm": 0.027422722429037094, "kl": 0.0010078709165100008, "learning_rate": 8.91219512195122e-06, "loss": 0.0434, "num_tokens": 265567.0, "reward": 1.59375, "reward_std": 0.6148366928100586, "rewards/answer_reward/mean": 0.65625, "rewards/answer_reward/std": 0.4825586974620819, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "sampling/importance_sampling_ratio/max": 2.874709367752075, "sampling/importance_sampling_ratio/mean": 0.8312649130821228, "sampling/importance_sampling_ratio/min": 0.23329804837703705, "sampling/sampling_logp_difference/max": 1.1406102180480957, "sampling/sampling_logp_difference/mean": 0.019440844655036926, "step": 3, "step_time": 11.11850585276261 }, { "clip_ratio/high_max": 0.0013612619368359447, "clip_ratio/high_mean": 0.000490555859869346, "clip_ratio/low_mean": 0.000903838976228144, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00139439483609749, "completions/clipped_ratio": 0.09375, "completions/max_length": 384.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 241.125, "completions/mean_terminated_length": 226.34483337402344, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.4599086008965969, "epoch": 0.01951219512195122, "frac_reward_zero_std": 0.625, "grad_norm": 0.04793599992990494, "kl": 0.0010079035637318157, "learning_rate": 8.86829268292683e-06, "loss": 0.0998, "num_tokens": 357113.0, "reward": 1.421875, "reward_std": 0.6822564005851746, "rewards/answer_reward/mean": 0.53125, "rewards/answer_reward/std": 0.507007360458374, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.2974183261394501, "sampling/importance_sampling_ratio/max": 2.3109569549560547, "sampling/importance_sampling_ratio/mean": 0.8832716345787048, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6609764695167542, "sampling/sampling_logp_difference/mean": 0.01860066130757332, "step": 4, "step_time": 11.451738181058317 }, { "clip_ratio/high_max": 0.002567008195910603, "clip_ratio/high_mean": 0.0008062257402343675, "clip_ratio/low_mean": 0.0003035861882381141, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011098119284724817, "completions/clipped_ratio": 0.125, "completions/max_length": 384.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 240.40625, "completions/mean_terminated_length": 219.8928680419922, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.43317702785134315, "epoch": 0.024390243902439025, "frac_reward_zero_std": 0.625, "grad_norm": 0.038888074457645416, "kl": 0.000925359083339572, "learning_rate": 8.824390243902438e-06, "loss": 0.0297, "num_tokens": 437268.0, "reward": 1.6328125, "reward_std": 0.6719745993614197, "rewards/answer_reward/mean": 0.75, "rewards/answer_reward/std": 0.4399413466453552, "rewards/format_reward/mean": 0.8828125, "rewards/format_reward/std": 0.30443212389945984, "sampling/importance_sampling_ratio/max": 2.164376974105835, "sampling/importance_sampling_ratio/mean": 0.7578344345092773, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.7340753078460693, "sampling/sampling_logp_difference/mean": 0.017508644610643387, "step": 5, "step_time": 10.343289981596172 }, { "clip_ratio/high_max": 0.007148410106310621, "clip_ratio/high_mean": 0.0019677383606904186, "clip_ratio/low_mean": 0.0003602049546316266, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0023279433080460876, "completions/clipped_ratio": 0.09375, "completions/max_length": 384.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 241.4375, "completions/mean_terminated_length": 226.6896514892578, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.48263201117515564, "epoch": 0.02926829268292683, "frac_reward_zero_std": 0.625, "grad_norm": 0.04244877025485039, "kl": 0.0014110119373071939, "learning_rate": 8.780487804878048e-06, "loss": 0.0286, "num_tokens": 535318.0, "reward": 1.6640625, "reward_std": 0.6337592005729675, "rewards/answer_reward/mean": 0.75, "rewards/answer_reward/std": 0.4399413466453552, "rewards/format_reward/mean": 0.9140625, "rewards/format_reward/std": 0.27393028140068054, "sampling/importance_sampling_ratio/max": 2.679584503173828, "sampling/importance_sampling_ratio/mean": 0.7900279760360718, "sampling/importance_sampling_ratio/min": 0.1352062225341797, "sampling/sampling_logp_difference/max": 1.5307142734527588, "sampling/sampling_logp_difference/mean": 0.02059682086110115, "step": 6, "step_time": 10.616379391867667 }, { "clip_ratio/high_max": 0.0024226909736171365, "clip_ratio/high_mean": 0.0006056727434042841, "clip_ratio/low_mean": 0.0011607309861574322, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017664037295617163, "completions/clipped_ratio": 0.15625, "completions/max_length": 384.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 237.03125, "completions/mean_terminated_length": 209.8148193359375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.43470465019345284, "epoch": 0.03414634146341464, "frac_reward_zero_std": 0.8125, "grad_norm": 0.027135657146573067, "kl": 0.0009752626210683957, "learning_rate": 8.736585365853658e-06, "loss": 0.0465, "num_tokens": 609315.0, "reward": 1.5625, "reward_std": 0.7593502998352051, "rewards/answer_reward/mean": 0.71875, "rewards/answer_reward/std": 0.45680341124534607, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.3689020276069641, "sampling/importance_sampling_ratio/max": 1.7231906652450562, "sampling/importance_sampling_ratio/mean": 0.7014956474304199, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.2421488761901855, "sampling/sampling_logp_difference/mean": 0.017867177724838257, "step": 7, "step_time": 10.432691829744726 }, { "clip_ratio/high_max": 0.0021982089965604246, "clip_ratio/high_mean": 0.0005495522491401061, "clip_ratio/low_mean": 0.0003420549910515547, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008916072401916608, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 236.8125, "completions/mean_terminated_length": 232.06451416015625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.4120408035814762, "epoch": 0.03902439024390244, "frac_reward_zero_std": 0.75, "grad_norm": 0.0498298779129982, "kl": 0.00090484361135168, "learning_rate": 8.692682926829268e-06, "loss": -0.0255, "num_tokens": 696055.0, "reward": 1.6640625, "reward_std": 0.5221287608146667, "rewards/answer_reward/mean": 0.6875, "rewards/answer_reward/std": 0.4709290862083435, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.13258251547813416, "sampling/importance_sampling_ratio/max": 2.8426010608673096, "sampling/importance_sampling_ratio/mean": 0.9005006551742554, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.600479781627655, "sampling/sampling_logp_difference/mean": 0.017349526286125183, "step": 8, "step_time": 10.365724839270115 }, { "clip_ratio/high_max": 0.002021610882366076, "clip_ratio/high_mean": 0.000505402720591519, "clip_ratio/low_mean": 0.0005256929944152944, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010310957077308558, "completions/clipped_ratio": 0.09375, "completions/max_length": 384.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 241.09375, "completions/mean_terminated_length": 226.3103485107422, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.4374147206544876, "epoch": 0.04390243902439024, "frac_reward_zero_std": 0.6875, "grad_norm": 0.03801951929926872, "kl": 0.0009326529398094863, "learning_rate": 8.648780487804878e-06, "loss": 0.0015, "num_tokens": 785434.0, "reward": 1.7734375, "reward_std": 0.5585095882415771, "rewards/answer_reward/mean": 0.84375, "rewards/answer_reward/std": 0.3689020276069641, "rewards/format_reward/mean": 0.9296875, "rewards/format_reward/std": 0.2310073971748352, "sampling/importance_sampling_ratio/max": 2.4348058700561523, "sampling/importance_sampling_ratio/mean": 0.9108383655548096, "sampling/importance_sampling_ratio/min": 0.12963537871837616, "sampling/sampling_logp_difference/max": 0.867497444152832, "sampling/sampling_logp_difference/mean": 0.018413696438074112, "step": 9, "step_time": 11.008081156294793 }, { "clip_ratio/high_max": 0.0013237882521934807, "clip_ratio/high_mean": 0.0003309470630483702, "clip_ratio/low_mean": 0.00016108246927615255, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004920295323245227, "completions/clipped_ratio": 0.09375, "completions/max_length": 384.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 226.25, "completions/mean_terminated_length": 209.9310302734375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.4347679689526558, "epoch": 0.04878048780487805, "frac_reward_zero_std": 0.75, "grad_norm": 0.05612954497337341, "kl": 0.0009539900784147903, "learning_rate": 8.604878048780488e-06, "loss": -0.0053, "num_tokens": 870644.0, "reward": 1.625, "reward_std": 0.6599119901657104, "rewards/answer_reward/mean": 0.71875, "rewards/answer_reward/std": 0.45680341124534607, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "sampling/importance_sampling_ratio/max": 2.6957921981811523, "sampling/importance_sampling_ratio/mean": 1.0683538913726807, "sampling/importance_sampling_ratio/min": 0.10242640972137451, "sampling/sampling_logp_difference/max": 0.8506159782409668, "sampling/sampling_logp_difference/mean": 0.018407460302114487, "step": 10, "step_time": 10.377598612569273 }, { "clip_ratio/high_max": 0.002954202995169908, "clip_ratio/high_mean": 0.000738550748792477, "clip_ratio/low_mean": 0.0005859916709596291, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001324542419752106, "completions/clipped_ratio": 0.1875, "completions/max_length": 384.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 254.46875, "completions/mean_terminated_length": 224.57693481445312, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.4255176857113838, "epoch": 0.05365853658536585, "frac_reward_zero_std": 0.625, "grad_norm": 0.043310217559337616, "kl": 0.0009655185167503078, "learning_rate": 8.560975609756098e-06, "loss": -0.0107, "num_tokens": 961701.0, "reward": 1.453125, "reward_std": 0.7762541770935059, "rewards/answer_reward/mean": 0.625, "rewards/answer_reward/std": 0.49186936020851135, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.3726404309272766, "sampling/importance_sampling_ratio/max": 2.75307297706604, "sampling/importance_sampling_ratio/mean": 0.9865622520446777, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7510389089584351, "sampling/sampling_logp_difference/mean": 0.016617847606539726, "step": 11, "step_time": 10.373775450047106 }, { "clip_ratio/high_max": 0.003720981883816421, "clip_ratio/high_mean": 0.0012697597121587023, "clip_ratio/low_mean": 0.0008682753614266403, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0021380350735853426, "completions/clipped_ratio": 0.0625, "completions/max_length": 384.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 235.6875, "completions/mean_terminated_length": 225.80001831054688, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.4825473502278328, "epoch": 0.05853658536585366, "frac_reward_zero_std": 0.4375, "grad_norm": 0.062373362481594086, "kl": 0.0009440175708732568, "learning_rate": 8.517073170731708e-06, "loss": -0.0089, "num_tokens": 1035933.0, "reward": 1.5625, "reward_std": 0.6189220547676086, "rewards/answer_reward/mean": 0.625, "rewards/answer_reward/std": 0.49186936020851135, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "sampling/importance_sampling_ratio/max": 2.656355381011963, "sampling/importance_sampling_ratio/mean": 0.879611611366272, "sampling/importance_sampling_ratio/min": 0.17241045832633972, "sampling/sampling_logp_difference/max": 0.6117346286773682, "sampling/sampling_logp_difference/mean": 0.019267737865447998, "step": 12, "step_time": 9.999413645360619 }, { "clip_ratio/high_max": 0.0015742292744107544, "clip_ratio/high_mean": 0.0003935573186026886, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003935573186026886, "completions/clipped_ratio": 0.0625, "completions/max_length": 384.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 252.8125, "completions/mean_terminated_length": 244.06668090820312, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.4906906373798847, "epoch": 0.06341463414634146, "frac_reward_zero_std": 0.8125, "grad_norm": 0.06609569489955902, "kl": 0.0009692223175079562, "learning_rate": 8.473170731707316e-06, "loss": -0.0378, "num_tokens": 1116519.0, "reward": 1.65625, "reward_std": 0.6015772223472595, "rewards/answer_reward/mean": 0.71875, "rewards/answer_reward/std": 0.45680341124534607, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "sampling/importance_sampling_ratio/max": 2.567021608352661, "sampling/importance_sampling_ratio/mean": 0.7601916790008545, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.030838966369629, "sampling/sampling_logp_difference/mean": 0.018852218985557556, "step": 13, "step_time": 10.928391702473164 }, { "clip_ratio/high_max": 0.0029226120095700026, "clip_ratio/high_mean": 0.0007306530023925006, "clip_ratio/low_mean": 0.00012159533071098849, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008522483331034891, "completions/clipped_ratio": 0.09375, "completions/max_length": 384.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 242.40625, "completions/mean_terminated_length": 227.7586212158203, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.4702279828488827, "epoch": 0.06829268292682927, "frac_reward_zero_std": 0.8125, "grad_norm": 0.03039458580315113, "kl": 0.001274055590329226, "learning_rate": 8.429268292682928e-06, "loss": -0.008, "num_tokens": 1195028.0, "reward": 1.5, "reward_std": 0.6720215082168579, "rewards/answer_reward/mean": 0.59375, "rewards/answer_reward/std": 0.49899089336395264, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "sampling/importance_sampling_ratio/max": 2.4283981323242188, "sampling/importance_sampling_ratio/mean": 0.6639824509620667, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.616560697555542, "sampling/sampling_logp_difference/mean": 0.01967538334429264, "step": 14, "step_time": 11.205044919159263 }, { "clip_ratio/high_max": 0.0007225433364510536, "clip_ratio/high_mean": 0.0001806358341127634, "clip_ratio/low_mean": 0.00011837121564894915, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00029900704976171255, "completions/clipped_ratio": 0.0625, "completions/max_length": 384.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 222.03125, "completions/mean_terminated_length": 211.23333740234375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.453625813126564, "epoch": 0.07317073170731707, "frac_reward_zero_std": 0.9375, "grad_norm": 0.020886672660708427, "kl": 0.0012097211001673713, "learning_rate": 8.385365853658538e-06, "loss": -0.0134, "num_tokens": 1294917.0, "reward": 1.65625, "reward_std": 0.6015772223472595, "rewards/answer_reward/mean": 0.71875, "rewards/answer_reward/std": 0.45680341124534607, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "sampling/importance_sampling_ratio/max": 2.8972060680389404, "sampling/importance_sampling_ratio/mean": 0.7579200863838196, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.1011431217193604, "sampling/sampling_logp_difference/mean": 0.020521733909845352, "step": 15, "step_time": 11.170089395251125 }, { "clip_ratio/high_max": 0.00455408327979967, "clip_ratio/high_mean": 0.0013634996867040172, "clip_ratio/low_mean": 0.0003042821626877412, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016677818493917584, "completions/clipped_ratio": 0.0625, "completions/max_length": 384.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 229.25, "completions/mean_terminated_length": 218.933349609375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.46426226384937763, "epoch": 0.07804878048780488, "frac_reward_zero_std": 0.625, "grad_norm": 0.03689539059996605, "kl": 0.0012160674305050634, "learning_rate": 8.341463414634147e-06, "loss": 0.0201, "num_tokens": 1391917.0, "reward": 1.40625, "reward_std": 0.6148366928100586, "rewards/answer_reward/mean": 0.46875, "rewards/answer_reward/std": 0.507007360458374, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "sampling/importance_sampling_ratio/max": 2.1973605155944824, "sampling/importance_sampling_ratio/mean": 0.7800529599189758, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.7803244590759277, "sampling/sampling_logp_difference/mean": 0.01894780993461609, "step": 16, "step_time": 11.071669754106551 }, { "clip_ratio/high_max": 0.002475247485563159, "clip_ratio/high_mean": 0.0007798943552188575, "clip_ratio/low_mean": 0.0006545381329488009, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0014344324881676584, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 225.15625, "completions/mean_terminated_length": 225.15625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.4997505024075508, "epoch": 0.08292682926829269, "frac_reward_zero_std": 0.875, "grad_norm": 0.026097500696778297, "kl": 0.0012611713318619877, "learning_rate": 8.297560975609757e-06, "loss": -0.019, "num_tokens": 1489458.0, "reward": 1.6328125, "reward_std": 0.5312203764915466, "rewards/answer_reward/mean": 0.65625, "rewards/answer_reward/std": 0.4825586974620819, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.13258251547813416, "sampling/importance_sampling_ratio/max": 2.27899169921875, "sampling/importance_sampling_ratio/mean": 0.7153455018997192, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 7.775173187255859, "sampling/sampling_logp_difference/mean": 0.02286536619067192, "step": 17, "step_time": 11.0951445591636 }, { "clip_ratio/high_max": 0.0026107223820872605, "clip_ratio/high_mean": 0.0006526805955218151, "clip_ratio/low_mean": 0.0005345513636711985, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011872319591930136, "completions/clipped_ratio": 0.15625, "completions/max_length": 384.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 257.875, "completions/mean_terminated_length": 234.51852416992188, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.41546468436717987, "epoch": 0.08780487804878048, "frac_reward_zero_std": 0.625, "grad_norm": 0.03923935070633888, "kl": 0.001213295035995543, "learning_rate": 8.253658536585366e-06, "loss": 0.084, "num_tokens": 1580204.0, "reward": 1.453125, "reward_std": 0.7335219383239746, "rewards/answer_reward/mean": 0.59375, "rewards/answer_reward/std": 0.49899089336395264, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3415895998477936, "sampling/importance_sampling_ratio/max": 2.0624868869781494, "sampling/importance_sampling_ratio/mean": 0.7597397565841675, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.0584347248077393, "sampling/sampling_logp_difference/mean": 0.016922926530241966, "step": 18, "step_time": 10.205001638270915 }, { "clip_ratio/high_max": 0.0017201835289597511, "clip_ratio/high_mean": 0.0008581280708312988, "clip_ratio/low_mean": 0.0002246998847112991, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001082827955542598, "completions/clipped_ratio": 0.15625, "completions/max_length": 384.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 249.75, "completions/mean_terminated_length": 224.88888549804688, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.4038604088127613, "epoch": 0.09268292682926829, "frac_reward_zero_std": 0.8125, "grad_norm": 0.03827754035592079, "kl": 0.0010679275656002574, "learning_rate": 8.209756097560976e-06, "loss": -0.0344, "num_tokens": 1666464.0, "reward": 1.625, "reward_std": 0.7513428926467896, "rewards/answer_reward/mean": 0.78125, "rewards/answer_reward/std": 0.420013427734375, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.3689020276069641, "sampling/importance_sampling_ratio/max": 2.7601513862609863, "sampling/importance_sampling_ratio/mean": 0.8299179077148438, "sampling/importance_sampling_ratio/min": 0.15807947516441345, "sampling/sampling_logp_difference/max": 0.6480059623718262, "sampling/sampling_logp_difference/mean": 0.01686183363199234, "step": 19, "step_time": 10.326803258154541 }, { "clip_ratio/high_max": 0.003928416001144797, "clip_ratio/high_mean": 0.0009821040002861992, "clip_ratio/low_mean": 0.0004931948787998408, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00147529887908604, "completions/clipped_ratio": 0.0625, "completions/max_length": 384.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 228.59375, "completions/mean_terminated_length": 218.23333740234375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.47537605091929436, "epoch": 0.0975609756097561, "frac_reward_zero_std": 0.5625, "grad_norm": 0.06714574247598648, "kl": 0.0010877468739636242, "learning_rate": 8.165853658536585e-06, "loss": 0.0638, "num_tokens": 1741253.0, "reward": 1.625, "reward_std": 0.5990583896636963, "rewards/answer_reward/mean": 0.6875, "rewards/answer_reward/std": 0.4709290862083435, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.2199706733226776, "sampling/importance_sampling_ratio/max": 2.1850311756134033, "sampling/importance_sampling_ratio/mean": 0.8047342300415039, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8779163360595703, "sampling/sampling_logp_difference/mean": 0.01931113749742508, "step": 20, "step_time": 10.537213985342532 }, { "clip_ratio/high_max": 0.002903377404436469, "clip_ratio/high_mean": 0.0007258443511091173, "clip_ratio/low_mean": 0.00015470296784769744, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008805473189568147, "completions/clipped_ratio": 0.09375, "completions/max_length": 384.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 232.03125, "completions/mean_terminated_length": 216.3103485107422, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.44494859501719475, "epoch": 0.1024390243902439, "frac_reward_zero_std": 0.875, "grad_norm": 0.02120947651565075, "kl": 0.0011454233899712563, "learning_rate": 8.121951219512195e-06, "loss": 0.0463, "num_tokens": 1827444.0, "reward": 1.4765625, "reward_std": 0.6548804640769958, "rewards/answer_reward/mean": 0.5625, "rewards/answer_reward/std": 0.504016101360321, "rewards/format_reward/mean": 0.9140625, "rewards/format_reward/std": 0.27393028140068054, "sampling/importance_sampling_ratio/max": 2.5996663570404053, "sampling/importance_sampling_ratio/mean": 0.8410799503326416, "sampling/importance_sampling_ratio/min": 0.20291541516780853, "sampling/sampling_logp_difference/max": 0.5728070735931396, "sampling/sampling_logp_difference/mean": 0.017538364976644516, "step": 21, "step_time": 10.532630940899253 }, { "clip_ratio/high_max": 0.003099173540249467, "clip_ratio/high_mean": 0.001011535758152604, "clip_ratio/low_mean": 8.138021075865254e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010929159689112566, "completions/clipped_ratio": 0.125, "completions/max_length": 384.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 248.46875, "completions/mean_terminated_length": 229.10714721679688, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.4029152952134609, "epoch": 0.1073170731707317, "frac_reward_zero_std": 0.8125, "grad_norm": 0.02548247203230858, "kl": 0.0010262145879096352, "learning_rate": 8.078048780487805e-06, "loss": 0.0245, "num_tokens": 1919727.0, "reward": 1.53125, "reward_std": 0.717719316482544, "rewards/answer_reward/mean": 0.65625, "rewards/answer_reward/std": 0.4825586974620819, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33601075410842896, "sampling/importance_sampling_ratio/max": 2.7946197986602783, "sampling/importance_sampling_ratio/mean": 0.8885304927825928, "sampling/importance_sampling_ratio/min": 0.10960470139980316, "sampling/sampling_logp_difference/max": 0.5226888656616211, "sampling/sampling_logp_difference/mean": 0.016314420849084854, "step": 22, "step_time": 10.934702535625547 }, { "clip_ratio/high_max": 0.005095954053103924, "clip_ratio/high_mean": 0.001401021028868854, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001401021028868854, "completions/clipped_ratio": 0.09375, "completions/max_length": 384.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 239.9375, "completions/mean_terminated_length": 225.03448486328125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.4513583593070507, "epoch": 0.11219512195121951, "frac_reward_zero_std": 0.625, "grad_norm": 0.04165147244930267, "kl": 0.0013165839554858394, "learning_rate": 8.034146341463415e-06, "loss": 0.05, "num_tokens": 2002777.0, "reward": 1.65625, "reward_std": 0.6530017852783203, "rewards/answer_reward/mean": 0.75, "rewards/answer_reward/std": 0.4399413466453552, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "sampling/importance_sampling_ratio/max": 2.4120402336120605, "sampling/importance_sampling_ratio/mean": 0.7301255464553833, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6974658966064453, "sampling/sampling_logp_difference/mean": 0.0183754600584507, "step": 23, "step_time": 11.071318143047392 }, { "clip_ratio/high_max": 0.004376540542580187, "clip_ratio/high_mean": 0.0013326847401913255, "clip_ratio/low_mean": 0.0002991502478835173, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001631835002626758, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 222.53125, "completions/mean_terminated_length": 217.32257080078125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.5213893316686153, "epoch": 0.11707317073170732, "frac_reward_zero_std": 0.75, "grad_norm": 0.06730229407548904, "kl": 0.0015682090306654572, "learning_rate": 7.990243902439025e-06, "loss": -0.0366, "num_tokens": 2100142.0, "reward": 1.609375, "reward_std": 0.5195279121398926, "rewards/answer_reward/mean": 0.625, "rewards/answer_reward/std": 0.49186936020851135, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0883883461356163, "sampling/importance_sampling_ratio/max": 2.6528284549713135, "sampling/importance_sampling_ratio/mean": 0.8558666110038757, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.2006051540374756, "sampling/sampling_logp_difference/mean": 0.02326766401529312, "step": 24, "step_time": 10.42135714367032 }, { "clip_ratio/high_max": 0.003065476194024086, "clip_ratio/high_mean": 0.0007663690485060215, "clip_ratio/low_mean": 0.0004078065903740935, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001174175638880115, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 217.28125, "completions/mean_terminated_length": 211.90321350097656, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.46236416697502136, "epoch": 0.12195121951219512, "frac_reward_zero_std": 0.5625, "grad_norm": 0.0592721663415432, "kl": 0.0017062597471522167, "learning_rate": 7.946341463414635e-06, "loss": -0.0339, "num_tokens": 2192299.0, "reward": 1.5, "reward_std": 0.5679618120193481, "rewards/answer_reward/mean": 0.53125, "rewards/answer_reward/std": 0.507007360458374, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 1.9587960243225098, "sampling/importance_sampling_ratio/mean": 0.8211201429367065, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.7250020503997803, "sampling/sampling_logp_difference/mean": 0.020450182259082794, "step": 25, "step_time": 10.746596784330904 }, { "clip_ratio/high_max": 0.0068384717451408505, "clip_ratio/high_mean": 0.002308627968886867, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002308627968886867, "completions/clipped_ratio": 0.09375, "completions/max_length": 384.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 242.09375, "completions/mean_terminated_length": 227.41378784179688, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.4656551331281662, "epoch": 0.12682926829268293, "frac_reward_zero_std": 0.625, "grad_norm": 0.061459530144929886, "kl": 0.0014516226292471401, "learning_rate": 7.902439024390243e-06, "loss": -0.0394, "num_tokens": 2287470.0, "reward": 1.65625, "reward_std": 0.6530017852783203, "rewards/answer_reward/mean": 0.75, "rewards/answer_reward/std": 0.4399413466453552, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "sampling/importance_sampling_ratio/max": 2.7204465866088867, "sampling/importance_sampling_ratio/mean": 0.9058985710144043, "sampling/importance_sampling_ratio/min": 0.18960736691951752, "sampling/sampling_logp_difference/max": 0.5948166847229004, "sampling/sampling_logp_difference/mean": 0.01951504498720169, "step": 26, "step_time": 10.900358975399286 }, { "clip_ratio/high_max": 0.002045851550064981, "clip_ratio/high_mean": 0.0006064476910978556, "clip_ratio/low_mean": 0.0004835754953091964, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001090023186407052, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 234.8125, "completions/mean_terminated_length": 230.0, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.4549492858350277, "epoch": 0.13170731707317074, "frac_reward_zero_std": 0.75, "grad_norm": 0.027680985629558563, "kl": 0.0015303581167245284, "learning_rate": 7.858536585365853e-06, "loss": 0.0333, "num_tokens": 2362948.0, "reward": 1.71875, "reward_std": 0.5226715207099915, "rewards/answer_reward/mean": 0.75, "rewards/answer_reward/std": 0.4399413466453552, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 2.0998833179473877, "sampling/importance_sampling_ratio/mean": 0.7969750165939331, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8335514068603516, "sampling/sampling_logp_difference/mean": 0.019734302535653114, "step": 27, "step_time": 11.038234162610024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 218.71875, "completions/mean_terminated_length": 218.71875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.5339497365057468, "epoch": 0.13658536585365855, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004977538483217359, "kl": 0.0018534217670094222, "learning_rate": 7.814634146341463e-06, "loss": 0.0, "num_tokens": 2446711.0, "reward": 1.75, "reward_std": 0.4399413466453552, "rewards/answer_reward/mean": 0.75, "rewards/answer_reward/std": 0.4399413466453552, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.2973110675811768, "sampling/importance_sampling_ratio/mean": 1.0180081129074097, "sampling/importance_sampling_ratio/min": 0.19585348665714264, "sampling/sampling_logp_difference/max": 0.9157562255859375, "sampling/sampling_logp_difference/mean": 0.02274218201637268, "step": 28, "step_time": 10.176566608250141 }, { "clip_ratio/high_max": 0.005964706419035792, "clip_ratio/high_mean": 0.0016934419982135296, "clip_ratio/low_mean": 0.0007721543297520839, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024655963279656135, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 236.65625, "completions/mean_terminated_length": 231.90321350097656, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.5105221085250378, "epoch": 0.14146341463414633, "frac_reward_zero_std": 0.6875, "grad_norm": 0.02868981659412384, "kl": 0.002936592645710334, "learning_rate": 7.770731707317073e-06, "loss": 0.0072, "num_tokens": 2542820.0, "reward": 1.71875, "reward_std": 0.5226715207099915, "rewards/answer_reward/mean": 0.75, "rewards/answer_reward/std": 0.4399413466453552, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 2.392137050628662, "sampling/importance_sampling_ratio/mean": 0.7716360092163086, "sampling/importance_sampling_ratio/min": 0.05495074391365051, "sampling/sampling_logp_difference/max": 1.4996533393859863, "sampling/sampling_logp_difference/mean": 0.02269141376018524, "step": 29, "step_time": 10.555466800462455 }, { "clip_ratio/high_max": 0.0029285850760061294, "clip_ratio/high_mean": 0.0007321462690015323, "clip_ratio/low_mean": 0.000692529632942751, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0014246759164961986, "completions/clipped_ratio": 0.09375, "completions/max_length": 384.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 230.4375, "completions/mean_terminated_length": 214.55172729492188, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.46762334555387497, "epoch": 0.14634146341463414, "frac_reward_zero_std": 0.75, "grad_norm": 0.03677405044436455, "kl": 0.0016829167871037498, "learning_rate": 7.726829268292683e-06, "loss": 0.0221, "num_tokens": 2621738.0, "reward": 1.625, "reward_std": 0.5956833958625793, "rewards/answer_reward/mean": 0.6875, "rewards/answer_reward/std": 0.4709290862083435, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.21060587465763092, "sampling/importance_sampling_ratio/max": 2.905714273452759, "sampling/importance_sampling_ratio/mean": 0.8889265060424805, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6631779670715332, "sampling/sampling_logp_difference/mean": 0.019964046776294708, "step": 30, "step_time": 10.969341307878494 }, { "clip_ratio/high_max": 0.0034808090422302485, "clip_ratio/high_mean": 0.0008702022605575621, "clip_ratio/low_mean": 0.0008536933601135388, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001723895620671101, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 233.25, "completions/mean_terminated_length": 233.25, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.5513717904686928, "epoch": 0.15121951219512195, "frac_reward_zero_std": 0.6875, "grad_norm": 0.027868084609508514, "kl": 0.0018341631075600162, "learning_rate": 7.682926829268293e-06, "loss": -0.0053, "num_tokens": 2707656.0, "reward": 1.59375, "reward_std": 0.498990923166275, "rewards/answer_reward/mean": 0.59375, "rewards/answer_reward/std": 0.49899089336395264, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9892253875732422, "sampling/importance_sampling_ratio/mean": 0.7115558385848999, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8299820423126221, "sampling/sampling_logp_difference/mean": 0.023282520473003387, "step": 31, "step_time": 10.565436326898634 }, { "clip_ratio/high_max": 0.0017246377537958324, "clip_ratio/high_mean": 0.0004311594384489581, "clip_ratio/low_mean": 0.0005854124538018368, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001016571892250795, "completions/clipped_ratio": 0.0625, "completions/max_length": 384.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 235.25, "completions/mean_terminated_length": 225.33334350585938, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.5157393738627434, "epoch": 0.15609756097560976, "frac_reward_zero_std": 0.8125, "grad_norm": 0.04127691686153412, "kl": 0.0023348392714979127, "learning_rate": 7.639024390243903e-06, "loss": -0.0251, "num_tokens": 2805772.0, "reward": 1.703125, "reward_std": 0.55152827501297, "rewards/answer_reward/mean": 0.75, "rewards/answer_reward/std": 0.4399413466453552, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.19507545232772827, "sampling/importance_sampling_ratio/max": 2.6918134689331055, "sampling/importance_sampling_ratio/mean": 0.842719316482544, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8110966682434082, "sampling/sampling_logp_difference/mean": 0.023382481187582016, "step": 32, "step_time": 10.72528725489974 }, { "clip_ratio/high_max": 0.0016565915429964662, "clip_ratio/high_mean": 0.00041414788574911654, "clip_ratio/low_mean": 0.0005713619466405362, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0009855098323896527, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 210.71875, "completions/mean_terminated_length": 210.71875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.4824623428285122, "epoch": 0.16097560975609757, "frac_reward_zero_std": 0.8125, "grad_norm": 0.0447763092815876, "kl": 0.002048063703114167, "learning_rate": 7.595121951219512e-06, "loss": -0.0671, "num_tokens": 2895207.0, "reward": 1.78125, "reward_std": 0.420013427734375, "rewards/answer_reward/mean": 0.78125, "rewards/answer_reward/std": 0.420013427734375, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.8762876987457275, "sampling/importance_sampling_ratio/mean": 0.9170467853546143, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.2772331237792969, "sampling/sampling_logp_difference/mean": 0.022362567484378815, "step": 33, "step_time": 11.0103964721784 }, { "clip_ratio/high_max": 0.004994971328414977, "clip_ratio/high_mean": 0.0013907882967032492, "clip_ratio/low_mean": 0.00021043770539108664, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016012260020943359, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 231.21875, "completions/mean_terminated_length": 226.29031372070312, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.5011247470974922, "epoch": 0.16585365853658537, "frac_reward_zero_std": 0.6875, "grad_norm": 0.0565413273870945, "kl": 0.002075166688882746, "learning_rate": 7.551219512195122e-06, "loss": -0.0623, "num_tokens": 2994758.0, "reward": 1.6875, "reward_std": 0.5350610613822937, "rewards/answer_reward/mean": 0.71875, "rewards/answer_reward/std": 0.45680341124534607, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 2.5171658992767334, "sampling/importance_sampling_ratio/mean": 0.8619438409805298, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.2942347526550293, "sampling/sampling_logp_difference/mean": 0.02174638956785202, "step": 34, "step_time": 10.241550699807703 }, { "clip_ratio/high_max": 0.006089852511649951, "clip_ratio/high_mean": 0.0022169076473801397, "clip_ratio/low_mean": 0.0005561558209592476, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027730634683393873, "completions/clipped_ratio": 0.09375, "completions/max_length": 384.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 236.78125, "completions/mean_terminated_length": 221.55172729492188, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.4057164266705513, "epoch": 0.17073170731707318, "frac_reward_zero_std": 0.625, "grad_norm": 0.041300203651189804, "kl": 0.004376085416879505, "learning_rate": 7.507317073170732e-06, "loss": 0.0395, "num_tokens": 3080403.0, "reward": 1.6640625, "reward_std": 0.6337592005729675, "rewards/answer_reward/mean": 0.75, "rewards/answer_reward/std": 0.4399413466453552, "rewards/format_reward/mean": 0.9140625, "rewards/format_reward/std": 0.27393028140068054, "sampling/importance_sampling_ratio/max": 2.6454672813415527, "sampling/importance_sampling_ratio/mean": 0.9857398867607117, "sampling/importance_sampling_ratio/min": 0.1031593307852745, "sampling/sampling_logp_difference/max": 1.6611382961273193, "sampling/sampling_logp_difference/mean": 0.017723016440868378, "step": 35, "step_time": 9.978441542945802 }, { "clip_ratio/high_max": 0.002132236957550049, "clip_ratio/high_mean": 0.0005330592393875122, "clip_ratio/low_mean": 0.0003881987649947405, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0009212580043822527, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 216.34375, "completions/mean_terminated_length": 216.34375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.43352097272872925, "epoch": 0.17560975609756097, "frac_reward_zero_std": 0.8125, "grad_norm": 0.047643136233091354, "kl": 0.014415435813134536, "learning_rate": 7.463414634146341e-06, "loss": -0.0235, "num_tokens": 3168286.0, "reward": 1.78125, "reward_std": 0.420013427734375, "rewards/answer_reward/mean": 0.78125, "rewards/answer_reward/std": 0.420013427734375, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.710761070251465, "sampling/importance_sampling_ratio/mean": 0.9685940146446228, "sampling/importance_sampling_ratio/min": 0.03373658284544945, "sampling/sampling_logp_difference/max": 1.9030696153640747, "sampling/sampling_logp_difference/mean": 0.020585577934980392, "step": 36, "step_time": 10.685656030196697 }, { "clip_ratio/high_max": 0.0021429979242384434, "clip_ratio/high_mean": 0.0006177704781293869, "clip_ratio/low_mean": 0.0002006550203077495, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008184254984371364, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 228.28125, "completions/mean_terminated_length": 223.258056640625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.46170755848288536, "epoch": 0.18048780487804877, "frac_reward_zero_std": 0.8125, "grad_norm": 0.08575189858675003, "kl": 0.02100449550198391, "learning_rate": 7.419512195121951e-06, "loss": 0.0234, "num_tokens": 3257725.0, "reward": 1.7578125, "reward_std": 0.48144763708114624, "rewards/answer_reward/mean": 0.78125, "rewards/answer_reward/std": 0.420013427734375, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.13258251547813416, "sampling/importance_sampling_ratio/max": 2.3194284439086914, "sampling/importance_sampling_ratio/mean": 0.8794113397598267, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 3.0725936889648438, "sampling/sampling_logp_difference/mean": 0.02204008772969246, "step": 37, "step_time": 10.496698076371104 }, { "clip_ratio/high_max": 0.004324117675423622, "clip_ratio/high_mean": 0.0010810294188559055, "clip_ratio/low_mean": 0.0011696535220835358, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0022506829409394413, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 233.78125, "completions/mean_terminated_length": 233.78125, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.49030400812625885, "epoch": 0.18536585365853658, "frac_reward_zero_std": 0.75, "grad_norm": 0.03152948245406151, "kl": 0.0031579996284563094, "learning_rate": 7.375609756097561e-06, "loss": -0.0381, "num_tokens": 3354826.0, "reward": 1.625, "reward_std": 0.49186936020851135, "rewards/answer_reward/mean": 0.625, "rewards/answer_reward/std": 0.49186936020851135, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.481741189956665, "sampling/importance_sampling_ratio/mean": 0.7777850031852722, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6459008455276489, "sampling/sampling_logp_difference/mean": 0.023015951737761497, "step": 38, "step_time": 10.31284431135282 }, { "clip_ratio/high_max": 0.0015046712360344827, "clip_ratio/high_mean": 0.0005478710954776034, "clip_ratio/low_mean": 0.0007652460044482723, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013131170999258757, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 224.375, "completions/mean_terminated_length": 224.375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.488705862313509, "epoch": 0.1902439024390244, "frac_reward_zero_std": 0.75, "grad_norm": 0.04360480234026909, "kl": 0.004071205868967809, "learning_rate": 7.331707317073171e-06, "loss": 0.0205, "num_tokens": 3446642.0, "reward": 1.7734375, "reward_std": 0.4181341826915741, "rewards/answer_reward/mean": 0.78125, "rewards/answer_reward/std": 0.420013427734375, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.36177659034729, "sampling/importance_sampling_ratio/mean": 0.794922947883606, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.3266563415527344, "sampling/sampling_logp_difference/mean": 0.022426854819059372, "step": 39, "step_time": 10.519494338426739 }, { "clip_ratio/high_max": 0.0020885482663288713, "clip_ratio/high_mean": 0.0005221370665822178, "clip_ratio/low_mean": 0.0010203170022577979, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015424540688400157, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 212.6875, "completions/mean_terminated_length": 212.6875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.4904809705913067, "epoch": 0.1951219512195122, "frac_reward_zero_std": 0.8125, "grad_norm": 0.025321129709482193, "kl": 0.0034989779960596934, "learning_rate": 7.2878048780487815e-06, "loss": -0.0094, "num_tokens": 3539134.0, "reward": 1.65625, "reward_std": 0.4825586974620819, "rewards/answer_reward/mean": 0.65625, "rewards/answer_reward/std": 0.4825586974620819, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0822365283966064, "sampling/importance_sampling_ratio/mean": 0.5993125438690186, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.4955928325653076, "sampling/sampling_logp_difference/mean": 0.023110268637537956, "step": 40, "step_time": 10.935050040949136 }, { "clip_ratio/high_max": 0.001646745076868683, "clip_ratio/high_mean": 0.0005049698665970936, "clip_ratio/low_mean": 0.0007270213682204485, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001231991220265627, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 228.78125, "completions/mean_terminated_length": 223.77418518066406, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.4727805405855179, "epoch": 0.2, "frac_reward_zero_std": 0.75, "grad_norm": 0.04103660210967064, "kl": 0.005718181113479659, "learning_rate": 7.243902439024391e-06, "loss": -0.0544, "num_tokens": 3609459.0, "reward": 1.578125, "reward_std": 0.5253167152404785, "rewards/answer_reward/mean": 0.59375, "rewards/answer_reward/std": 0.49899089336395264, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0883883461356163, "sampling/importance_sampling_ratio/max": 2.6190927028656006, "sampling/importance_sampling_ratio/mean": 0.8007964491844177, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.0930960178375244, "sampling/sampling_logp_difference/mean": 0.02303743176162243, "step": 41, "step_time": 10.17232642415911 }, { "clip_ratio/high_max": 0.002646659704623744, "clip_ratio/high_mean": 0.0009345906946691684, "clip_ratio/low_mean": 0.0004987043212167919, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0014332950158859603, "completions/clipped_ratio": 0.0625, "completions/max_length": 384.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 227.59375, "completions/mean_terminated_length": 217.1666717529297, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.47006719931960106, "epoch": 0.2048780487804878, "frac_reward_zero_std": 0.6875, "grad_norm": 0.03479534760117531, "kl": 0.0061148157401476055, "learning_rate": 7.2000000000000005e-06, "loss": -0.0252, "num_tokens": 3708660.0, "reward": 1.6640625, "reward_std": 0.5806329250335693, "rewards/answer_reward/mean": 0.71875, "rewards/answer_reward/std": 0.45680341124534607, "rewards/format_reward/mean": 0.9453125, "rewards/format_reward/std": 0.21752247214317322, "sampling/importance_sampling_ratio/max": 1.9399240016937256, "sampling/importance_sampling_ratio/mean": 0.7261408567428589, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.255643129348755, "sampling/sampling_logp_difference/mean": 0.0239977166056633, "step": 42, "step_time": 10.487732320092618 }, { "clip_ratio/high_max": 0.003040820301976055, "clip_ratio/high_mean": 0.0007602050754940137, "clip_ratio/low_mean": 0.000557002320419997, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013172073813620955, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 209.78125, "completions/mean_terminated_length": 204.16128540039062, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.518859688192606, "epoch": 0.2097560975609756, "frac_reward_zero_std": 0.75, "grad_norm": 0.034217655658721924, "kl": 0.0050877362955361605, "learning_rate": 7.1560975609756104e-06, "loss": -0.0054, "num_tokens": 3782987.0, "reward": 1.671875, "reward_std": 0.5017610192298889, "rewards/answer_reward/mean": 0.6875, "rewards/answer_reward/std": 0.4709290862083435, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0883883461356163, "sampling/importance_sampling_ratio/max": 1.9648051261901855, "sampling/importance_sampling_ratio/mean": 0.7545971870422363, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.5439283847808838, "sampling/sampling_logp_difference/mean": 0.02481948584318161, "step": 43, "step_time": 11.278675698675215 }, { "clip_ratio/high_max": 0.0011467889416962862, "clip_ratio/high_mean": 0.00043204607209190726, "clip_ratio/low_mean": 0.0002802690723910928, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000712315144483, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 213.125, "completions/mean_terminated_length": 213.125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.47029734402894974, "epoch": 0.2146341463414634, "frac_reward_zero_std": 0.8125, "grad_norm": 0.033608853816986084, "kl": 0.0038877171755302697, "learning_rate": 7.11219512195122e-06, "loss": 0.0084, "num_tokens": 3874359.0, "reward": 1.71875, "reward_std": 0.45680341124534607, "rewards/answer_reward/mean": 0.71875, "rewards/answer_reward/std": 0.45680341124534607, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.978834629058838, "sampling/importance_sampling_ratio/mean": 0.6548266410827637, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.8893768787384033, "sampling/sampling_logp_difference/mean": 0.024706728756427765, "step": 44, "step_time": 10.805110983084887 }, { "clip_ratio/high_max": 0.008944265078753233, "clip_ratio/high_mean": 0.0022360662696883082, "clip_ratio/low_mean": 0.0007542976964032277, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002990363966091536, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 228.34375, "completions/mean_terminated_length": 223.32257080078125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.5289417952299118, "epoch": 0.21951219512195122, "frac_reward_zero_std": 0.625, "grad_norm": 0.060505449771881104, "kl": 0.0035444071982055902, "learning_rate": 7.0682926829268295e-06, "loss": 0.1305, "num_tokens": 3971108.0, "reward": 1.734375, "reward_std": 0.4749257266521454, "rewards/answer_reward/mean": 0.75, "rewards/answer_reward/std": 0.4399413466453552, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0883883461356163, "sampling/importance_sampling_ratio/max": 2.517712354660034, "sampling/importance_sampling_ratio/mean": 0.8168565034866333, "sampling/importance_sampling_ratio/min": 0.1494995355606079, "sampling/sampling_logp_difference/max": 0.7752656936645508, "sampling/sampling_logp_difference/mean": 0.025015059858560562, "step": 45, "step_time": 11.442232412751764 }, { "clip_ratio/high_max": 0.002964715618873015, "clip_ratio/high_mean": 0.0007411789047182538, "clip_ratio/low_mean": 0.0002559612039476633, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000997140108665917, "completions/clipped_ratio": 0.0625, "completions/max_length": 384.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 222.9375, "completions/mean_terminated_length": 212.20001220703125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.4323978088796139, "epoch": 0.22439024390243903, "frac_reward_zero_std": 0.8125, "grad_norm": 0.010640453547239304, "kl": 0.005387827957747504, "learning_rate": 7.024390243902439e-06, "loss": 0.0093, "num_tokens": 4056310.0, "reward": 1.6328125, "reward_std": 0.5888218879699707, "rewards/answer_reward/mean": 0.6875, "rewards/answer_reward/std": 0.4709290862083435, "rewards/format_reward/mean": 0.9453125, "rewards/format_reward/std": 0.21752247214317322, "sampling/importance_sampling_ratio/max": 2.1510355472564697, "sampling/importance_sampling_ratio/mean": 0.501771867275238, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.349775791168213, "sampling/sampling_logp_difference/mean": 0.022949447855353355, "step": 46, "step_time": 11.483183315955102 }, { "clip_ratio/high_max": 0.005627745558740571, "clip_ratio/high_mean": 0.0014069363896851428, "clip_ratio/low_mean": 0.00041094396874541417, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018178803511545993, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 218.4375, "completions/mean_terminated_length": 218.4375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.47491561621427536, "epoch": 0.22926829268292684, "frac_reward_zero_std": 0.6875, "grad_norm": 0.03308742120862007, "kl": 0.004515227046795189, "learning_rate": 6.980487804878049e-06, "loss": 0.0163, "num_tokens": 4147062.0, "reward": 1.59375, "reward_std": 0.498990923166275, "rewards/answer_reward/mean": 0.59375, "rewards/answer_reward/std": 0.49899089336395264, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.5992398262023926, "sampling/importance_sampling_ratio/mean": 0.7042809724807739, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.1094727516174316, "sampling/sampling_logp_difference/mean": 0.024059128016233444, "step": 47, "step_time": 10.558641617186368 }, { "clip_ratio/high_max": 0.0028658254886977375, "clip_ratio/high_mean": 0.0009117688896367326, "clip_ratio/low_mean": 0.0005486640438903123, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0014604329189751297, "completions/clipped_ratio": 0.0625, "completions/max_length": 384.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 224.625, "completions/mean_terminated_length": 214.00001525878906, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.4751594066619873, "epoch": 0.23414634146341465, "frac_reward_zero_std": 0.75, "grad_norm": 0.18119321763515472, "kl": 0.0899049931904301, "learning_rate": 6.936585365853659e-06, "loss": -0.0177, "num_tokens": 4242132.0, "reward": 1.5390625, "reward_std": 0.6027804613113403, "rewards/answer_reward/mean": 0.59375, "rewards/answer_reward/std": 0.49899089336395264, "rewards/format_reward/mean": 0.9453125, "rewards/format_reward/std": 0.21752247214317322, "sampling/importance_sampling_ratio/max": 2.8533685207366943, "sampling/importance_sampling_ratio/mean": 0.9854819178581238, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.4867959022521973, "sampling/sampling_logp_difference/mean": 0.022435851395130157, "step": 48, "step_time": 10.664352198131382 }, { "clip_ratio/high_max": 0.00070821528788656, "clip_ratio/high_mean": 0.00017705382197164, "clip_ratio/low_mean": 0.00016276042151730508, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003398142507649027, "completions/clipped_ratio": 0.09375, "completions/max_length": 384.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 235.5, "completions/mean_terminated_length": 220.13792419433594, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.44019098579883575, "epoch": 0.23902439024390243, "frac_reward_zero_std": 0.8125, "grad_norm": 0.022716790437698364, "kl": 0.004093223426025361, "learning_rate": 6.892682926829268e-06, "loss": 0.0241, "num_tokens": 4318012.0, "reward": 1.625, "reward_std": 0.6599119901657104, "rewards/answer_reward/mean": 0.71875, "rewards/answer_reward/std": 0.45680341124534607, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "sampling/importance_sampling_ratio/max": 2.674631118774414, "sampling/importance_sampling_ratio/mean": 0.7501329779624939, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.1893327236175537, "sampling/sampling_logp_difference/mean": 0.02094566449522972, "step": 49, "step_time": 10.467873891349882 }, { "clip_ratio/high_max": 0.001983438618481159, "clip_ratio/high_mean": 0.0009073145338334143, "clip_ratio/low_mean": 0.000347004781360738, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0012543193151941523, "completions/clipped_ratio": 0.0625, "completions/max_length": 384.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 243.09375, "completions/mean_terminated_length": 233.70001220703125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.5166932232677937, "epoch": 0.24390243902439024, "frac_reward_zero_std": 0.75, "grad_norm": 0.05326803773641586, "kl": 0.0034853820980060846, "learning_rate": 6.848780487804878e-06, "loss": 0.0253, "num_tokens": 4411647.0, "reward": 1.53125, "reward_std": 0.5526694655418396, "rewards/answer_reward/mean": 0.5625, "rewards/answer_reward/std": 0.504016101360321, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.12296734005212784, "sampling/importance_sampling_ratio/max": 2.558199405670166, "sampling/importance_sampling_ratio/mean": 0.9979549646377563, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6962003707885742, "sampling/sampling_logp_difference/mean": 0.022886939346790314, "step": 50, "step_time": 11.309908171184361 }, { "clip_ratio/high_max": 0.001243781065568328, "clip_ratio/high_mean": 0.0004062196530867368, "clip_ratio/low_mean": 0.000338267142069526, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0007444867951562628, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 233.375, "completions/mean_terminated_length": 228.51612854003906, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.4764145202934742, "epoch": 0.24878048780487805, "frac_reward_zero_std": 0.75, "grad_norm": 0.04588745906949043, "kl": 0.0031135591270867735, "learning_rate": 6.804878048780488e-06, "loss": -0.0254, "num_tokens": 4508373.0, "reward": 1.859375, "reward_std": 0.3859294354915619, "rewards/answer_reward/mean": 0.875, "rewards/answer_reward/std": 0.33601075410842896, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0883883461356163, "sampling/importance_sampling_ratio/max": 2.8658206462860107, "sampling/importance_sampling_ratio/mean": 0.8444061875343323, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8799257278442383, "sampling/sampling_logp_difference/mean": 0.02241414785385132, "step": 51, "step_time": 11.038568420801312 }, { "clip_ratio/high_max": 0.0037714775535278022, "clip_ratio/high_mean": 0.0009428693883819506, "clip_ratio/low_mean": 0.0003312240296509117, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0012740934325847775, "completions/clipped_ratio": 0.09375, "completions/max_length": 384.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 242.15625, "completions/mean_terminated_length": 227.48275756835938, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.4154508151113987, "epoch": 0.25365853658536586, "frac_reward_zero_std": 0.625, "grad_norm": 0.044459983706474304, "kl": 0.004664821288315579, "learning_rate": 6.760975609756098e-06, "loss": -0.0351, "num_tokens": 4591878.0, "reward": 1.5859375, "reward_std": 0.6617711186408997, "rewards/answer_reward/mean": 0.6875, "rewards/answer_reward/std": 0.4709290862083435, "rewards/format_reward/mean": 0.8984375, "rewards/format_reward/std": 0.2968882620334625, "sampling/importance_sampling_ratio/max": 2.55808162689209, "sampling/importance_sampling_ratio/mean": 0.876739501953125, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9544294476509094, "sampling/sampling_logp_difference/mean": 0.020277973264455795, "step": 52, "step_time": 10.692133543547243 }, { "clip_ratio/high_max": 0.0028641351382248104, "clip_ratio/high_mean": 0.0009545834182063118, "clip_ratio/low_mean": 0.00020032051543239504, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011549039336387068, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 226.375, "completions/mean_terminated_length": 226.375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.4983530156314373, "epoch": 0.25853658536585367, "frac_reward_zero_std": 0.6875, "grad_norm": 0.041858769953250885, "kl": 0.002943053186754696, "learning_rate": 6.717073170731707e-06, "loss": -0.0121, "num_tokens": 4686658.0, "reward": 1.59375, "reward_std": 0.5341181755065918, "rewards/answer_reward/mean": 0.625, "rewards/answer_reward/std": 0.49186936020851135, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.13839517533779144, "sampling/importance_sampling_ratio/max": 2.750025749206543, "sampling/importance_sampling_ratio/mean": 0.7822805047035217, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.1717112064361572, "sampling/sampling_logp_difference/mean": 0.024302711710333824, "step": 53, "step_time": 10.914116536732763 }, { "clip_ratio/high_max": 0.0027131292154081166, "clip_ratio/high_mean": 0.0010320558649254963, "clip_ratio/low_mean": 0.0002754795932560228, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001307535458181519, "completions/clipped_ratio": 0.0625, "completions/max_length": 384.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 220.03125, "completions/mean_terminated_length": 209.10000610351562, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.4174237288534641, "epoch": 0.2634146341463415, "frac_reward_zero_std": 0.75, "grad_norm": 0.049165841192007065, "kl": 0.003916973510058597, "learning_rate": 6.673170731707317e-06, "loss": -0.0071, "num_tokens": 4771637.0, "reward": 1.546875, "reward_std": 0.5869463682174683, "rewards/answer_reward/mean": 0.59375, "rewards/answer_reward/std": 0.49899089336395264, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.19507545232772827, "sampling/importance_sampling_ratio/max": 2.5629098415374756, "sampling/importance_sampling_ratio/mean": 0.9409783482551575, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.0153902769088745, "sampling/sampling_logp_difference/mean": 0.02094930037856102, "step": 54, "step_time": 10.375479507260025 }, { "clip_ratio/high_max": 0.0006906077614985406, "clip_ratio/high_mean": 0.00017265194037463516, "clip_ratio/low_mean": 0.0008726056548766792, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010452575806993991, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 217.0, "completions/mean_terminated_length": 211.61289978027344, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.4954943358898163, "epoch": 0.2682926829268293, "frac_reward_zero_std": 0.8125, "grad_norm": 0.024923495948314667, "kl": 0.0032566340814810246, "learning_rate": 6.629268292682927e-06, "loss": 0.0748, "num_tokens": 4854501.0, "reward": 1.65625, "reward_std": 0.5453247427940369, "rewards/answer_reward/mean": 0.6875, "rewards/answer_reward/std": 0.4709290862083435, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 1.796933889389038, "sampling/importance_sampling_ratio/mean": 0.6444294452667236, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7916944026947021, "sampling/sampling_logp_difference/mean": 0.022670406848192215, "step": 55, "step_time": 10.566038527991623 }, { "clip_ratio/high_max": 0.0017522397683933377, "clip_ratio/high_mean": 0.00043805994209833443, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00043805994209833443, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 211.59375, "completions/mean_terminated_length": 211.59375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.4567127116024494, "epoch": 0.2731707317073171, "frac_reward_zero_std": 0.875, "grad_norm": 0.02831697277724743, "kl": 0.002442362339934334, "learning_rate": 6.585365853658537e-06, "loss": -0.0149, "num_tokens": 4936410.0, "reward": 1.671875, "reward_std": 0.5017610192298889, "rewards/answer_reward/mean": 0.6875, "rewards/answer_reward/std": 0.4709290862083435, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0883883461356163, "sampling/importance_sampling_ratio/max": 1.9650088548660278, "sampling/importance_sampling_ratio/mean": 0.703760027885437, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6693353652954102, "sampling/sampling_logp_difference/mean": 0.021342452615499496, "step": 56, "step_time": 10.809541343711317 }, { "clip_ratio/high_max": 0.0021839198598172516, "clip_ratio/high_mean": 0.0005459799649543129, "clip_ratio/low_mean": 0.0002765486715361476, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008225286364904605, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 240.5625, "completions/mean_terminated_length": 235.9354705810547, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.4449859596788883, "epoch": 0.2780487804878049, "frac_reward_zero_std": 0.6875, "grad_norm": 0.05741678923368454, "kl": 0.0025756562245078385, "learning_rate": 6.541463414634146e-06, "loss": -0.0805, "num_tokens": 5020870.0, "reward": 1.6953125, "reward_std": 0.5109067559242249, "rewards/answer_reward/mean": 0.71875, "rewards/answer_reward/std": 0.45680341124534607, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.13258251547813416, "sampling/importance_sampling_ratio/max": 2.8538033962249756, "sampling/importance_sampling_ratio/mean": 0.9059665203094482, "sampling/importance_sampling_ratio/min": 0.1358150988817215, "sampling/sampling_logp_difference/max": 1.4859380722045898, "sampling/sampling_logp_difference/mean": 0.019545143470168114, "step": 57, "step_time": 9.992725926917046 }, { "clip_ratio/high_max": 0.0021657724864780903, "clip_ratio/high_mean": 0.0005414431216195226, "clip_ratio/low_mean": 0.00016025641525629908, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0007016995368758217, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 209.5, "completions/mean_terminated_length": 203.87095642089844, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.500361867249012, "epoch": 0.28292682926829266, "frac_reward_zero_std": 0.6875, "grad_norm": 0.03688355162739754, "kl": 0.0031993776792660356, "learning_rate": 6.497560975609756e-06, "loss": -0.0417, "num_tokens": 5109398.0, "reward": 1.59375, "reward_std": 0.5599179267883301, "rewards/answer_reward/mean": 0.625, "rewards/answer_reward/std": 0.49186936020851135, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 2.9239535331726074, "sampling/importance_sampling_ratio/mean": 0.8558993339538574, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8777744770050049, "sampling/sampling_logp_difference/mean": 0.02262384071946144, "step": 58, "step_time": 10.261757185216993 }, { "clip_ratio/high_max": 0.0004681647988036275, "clip_ratio/high_mean": 0.00011704119970090687, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00011704119970090687, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 208.96875, "completions/mean_terminated_length": 208.96875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.459903497248888, "epoch": 0.28780487804878047, "frac_reward_zero_std": 0.9375, "grad_norm": 0.02031203731894493, "kl": 0.0026566239685053006, "learning_rate": 6.453658536585366e-06, "loss": -0.0257, "num_tokens": 5215039.0, "reward": 1.96875, "reward_std": 0.1767766922712326, "rewards/answer_reward/mean": 0.96875, "rewards/answer_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9863730669021606, "sampling/importance_sampling_ratio/mean": 0.7549005746841431, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8516249656677246, "sampling/sampling_logp_difference/mean": 0.022490231320261955, "step": 59, "step_time": 11.222033787053078 }, { "clip_ratio/high_max": 0.004640385741367936, "clip_ratio/high_mean": 0.001160096435341984, "clip_ratio/low_mean": 0.00030283338855952024, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0014629297947976738, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 230.15625, "completions/mean_terminated_length": 225.19354248046875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.47000033780932426, "epoch": 0.2926829268292683, "frac_reward_zero_std": 0.6875, "grad_norm": 0.04717700555920601, "kl": 0.003128852928057313, "learning_rate": 6.409756097560976e-06, "loss": 0.0599, "num_tokens": 5304654.0, "reward": 1.5625, "reward_std": 0.5644009113311768, "rewards/answer_reward/mean": 0.59375, "rewards/answer_reward/std": 0.49899089336395264, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 2.3304474353790283, "sampling/importance_sampling_ratio/mean": 0.9126774668693542, "sampling/importance_sampling_ratio/min": 0.17052499949932098, "sampling/sampling_logp_difference/max": 0.8919205665588379, "sampling/sampling_logp_difference/mean": 0.021476587280631065, "step": 60, "step_time": 10.614137754775584 }, { "clip_ratio/high_max": 0.007831865805201232, "clip_ratio/high_mean": 0.002114216476911679, "clip_ratio/low_mean": 0.00043449953955132514, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0025487159873591736, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 228.28125, "completions/mean_terminated_length": 223.258056640625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.45100002363324165, "epoch": 0.2975609756097561, "frac_reward_zero_std": 0.6875, "grad_norm": 0.06742876023054123, "kl": 0.003054998174775392, "learning_rate": 6.365853658536585e-06, "loss": 0.0619, "num_tokens": 5413371.0, "reward": 1.8125, "reward_std": 0.4709290862083435, "rewards/answer_reward/mean": 0.84375, "rewards/answer_reward/std": 0.3689020276069641, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 2.383427381515503, "sampling/importance_sampling_ratio/mean": 0.7557109594345093, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9384844303131104, "sampling/sampling_logp_difference/mean": 0.020888276398181915, "step": 61, "step_time": 10.797777844127268 }, { "clip_ratio/high_max": 0.0011312217684462667, "clip_ratio/high_mean": 0.00028280544211156666, "clip_ratio/low_mean": 0.00013586955901701003, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004186750156804919, "completions/clipped_ratio": 0.09375, "completions/max_length": 384.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 220.0625, "completions/mean_terminated_length": 203.10345458984375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.42394576594233513, "epoch": 0.3024390243902439, "frac_reward_zero_std": 0.6875, "grad_norm": 0.0532158799469471, "kl": 0.003929715225240216, "learning_rate": 6.321951219512195e-06, "loss": -0.0624, "num_tokens": 5506581.0, "reward": 1.53125, "reward_std": 0.671271026134491, "rewards/answer_reward/mean": 0.625, "rewards/answer_reward/std": 0.49186936020851135, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "sampling/importance_sampling_ratio/max": 2.3454604148864746, "sampling/importance_sampling_ratio/mean": 0.8490308523178101, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9066394567489624, "sampling/sampling_logp_difference/mean": 0.018819771707057953, "step": 62, "step_time": 10.819079339504242 }, { "clip_ratio/high_max": 0.0014980152482166886, "clip_ratio/high_mean": 0.00037450381205417216, "clip_ratio/low_mean": 0.00016276042151730508, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0005372642335714772, "completions/clipped_ratio": 0.0625, "completions/max_length": 384.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 221.5, "completions/mean_terminated_length": 210.6666717529297, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.46575015038251877, "epoch": 0.3073170731707317, "frac_reward_zero_std": 0.75, "grad_norm": 0.03290977701544762, "kl": 0.003663779265480116, "learning_rate": 6.278048780487805e-06, "loss": 0.0022, "num_tokens": 5597415.0, "reward": 1.4140625, "reward_std": 0.5840948820114136, "rewards/answer_reward/mean": 0.46875, "rewards/answer_reward/std": 0.507007360458374, "rewards/format_reward/mean": 0.9453125, "rewards/format_reward/std": 0.17659839987754822, "sampling/importance_sampling_ratio/max": 2.5510785579681396, "sampling/importance_sampling_ratio/mean": 0.7496539950370789, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7562971115112305, "sampling/sampling_logp_difference/mean": 0.02053637057542801, "step": 63, "step_time": 10.832582022994757 }, { "clip_ratio/high_max": 0.0012686435075011104, "clip_ratio/high_mean": 0.0003171608768752776, "clip_ratio/low_mean": 0.00017655367264524102, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004937145495205186, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 218.96875, "completions/mean_terminated_length": 213.64515686035156, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.4826985336840153, "epoch": 0.3121951219512195, "frac_reward_zero_std": 0.5625, "grad_norm": 0.06169905513525009, "kl": 0.002978830103529617, "learning_rate": 6.234146341463415e-06, "loss": 0.1032, "num_tokens": 5677440.0, "reward": 1.59375, "reward_std": 0.549009382724762, "rewards/answer_reward/mean": 0.625, "rewards/answer_reward/std": 0.49186936020851135, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.13839517533779144, "sampling/importance_sampling_ratio/max": 2.147642135620117, "sampling/importance_sampling_ratio/mean": 0.8335617780685425, "sampling/importance_sampling_ratio/min": 0.09541506320238113, "sampling/sampling_logp_difference/max": 0.7832868099212646, "sampling/sampling_logp_difference/mean": 0.022081460803747177, "step": 64, "step_time": 9.950304591562599 }, { "clip_ratio/high_max": 0.00500533462036401, "clip_ratio/high_mean": 0.0012513336550910026, "clip_ratio/low_mean": 0.000501063244882971, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017523968999739736, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 226.1875, "completions/mean_terminated_length": 221.09677124023438, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.4574001505970955, "epoch": 0.3170731707317073, "frac_reward_zero_std": 0.6875, "grad_norm": 0.06617464125156403, "kl": 0.0032315582793671638, "learning_rate": 6.190243902439024e-06, "loss": 0.0779, "num_tokens": 5764322.0, "reward": 1.8125, "reward_std": 0.4709290862083435, "rewards/answer_reward/mean": 0.84375, "rewards/answer_reward/std": 0.3689020276069641, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 2.5191421508789062, "sampling/importance_sampling_ratio/mean": 0.931397557258606, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.778644323348999, "sampling/sampling_logp_difference/mean": 0.021437622606754303, "step": 65, "step_time": 10.335037499666214 }, { "clip_ratio/high_max": 0.003229860798455775, "clip_ratio/high_mean": 0.0008074651996139437, "clip_ratio/low_mean": 0.00023408239940181375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010415475990157574, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 219.71875, "completions/mean_terminated_length": 219.71875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.4585128128528595, "epoch": 0.32195121951219513, "frac_reward_zero_std": 0.8125, "grad_norm": 0.050099607557058334, "kl": 0.0030265965906437486, "learning_rate": 6.1463414634146346e-06, "loss": 0.0717, "num_tokens": 5861801.0, "reward": 1.78125, "reward_std": 0.420013427734375, "rewards/answer_reward/mean": 0.78125, "rewards/answer_reward/std": 0.420013427734375, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.648573398590088, "sampling/importance_sampling_ratio/mean": 0.7634668946266174, "sampling/importance_sampling_ratio/min": 0.03280496969819069, "sampling/sampling_logp_difference/max": 0.7581448554992676, "sampling/sampling_logp_difference/mean": 0.021791774779558182, "step": 66, "step_time": 10.953878466971219 }, { "clip_ratio/high_max": 0.005395654297899455, "clip_ratio/high_mean": 0.0013489135744748637, "clip_ratio/low_mean": 8.138021075865254e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0014302937779575586, "completions/clipped_ratio": 0.09375, "completions/max_length": 384.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 226.25, "completions/mean_terminated_length": 209.9310302734375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.4422372169792652, "epoch": 0.32682926829268294, "frac_reward_zero_std": 0.6875, "grad_norm": 0.04628661647439003, "kl": 0.0028884605853818357, "learning_rate": 6.1024390243902445e-06, "loss": -0.0675, "num_tokens": 5947121.0, "reward": 1.6015625, "reward_std": 0.6086055636405945, "rewards/answer_reward/mean": 0.6875, "rewards/answer_reward/std": 0.4709290862083435, "rewards/format_reward/mean": 0.9140625, "rewards/format_reward/std": 0.24271129071712494, "sampling/importance_sampling_ratio/max": 2.669809103012085, "sampling/importance_sampling_ratio/mean": 0.84369957447052, "sampling/importance_sampling_ratio/min": 0.07179512083530426, "sampling/sampling_logp_difference/max": 1.1247926950454712, "sampling/sampling_logp_difference/mean": 0.02017967775464058, "step": 67, "step_time": 10.086589827667922 }, { "clip_ratio/high_max": 0.004410556750372052, "clip_ratio/high_mean": 0.0012543381890282035, "clip_ratio/low_mean": 0.0008150598732754588, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020693980623036623, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 212.59375, "completions/mean_terminated_length": 207.06451416015625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.4661427438259125, "epoch": 0.33170731707317075, "frac_reward_zero_std": 0.625, "grad_norm": 0.04123358055949211, "kl": 0.004074439581017941, "learning_rate": 6.0585365853658544e-06, "loss": -0.008, "num_tokens": 6042604.0, "reward": 1.5625, "reward_std": 0.5644009113311768, "rewards/answer_reward/mean": 0.59375, "rewards/answer_reward/std": 0.49899089336395264, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 2.9287493228912354, "sampling/importance_sampling_ratio/mean": 0.825698971748352, "sampling/importance_sampling_ratio/min": 0.07141714543104172, "sampling/sampling_logp_difference/max": 1.3847846984863281, "sampling/sampling_logp_difference/mean": 0.02298871800303459, "step": 68, "step_time": 10.645810864400119 }, { "clip_ratio/high_max": 0.0035610408522188663, "clip_ratio/high_mean": 0.0011331334244459867, "clip_ratio/low_mean": 0.00016191709437407553, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0012950505188200623, "completions/clipped_ratio": 0.0625, "completions/max_length": 384.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 243.1875, "completions/mean_terminated_length": 233.80001831054688, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.44261573627591133, "epoch": 0.33658536585365856, "frac_reward_zero_std": 0.625, "grad_norm": 0.024833301082253456, "kl": 0.003463102242676541, "learning_rate": 6.0146341463414635e-06, "loss": -0.0003, "num_tokens": 6135672.0, "reward": 1.515625, "reward_std": 0.6252015829086304, "rewards/answer_reward/mean": 0.59375, "rewards/answer_reward/std": 0.49899089336395264, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.24949544668197632, "sampling/importance_sampling_ratio/max": 2.062290906906128, "sampling/importance_sampling_ratio/mean": 0.6995272636413574, "sampling/importance_sampling_ratio/min": 0.015850622206926346, "sampling/sampling_logp_difference/max": 2.231865882873535, "sampling/sampling_logp_difference/mean": 0.020183466374874115, "step": 69, "step_time": 11.0476228101179 }, { "clip_ratio/high_max": 0.0056777336285449564, "clip_ratio/high_mean": 0.001977761698071845, "clip_ratio/low_mean": 0.0015970167514751665, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003574778442271054, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 224.21875, "completions/mean_terminated_length": 219.06451416015625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.47000304982066154, "epoch": 0.34146341463414637, "frac_reward_zero_std": 0.5625, "grad_norm": 0.049614764750003815, "kl": 0.0034863646142184734, "learning_rate": 5.9707317073170734e-06, "loss": -0.0051, "num_tokens": 6221455.0, "reward": 1.53125, "reward_std": 0.5670737028121948, "rewards/answer_reward/mean": 0.5625, "rewards/answer_reward/std": 0.504016101360321, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 2.2046234607696533, "sampling/importance_sampling_ratio/mean": 0.7964699864387512, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 3.2644290924072266, "sampling/sampling_logp_difference/mean": 0.021278703585267067, "step": 70, "step_time": 11.159715018700808 }, { "clip_ratio/high_max": 0.0012195121962577105, "clip_ratio/high_mean": 0.0003048780490644276, "clip_ratio/low_mean": 0.00010629251482896507, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004111705638933927, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 230.71875, "completions/mean_terminated_length": 225.77418518066406, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.4836859405040741, "epoch": 0.3463414634146341, "frac_reward_zero_std": 0.8125, "grad_norm": 0.05026715248823166, "kl": 0.003689877310534939, "learning_rate": 5.926829268292683e-06, "loss": -0.0085, "num_tokens": 6318586.0, "reward": 1.625, "reward_std": 0.5535807013511658, "rewards/answer_reward/mean": 0.65625, "rewards/answer_reward/std": 0.4825586974620819, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 2.672159433364868, "sampling/importance_sampling_ratio/mean": 0.7593006491661072, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7141246795654297, "sampling/sampling_logp_difference/mean": 0.02192062698304653, "step": 71, "step_time": 10.683846821077168 }, { "clip_ratio/high_max": 0.003421555331442505, "clip_ratio/high_mean": 0.0009967915539164096, "clip_ratio/low_mean": 0.0001201923078042455, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011169838617206551, "completions/clipped_ratio": 0.0625, "completions/max_length": 384.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 241.78125, "completions/mean_terminated_length": 232.30001831054688, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.4767580032348633, "epoch": 0.35121951219512193, "frac_reward_zero_std": 0.6875, "grad_norm": 0.04632245749235153, "kl": 0.003041467411094345, "learning_rate": 5.882926829268293e-06, "loss": -0.0231, "num_tokens": 6405605.0, "reward": 1.5703125, "reward_std": 0.5998466610908508, "rewards/answer_reward/mean": 0.625, "rewards/answer_reward/std": 0.49186936020851135, "rewards/format_reward/mean": 0.9453125, "rewards/format_reward/std": 0.21752247214317322, "sampling/importance_sampling_ratio/max": 2.6062631607055664, "sampling/importance_sampling_ratio/mean": 1.0011464357376099, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.006958246231079, "sampling/sampling_logp_difference/mean": 0.021535612642765045, "step": 72, "step_time": 11.254930234514177 }, { "clip_ratio/high_max": 0.0024771385360509157, "clip_ratio/high_mean": 0.0006192846340127289, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0006192846340127289, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 221.78125, "completions/mean_terminated_length": 221.78125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.48671144247055054, "epoch": 0.35609756097560974, "frac_reward_zero_std": 0.875, "grad_norm": 0.011796095408499241, "kl": 0.00371285411529243, "learning_rate": 5.839024390243902e-06, "loss": 0.0101, "num_tokens": 6499848.0, "reward": 1.8125, "reward_std": 0.3965577781200409, "rewards/answer_reward/mean": 0.8125, "rewards/answer_reward/std": 0.3965577781200409, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.4030282497406006, "sampling/importance_sampling_ratio/mean": 0.7161678075790405, "sampling/importance_sampling_ratio/min": 0.038043342530727386, "sampling/sampling_logp_difference/max": 0.8422527313232422, "sampling/sampling_logp_difference/mean": 0.023928867653012276, "step": 73, "step_time": 10.766288570128381 }, { "clip_ratio/high_max": 0.0008417508215643466, "clip_ratio/high_mean": 0.00021043770539108664, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021043770539108664, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 227.0625, "completions/mean_terminated_length": 227.0625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.45167214423418045, "epoch": 0.36097560975609755, "frac_reward_zero_std": 0.9375, "grad_norm": 0.012249421328306198, "kl": 0.003067908779485151, "learning_rate": 5.795121951219512e-06, "loss": 0.0084, "num_tokens": 6591814.0, "reward": 1.84375, "reward_std": 0.3689020276069641, "rewards/answer_reward/mean": 0.84375, "rewards/answer_reward/std": 0.3689020276069641, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.870575189590454, "sampling/importance_sampling_ratio/mean": 0.8070812225341797, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.0197649002075195, "sampling/sampling_logp_difference/mean": 0.021360913291573524, "step": 74, "step_time": 10.20875250454992 }, { "clip_ratio/high_max": 0.00044014083687216043, "clip_ratio/high_mean": 0.00011003520921804011, "clip_ratio/low_mean": 0.0001849112450145185, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002949464542325586, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 214.09375, "completions/mean_terminated_length": 214.09375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.4942481331527233, "epoch": 0.36585365853658536, "frac_reward_zero_std": 0.9375, "grad_norm": 0.009616587311029434, "kl": 0.003608565020840615, "learning_rate": 5.751219512195122e-06, "loss": -0.0138, "num_tokens": 6681287.0, "reward": 1.65625, "reward_std": 0.4825586974620819, "rewards/answer_reward/mean": 0.65625, "rewards/answer_reward/std": 0.4825586974620819, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.6155896186828613, "sampling/importance_sampling_ratio/mean": 0.7632205486297607, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.4494844675064087, "sampling/sampling_logp_difference/mean": 0.023352663964033127, "step": 75, "step_time": 10.857617914211005 }, { "clip_ratio/high_max": 0.0031204906990751624, "clip_ratio/high_mean": 0.0007801226747687906, "clip_ratio/low_mean": 0.00040849673678167164, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011886194115504622, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 219.875, "completions/mean_terminated_length": 214.5806427001953, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.4607093334197998, "epoch": 0.37073170731707317, "frac_reward_zero_std": 0.8125, "grad_norm": 0.03661885857582092, "kl": 0.009103492280701175, "learning_rate": 5.707317073170732e-06, "loss": -0.0586, "num_tokens": 6764719.0, "reward": 1.78125, "reward_std": 0.4908435642719269, "rewards/answer_reward/mean": 0.8125, "rewards/answer_reward/std": 0.3965577781200409, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 2.5536136627197266, "sampling/importance_sampling_ratio/mean": 0.767670214176178, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.548677444458008, "sampling/sampling_logp_difference/mean": 0.021770499646663666, "step": 76, "step_time": 10.491351027972996 }, { "clip_ratio/high_max": 0.004518072120845318, "clip_ratio/high_mean": 0.0011295180302113295, "clip_ratio/low_mean": 0.0005425998533610255, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016721178544685245, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 222.46875, "completions/mean_terminated_length": 222.46875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.49083130434155464, "epoch": 0.375609756097561, "frac_reward_zero_std": 0.8125, "grad_norm": 0.04738627001643181, "kl": 0.0048617102147545666, "learning_rate": 5.663414634146341e-06, "loss": 0.0345, "num_tokens": 6855224.0, "reward": 1.59375, "reward_std": 0.498990923166275, "rewards/answer_reward/mean": 0.59375, "rewards/answer_reward/std": 0.49899089336395264, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.571396827697754, "sampling/importance_sampling_ratio/mean": 0.860047459602356, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.730867624282837, "sampling/sampling_logp_difference/mean": 0.024264901876449585, "step": 77, "step_time": 10.086119148880243 }, { "clip_ratio/high_max": 0.003703721275087446, "clip_ratio/high_mean": 0.0010238927061436698, "clip_ratio/low_mean": 9.765625145519152e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011215489575988613, "completions/clipped_ratio": 0.0625, "completions/max_length": 384.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 233.84375, "completions/mean_terminated_length": 223.83334350585938, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.4425269290804863, "epoch": 0.3804878048780488, "frac_reward_zero_std": 0.625, "grad_norm": 0.03773188963532448, "kl": 0.0036327966663520783, "learning_rate": 5.619512195121951e-06, "loss": 0.057, "num_tokens": 6933083.0, "reward": 1.5859375, "reward_std": 0.5701208710670471, "rewards/answer_reward/mean": 0.625, "rewards/answer_reward/std": 0.49186936020851135, "rewards/format_reward/mean": 0.9609375, "rewards/format_reward/std": 0.18082889914512634, "sampling/importance_sampling_ratio/max": 2.883322238922119, "sampling/importance_sampling_ratio/mean": 0.9076295495033264, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9331510066986084, "sampling/sampling_logp_difference/mean": 0.0196191668510437, "step": 78, "step_time": 10.228960340376943 }, { "clip_ratio/high_max": 0.004740454663988203, "clip_ratio/high_mean": 0.0015848991024540737, "clip_ratio/low_mean": 0.0006483040997409262, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002233203202195, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 226.09375, "completions/mean_terminated_length": 221.0, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.480973981320858, "epoch": 0.3853658536585366, "frac_reward_zero_std": 0.5, "grad_norm": 0.05736730247735977, "kl": 0.003001857054186985, "learning_rate": 5.575609756097561e-06, "loss": 0.0257, "num_tokens": 7018234.0, "reward": 1.578125, "reward_std": 0.5253167152404785, "rewards/answer_reward/mean": 0.59375, "rewards/answer_reward/std": 0.49899089336395264, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0883883461356163, "sampling/importance_sampling_ratio/max": 1.9486180543899536, "sampling/importance_sampling_ratio/mean": 0.8807474970817566, "sampling/importance_sampling_ratio/min": 0.17010854184627533, "sampling/sampling_logp_difference/max": 1.494616985321045, "sampling/sampling_logp_difference/mean": 0.021555040031671524, "step": 79, "step_time": 10.56827099295333 }, { "clip_ratio/high_max": 0.002531431324314326, "clip_ratio/high_mean": 0.0006328578310785815, "clip_ratio/low_mean": 0.00026371306739747524, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008965708984760568, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 226.25, "completions/mean_terminated_length": 221.16128540039062, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.41876252368092537, "epoch": 0.3902439024390244, "frac_reward_zero_std": 0.625, "grad_norm": 0.048021476715803146, "kl": 0.003330543724587187, "learning_rate": 5.531707317073171e-06, "loss": -0.0388, "num_tokens": 7109796.0, "reward": 1.5546875, "reward_std": 0.5740854144096375, "rewards/answer_reward/mean": 0.59375, "rewards/answer_reward/std": 0.49899089336395264, "rewards/format_reward/mean": 0.9609375, "rewards/format_reward/std": 0.18082889914512634, "sampling/importance_sampling_ratio/max": 2.2465624809265137, "sampling/importance_sampling_ratio/mean": 0.7682920098304749, "sampling/importance_sampling_ratio/min": 0.09448876231908798, "sampling/sampling_logp_difference/max": 1.3275697231292725, "sampling/sampling_logp_difference/mean": 0.019996892660856247, "step": 80, "step_time": 10.74914779420942 }, { "clip_ratio/high_max": 0.0032914653711486608, "clip_ratio/high_mean": 0.0008228663427871652, "clip_ratio/low_mean": 0.0006310141980065964, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0014538805262418464, "completions/clipped_ratio": 0.125, "completions/max_length": 384.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 231.78125, "completions/mean_terminated_length": 210.0357208251953, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.4251158833503723, "epoch": 0.3951219512195122, "frac_reward_zero_std": 0.5, "grad_norm": 0.06589759886264801, "kl": 0.004558836197247729, "learning_rate": 5.48780487804878e-06, "loss": -0.0896, "num_tokens": 7203627.0, "reward": 1.5703125, "reward_std": 0.6389077305793762, "rewards/answer_reward/mean": 0.65625, "rewards/answer_reward/std": 0.4825586974620819, "rewards/format_reward/mean": 0.9140625, "rewards/format_reward/std": 0.24271129071712494, "sampling/importance_sampling_ratio/max": 2.7476093769073486, "sampling/importance_sampling_ratio/mean": 0.8263241052627563, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.0925378799438477, "sampling/sampling_logp_difference/mean": 0.019135713577270508, "step": 81, "step_time": 10.978759698569775 }, { "clip_ratio/high_max": 0.0017816199106164277, "clip_ratio/high_mean": 0.0004454049776541069, "clip_ratio/low_mean": 0.0002672754635568708, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0007126804412109777, "completions/clipped_ratio": 0.125, "completions/max_length": 384.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 236.59375, "completions/mean_terminated_length": 215.5357208251953, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.4356975592672825, "epoch": 0.4, "frac_reward_zero_std": 0.8125, "grad_norm": 0.04266076534986496, "kl": 0.003968496748711914, "learning_rate": 5.44390243902439e-06, "loss": 0.0164, "num_tokens": 7292078.0, "reward": 1.578125, "reward_std": 0.6852051019668579, "rewards/answer_reward/mean": 0.6875, "rewards/answer_reward/std": 0.4709290862083435, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.30412155389785767, "sampling/importance_sampling_ratio/max": 2.52759051322937, "sampling/importance_sampling_ratio/mean": 0.7123097777366638, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9552326202392578, "sampling/sampling_logp_difference/mean": 0.02042466774582863, "step": 82, "step_time": 11.551223640330136 }, { "clip_ratio/high_max": 0.0060963190626353025, "clip_ratio/high_mean": 0.0015240797656588256, "clip_ratio/low_mean": 8.138021075865254e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016054599764174782, "completions/clipped_ratio": 0.125, "completions/max_length": 384.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 243.5, "completions/mean_terminated_length": 223.4285888671875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.42638740688562393, "epoch": 0.40487804878048783, "frac_reward_zero_std": 0.625, "grad_norm": 0.035880882292985916, "kl": 0.004300528948078863, "learning_rate": 5.4e-06, "loss": -0.0084, "num_tokens": 7374178.0, "reward": 1.3203125, "reward_std": 0.6786917448043823, "rewards/answer_reward/mean": 0.4375, "rewards/answer_reward/std": 0.504016101360321, "rewards/format_reward/mean": 0.8828125, "rewards/format_reward/std": 0.31740108132362366, "sampling/importance_sampling_ratio/max": 2.710395574569702, "sampling/importance_sampling_ratio/mean": 0.6136900186538696, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 3.073319435119629, "sampling/sampling_logp_difference/mean": 0.019496260210871696, "step": 83, "step_time": 10.301659624557942 }, { "clip_ratio/high_max": 0.0009541984763927758, "clip_ratio/high_mean": 0.00023854961909819394, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023854961909819394, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 211.5, "completions/mean_terminated_length": 205.9354705810547, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.43496450036764145, "epoch": 0.4097560975609756, "frac_reward_zero_std": 0.9375, "grad_norm": 0.006664374843239784, "kl": 0.0033555967384018004, "learning_rate": 5.35609756097561e-06, "loss": 0.0037, "num_tokens": 7468050.0, "reward": 1.84375, "reward_std": 0.4478893280029297, "rewards/answer_reward/mean": 0.875, "rewards/answer_reward/std": 0.33601075410842896, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 2.450230121612549, "sampling/importance_sampling_ratio/mean": 0.8502267003059387, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.1929280757904053, "sampling/sampling_logp_difference/mean": 0.0214078351855278, "step": 84, "step_time": 10.604936160612851 }, { "clip_ratio/high_max": 0.004009730997495353, "clip_ratio/high_mean": 0.0014864408876746893, "clip_ratio/low_mean": 0.0017277450679102913, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0032141859555849805, "completions/clipped_ratio": 0.125, "completions/max_length": 384.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 240.90625, "completions/mean_terminated_length": 220.46429443359375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.45579832792282104, "epoch": 0.4146341463414634, "frac_reward_zero_std": 0.25, "grad_norm": 0.06952547281980515, "kl": 0.0029007613484282047, "learning_rate": 5.312195121951219e-06, "loss": -0.0636, "num_tokens": 7545391.0, "reward": 1.4140625, "reward_std": 0.679804801940918, "rewards/answer_reward/mean": 0.53125, "rewards/answer_reward/std": 0.507007360458374, "rewards/format_reward/mean": 0.8828125, "rewards/format_reward/std": 0.30443212389945984, "sampling/importance_sampling_ratio/max": 2.6186046600341797, "sampling/importance_sampling_ratio/mean": 0.8578096628189087, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9831582307815552, "sampling/sampling_logp_difference/mean": 0.020467961207032204, "step": 85, "step_time": 10.19673680467531 }, { "clip_ratio/high_max": 0.0018601190531626344, "clip_ratio/high_mean": 0.0004650297632906586, "clip_ratio/low_mean": 0.000433317429269664, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008983471925603226, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 216.375, "completions/mean_terminated_length": 216.375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.48994241282343864, "epoch": 0.4195121951219512, "frac_reward_zero_std": 0.875, "grad_norm": 0.061561450362205505, "kl": 0.0029056462371954694, "learning_rate": 5.268292682926829e-06, "loss": 0.0595, "num_tokens": 7643733.0, "reward": 1.9375, "reward_std": 0.24593468010425568, "rewards/answer_reward/mean": 0.9375, "rewards/answer_reward/std": 0.24593468010425568, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.7056515216827393, "sampling/importance_sampling_ratio/mean": 1.0200022459030151, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6548042297363281, "sampling/sampling_logp_difference/mean": 0.02344439923763275, "step": 86, "step_time": 11.25922324648127 }, { "clip_ratio/high_max": 0.0010204081190750003, "clip_ratio/high_mean": 0.00025510202976875007, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00025510202976875007, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 199.09375, "completions/mean_terminated_length": 193.1290283203125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.42809392511844635, "epoch": 0.424390243902439, "frac_reward_zero_std": 0.875, "grad_norm": 0.022602515295147896, "kl": 0.0037281062977854162, "learning_rate": 5.224390243902439e-06, "loss": 0.0562, "num_tokens": 7737118.0, "reward": 1.65625, "reward_std": 0.5453247427940369, "rewards/answer_reward/mean": 0.6875, "rewards/answer_reward/std": 0.4709290862083435, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 2.7219810485839844, "sampling/importance_sampling_ratio/mean": 0.8426911234855652, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.8988027572631836, "sampling/sampling_logp_difference/mean": 0.023013995960354805, "step": 87, "step_time": 10.907138023059815 }, { "clip_ratio/high_max": 0.0017450755694881082, "clip_ratio/high_mean": 0.00043626889237202704, "clip_ratio/low_mean": 0.0001832844573073089, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000619553349679336, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 209.625, "completions/mean_terminated_length": 209.625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.5040510408580303, "epoch": 0.4292682926829268, "frac_reward_zero_std": 0.875, "grad_norm": 0.017254803329706192, "kl": 0.004208734288113192, "learning_rate": 5.180487804878049e-06, "loss": -0.0029, "num_tokens": 7843300.0, "reward": 1.75, "reward_std": 0.4399413466453552, "rewards/answer_reward/mean": 0.75, "rewards/answer_reward/std": 0.4399413466453552, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9941741228103638, "sampling/importance_sampling_ratio/mean": 0.6190696954727173, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.848278284072876, "sampling/sampling_logp_difference/mean": 0.026435861364006996, "step": 88, "step_time": 11.069655584171414 }, { "clip_ratio/high_max": 0.0015647003310732543, "clip_ratio/high_mean": 0.0006472115055657923, "clip_ratio/low_mean": 0.0004426804807735607, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001089891986339353, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 218.71875, "completions/mean_terminated_length": 213.3870849609375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.46804945915937424, "epoch": 0.43414634146341463, "frac_reward_zero_std": 0.5625, "grad_norm": 0.073431096971035, "kl": 0.004327211674535647, "learning_rate": 5.136585365853658e-06, "loss": 0.0243, "num_tokens": 7918799.0, "reward": 1.6328125, "reward_std": 0.5312203764915466, "rewards/answer_reward/mean": 0.65625, "rewards/answer_reward/std": 0.4825586974620819, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.13258251547813416, "sampling/importance_sampling_ratio/max": 2.335550546646118, "sampling/importance_sampling_ratio/mean": 0.7561908960342407, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.891487717628479, "sampling/sampling_logp_difference/mean": 0.023092076182365417, "step": 89, "step_time": 10.340139248408377 }, { "clip_ratio/high_max": 0.0026431982405483723, "clip_ratio/high_mean": 0.0007990738959051669, "clip_ratio/low_mean": 0.00017170330102089792, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0009707771969260648, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 210.46875, "completions/mean_terminated_length": 210.46875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.47758885845541954, "epoch": 0.43902439024390244, "frac_reward_zero_std": 0.75, "grad_norm": 0.044782668352127075, "kl": 0.0033476559910923243, "learning_rate": 5.092682926829268e-06, "loss": 0.0294, "num_tokens": 8021566.0, "reward": 1.625, "reward_std": 0.49186936020851135, "rewards/answer_reward/mean": 0.625, "rewards/answer_reward/std": 0.49186936020851135, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.5267364978790283, "sampling/importance_sampling_ratio/mean": 0.9265998601913452, "sampling/importance_sampling_ratio/min": 0.1553952991962433, "sampling/sampling_logp_difference/max": 2.6863222122192383, "sampling/sampling_logp_difference/mean": 0.022852841764688492, "step": 90, "step_time": 10.524904581252486 }, { "clip_ratio/high_max": 0.0028263161657378078, "clip_ratio/high_mean": 0.0012859441485488787, "clip_ratio/low_mean": 0.0003209953647456132, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001606939513294492, "completions/clipped_ratio": 0.0625, "completions/max_length": 384.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 228.34375, "completions/mean_terminated_length": 217.9666748046875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.4551536776125431, "epoch": 0.44390243902439025, "frac_reward_zero_std": 0.6875, "grad_norm": 0.04880724847316742, "kl": 0.0029533623601309955, "learning_rate": 5.0487804878048785e-06, "loss": -0.0834, "num_tokens": 8116615.0, "reward": 1.6640625, "reward_std": 0.5806329250335693, "rewards/answer_reward/mean": 0.71875, "rewards/answer_reward/std": 0.45680341124534607, "rewards/format_reward/mean": 0.9453125, "rewards/format_reward/std": 0.21752247214317322, "sampling/importance_sampling_ratio/max": 2.359431028366089, "sampling/importance_sampling_ratio/mean": 0.9289252758026123, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.0405874252319336, "sampling/sampling_logp_difference/mean": 0.0203053280711174, "step": 91, "step_time": 10.531819423194975 }, { "clip_ratio/high_max": 0.0020854398608207703, "clip_ratio/high_mean": 0.0006875833787489682, "clip_ratio/low_mean": 0.0003047876089112833, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0009923709876602516, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 197.21875, "completions/mean_terminated_length": 197.21875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.47539298981428146, "epoch": 0.44878048780487806, "frac_reward_zero_std": 0.8125, "grad_norm": 0.056593868881464005, "kl": 0.0033072416845243424, "learning_rate": 5.0048780487804885e-06, "loss": 0.027, "num_tokens": 8217066.0, "reward": 1.84375, "reward_std": 0.3689020276069641, "rewards/answer_reward/mean": 0.84375, "rewards/answer_reward/std": 0.3689020276069641, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.732353448867798, "sampling/importance_sampling_ratio/mean": 0.9488453269004822, "sampling/importance_sampling_ratio/min": 0.17084568738937378, "sampling/sampling_logp_difference/max": 0.8150334358215332, "sampling/sampling_logp_difference/mean": 0.02247290313243866, "step": 92, "step_time": 11.247291225939989 }, { "clip_ratio/high_max": 0.003230353817343712, "clip_ratio/high_mean": 0.000807588454335928, "clip_ratio/low_mean": 0.0014973932557040825, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0023049817100400105, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 208.34375, "completions/mean_terminated_length": 202.6774139404297, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.4683057926595211, "epoch": 0.45365853658536587, "frac_reward_zero_std": 0.625, "grad_norm": 0.061729904264211655, "kl": 0.003580713440896943, "learning_rate": 4.960975609756098e-06, "loss": -0.0247, "num_tokens": 8306843.0, "reward": 1.7109375, "reward_std": 0.5354730486869812, "rewards/answer_reward/mean": 0.75, "rewards/answer_reward/std": 0.4399413466453552, "rewards/format_reward/mean": 0.9609375, "rewards/format_reward/std": 0.18082889914512634, "sampling/importance_sampling_ratio/max": 2.6496102809906006, "sampling/importance_sampling_ratio/mean": 0.8263420462608337, "sampling/importance_sampling_ratio/min": 0.1148439273238182, "sampling/sampling_logp_difference/max": 1.5440120697021484, "sampling/sampling_logp_difference/mean": 0.023087508976459503, "step": 93, "step_time": 10.740753654390574 }, { "clip_ratio/high_max": 0.002915193617809564, "clip_ratio/high_mean": 0.000728798404452391, "clip_ratio/low_mean": 0.0003955073916586116, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001124305788835045, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 223.15625, "completions/mean_terminated_length": 217.96774291992188, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.49954839050769806, "epoch": 0.4585365853658537, "frac_reward_zero_std": 0.625, "grad_norm": 0.047105107456445694, "kl": 0.0034967970568686724, "learning_rate": 4.9170731707317075e-06, "loss": -0.0341, "num_tokens": 8401996.0, "reward": 1.5859375, "reward_std": 0.5104132294654846, "rewards/answer_reward/mean": 0.59375, "rewards/answer_reward/std": 0.49899089336395264, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.095695972442627, "sampling/importance_sampling_ratio/mean": 0.806057333946228, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8557374477386475, "sampling/sampling_logp_difference/mean": 0.024296607822179794, "step": 94, "step_time": 10.563482139725238 }, { "clip_ratio/high_max": 0.003292530047474429, "clip_ratio/high_mean": 0.0010691955030779354, "clip_ratio/low_mean": 0.0007563241379102692, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018255196409882046, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 210.4375, "completions/mean_terminated_length": 210.4375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.4907991737127304, "epoch": 0.4634146341463415, "frac_reward_zero_std": 0.75, "grad_norm": 0.05608353018760681, "kl": 0.003239309706259519, "learning_rate": 4.873170731707317e-06, "loss": -0.0172, "num_tokens": 8499162.0, "reward": 1.8125, "reward_std": 0.3965577781200409, "rewards/answer_reward/mean": 0.8125, "rewards/answer_reward/std": 0.3965577781200409, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.1634464263916016, "sampling/importance_sampling_ratio/mean": 0.8793953657150269, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.105881690979004, "sampling/sampling_logp_difference/mean": 0.023072104901075363, "step": 95, "step_time": 11.195508044213057 }, { "clip_ratio/high_max": 0.004362425592262298, "clip_ratio/high_mean": 0.0013920533383497968, "clip_ratio/low_mean": 0.0005237840960035101, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0019158374343533069, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 219.40625, "completions/mean_terminated_length": 214.09677124023438, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.47724034264683723, "epoch": 0.4682926829268293, "frac_reward_zero_std": 0.6875, "grad_norm": 0.0945458933711052, "kl": 0.003655951237305999, "learning_rate": 4.829268292682927e-06, "loss": 0.0428, "num_tokens": 8586895.0, "reward": 1.8203125, "reward_std": 0.4410141110420227, "rewards/answer_reward/mean": 0.84375, "rewards/answer_reward/std": 0.3689020276069641, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.13258251547813416, "sampling/importance_sampling_ratio/max": 2.8365917205810547, "sampling/importance_sampling_ratio/mean": 0.9658568501472473, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.704138457775116, "sampling/sampling_logp_difference/mean": 0.022699303925037384, "step": 96, "step_time": 10.950943340547383 }, { "clip_ratio/high_max": 0.001443342596758157, "clip_ratio/high_mean": 0.00036083564918953925, "clip_ratio/low_mean": 0.00024416640371782705, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0006050020529073663, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 222.90625, "completions/mean_terminated_length": 222.90625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.5080360323190689, "epoch": 0.47317073170731705, "frac_reward_zero_std": 0.6875, "grad_norm": 0.05043130740523338, "kl": 0.003999402892077342, "learning_rate": 4.785365853658537e-06, "loss": 0.0211, "num_tokens": 8675708.0, "reward": 1.7578125, "reward_std": 0.4282577931880951, "rewards/answer_reward/mean": 0.78125, "rewards/answer_reward/std": 0.420013427734375, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.13258251547813416, "sampling/importance_sampling_ratio/max": 2.9664976596832275, "sampling/importance_sampling_ratio/mean": 0.7789219617843628, "sampling/importance_sampling_ratio/min": 0.14035353064537048, "sampling/sampling_logp_difference/max": 0.8134393692016602, "sampling/sampling_logp_difference/mean": 0.023275621235370636, "step": 97, "step_time": 10.177360822912306 }, { "clip_ratio/high_max": 0.0034563980880193412, "clip_ratio/high_mean": 0.001050111444783397, "clip_ratio/low_mean": 0.0004221491180942394, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0014722605556016788, "completions/clipped_ratio": 0.0625, "completions/max_length": 384.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 243.5625, "completions/mean_terminated_length": 234.20001220703125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.44193674251437187, "epoch": 0.47804878048780486, "frac_reward_zero_std": 0.75, "grad_norm": 0.01877354271709919, "kl": 0.003897030052030459, "learning_rate": 4.741463414634146e-06, "loss": 0.0075, "num_tokens": 8770622.0, "reward": 1.7265625, "reward_std": 0.5585095882415771, "rewards/answer_reward/mean": 0.78125, "rewards/answer_reward/std": 0.420013427734375, "rewards/format_reward/mean": 0.9453125, "rewards/format_reward/std": 0.21752247214317322, "sampling/importance_sampling_ratio/max": 2.518388032913208, "sampling/importance_sampling_ratio/mean": 0.6864780187606812, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.023024320602417, "sampling/sampling_logp_difference/mean": 0.020080845803022385, "step": 98, "step_time": 10.864618655759841 }, { "clip_ratio/high_max": 0.001923819538205862, "clip_ratio/high_mean": 0.0006095556891523302, "clip_ratio/low_mean": 0.0002857694635167718, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000895325152669102, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 213.59375, "completions/mean_terminated_length": 208.09677124023438, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.4617055393755436, "epoch": 0.48292682926829267, "frac_reward_zero_std": 0.75, "grad_norm": 0.030339037999510765, "kl": 0.003507912246277556, "learning_rate": 4.697560975609756e-06, "loss": -0.0363, "num_tokens": 8868737.0, "reward": 1.6875, "reward_std": 0.5350610613822937, "rewards/answer_reward/mean": 0.71875, "rewards/answer_reward/std": 0.45680341124534607, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 1.6665289402008057, "sampling/importance_sampling_ratio/mean": 0.7515348792076111, "sampling/importance_sampling_ratio/min": 0.20681850612163544, "sampling/sampling_logp_difference/max": 0.9155688285827637, "sampling/sampling_logp_difference/mean": 0.02345692552626133, "step": 99, "step_time": 11.170355724170804 }, { "clip_ratio/high_max": 0.0026964288554154336, "clip_ratio/high_mean": 0.0006741072138538584, "clip_ratio/low_mean": 0.00023763020726619288, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0009117374211200513, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 227.25, "completions/mean_terminated_length": 222.19354248046875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.45347820222377777, "epoch": 0.4878048780487805, "frac_reward_zero_std": 0.75, "grad_norm": 0.039458584040403366, "kl": 0.003636793087935075, "learning_rate": 4.653658536585366e-06, "loss": 0.0376, "num_tokens": 8939931.0, "reward": 1.59375, "reward_std": 0.5599179267883301, "rewards/answer_reward/mean": 0.625, "rewards/answer_reward/std": 0.49186936020851135, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 2.3715121746063232, "sampling/importance_sampling_ratio/mean": 0.6750224828720093, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.1543712615966797, "sampling/sampling_logp_difference/mean": 0.02053094655275345, "step": 100, "step_time": 10.468179046176374 }, { "clip_ratio/high_max": 0.0034591687144711614, "clip_ratio/high_mean": 0.0008647921786177903, "clip_ratio/low_mean": 0.0006940892490092665, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015588814276270568, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 203.625, "completions/mean_terminated_length": 203.625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.5068883001804352, "epoch": 0.4926829268292683, "frac_reward_zero_std": 0.8125, "grad_norm": 0.05366944521665573, "kl": 0.00340608402621001, "learning_rate": 4.609756097560976e-06, "loss": -0.0256, "num_tokens": 9031695.0, "reward": 1.84375, "reward_std": 0.3689020276069641, "rewards/answer_reward/mean": 0.84375, "rewards/answer_reward/std": 0.3689020276069641, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.3734490871429443, "sampling/importance_sampling_ratio/mean": 0.8445175886154175, "sampling/importance_sampling_ratio/min": 0.19141831994056702, "sampling/sampling_logp_difference/max": 1.3785693645477295, "sampling/sampling_logp_difference/mean": 0.023890763521194458, "step": 101, "step_time": 10.971125331241637 }, { "clip_ratio/high_max": 0.003029214160051197, "clip_ratio/high_mean": 0.0007573035400127992, "clip_ratio/low_mean": 0.000811911464552395, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015692150045651942, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 215.625, "completions/mean_terminated_length": 215.625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.5009712651371956, "epoch": 0.4975609756097561, "frac_reward_zero_std": 0.8125, "grad_norm": 0.062298521399497986, "kl": 0.0040636283229105175, "learning_rate": 4.565853658536585e-06, "loss": 0.0497, "num_tokens": 9120877.0, "reward": 1.71875, "reward_std": 0.45680341124534607, "rewards/answer_reward/mean": 0.71875, "rewards/answer_reward/std": 0.45680341124534607, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.3932013511657715, "sampling/importance_sampling_ratio/mean": 0.7262617945671082, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9279193878173828, "sampling/sampling_logp_difference/mean": 0.024571895599365234, "step": 102, "step_time": 11.2122079920955 }, { "clip_ratio/high_max": 0.0020691993995569646, "clip_ratio/high_mean": 0.0007405141514027491, "clip_ratio/low_mean": 0.000777995097450912, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015185092488536611, "completions/clipped_ratio": 0.0625, "completions/max_length": 384.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 217.46875, "completions/mean_terminated_length": 206.36668395996094, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.4154631309211254, "epoch": 0.5024390243902439, "frac_reward_zero_std": 0.6875, "grad_norm": 0.031814295798540115, "kl": 0.005143697984749451, "learning_rate": 4.521951219512195e-06, "loss": -0.0569, "num_tokens": 9208352.0, "reward": 1.703125, "reward_std": 0.5765494704246521, "rewards/answer_reward/mean": 0.78125, "rewards/answer_reward/std": 0.420013427734375, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.24949544668197632, "sampling/importance_sampling_ratio/max": 2.823171615600586, "sampling/importance_sampling_ratio/mean": 0.8364578485488892, "sampling/importance_sampling_ratio/min": 0.11660793423652649, "sampling/sampling_logp_difference/max": 1.4307622909545898, "sampling/sampling_logp_difference/mean": 0.019061390310525894, "step": 103, "step_time": 10.77770367031917 }, { "clip_ratio/high_max": 0.0022522523067891598, "clip_ratio/high_mean": 0.0005630630766972899, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0005630630766972899, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 198.6875, "completions/mean_terminated_length": 198.6875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.5059105902910233, "epoch": 0.5073170731707317, "frac_reward_zero_std": 0.9375, "grad_norm": 0.012009642086923122, "kl": 0.003522889281157404, "learning_rate": 4.478048780487805e-06, "loss": -0.0128, "num_tokens": 9310366.0, "reward": 1.84375, "reward_std": 0.3689020276069641, "rewards/answer_reward/mean": 0.84375, "rewards/answer_reward/std": 0.3689020276069641, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.8698883056640625, "sampling/importance_sampling_ratio/mean": 0.895970344543457, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6851269006729126, "sampling/sampling_logp_difference/mean": 0.024666322395205498, "step": 104, "step_time": 10.658165893517435 }, { "clip_ratio/high_max": 0.0027775426860898733, "clip_ratio/high_mean": 0.0006943856715224683, "clip_ratio/low_mean": 0.00027980162121821195, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0009741872709128074, "completions/clipped_ratio": 0.0625, "completions/max_length": 384.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 230.21875, "completions/mean_terminated_length": 219.9666748046875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.4586643800139427, "epoch": 0.5121951219512195, "frac_reward_zero_std": 0.75, "grad_norm": 0.02510133571922779, "kl": 0.003927781945094466, "learning_rate": 4.434146341463415e-06, "loss": 0.0119, "num_tokens": 9394187.0, "reward": 1.71875, "reward_std": 0.507007360458374, "rewards/answer_reward/mean": 0.75, "rewards/answer_reward/std": 0.4399413466453552, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.12296734005212784, "sampling/importance_sampling_ratio/max": 2.5109217166900635, "sampling/importance_sampling_ratio/mean": 0.7595022916793823, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9189889430999756, "sampling/sampling_logp_difference/mean": 0.021806055679917336, "step": 105, "step_time": 10.741107920184731 }, { "clip_ratio/high_max": 0.0023434755858033895, "clip_ratio/high_mean": 0.0005858688964508474, "clip_ratio/low_mean": 0.00011281588376732543, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0006986847802181728, "completions/clipped_ratio": 0.0625, "completions/max_length": 384.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 240.375, "completions/mean_terminated_length": 230.80001831054688, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.4533752277493477, "epoch": 0.5170731707317073, "frac_reward_zero_std": 0.875, "grad_norm": 0.023528030142188072, "kl": 0.0034058827732224017, "learning_rate": 4.390243902439024e-06, "loss": 0.0079, "num_tokens": 9478317.0, "reward": 1.5, "reward_std": 0.6221709847450256, "rewards/answer_reward/mean": 0.5625, "rewards/answer_reward/std": 0.504016101360321, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "sampling/importance_sampling_ratio/max": 1.7741868495941162, "sampling/importance_sampling_ratio/mean": 0.5809928178787231, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9675875902175903, "sampling/sampling_logp_difference/mean": 0.02003411389887333, "step": 106, "step_time": 10.816990955267102 }, { "clip_ratio/high_max": 0.002154357935069129, "clip_ratio/high_mean": 0.0005385894837672822, "clip_ratio/low_mean": 0.0003177284525008872, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008563179362681694, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 219.15625, "completions/mean_terminated_length": 213.8386993408203, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.46243008598685265, "epoch": 0.5219512195121951, "frac_reward_zero_std": 0.625, "grad_norm": 0.05062209069728851, "kl": 0.0030750717560295016, "learning_rate": 4.346341463414634e-06, "loss": -0.0186, "num_tokens": 9563854.0, "reward": 1.625, "reward_std": 0.5388159155845642, "rewards/answer_reward/mean": 0.65625, "rewards/answer_reward/std": 0.4825586974620819, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.12296734005212784, "sampling/importance_sampling_ratio/max": 2.1464736461639404, "sampling/importance_sampling_ratio/mean": 0.6099013090133667, "sampling/importance_sampling_ratio/min": 0.07287362217903137, "sampling/sampling_logp_difference/max": 1.4211478233337402, "sampling/sampling_logp_difference/mean": 0.02165093645453453, "step": 107, "step_time": 10.410744367167354 }, { "clip_ratio/high_max": 0.0010245901066809893, "clip_ratio/high_mean": 0.0002561475266702473, "clip_ratio/low_mean": 0.00023275169223779812, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004888992189080454, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 224.0, "completions/mean_terminated_length": 218.8386993408203, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.44984977319836617, "epoch": 0.526829268292683, "frac_reward_zero_std": 0.8125, "grad_norm": 0.0342860110104084, "kl": 0.0038403815706260502, "learning_rate": 4.302439024390244e-06, "loss": -0.0228, "num_tokens": 9643586.0, "reward": 1.796875, "reward_std": 0.43735596537590027, "rewards/answer_reward/mean": 0.8125, "rewards/answer_reward/std": 0.3965577781200409, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0883883461356163, "sampling/importance_sampling_ratio/max": 2.259862184524536, "sampling/importance_sampling_ratio/mean": 0.6345269680023193, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9070982933044434, "sampling/sampling_logp_difference/mean": 0.021662143990397453, "step": 108, "step_time": 11.185431754216552 }, { "clip_ratio/high_max": 0.001976073603145778, "clip_ratio/high_mean": 0.0004940184007864445, "clip_ratio/low_mean": 0.0004766064084833488, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0009706248092697933, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 218.15625, "completions/mean_terminated_length": 218.15625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.537204310297966, "epoch": 0.5317073170731708, "frac_reward_zero_std": 0.75, "grad_norm": 0.04423275962471962, "kl": 0.0030827905575279146, "learning_rate": 4.258536585365854e-06, "loss": -0.0286, "num_tokens": 9733835.0, "reward": 1.6875, "reward_std": 0.4709290862083435, "rewards/answer_reward/mean": 0.6875, "rewards/answer_reward/std": 0.4709290862083435, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.1888351440429688, "sampling/importance_sampling_ratio/mean": 0.9907851219177246, "sampling/importance_sampling_ratio/min": 0.12779958546161652, "sampling/sampling_logp_difference/max": 0.9256432056427002, "sampling/sampling_logp_difference/mean": 0.023716870695352554, "step": 109, "step_time": 10.099970336072147 }, { "clip_ratio/high_max": 0.0014710002578794956, "clip_ratio/high_mean": 0.0005110986821819097, "clip_ratio/low_mean": 0.00028876848227810115, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0007998671644600108, "completions/clipped_ratio": 0.09375, "completions/max_length": 384.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 234.09375, "completions/mean_terminated_length": 218.58621215820312, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.4140498898923397, "epoch": 0.5365853658536586, "frac_reward_zero_std": 0.625, "grad_norm": 0.037579748779535294, "kl": 0.003455500293057412, "learning_rate": 4.214634146341464e-06, "loss": -0.0501, "num_tokens": 9807832.0, "reward": 1.71875, "reward_std": 0.6342064142227173, "rewards/answer_reward/mean": 0.8125, "rewards/answer_reward/std": 0.3965577781200409, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "sampling/importance_sampling_ratio/max": 2.8250668048858643, "sampling/importance_sampling_ratio/mean": 1.0007641315460205, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.3083149194717407, "sampling/sampling_logp_difference/mean": 0.01731729507446289, "step": 110, "step_time": 10.652048359625041 }, { "clip_ratio/high_max": 0.005276573763694614, "clip_ratio/high_mean": 0.0013191434409236535, "clip_ratio/low_mean": 0.0005677837980329059, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018869272244046442, "completions/clipped_ratio": 0.0625, "completions/max_length": 384.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 234.40625, "completions/mean_terminated_length": 224.433349609375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.41416120156645775, "epoch": 0.5414634146341464, "frac_reward_zero_std": 0.625, "grad_norm": 0.0281074121594429, "kl": 0.0038687162450514734, "learning_rate": 4.170731707317074e-06, "loss": 0.0084, "num_tokens": 9895203.0, "reward": 1.5390625, "reward_std": 0.6027804613113403, "rewards/answer_reward/mean": 0.59375, "rewards/answer_reward/std": 0.49899089336395264, "rewards/format_reward/mean": 0.9453125, "rewards/format_reward/std": 0.21752247214317322, "sampling/importance_sampling_ratio/max": 1.729832649230957, "sampling/importance_sampling_ratio/mean": 0.6059998869895935, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.1899821758270264, "sampling/sampling_logp_difference/mean": 0.01990685611963272, "step": 111, "step_time": 10.4667331520468 }, { "clip_ratio/high_max": 0.006085985864046961, "clip_ratio/high_mean": 0.0015214964660117403, "clip_ratio/low_mean": 0.0008812702144496143, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024027666804613546, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 221.25, "completions/mean_terminated_length": 221.25, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.4739711731672287, "epoch": 0.5463414634146342, "frac_reward_zero_std": 0.6875, "grad_norm": 0.039898503571748734, "kl": 0.0029072742036078125, "learning_rate": 4.126829268292683e-06, "loss": 0.0047, "num_tokens": 10001069.0, "reward": 1.84375, "reward_std": 0.3689020276069641, "rewards/answer_reward/mean": 0.84375, "rewards/answer_reward/std": 0.3689020276069641, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0548136234283447, "sampling/importance_sampling_ratio/mean": 0.7654436826705933, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9627161026000977, "sampling/sampling_logp_difference/mean": 0.02356475219130516, "step": 112, "step_time": 12.39280565874651 }, { "clip_ratio/high_max": 0.0037802170554641634, "clip_ratio/high_mean": 0.0009450542638660409, "clip_ratio/low_mean": 0.0005225171335041523, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0014675713973701932, "completions/clipped_ratio": 0.0625, "completions/max_length": 384.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 229.28125, "completions/mean_terminated_length": 218.9666748046875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.5093803778290749, "epoch": 0.551219512195122, "frac_reward_zero_std": 0.6875, "grad_norm": 0.04774067550897598, "kl": 0.0030522581073455513, "learning_rate": 4.082926829268293e-06, "loss": -0.0664, "num_tokens": 10091102.0, "reward": 1.515625, "reward_std": 0.5886613726615906, "rewards/answer_reward/mean": 0.5625, "rewards/answer_reward/std": 0.504016101360321, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.19507545232772827, "sampling/importance_sampling_ratio/max": 2.5854318141937256, "sampling/importance_sampling_ratio/mean": 0.7114850282669067, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8183166980743408, "sampling/sampling_logp_difference/mean": 0.02346753142774105, "step": 113, "step_time": 10.92712882021442 }, { "clip_ratio/high_max": 0.0046259483497124165, "clip_ratio/high_mean": 0.0014390545838978142, "clip_ratio/low_mean": 0.001063492993125692, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002502547562471591, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 230.53125, "completions/mean_terminated_length": 225.5806427001953, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.5068776197731495, "epoch": 0.5560975609756098, "frac_reward_zero_std": 0.5, "grad_norm": 0.07586206495761871, "kl": 0.0031996518664527684, "learning_rate": 4.039024390243903e-06, "loss": 0.0016, "num_tokens": 10197053.0, "reward": 1.703125, "reward_std": 0.48955830931663513, "rewards/answer_reward/mean": 0.71875, "rewards/answer_reward/std": 0.45680341124534607, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0883883461356163, "sampling/importance_sampling_ratio/max": 2.257267475128174, "sampling/importance_sampling_ratio/mean": 0.8596147894859314, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.0596094131469727, "sampling/sampling_logp_difference/mean": 0.02311866357922554, "step": 114, "step_time": 10.546700282488018 }, { "clip_ratio/high_max": 0.005915228626690805, "clip_ratio/high_mean": 0.0014788071566727012, "clip_ratio/low_mean": 0.0007361335738096386, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0022149407159304246, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 224.125, "completions/mean_terminated_length": 218.96774291992188, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.4833877235651016, "epoch": 0.5609756097560976, "frac_reward_zero_std": 0.6875, "grad_norm": 0.045443978160619736, "kl": 0.003565110295312479, "learning_rate": 3.995121951219513e-06, "loss": 0.0728, "num_tokens": 10285301.0, "reward": 1.75, "reward_std": 0.5080004930496216, "rewards/answer_reward/mean": 0.78125, "rewards/answer_reward/std": 0.420013427734375, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 2.324744462966919, "sampling/importance_sampling_ratio/mean": 0.755713701248169, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.3630270957946777, "sampling/sampling_logp_difference/mean": 0.022455114871263504, "step": 115, "step_time": 10.68481420353055 }, { "clip_ratio/high_max": 0.004226940043736249, "clip_ratio/high_mean": 0.0012238473136676475, "clip_ratio/low_mean": 0.00016622339899186045, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001390070712659508, "completions/clipped_ratio": 0.09375, "completions/max_length": 384.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 218.4375, "completions/mean_terminated_length": 201.3103485107422, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.40984179824590683, "epoch": 0.5658536585365853, "frac_reward_zero_std": 0.6875, "grad_norm": 0.05216742679476738, "kl": 0.0031304286676459014, "learning_rate": 3.951219512195122e-06, "loss": -0.0299, "num_tokens": 10369077.0, "reward": 1.46875, "reward_std": 0.671271026134491, "rewards/answer_reward/mean": 0.5625, "rewards/answer_reward/std": 0.504016101360321, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "sampling/importance_sampling_ratio/max": 2.520874500274658, "sampling/importance_sampling_ratio/mean": 0.8263678550720215, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.0991706848144531, "sampling/sampling_logp_difference/mean": 0.017945710569620132, "step": 116, "step_time": 11.567763729952276 }, { "clip_ratio/high_max": 0.00234023091616109, "clip_ratio/high_mean": 0.0005850577290402725, "clip_ratio/low_mean": 0.0006649597198702395, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001250017448910512, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 218.65625, "completions/mean_terminated_length": 218.65625, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.4755069352686405, "epoch": 0.5707317073170731, "frac_reward_zero_std": 0.8125, "grad_norm": 0.023121824488043785, "kl": 0.003578370378818363, "learning_rate": 3.907317073170732e-06, "loss": -0.0138, "num_tokens": 10456690.0, "reward": 1.71875, "reward_std": 0.45680341124534607, "rewards/answer_reward/mean": 0.71875, "rewards/answer_reward/std": 0.45680341124534607, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.7583324909210205, "sampling/importance_sampling_ratio/mean": 0.8466827273368835, "sampling/importance_sampling_ratio/min": 0.1014837995171547, "sampling/sampling_logp_difference/max": 0.5709977149963379, "sampling/sampling_logp_difference/mean": 0.022252600640058517, "step": 117, "step_time": 11.18006874481216 }, { "clip_ratio/high_max": 0.001953125, "clip_ratio/high_mean": 0.00048828125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00048828125, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 226.6875, "completions/mean_terminated_length": 221.61289978027344, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.4270494468510151, "epoch": 0.5756097560975609, "frac_reward_zero_std": 0.9375, "grad_norm": 0.018269840627908707, "kl": 0.003924514807295054, "learning_rate": 3.8634146341463415e-06, "loss": 0.0073, "num_tokens": 10547272.0, "reward": 1.75, "reward_std": 0.5080004930496216, "rewards/answer_reward/mean": 0.78125, "rewards/answer_reward/std": 0.420013427734375, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 2.1939961910247803, "sampling/importance_sampling_ratio/mean": 0.7488278150558472, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 3.6323447227478027, "sampling/sampling_logp_difference/mean": 0.02143348939716816, "step": 118, "step_time": 10.804527630563825 }, { "clip_ratio/high_max": 0.004367877379991114, "clip_ratio/high_mean": 0.001190239767311141, "clip_ratio/low_mean": 0.0009750968310981989, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0021653365838574246, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 227.09375, "completions/mean_terminated_length": 222.03225708007812, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.5322628989815712, "epoch": 0.5804878048780487, "frac_reward_zero_std": 0.75, "grad_norm": 0.05473540723323822, "kl": 0.0033228147658519447, "learning_rate": 3.8195121951219515e-06, "loss": -0.0287, "num_tokens": 10662725.0, "reward": 1.765625, "reward_std": 0.4576302170753479, "rewards/answer_reward/mean": 0.78125, "rewards/answer_reward/std": 0.420013427734375, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0883883461356163, "sampling/importance_sampling_ratio/max": 2.531384229660034, "sampling/importance_sampling_ratio/mean": 0.8767915964126587, "sampling/importance_sampling_ratio/min": 0.11986883729696274, "sampling/sampling_logp_difference/max": 0.7191965579986572, "sampling/sampling_logp_difference/mean": 0.024603866040706635, "step": 119, "step_time": 11.137315727304667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00019171778694726527, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019171778694726527, "completions/clipped_ratio": 0.0625, "completions/max_length": 384.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 224.875, "completions/mean_terminated_length": 214.2666778564453, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.4663756862282753, "epoch": 0.5853658536585366, "frac_reward_zero_std": 0.75, "grad_norm": 0.04449266567826271, "kl": 0.00401684595271945, "learning_rate": 3.775609756097561e-06, "loss": -0.0302, "num_tokens": 10745827.0, "reward": 1.5625, "reward_std": 0.6189220547676086, "rewards/answer_reward/mean": 0.625, "rewards/answer_reward/std": 0.49186936020851135, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "sampling/importance_sampling_ratio/max": 2.604609489440918, "sampling/importance_sampling_ratio/mean": 0.9018092751502991, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.3791694641113281, "sampling/sampling_logp_difference/mean": 0.021657606586813927, "step": 120, "step_time": 10.465225329156965 }, { "clip_ratio/high_max": 0.009260594088118523, "clip_ratio/high_mean": 0.0030414619686780497, "clip_ratio/low_mean": 0.001276788898394443, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004318250867072493, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 221.125, "completions/mean_terminated_length": 221.125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.4963403381407261, "epoch": 0.5902439024390244, "frac_reward_zero_std": 0.625, "grad_norm": 0.036630306392908096, "kl": 0.0032677398121450096, "learning_rate": 3.7317073170731705e-06, "loss": 0.028, "num_tokens": 10847923.0, "reward": 1.6875, "reward_std": 0.4709290862083435, "rewards/answer_reward/mean": 0.6875, "rewards/answer_reward/std": 0.4709290862083435, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0588531494140625, "sampling/importance_sampling_ratio/mean": 0.7000945210456848, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.8319474458694458, "sampling/sampling_logp_difference/mean": 0.02460411563515663, "step": 121, "step_time": 11.072220541071147 }, { "clip_ratio/high_max": 0.0009293680195696652, "clip_ratio/high_mean": 0.0002323420048924163, "clip_ratio/low_mean": 0.0006901369197294116, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0009224789246218279, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 224.0625, "completions/mean_terminated_length": 218.90321350097656, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.47787731885910034, "epoch": 0.5951219512195122, "frac_reward_zero_std": 0.75, "grad_norm": 0.03142237663269043, "kl": 0.0036227755481377244, "learning_rate": 3.6878048780487804e-06, "loss": 0.0089, "num_tokens": 10933545.0, "reward": 1.65625, "reward_std": 0.5453247427940369, "rewards/answer_reward/mean": 0.6875, "rewards/answer_reward/std": 0.4709290862083435, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 2.7272655963897705, "sampling/importance_sampling_ratio/mean": 0.916573703289032, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8603172302246094, "sampling/sampling_logp_difference/mean": 0.02060486376285553, "step": 122, "step_time": 10.540304779540747 }, { "clip_ratio/high_max": 0.003167753166053444, "clip_ratio/high_mean": 0.0009327040752395988, "clip_ratio/low_mean": 0.00013130252773407847, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001064006588421762, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 225.21875, "completions/mean_terminated_length": 220.09677124023438, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.4701153337955475, "epoch": 0.6, "frac_reward_zero_std": 0.6875, "grad_norm": 0.03809630870819092, "kl": 0.00421301374444738, "learning_rate": 3.6439024390243908e-06, "loss": 0.0141, "num_tokens": 11012568.0, "reward": 1.59375, "reward_std": 0.5599179267883301, "rewards/answer_reward/mean": 0.625, "rewards/answer_reward/std": 0.49186936020851135, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 2.0695483684539795, "sampling/importance_sampling_ratio/mean": 0.7202502489089966, "sampling/importance_sampling_ratio/min": 0.09134870022535324, "sampling/sampling_logp_difference/max": 1.0061416625976562, "sampling/sampling_logp_difference/mean": 0.021042723208665848, "step": 123, "step_time": 11.365232598036528 }, { "clip_ratio/high_max": 0.0006613756413571537, "clip_ratio/high_mean": 0.0001653439103392884, "clip_ratio/low_mean": 8.138021075865254e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00024672412109794095, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 210.375, "completions/mean_terminated_length": 204.77418518066406, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.44315605983138084, "epoch": 0.6048780487804878, "frac_reward_zero_std": 0.875, "grad_norm": 0.03819752112030983, "kl": 0.003705405193613842, "learning_rate": 3.6000000000000003e-06, "loss": 0.0231, "num_tokens": 11088576.0, "reward": 1.90625, "reward_std": 0.39015093445777893, "rewards/answer_reward/mean": 0.9375, "rewards/answer_reward/std": 0.24593468010425568, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 2.209427833557129, "sampling/importance_sampling_ratio/mean": 0.7784409523010254, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.2500107288360596, "sampling/sampling_logp_difference/mean": 0.021527979522943497, "step": 124, "step_time": 10.225236669648439 }, { "clip_ratio/high_max": 0.0018193323630839586, "clip_ratio/high_mean": 0.00045483309077098966, "clip_ratio/low_mean": 0.00013130252773407847, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0005861356185050681, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 206.40625, "completions/mean_terminated_length": 206.40625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.46003472432494164, "epoch": 0.6097560975609756, "frac_reward_zero_std": 0.875, "grad_norm": 0.01813965104520321, "kl": 0.003409226337680593, "learning_rate": 3.55609756097561e-06, "loss": 0.0181, "num_tokens": 11172181.0, "reward": 1.6875, "reward_std": 0.4709290862083435, "rewards/answer_reward/mean": 0.6875, "rewards/answer_reward/std": 0.4709290862083435, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.056746482849121, "sampling/importance_sampling_ratio/mean": 0.5680859088897705, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9758151769638062, "sampling/sampling_logp_difference/mean": 0.022623809054493904, "step": 125, "step_time": 10.058586265426129 }, { "clip_ratio/high_max": 0.003505508473608643, "clip_ratio/high_mean": 0.0012168505927547812, "clip_ratio/low_mean": 0.0006495131674455479, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018663637602003291, "completions/clipped_ratio": 0.0625, "completions/max_length": 384.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 231.59375, "completions/mean_terminated_length": 221.433349609375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.48817386850714684, "epoch": 0.6146341463414634, "frac_reward_zero_std": 0.5625, "grad_norm": 0.054380886256694794, "kl": 0.0036174336564727128, "learning_rate": 3.5121951219512197e-06, "loss": -0.0804, "num_tokens": 11270594.0, "reward": 1.546875, "reward_std": 0.5869463682174683, "rewards/answer_reward/mean": 0.59375, "rewards/answer_reward/std": 0.49899089336395264, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.19507545232772827, "sampling/importance_sampling_ratio/max": 2.810664176940918, "sampling/importance_sampling_ratio/mean": 0.7244150638580322, "sampling/importance_sampling_ratio/min": 0.11591929942369461, "sampling/sampling_logp_difference/max": 1.476397156715393, "sampling/sampling_logp_difference/mean": 0.02223159372806549, "step": 126, "step_time": 10.953866918105632 }, { "clip_ratio/high_max": 0.0028710802434943616, "clip_ratio/high_mean": 0.0007177700608735904, "clip_ratio/low_mean": 0.0001302083401242271, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008479784009978175, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 220.65625, "completions/mean_terminated_length": 220.65625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.46637769788503647, "epoch": 0.6195121951219512, "frac_reward_zero_std": 0.875, "grad_norm": 0.014444850385189056, "kl": 0.003776507204747759, "learning_rate": 3.4682926829268296e-06, "loss": 0.0233, "num_tokens": 11358853.0, "reward": 1.9375, "reward_std": 0.24593468010425568, "rewards/answer_reward/mean": 0.9375, "rewards/answer_reward/std": 0.24593468010425568, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.9464449882507324, "sampling/importance_sampling_ratio/mean": 0.8962424993515015, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.0486798286437988, "sampling/sampling_logp_difference/mean": 0.02154400572180748, "step": 127, "step_time": 10.519520456437021 }, { "clip_ratio/high_max": 0.001828515058150515, "clip_ratio/high_mean": 0.00045712876453762874, "clip_ratio/low_mean": 0.0006179961856105365, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010751249574241228, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 215.96875, "completions/mean_terminated_length": 215.96875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.46675438433885574, "epoch": 0.624390243902439, "frac_reward_zero_std": 0.75, "grad_norm": 1.4395884275436401, "kl": 0.1061126311251428, "learning_rate": 3.424390243902439e-06, "loss": -0.0059, "num_tokens": 11460156.0, "reward": 1.8125, "reward_std": 0.3965577781200409, "rewards/answer_reward/mean": 0.8125, "rewards/answer_reward/std": 0.3965577781200409, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.2055585384368896, "sampling/importance_sampling_ratio/mean": 0.823553740978241, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.1067419052124023, "sampling/sampling_logp_difference/mean": 0.022794220596551895, "step": 128, "step_time": 11.482311175670475 }, { "clip_ratio/high_max": 0.0018533984548412263, "clip_ratio/high_mean": 0.0004633496137103066, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004633496137103066, "completions/clipped_ratio": 0.09375, "completions/max_length": 384.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 230.6875, "completions/mean_terminated_length": 214.8275909423828, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.4642071984708309, "epoch": 0.6292682926829268, "frac_reward_zero_std": 0.75, "grad_norm": 0.042642008513212204, "kl": 0.002690551817067899, "learning_rate": 3.380487804878049e-06, "loss": -0.021, "num_tokens": 11550158.0, "reward": 1.5078125, "reward_std": 0.655265212059021, "rewards/answer_reward/mean": 0.59375, "rewards/answer_reward/std": 0.49899089336395264, "rewards/format_reward/mean": 0.9140625, "rewards/format_reward/std": 0.27393028140068054, "sampling/importance_sampling_ratio/max": 2.8556809425354004, "sampling/importance_sampling_ratio/mean": 0.8294768333435059, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.2607998847961426, "sampling/sampling_logp_difference/mean": 0.019794989377260208, "step": 129, "step_time": 10.867440742440522 }, { "clip_ratio/high_max": 0.004882233042735606, "clip_ratio/high_mean": 0.0014196028496371582, "clip_ratio/low_mean": 0.00019778481510002166, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016173876647371799, "completions/clipped_ratio": 0.125, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 249.40625, "completions/mean_terminated_length": 230.1785888671875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.3816651366651058, "epoch": 0.6341463414634146, "frac_reward_zero_std": 0.75, "grad_norm": 0.029238134622573853, "kl": 0.004339955165050924, "learning_rate": 3.3365853658536586e-06, "loss": -0.0113, "num_tokens": 11632851.0, "reward": 1.5, "reward_std": 0.7071067690849304, "rewards/answer_reward/mean": 0.625, "rewards/answer_reward/std": 0.49186936020851135, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3110854923725128, "sampling/importance_sampling_ratio/max": 2.8255879878997803, "sampling/importance_sampling_ratio/mean": 0.6691219210624695, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.142876386642456, "sampling/sampling_logp_difference/mean": 0.019072573632001877, "step": 130, "step_time": 11.07424188265577 }, { "clip_ratio/high_max": 0.0006510416860692203, "clip_ratio/high_mean": 0.00016276042151730508, "clip_ratio/low_mean": 0.00025510202976875007, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00041786245128605515, "completions/clipped_ratio": 0.125, "completions/max_length": 384.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 233.125, "completions/mean_terminated_length": 211.57144165039062, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.42053060978651047, "epoch": 0.6390243902439025, "frac_reward_zero_std": 0.875, "grad_norm": 0.007440975401550531, "kl": 0.00404884124873206, "learning_rate": 3.2926829268292685e-06, "loss": 0.0076, "num_tokens": 11723563.0, "reward": 1.546875, "reward_std": 0.6881412267684937, "rewards/answer_reward/mean": 0.65625, "rewards/answer_reward/std": 0.4825586974620819, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.30412155389785767, "sampling/importance_sampling_ratio/max": 2.5075881481170654, "sampling/importance_sampling_ratio/mean": 0.6842648983001709, "sampling/importance_sampling_ratio/min": 0.1368464231491089, "sampling/sampling_logp_difference/max": 1.730358600616455, "sampling/sampling_logp_difference/mean": 0.018715228885412216, "step": 131, "step_time": 10.914618720300496 }, { "clip_ratio/high_max": 0.003694373823236674, "clip_ratio/high_mean": 0.0009235934558091685, "clip_ratio/low_mean": 0.000378999269742053, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013025927255512215, "completions/clipped_ratio": 0.0625, "completions/max_length": 384.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 231.09375, "completions/mean_terminated_length": 220.90000915527344, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.5026778057217598, "epoch": 0.6439024390243903, "frac_reward_zero_std": 0.6875, "grad_norm": 0.043961189687252045, "kl": 0.0033653820282779634, "learning_rate": 3.248780487804878e-06, "loss": 0.0244, "num_tokens": 11805812.0, "reward": 1.5625, "reward_std": 0.6189220547676086, "rewards/answer_reward/mean": 0.625, "rewards/answer_reward/std": 0.49186936020851135, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "sampling/importance_sampling_ratio/max": 2.094510793685913, "sampling/importance_sampling_ratio/mean": 0.9129068851470947, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.971165657043457, "sampling/sampling_logp_difference/mean": 0.02147020772099495, "step": 132, "step_time": 10.484863319899887 }, { "clip_ratio/high_max": 0.004804711497854441, "clip_ratio/high_mean": 0.0015614084113622084, "clip_ratio/low_mean": 0.0005983521550660953, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0021597605664283037, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 238.21875, "completions/mean_terminated_length": 233.51612854003906, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.4460136108100414, "epoch": 0.6487804878048781, "frac_reward_zero_std": 0.6875, "grad_norm": 0.049723897129297256, "kl": 0.0024255999305751175, "learning_rate": 3.204878048780488e-06, "loss": -0.0367, "num_tokens": 11889023.0, "reward": 1.6875, "reward_std": 0.5350610613822937, "rewards/answer_reward/mean": 0.71875, "rewards/answer_reward/std": 0.45680341124534607, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 2.679922580718994, "sampling/importance_sampling_ratio/mean": 0.7380883097648621, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.863440990447998, "sampling/sampling_logp_difference/mean": 0.019735511392354965, "step": 133, "step_time": 11.252778528258204 }, { "clip_ratio/high_max": 0.0007204610737971961, "clip_ratio/high_mean": 0.00018011526844929904, "clip_ratio/low_mean": 0.00016108246927615255, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003411977377254516, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 206.125, "completions/mean_terminated_length": 206.125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.498601246625185, "epoch": 0.6536585365853659, "frac_reward_zero_std": 0.9375, "grad_norm": 0.06702587753534317, "kl": 0.002942003571661189, "learning_rate": 3.1609756097560974e-06, "loss": -0.0908, "num_tokens": 11988781.0, "reward": 1.59375, "reward_std": 0.498990923166275, "rewards/answer_reward/mean": 0.59375, "rewards/answer_reward/std": 0.49899089336395264, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.86968994140625, "sampling/importance_sampling_ratio/mean": 0.9838160276412964, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.591756820678711, "sampling/sampling_logp_difference/mean": 0.025535499677062035, "step": 134, "step_time": 11.168831198941916 }, { "clip_ratio/high_max": 0.0028986767574679106, "clip_ratio/high_mean": 0.0009513982440694235, "clip_ratio/low_mean": 8.138021075865254e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001032778454828076, "completions/clipped_ratio": 0.15625, "completions/max_length": 384.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 232.09375, "completions/mean_terminated_length": 203.9629669189453, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.4309198185801506, "epoch": 0.6585365853658537, "frac_reward_zero_std": 0.625, "grad_norm": 0.03864120692014694, "kl": 0.004309304436901584, "learning_rate": 3.1170731707317074e-06, "loss": -0.0264, "num_tokens": 12075874.0, "reward": 1.453125, "reward_std": 0.7335219383239746, "rewards/answer_reward/mean": 0.59375, "rewards/answer_reward/std": 0.49899089336395264, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3415895998477936, "sampling/importance_sampling_ratio/max": 2.3004274368286133, "sampling/importance_sampling_ratio/mean": 0.7384710311889648, "sampling/importance_sampling_ratio/min": 0.18541933596134186, "sampling/sampling_logp_difference/max": 1.1373701095581055, "sampling/sampling_logp_difference/mean": 0.02040836215019226, "step": 135, "step_time": 11.194369727745652 }, { "clip_ratio/high_max": 0.005996063293423504, "clip_ratio/high_mean": 0.001499015823355876, "clip_ratio/low_mean": 0.0006908060167916119, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002189821840147488, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 211.4375, "completions/mean_terminated_length": 211.4375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.47248073294758797, "epoch": 0.6634146341463415, "frac_reward_zero_std": 0.75, "grad_norm": 0.04862526059150696, "kl": 0.0037692947225878015, "learning_rate": 3.0731707317073173e-06, "loss": 0.0957, "num_tokens": 12167948.0, "reward": 1.875, "reward_std": 0.33601075410842896, "rewards/answer_reward/mean": 0.875, "rewards/answer_reward/std": 0.33601075410842896, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.245432138442993, "sampling/importance_sampling_ratio/mean": 0.9704145193099976, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.6443252563476562, "sampling/sampling_logp_difference/mean": 0.022767890244722366, "step": 136, "step_time": 10.50652605202049 }, { "clip_ratio/high_max": 0.00389592454303056, "clip_ratio/high_mean": 0.00097398113575764, "clip_ratio/low_mean": 0.0008930685216910206, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018670496574486606, "completions/clipped_ratio": 0.09375, "completions/max_length": 384.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 234.71875, "completions/mean_terminated_length": 219.27586364746094, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.3827534466981888, "epoch": 0.6682926829268293, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10608888417482376, "kl": 0.004333636068622582, "learning_rate": 3.0292682926829272e-06, "loss": -0.0423, "num_tokens": 12267075.0, "reward": 1.65625, "reward_std": 0.6530017852783203, "rewards/answer_reward/mean": 0.75, "rewards/answer_reward/std": 0.4399413466453552, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "sampling/importance_sampling_ratio/max": 2.722170114517212, "sampling/importance_sampling_ratio/mean": 0.9099236130714417, "sampling/importance_sampling_ratio/min": 0.044184040278196335, "sampling/sampling_logp_difference/max": 2.6253199577331543, "sampling/sampling_logp_difference/mean": 0.017766209319233894, "step": 137, "step_time": 11.628058591391891 }, { "clip_ratio/high_max": 0.004756631446070969, "clip_ratio/high_mean": 0.0011891578615177423, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011891578615177423, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 228.59375, "completions/mean_terminated_length": 228.59375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.5243602283298969, "epoch": 0.6731707317073171, "frac_reward_zero_std": 0.8125, "grad_norm": 0.055757828056812286, "kl": 0.0035456281038932502, "learning_rate": 2.9853658536585367e-06, "loss": 0.0673, "num_tokens": 12351124.0, "reward": 1.71875, "reward_std": 0.45680341124534607, "rewards/answer_reward/mean": 0.71875, "rewards/answer_reward/std": 0.45680341124534607, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.767862319946289, "sampling/importance_sampling_ratio/mean": 0.9563580751419067, "sampling/importance_sampling_ratio/min": 0.006445834878832102, "sampling/sampling_logp_difference/max": 2.493523359298706, "sampling/sampling_logp_difference/mean": 0.023791082203388214, "step": 138, "step_time": 11.12048377096653 }, { "clip_ratio/high_max": 0.0025061554624699056, "clip_ratio/high_mean": 0.0006265388656174764, "clip_ratio/low_mean": 0.00042493898945394903, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010514778550714254, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 220.40625, "completions/mean_terminated_length": 215.1290283203125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.46175797283649445, "epoch": 0.6780487804878049, "frac_reward_zero_std": 0.75, "grad_norm": 0.039455242455005646, "kl": 0.0029219040006864816, "learning_rate": 2.9414634146341466e-06, "loss": -0.0467, "num_tokens": 12439315.0, "reward": 1.59375, "reward_std": 0.5599179267883301, "rewards/answer_reward/mean": 0.625, "rewards/answer_reward/std": 0.49186936020851135, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 2.883561611175537, "sampling/importance_sampling_ratio/mean": 0.883018970489502, "sampling/importance_sampling_ratio/min": 0.11348459869623184, "sampling/sampling_logp_difference/max": 0.7379424571990967, "sampling/sampling_logp_difference/mean": 0.020442981272935867, "step": 139, "step_time": 10.27398874470964 }, { "clip_ratio/high_max": 0.0015050054644234478, "clip_ratio/high_mean": 0.0005294376314850524, "clip_ratio/low_mean": 0.001167011185316369, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016964488168014213, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 215.90625, "completions/mean_terminated_length": 215.90625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.5045613572001457, "epoch": 0.6829268292682927, "frac_reward_zero_std": 0.625, "grad_norm": 0.3066153824329376, "kl": 0.04518254725553561, "learning_rate": 2.897560975609756e-06, "loss": 0.0613, "num_tokens": 12533770.0, "reward": 1.625, "reward_std": 0.49186936020851135, "rewards/answer_reward/mean": 0.625, "rewards/answer_reward/std": 0.49186936020851135, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.816474676132202, "sampling/importance_sampling_ratio/mean": 0.8181303143501282, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.568915605545044, "sampling/sampling_logp_difference/mean": 0.023272626101970673, "step": 140, "step_time": 10.978160293307155 }, { "clip_ratio/high_max": 0.00503623834811151, "clip_ratio/high_mean": 0.0012590595870278776, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0012590595870278776, "completions/clipped_ratio": 0.0625, "completions/max_length": 384.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 222.21875, "completions/mean_terminated_length": 211.433349609375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.4124786965548992, "epoch": 0.6878048780487804, "frac_reward_zero_std": 0.8125, "grad_norm": 0.04376315698027611, "kl": 0.004006432747701183, "learning_rate": 2.853658536585366e-06, "loss": 0.0237, "num_tokens": 12614955.0, "reward": 1.5390625, "reward_std": 0.6027804613113403, "rewards/answer_reward/mean": 0.59375, "rewards/answer_reward/std": 0.49899089336395264, "rewards/format_reward/mean": 0.9453125, "rewards/format_reward/std": 0.21752247214317322, "sampling/importance_sampling_ratio/max": 2.7897555828094482, "sampling/importance_sampling_ratio/mean": 0.9077985882759094, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.9469695091247559, "sampling/sampling_logp_difference/mean": 0.020155835896730423, "step": 141, "step_time": 10.82123430725187 }, { "clip_ratio/high_max": 0.003968397853896022, "clip_ratio/high_mean": 0.0012801179545931518, "clip_ratio/low_mean": 0.00023242230963660404, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015125402715057135, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 222.59375, "completions/mean_terminated_length": 222.59375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.5102537907660007, "epoch": 0.6926829268292682, "frac_reward_zero_std": 0.75, "grad_norm": 0.028739968314766884, "kl": 0.003998424712335691, "learning_rate": 2.8097560975609756e-06, "loss": -0.0352, "num_tokens": 12701166.0, "reward": 1.8125, "reward_std": 0.3965577781200409, "rewards/answer_reward/mean": 0.8125, "rewards/answer_reward/std": 0.3965577781200409, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.5964202880859375, "sampling/importance_sampling_ratio/mean": 0.7092758417129517, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7761952877044678, "sampling/sampling_logp_difference/mean": 0.022487960755825043, "step": 142, "step_time": 11.065863597672433 }, { "clip_ratio/high_max": 0.0017301483021583408, "clip_ratio/high_mean": 0.000566084090678487, "clip_ratio/low_mean": 0.0002530364436097443, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008191205342882313, "completions/clipped_ratio": 0.0625, "completions/max_length": 384.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 227.28125, "completions/mean_terminated_length": 216.83334350585938, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.4699244871735573, "epoch": 0.697560975609756, "frac_reward_zero_std": 0.625, "grad_norm": 0.075437031686306, "kl": 0.0032705249614082277, "learning_rate": 2.7658536585365855e-06, "loss": -0.1119, "num_tokens": 12800559.0, "reward": 1.5703125, "reward_std": 0.5998466610908508, "rewards/answer_reward/mean": 0.625, "rewards/answer_reward/std": 0.49186936020851135, "rewards/format_reward/mean": 0.9453125, "rewards/format_reward/std": 0.21752247214317322, "sampling/importance_sampling_ratio/max": 2.9849693775177, "sampling/importance_sampling_ratio/mean": 1.0019469261169434, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8746500015258789, "sampling/sampling_logp_difference/mean": 0.021881382912397385, "step": 143, "step_time": 11.326349475421011 }, { "clip_ratio/high_max": 0.0031508540268987417, "clip_ratio/high_mean": 0.0007877135067246854, "clip_ratio/low_mean": 0.00036376923526404426, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011514827492646873, "completions/clipped_ratio": 0.09375, "completions/max_length": 384.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 238.0, "completions/mean_terminated_length": 222.89654541015625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.48805396631360054, "epoch": 0.7024390243902439, "frac_reward_zero_std": 0.6875, "grad_norm": 0.0328507125377655, "kl": 0.0029778576572425663, "learning_rate": 2.721951219512195e-06, "loss": -0.0203, "num_tokens": 12892757.0, "reward": 1.4765625, "reward_std": 0.6548804640769958, "rewards/answer_reward/mean": 0.5625, "rewards/answer_reward/std": 0.504016101360321, "rewards/format_reward/mean": 0.9140625, "rewards/format_reward/std": 0.27393028140068054, "sampling/importance_sampling_ratio/max": 2.15743088722229, "sampling/importance_sampling_ratio/mean": 0.5990031361579895, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.0977864265441895, "sampling/sampling_logp_difference/mean": 0.021314745768904686, "step": 144, "step_time": 10.480542997829616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0003070119782933034, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003070119782933034, "completions/clipped_ratio": 0.09375, "completions/max_length": 384.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 242.90625, "completions/mean_terminated_length": 228.3103485107422, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.40603329613804817, "epoch": 0.7073170731707317, "frac_reward_zero_std": 0.75, "grad_norm": 0.04806957393884659, "kl": 0.009593348455382511, "learning_rate": 2.678048780487805e-06, "loss": -0.0613, "num_tokens": 12977768.0, "reward": 1.71875, "reward_std": 0.6342064142227173, "rewards/answer_reward/mean": 0.8125, "rewards/answer_reward/std": 0.3965577781200409, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2961445748806, "sampling/importance_sampling_ratio/max": 2.6840295791625977, "sampling/importance_sampling_ratio/mean": 1.0137289762496948, "sampling/importance_sampling_ratio/min": 0.02219373546540737, "sampling/sampling_logp_difference/max": 2.5798792839050293, "sampling/sampling_logp_difference/mean": 0.018330439925193787, "step": 145, "step_time": 11.116572443861514 }, { "clip_ratio/high_max": 0.00285246194107458, "clip_ratio/high_mean": 0.0008361469808733091, "clip_ratio/low_mean": 0.00016276042151730508, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0009989074023906142, "completions/clipped_ratio": 0.15625, "completions/max_length": 384.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 239.40625, "completions/mean_terminated_length": 212.629638671875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.40028978139162064, "epoch": 0.7121951219512195, "frac_reward_zero_std": 0.6875, "grad_norm": 0.04945327341556549, "kl": 0.005656325462041423, "learning_rate": 2.6341463414634145e-06, "loss": -0.015, "num_tokens": 13071723.0, "reward": 1.2890625, "reward_std": 0.7159174084663391, "rewards/answer_reward/mean": 0.4375, "rewards/answer_reward/std": 0.504016101360321, "rewards/format_reward/mean": 0.8515625, "rewards/format_reward/std": 0.3527505695819855, "sampling/importance_sampling_ratio/max": 2.8069207668304443, "sampling/importance_sampling_ratio/mean": 0.920677661895752, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.0694570541381836, "sampling/sampling_logp_difference/mean": 0.018220534548163414, "step": 146, "step_time": 10.47071039583534 }, { "clip_ratio/high_max": 0.0014992537908256054, "clip_ratio/high_mean": 0.00037481344770640135, "clip_ratio/low_mean": 0.0006648323833360337, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001039645831042435, "completions/clipped_ratio": 0.0625, "completions/max_length": 384.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 213.625, "completions/mean_terminated_length": 202.2666778564453, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.442078597843647, "epoch": 0.7170731707317073, "frac_reward_zero_std": 0.6875, "grad_norm": 0.055851008743047714, "kl": 0.0032310287933796644, "learning_rate": 2.5902439024390244e-06, "loss": -0.0049, "num_tokens": 13156221.0, "reward": 1.5625, "reward_std": 0.6189220547676086, "rewards/answer_reward/mean": 0.625, "rewards/answer_reward/std": 0.49186936020851135, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "sampling/importance_sampling_ratio/max": 2.4473471641540527, "sampling/importance_sampling_ratio/mean": 0.9011058211326599, "sampling/importance_sampling_ratio/min": 0.08750143647193909, "sampling/sampling_logp_difference/max": 0.794292688369751, "sampling/sampling_logp_difference/mean": 0.01944834552705288, "step": 147, "step_time": 10.458699013106525 }, { "clip_ratio/high_max": 0.0018730179872363806, "clip_ratio/high_mean": 0.00046825449680909514, "clip_ratio/low_mean": 0.00042951702198479325, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008977715187938884, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 200.6875, "completions/mean_terminated_length": 194.77418518066406, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.428194060921669, "epoch": 0.7219512195121951, "frac_reward_zero_std": 0.8125, "grad_norm": 0.04639339819550514, "kl": 0.003817728327703662, "learning_rate": 2.546341463414634e-06, "loss": -0.049, "num_tokens": 13231133.0, "reward": 1.8125, "reward_std": 0.4709290862083435, "rewards/answer_reward/mean": 0.84375, "rewards/answer_reward/std": 0.3689020276069641, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 2.7002227306365967, "sampling/importance_sampling_ratio/mean": 0.8296744227409363, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.6325256824493408, "sampling/sampling_logp_difference/mean": 0.02209526114165783, "step": 148, "step_time": 10.671683336608112 }, { "clip_ratio/high_max": 0.0028769505443051457, "clip_ratio/high_mean": 0.0011856555647682399, "clip_ratio/low_mean": 0.0002530265337554738, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0014386820839717984, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 224.25, "completions/mean_terminated_length": 224.25, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.485433392226696, "epoch": 0.7268292682926829, "frac_reward_zero_std": 0.6875, "grad_norm": 0.06338957697153091, "kl": 0.0033437050005886704, "learning_rate": 2.5024390243902442e-06, "loss": 0.0075, "num_tokens": 13319573.0, "reward": 1.71875, "reward_std": 0.45680341124534607, "rewards/answer_reward/mean": 0.71875, "rewards/answer_reward/std": 0.45680341124534607, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.9252912998199463, "sampling/importance_sampling_ratio/mean": 0.9583618640899658, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 6.775826930999756, "sampling/sampling_logp_difference/mean": 0.023275969550013542, "step": 149, "step_time": 10.233081359416246 }, { "clip_ratio/high_max": 0.004544506780803204, "clip_ratio/high_mean": 0.001136126695200801, "clip_ratio/low_mean": 0.0006127166852820665, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017488433804828674, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 219.0, "completions/mean_terminated_length": 213.6774139404297, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.4609008617699146, "epoch": 0.7317073170731707, "frac_reward_zero_std": 0.6875, "grad_norm": 0.030725322663784027, "kl": 0.003498644451610744, "learning_rate": 2.4585365853658537e-06, "loss": -0.0182, "num_tokens": 13402819.0, "reward": 1.671875, "reward_std": 0.5017610192298889, "rewards/answer_reward/mean": 0.6875, "rewards/answer_reward/std": 0.4709290862083435, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0883883461356163, "sampling/importance_sampling_ratio/max": 2.5229005813598633, "sampling/importance_sampling_ratio/mean": 0.7899543046951294, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.2096989154815674, "sampling/sampling_logp_difference/mean": 0.021923985332250595, "step": 150, "step_time": 10.573835325427353 }, { "clip_ratio/high_max": 0.0036820303357671946, "clip_ratio/high_mean": 0.0009205075839417987, "clip_ratio/low_mean": 0.00012450199574232101, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010450095796841197, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 201.71875, "completions/mean_terminated_length": 201.71875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.4674621485173702, "epoch": 0.7365853658536585, "frac_reward_zero_std": 0.75, "grad_norm": 0.05863359570503235, "kl": 0.002721148222917691, "learning_rate": 2.4146341463414637e-06, "loss": -0.0662, "num_tokens": 13498760.0, "reward": 1.75, "reward_std": 0.4399413466453552, "rewards/answer_reward/mean": 0.75, "rewards/answer_reward/std": 0.4399413466453552, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.962303638458252, "sampling/importance_sampling_ratio/mean": 0.9285770654678345, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.3871065378189087, "sampling/sampling_logp_difference/mean": 0.021955575793981552, "step": 151, "step_time": 10.542595710605383 }, { "clip_ratio/high_max": 0.01114197145216167, "clip_ratio/high_mean": 0.0027854928630404174, "clip_ratio/low_mean": 0.001369824902212713, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004155317787081003, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 212.6875, "completions/mean_terminated_length": 212.6875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.46512043103575706, "epoch": 0.7414634146341463, "frac_reward_zero_std": 0.5, "grad_norm": 0.029148826375603676, "kl": 0.0038136374205350876, "learning_rate": 2.370731707317073e-06, "loss": -0.0074, "num_tokens": 13593720.0, "reward": 1.6796875, "reward_std": 0.4845781922340393, "rewards/answer_reward/mean": 0.6875, "rewards/answer_reward/std": 0.4709290862083435, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.159280300140381, "sampling/importance_sampling_ratio/mean": 0.6217820048332214, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.1276429891586304, "sampling/sampling_logp_difference/mean": 0.022411873564124107, "step": 152, "step_time": 11.011414418928325 }, { "clip_ratio/high_max": 0.003220285288989544, "clip_ratio/high_mean": 0.000805071322247386, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000805071322247386, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 221.375, "completions/mean_terminated_length": 216.1290283203125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.4828587956726551, "epoch": 0.7463414634146341, "frac_reward_zero_std": 0.8125, "grad_norm": 0.05524183809757233, "kl": 0.0031579379574395716, "learning_rate": 2.326829268292683e-06, "loss": -0.0212, "num_tokens": 13675888.0, "reward": 1.8203125, "reward_std": 0.4410141110420227, "rewards/answer_reward/mean": 0.84375, "rewards/answer_reward/std": 0.3689020276069641, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.13258251547813416, "sampling/importance_sampling_ratio/max": 2.7531607151031494, "sampling/importance_sampling_ratio/mean": 0.9062055349349976, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.806251049041748, "sampling/sampling_logp_difference/mean": 0.021499255672097206, "step": 153, "step_time": 10.743210955522954 }, { "clip_ratio/high_max": 0.001696832594461739, "clip_ratio/high_mean": 0.00042420814861543477, "clip_ratio/low_mean": 0.0005368027559597977, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0009610109191271476, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 223.1875, "completions/mean_terminated_length": 218.0, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.4395974203944206, "epoch": 0.751219512195122, "frac_reward_zero_std": 0.8125, "grad_norm": 0.02662387117743492, "kl": 0.006488846556749195, "learning_rate": 2.2829268292682926e-06, "loss": 0.0001, "num_tokens": 13776244.0, "reward": 1.84375, "reward_std": 0.4478893280029297, "rewards/answer_reward/mean": 0.875, "rewards/answer_reward/std": 0.33601075410842896, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 2.845818042755127, "sampling/importance_sampling_ratio/mean": 1.0668256282806396, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8599768877029419, "sampling/sampling_logp_difference/mean": 0.02020164765417576, "step": 154, "step_time": 10.642807276919484 }, { "clip_ratio/high_max": 0.002922117244452238, "clip_ratio/high_mean": 0.000928314111661166, "clip_ratio/low_mean": 0.00035823971848003566, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0012865538301412016, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 224.28125, "completions/mean_terminated_length": 224.28125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.4709400311112404, "epoch": 0.7560975609756098, "frac_reward_zero_std": 0.75, "grad_norm": 0.04553566128015518, "kl": 0.0036637499288190156, "learning_rate": 2.2390243902439025e-06, "loss": -0.0515, "num_tokens": 13869725.0, "reward": 1.75, "reward_std": 0.4399413466453552, "rewards/answer_reward/mean": 0.75, "rewards/answer_reward/std": 0.4399413466453552, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.88288950920105, "sampling/importance_sampling_ratio/mean": 0.912500262260437, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9058674573898315, "sampling/sampling_logp_difference/mean": 0.022105611860752106, "step": 155, "step_time": 11.198091230820864 }, { "clip_ratio/high_max": 0.009194869664497674, "clip_ratio/high_mean": 0.00271540047833696, "clip_ratio/low_mean": 0.0009877693446469493, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0037031698593636975, "completions/clipped_ratio": 0.0625, "completions/max_length": 384.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 251.59375, "completions/mean_terminated_length": 242.7666778564453, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.48480816930532455, "epoch": 0.7609756097560976, "frac_reward_zero_std": 0.5, "grad_norm": 0.06312282383441925, "kl": 0.0030437585955951363, "learning_rate": 2.195121951219512e-06, "loss": 0.0052, "num_tokens": 13962088.0, "reward": 1.4609375, "reward_std": 0.6160140633583069, "rewards/answer_reward/mean": 0.53125, "rewards/answer_reward/std": 0.507007360458374, "rewards/format_reward/mean": 0.9296875, "rewards/format_reward/std": 0.24784860014915466, "sampling/importance_sampling_ratio/max": 2.7408976554870605, "sampling/importance_sampling_ratio/mean": 0.9173311591148376, "sampling/importance_sampling_ratio/min": 0.05823906138539314, "sampling/sampling_logp_difference/max": 0.8257522583007812, "sampling/sampling_logp_difference/mean": 0.02127200737595558, "step": 156, "step_time": 11.013755402062088 }, { "clip_ratio/high_max": 0.0027126000495627522, "clip_ratio/high_mean": 0.0006781500123906881, "clip_ratio/low_mean": 0.000981114324531518, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001659264336922206, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 209.125, "completions/mean_terminated_length": 209.125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.4421813003718853, "epoch": 0.7658536585365854, "frac_reward_zero_std": 0.75, "grad_norm": 0.05861254408955574, "kl": 0.0030077206902205944, "learning_rate": 2.151219512195122e-06, "loss": -0.0239, "num_tokens": 14055960.0, "reward": 1.6875, "reward_std": 0.4709290862083435, "rewards/answer_reward/mean": 0.6875, "rewards/answer_reward/std": 0.4709290862083435, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.959907293319702, "sampling/importance_sampling_ratio/mean": 0.9499596953392029, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.892148494720459, "sampling/sampling_logp_difference/mean": 0.020884938538074493, "step": 157, "step_time": 10.487126488704234 }, { "clip_ratio/high_max": 0.004838591790758073, "clip_ratio/high_mean": 0.0012096479476895183, "clip_ratio/low_mean": 0.0006283818947849795, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018380298715783283, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 212.6875, "completions/mean_terminated_length": 212.6875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.4774930328130722, "epoch": 0.7707317073170732, "frac_reward_zero_std": 0.6875, "grad_norm": 0.054494407027959824, "kl": 0.003161880071274936, "learning_rate": 2.107317073170732e-06, "loss": -0.0382, "num_tokens": 14151572.0, "reward": 1.78125, "reward_std": 0.420013427734375, "rewards/answer_reward/mean": 0.78125, "rewards/answer_reward/std": 0.420013427734375, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.591606855392456, "sampling/importance_sampling_ratio/mean": 0.8475039005279541, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.8139610290527344, "sampling/sampling_logp_difference/mean": 0.023174606263637543, "step": 158, "step_time": 10.631651198957115 }, { "clip_ratio/high_max": 0.004130295128561556, "clip_ratio/high_mean": 0.001032573782140389, "clip_ratio/low_mean": 0.0005707633099518716, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016033370775403455, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 221.4375, "completions/mean_terminated_length": 216.19354248046875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.5265634618699551, "epoch": 0.775609756097561, "frac_reward_zero_std": 0.8125, "grad_norm": 0.02883065491914749, "kl": 0.005228572030318901, "learning_rate": 2.0634146341463414e-06, "loss": 0.01, "num_tokens": 14242364.0, "reward": 1.703125, "reward_std": 0.48955830931663513, "rewards/answer_reward/mean": 0.71875, "rewards/answer_reward/std": 0.45680341124534607, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0883883461356163, "sampling/importance_sampling_ratio/max": 2.900527000427246, "sampling/importance_sampling_ratio/mean": 0.8510293960571289, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.8001418113708496, "sampling/sampling_logp_difference/mean": 0.02458598092198372, "step": 159, "step_time": 10.879618687089533 }, { "clip_ratio/high_max": 0.0028781348082702607, "clip_ratio/high_mean": 0.0007195337020675652, "clip_ratio/low_mean": 0.0002498772882972844, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0009694109903648496, "completions/clipped_ratio": 0.0625, "completions/max_length": 384.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 226.4375, "completions/mean_terminated_length": 215.933349609375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.3967212662100792, "epoch": 0.7804878048780488, "frac_reward_zero_std": 0.8125, "grad_norm": 0.02241561748087406, "kl": 0.003705541050294414, "learning_rate": 2.0195121951219513e-06, "loss": 0.0163, "num_tokens": 14334124.0, "reward": 1.53125, "reward_std": 0.6213603615760803, "rewards/answer_reward/mean": 0.59375, "rewards/answer_reward/std": 0.49899089336395264, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "sampling/importance_sampling_ratio/max": 2.3054981231689453, "sampling/importance_sampling_ratio/mean": 0.6920244097709656, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.933199405670166, "sampling/sampling_logp_difference/mean": 0.01838141679763794, "step": 160, "step_time": 10.618746706750244 }, { "clip_ratio/high_max": 0.004244438197929412, "clip_ratio/high_mean": 0.0014052704646019265, "clip_ratio/low_mean": 0.0007977636632858776, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002203034127887804, "completions/clipped_ratio": 0.09375, "completions/max_length": 384.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 229.4375, "completions/mean_terminated_length": 213.44827270507812, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.453995019197464, "epoch": 0.7853658536585366, "frac_reward_zero_std": 0.5625, "grad_norm": 0.05820666253566742, "kl": 0.004268685472197831, "learning_rate": 1.975609756097561e-06, "loss": -0.037, "num_tokens": 14420366.0, "reward": 1.65625, "reward_std": 0.5880188345909119, "rewards/answer_reward/mean": 0.71875, "rewards/answer_reward/std": 0.45680341124534607, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.21060587465763092, "sampling/importance_sampling_ratio/max": 2.4027657508850098, "sampling/importance_sampling_ratio/mean": 0.7757611274719238, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.85393226146698, "sampling/sampling_logp_difference/mean": 0.02145053818821907, "step": 161, "step_time": 10.427693421021104 }, { "clip_ratio/high_max": 0.0029885040130466223, "clip_ratio/high_mean": 0.0007471260032616556, "clip_ratio/low_mean": 0.00048553718079347163, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0012326631840551272, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 224.71875, "completions/mean_terminated_length": 219.5806427001953, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.5160452947020531, "epoch": 0.7902439024390244, "frac_reward_zero_std": 0.625, "grad_norm": 0.04164901003241539, "kl": 0.005827239656355232, "learning_rate": 1.9317073170731708e-06, "loss": -0.0356, "num_tokens": 14504693.0, "reward": 1.5546875, "reward_std": 0.5740854144096375, "rewards/answer_reward/mean": 0.59375, "rewards/answer_reward/std": 0.49899089336395264, "rewards/format_reward/mean": 0.9609375, "rewards/format_reward/std": 0.18082889914512634, "sampling/importance_sampling_ratio/max": 2.2849972248077393, "sampling/importance_sampling_ratio/mean": 0.779876708984375, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.9267232418060303, "sampling/sampling_logp_difference/mean": 0.023528866469860077, "step": 162, "step_time": 10.057746862526983 }, { "clip_ratio/high_max": 0.001974675862584263, "clip_ratio/high_mean": 0.0004936689656460658, "clip_ratio/low_mean": 0.0002775600805762224, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0007712290462222882, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 219.40625, "completions/mean_terminated_length": 219.40625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.5289221368730068, "epoch": 0.7951219512195122, "frac_reward_zero_std": 0.875, "grad_norm": 0.05087749660015106, "kl": 0.00343395140953362, "learning_rate": 1.8878048780487805e-06, "loss": -0.0506, "num_tokens": 14572076.0, "reward": 1.8125, "reward_std": 0.3965577781200409, "rewards/answer_reward/mean": 0.8125, "rewards/answer_reward/std": 0.3965577781200409, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.967545747756958, "sampling/importance_sampling_ratio/mean": 0.8270659446716309, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 3.048293113708496, "sampling/sampling_logp_difference/mean": 0.023295924067497253, "step": 163, "step_time": 9.095140967518091 }, { "clip_ratio/high_max": 0.0019190263701602817, "clip_ratio/high_mean": 0.0004797565925400704, "clip_ratio/low_mean": 0.0006452891757362522, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011250457682763226, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 216.25, "completions/mean_terminated_length": 216.25, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.4857654497027397, "epoch": 0.8, "frac_reward_zero_std": 0.6875, "grad_norm": 0.07985271513462067, "kl": 0.004710580455139279, "learning_rate": 1.8439024390243902e-06, "loss": 0.0513, "num_tokens": 14666894.0, "reward": 1.65625, "reward_std": 0.4825586974620819, "rewards/answer_reward/mean": 0.65625, "rewards/answer_reward/std": 0.4825586974620819, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.308689594268799, "sampling/importance_sampling_ratio/mean": 0.9040347337722778, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.3873515129089355, "sampling/sampling_logp_difference/mean": 0.023477796465158463, "step": 164, "step_time": 10.798008191399276 }, { "clip_ratio/high_max": 0.002106875297613442, "clip_ratio/high_mean": 0.0006416085234377533, "clip_ratio/low_mean": 8.138021075865254e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0007229887341964059, "completions/clipped_ratio": 0.0625, "completions/max_length": 384.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 223.46875, "completions/mean_terminated_length": 212.7666778564453, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.45716242492198944, "epoch": 0.8048780487804879, "frac_reward_zero_std": 0.8125, "grad_norm": 0.04413546249270439, "kl": 0.004823508264962584, "learning_rate": 1.8000000000000001e-06, "loss": 0.0195, "num_tokens": 14756829.0, "reward": 1.6875, "reward_std": 0.5922891497612, "rewards/answer_reward/mean": 0.75, "rewards/answer_reward/std": 0.4399413466453552, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "sampling/importance_sampling_ratio/max": 2.4274868965148926, "sampling/importance_sampling_ratio/mean": 0.939383327960968, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.2122303247451782, "sampling/sampling_logp_difference/mean": 0.020669518038630486, "step": 165, "step_time": 10.459069825243205 }, { "clip_ratio/high_max": 0.00569309457205236, "clip_ratio/high_mean": 0.0015877473342698067, "clip_ratio/low_mean": 0.0006931749594514258, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00228092230099719, "completions/clipped_ratio": 0.0625, "completions/max_length": 384.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 211.3125, "completions/mean_terminated_length": 199.8000030517578, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.4729599766433239, "epoch": 0.8097560975609757, "frac_reward_zero_std": 0.625, "grad_norm": 0.04688924178481102, "kl": 0.005117002205224708, "learning_rate": 1.7560975609756098e-06, "loss": -0.0373, "num_tokens": 14854493.0, "reward": 1.703125, "reward_std": 0.55152827501297, "rewards/answer_reward/mean": 0.75, "rewards/answer_reward/std": 0.4399413466453552, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.19507545232772827, "sampling/importance_sampling_ratio/max": 2.942394256591797, "sampling/importance_sampling_ratio/mean": 0.7860126495361328, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.5675859451293945, "sampling/sampling_logp_difference/mean": 0.022123247385025024, "step": 166, "step_time": 10.35710391541943 }, { "clip_ratio/high_max": 0.0006983240018598735, "clip_ratio/high_mean": 0.00017458100046496838, "clip_ratio/low_mean": 0.0004635534714907408, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0006381344719557092, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 220.46875, "completions/mean_terminated_length": 215.19354248046875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.4294270947575569, "epoch": 0.8146341463414634, "frac_reward_zero_std": 0.8125, "grad_norm": 0.031683824956417084, "kl": 0.0032821067434269935, "learning_rate": 1.7121951219512196e-06, "loss": -0.0018, "num_tokens": 14946446.0, "reward": 1.625, "reward_std": 0.5535807013511658, "rewards/answer_reward/mean": 0.65625, "rewards/answer_reward/std": 0.4825586974620819, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 2.78151273727417, "sampling/importance_sampling_ratio/mean": 0.8940967321395874, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.1167418956756592, "sampling/sampling_logp_difference/mean": 0.020627832040190697, "step": 167, "step_time": 11.34063930530101 }, { "clip_ratio/high_max": 0.005653230415191501, "clip_ratio/high_mean": 0.0014133076037978753, "clip_ratio/low_mean": 0.0009790845797397196, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002392392125329934, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 228.21875, "completions/mean_terminated_length": 228.21875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.5320886373519897, "epoch": 0.8195121951219512, "frac_reward_zero_std": 0.6875, "grad_norm": 0.039519716054201126, "kl": 0.0039497674006270245, "learning_rate": 1.6682926829268293e-06, "loss": 0.0188, "num_tokens": 15044861.0, "reward": 1.65625, "reward_std": 0.4825586974620819, "rewards/answer_reward/mean": 0.65625, "rewards/answer_reward/std": 0.4825586974620819, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.2347655296325684, "sampling/importance_sampling_ratio/mean": 0.7667834758758545, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.6746830940246582, "sampling/sampling_logp_difference/mean": 0.024437204003334045, "step": 168, "step_time": 11.091352775227278 }, { "clip_ratio/high_max": 0.004256756161339581, "clip_ratio/high_mean": 0.0010641890403348953, "clip_ratio/low_mean": 0.0007061943761073053, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001770383445546031, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 225.15625, "completions/mean_terminated_length": 220.03225708007812, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.4418558329343796, "epoch": 0.824390243902439, "frac_reward_zero_std": 0.6875, "grad_norm": 0.03500165045261383, "kl": 0.0038856152386870235, "learning_rate": 1.624390243902439e-06, "loss": -0.0268, "num_tokens": 15131746.0, "reward": 1.625, "reward_std": 0.5535807013511658, "rewards/answer_reward/mean": 0.65625, "rewards/answer_reward/std": 0.4825586974620819, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 2.899585008621216, "sampling/importance_sampling_ratio/mean": 0.9141867756843567, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.3395915031433105, "sampling/sampling_logp_difference/mean": 0.021318811923265457, "step": 169, "step_time": 10.48305617691949 }, { "clip_ratio/high_max": 0.0020239867735654116, "clip_ratio/high_mean": 0.0005059966933913529, "clip_ratio/low_mean": 0.00027901786961592734, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0007850145775591955, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 236.15625, "completions/mean_terminated_length": 236.15625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.4165119268000126, "epoch": 0.8292682926829268, "frac_reward_zero_std": 0.8125, "grad_norm": 0.035240646451711655, "kl": 0.0058187977119814605, "learning_rate": 1.5804878048780487e-06, "loss": -0.033, "num_tokens": 15217911.0, "reward": 1.84375, "reward_std": 0.3689020276069641, "rewards/answer_reward/mean": 0.84375, "rewards/answer_reward/std": 0.3689020276069641, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.6287336349487305, "sampling/importance_sampling_ratio/mean": 0.8505252599716187, "sampling/importance_sampling_ratio/min": 0.07618208974599838, "sampling/sampling_logp_difference/max": 1.0140711069107056, "sampling/sampling_logp_difference/mean": 0.019460199400782585, "step": 170, "step_time": 10.845775848254561 }, { "clip_ratio/high_max": 0.004382005048682913, "clip_ratio/high_mean": 0.0010955012621707283, "clip_ratio/low_mean": 0.0006886602059239522, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017841614608187228, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 224.78125, "completions/mean_terminated_length": 224.78125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.46056048199534416, "epoch": 0.8341463414634146, "frac_reward_zero_std": 0.6875, "grad_norm": 0.05784723907709122, "kl": 0.0031874341948423535, "learning_rate": 1.5365853658536586e-06, "loss": -0.0315, "num_tokens": 15309014.0, "reward": 1.59375, "reward_std": 0.498990923166275, "rewards/answer_reward/mean": 0.59375, "rewards/answer_reward/std": 0.49899089336395264, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.3229949474334717, "sampling/importance_sampling_ratio/mean": 0.8183152675628662, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.4058157205581665, "sampling/sampling_logp_difference/mean": 0.022365521639585495, "step": 171, "step_time": 11.49023184645921 }, { "clip_ratio/high_max": 0.005477624072227627, "clip_ratio/high_mean": 0.0013694060180569068, "clip_ratio/low_mean": 0.00021806551376357675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001587471560924314, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 215.875, "completions/mean_terminated_length": 210.45159912109375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.46616629511117935, "epoch": 0.8390243902439024, "frac_reward_zero_std": 0.5625, "grad_norm": 0.04392164945602417, "kl": 0.0033080412540584803, "learning_rate": 1.4926829268292684e-06, "loss": 0.069, "num_tokens": 15399322.0, "reward": 1.640625, "reward_std": 0.5117076635360718, "rewards/answer_reward/mean": 0.65625, "rewards/answer_reward/std": 0.4825586974620819, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0883883461356163, "sampling/importance_sampling_ratio/max": 2.1742324829101562, "sampling/importance_sampling_ratio/mean": 0.7761921286582947, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 3.696263074874878, "sampling/sampling_logp_difference/mean": 0.022831637412309647, "step": 172, "step_time": 10.035372076090425 }, { "clip_ratio/high_max": 0.0031591171864420176, "clip_ratio/high_mean": 0.0009034156682901084, "clip_ratio/low_mean": 0.00020424836839083582, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011076640366809443, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 205.84375, "completions/mean_terminated_length": 205.84375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.46773793175816536, "epoch": 0.8439024390243902, "frac_reward_zero_std": 0.75, "grad_norm": 0.03523130342364311, "kl": 0.002978018077556044, "learning_rate": 1.448780487804878e-06, "loss": -0.0264, "num_tokens": 15486593.0, "reward": 1.75, "reward_std": 0.4399413466453552, "rewards/answer_reward/mean": 0.75, "rewards/answer_reward/std": 0.4399413466453552, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.7644588947296143, "sampling/importance_sampling_ratio/mean": 0.9399392008781433, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.3304476737976074, "sampling/sampling_logp_difference/mean": 0.022491615265607834, "step": 173, "step_time": 11.184126393403858 }, { "clip_ratio/high_max": 0.0030360152013599873, "clip_ratio/high_mean": 0.0007590038003399968, "clip_ratio/low_mean": 0.0004032258002553135, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011622296005953103, "completions/clipped_ratio": 0.0625, "completions/max_length": 384.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 219.46875, "completions/mean_terminated_length": 208.50001525878906, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.4764224998652935, "epoch": 0.848780487804878, "frac_reward_zero_std": 0.75, "grad_norm": 0.026516076177358627, "kl": 0.00541797315236181, "learning_rate": 1.4048780487804878e-06, "loss": 0.0072, "num_tokens": 15568228.0, "reward": 1.6328125, "reward_std": 0.5888218879699707, "rewards/answer_reward/mean": 0.6875, "rewards/answer_reward/std": 0.4709290862083435, "rewards/format_reward/mean": 0.9453125, "rewards/format_reward/std": 0.21752247214317322, "sampling/importance_sampling_ratio/max": 2.8609964847564697, "sampling/importance_sampling_ratio/mean": 0.9248221516609192, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.3582427501678467, "sampling/sampling_logp_difference/mean": 0.02183512970805168, "step": 174, "step_time": 10.45904777571559 }, { "clip_ratio/high_max": 0.0039022971177473664, "clip_ratio/high_mean": 0.001140047999797389, "clip_ratio/low_mean": 0.0004799068265128881, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001619954826310277, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 199.21875, "completions/mean_terminated_length": 199.21875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.5122228041291237, "epoch": 0.8536585365853658, "frac_reward_zero_std": 0.625, "grad_norm": 0.07483609020709991, "kl": 0.003273438400356099, "learning_rate": 1.3609756097560975e-06, "loss": -0.0337, "num_tokens": 15661767.0, "reward": 1.625, "reward_std": 0.49186936020851135, "rewards/answer_reward/mean": 0.625, "rewards/answer_reward/std": 0.49186936020851135, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.468684196472168, "sampling/importance_sampling_ratio/mean": 0.8644235134124756, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.0376434326171875, "sampling/sampling_logp_difference/mean": 0.02387034147977829, "step": 175, "step_time": 10.119488212745637 }, { "clip_ratio/high_max": 0.001479289960116148, "clip_ratio/high_mean": 0.00048730371054261923, "clip_ratio/low_mean": 0.0002398574652033858, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000727161175746005, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 222.53125, "completions/mean_terminated_length": 222.53125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.5416915528476238, "epoch": 0.8585365853658536, "frac_reward_zero_std": 0.8125, "grad_norm": 0.03379981592297554, "kl": 0.0034248526208102703, "learning_rate": 1.3170731707317072e-06, "loss": -0.0361, "num_tokens": 15749830.0, "reward": 1.78125, "reward_std": 0.420013427734375, "rewards/answer_reward/mean": 0.78125, "rewards/answer_reward/std": 0.420013427734375, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.7789244651794434, "sampling/importance_sampling_ratio/mean": 0.7408767938613892, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 3.3820762634277344, "sampling/sampling_logp_difference/mean": 0.024976110085844994, "step": 176, "step_time": 10.646100924350321 }, { "clip_ratio/high_max": 0.004986484826076776, "clip_ratio/high_mean": 0.001246621206519194, "clip_ratio/low_mean": 0.00037097905442351475, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016176002463907935, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 223.96875, "completions/mean_terminated_length": 218.8064422607422, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.5043984092772007, "epoch": 0.8634146341463415, "frac_reward_zero_std": 0.75, "grad_norm": 0.040593381971120834, "kl": 0.003315436450066045, "learning_rate": 1.273170731707317e-06, "loss": 0.0499, "num_tokens": 15838097.0, "reward": 1.65625, "reward_std": 0.5453247427940369, "rewards/answer_reward/mean": 0.6875, "rewards/answer_reward/std": 0.4709290862083435, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 2.1517446041107178, "sampling/importance_sampling_ratio/mean": 0.7310981154441833, "sampling/importance_sampling_ratio/min": 0.07411627471446991, "sampling/sampling_logp_difference/max": 1.00062894821167, "sampling/sampling_logp_difference/mean": 0.02219741977751255, "step": 177, "step_time": 11.448397235944867 }, { "clip_ratio/high_max": 0.003805235493928194, "clip_ratio/high_mean": 0.0009513088734820485, "clip_ratio/low_mean": 0.0012900896181236021, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00224139846250182, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 220.65625, "completions/mean_terminated_length": 215.3870849609375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.44304509088397026, "epoch": 0.8682926829268293, "frac_reward_zero_std": 0.75, "grad_norm": 0.049161396920681, "kl": 0.0029954708588775247, "learning_rate": 1.2292682926829269e-06, "loss": -0.0613, "num_tokens": 15919774.0, "reward": 1.78125, "reward_std": 0.4908435642719269, "rewards/answer_reward/mean": 0.8125, "rewards/answer_reward/std": 0.3965577781200409, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 2.966403007507324, "sampling/importance_sampling_ratio/mean": 1.0567896366119385, "sampling/importance_sampling_ratio/min": 0.08224120736122131, "sampling/sampling_logp_difference/max": 0.798243522644043, "sampling/sampling_logp_difference/mean": 0.020214486867189407, "step": 178, "step_time": 10.48279583454132 }, { "clip_ratio/high_max": 0.0035502109676599503, "clip_ratio/high_mean": 0.0008875527419149876, "clip_ratio/low_mean": 0.000522026268299669, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0014095790102146566, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 207.96875, "completions/mean_terminated_length": 207.96875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.46385154128074646, "epoch": 0.8731707317073171, "frac_reward_zero_std": 0.8125, "grad_norm": 0.06173566356301308, "kl": 0.0028507955430541188, "learning_rate": 1.1853658536585366e-06, "loss": 0.0836, "num_tokens": 16013503.0, "reward": 1.796875, "reward_std": 0.3987758457660675, "rewards/answer_reward/mean": 0.8125, "rewards/answer_reward/std": 0.3965577781200409, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0883883461356163, "sampling/importance_sampling_ratio/max": 2.8499789237976074, "sampling/importance_sampling_ratio/mean": 1.0519715547561646, "sampling/importance_sampling_ratio/min": 0.2590835690498352, "sampling/sampling_logp_difference/max": 0.7385328412055969, "sampling/sampling_logp_difference/mean": 0.021887125447392464, "step": 179, "step_time": 10.625448178965598 }, { "clip_ratio/high_max": 0.002364396466873586, "clip_ratio/high_mean": 0.0005910991167183965, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0005910991167183965, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 203.5, "completions/mean_terminated_length": 197.6774139404297, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.4405072145164013, "epoch": 0.8780487804878049, "frac_reward_zero_std": 0.875, "grad_norm": 0.040941182523965836, "kl": 0.0044606749434024096, "learning_rate": 1.1414634146341463e-06, "loss": -0.0542, "num_tokens": 16093609.0, "reward": 1.84375, "reward_std": 0.4478893280029297, "rewards/answer_reward/mean": 0.875, "rewards/answer_reward/std": 0.33601075410842896, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 2.76668381690979, "sampling/importance_sampling_ratio/mean": 0.9184104204177856, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8794723749160767, "sampling/sampling_logp_difference/mean": 0.021443814039230347, "step": 180, "step_time": 10.007919522468 }, { "clip_ratio/high_max": 0.0028242127737030387, "clip_ratio/high_mean": 0.001045727141899988, "clip_ratio/low_mean": 0.0003289473825134337, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013746745244134218, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 206.53125, "completions/mean_terminated_length": 206.53125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.48193415626883507, "epoch": 0.8829268292682927, "frac_reward_zero_std": 0.75, "grad_norm": 0.0686846673488617, "kl": 0.0036437779199332, "learning_rate": 1.097560975609756e-06, "loss": 0.068, "num_tokens": 16191858.0, "reward": 1.625, "reward_std": 0.49186936020851135, "rewards/answer_reward/mean": 0.625, "rewards/answer_reward/std": 0.49186936020851135, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.981576681137085, "sampling/importance_sampling_ratio/mean": 0.9696821570396423, "sampling/importance_sampling_ratio/min": 0.05335341393947601, "sampling/sampling_logp_difference/max": 1.029494285583496, "sampling/sampling_logp_difference/mean": 0.022601637989282608, "step": 181, "step_time": 11.076276910956949 }, { "clip_ratio/high_max": 0.0033581744646653533, "clip_ratio/high_mean": 0.0008395436161663383, "clip_ratio/low_mean": 0.00016108246927615255, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010006260854424909, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 203.96875, "completions/mean_terminated_length": 203.96875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.5076060481369495, "epoch": 0.8878048780487805, "frac_reward_zero_std": 0.625, "grad_norm": 0.07976002246141434, "kl": 0.0036637159646488726, "learning_rate": 1.053658536585366e-06, "loss": -0.0545, "num_tokens": 16270099.0, "reward": 1.75, "reward_std": 0.4399413466453552, "rewards/answer_reward/mean": 0.75, "rewards/answer_reward/std": 0.4399413466453552, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.6828994750976562, "sampling/importance_sampling_ratio/mean": 1.1108057498931885, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6022930145263672, "sampling/sampling_logp_difference/mean": 0.02332463301718235, "step": 182, "step_time": 9.850909248460084 }, { "clip_ratio/high_max": 0.0022831049282103777, "clip_ratio/high_mean": 0.0005707762320525944, "clip_ratio/low_mean": 0.00022482014901470393, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0007955963810672984, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 211.96875, "completions/mean_terminated_length": 211.96875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.45372073724865913, "epoch": 0.8926829268292683, "frac_reward_zero_std": 0.8125, "grad_norm": 0.04267973452806473, "kl": 0.00301697023678571, "learning_rate": 1.0097560975609757e-06, "loss": 0.0632, "num_tokens": 16352204.0, "reward": 1.71875, "reward_std": 0.45680341124534607, "rewards/answer_reward/mean": 0.71875, "rewards/answer_reward/std": 0.45680341124534607, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.5246293544769287, "sampling/importance_sampling_ratio/mean": 0.8890446424484253, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.646282434463501, "sampling/sampling_logp_difference/mean": 0.0204051174223423, "step": 183, "step_time": 10.250360297970474 }, { "clip_ratio/high_max": 0.0019633506890386343, "clip_ratio/high_mean": 0.0004908376722596586, "clip_ratio/low_mean": 0.0002890214091166854, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000779859081376344, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 219.28125, "completions/mean_terminated_length": 213.96774291992188, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.42499367892742157, "epoch": 0.8975609756097561, "frac_reward_zero_std": 0.875, "grad_norm": 0.01471929345279932, "kl": 0.0031238264054991305, "learning_rate": 9.658536585365854e-07, "loss": 0.0096, "num_tokens": 16442735.0, "reward": 1.78125, "reward_std": 0.4908435642719269, "rewards/answer_reward/mean": 0.8125, "rewards/answer_reward/std": 0.3965577781200409, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 2.816704034805298, "sampling/importance_sampling_ratio/mean": 0.9738296270370483, "sampling/importance_sampling_ratio/min": 0.07506652921438217, "sampling/sampling_logp_difference/max": 0.9794454574584961, "sampling/sampling_logp_difference/mean": 0.02144341915845871, "step": 184, "step_time": 11.803486650809646 }, { "clip_ratio/high_max": 0.0018275727634318173, "clip_ratio/high_mean": 0.0004568931908579543, "clip_ratio/low_mean": 0.0003948668309021741, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008517600217601284, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 204.0, "completions/mean_terminated_length": 204.0, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.44336048141121864, "epoch": 0.9024390243902439, "frac_reward_zero_std": 0.875, "grad_norm": 0.024026138707995415, "kl": 0.004125098901567981, "learning_rate": 9.219512195121951e-07, "loss": -0.0102, "num_tokens": 16532051.0, "reward": 1.75, "reward_std": 0.4399413466453552, "rewards/answer_reward/mean": 0.75, "rewards/answer_reward/std": 0.4399413466453552, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0991716384887695, "sampling/importance_sampling_ratio/mean": 0.6524126529693604, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8715963363647461, "sampling/sampling_logp_difference/mean": 0.021990150213241577, "step": 185, "step_time": 10.306848557665944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 8.138021075865254e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 8.138021075865254e-05, "completions/clipped_ratio": 0.125, "completions/max_length": 384.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 235.8125, "completions/mean_terminated_length": 214.6428680419922, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.44655970856547356, "epoch": 0.9073170731707317, "frac_reward_zero_std": 0.8125, "grad_norm": 0.03729933872818947, "kl": 0.004490631326916628, "learning_rate": 8.780487804878049e-07, "loss": -0.0373, "num_tokens": 16605891.0, "reward": 1.609375, "reward_std": 0.6807772517204285, "rewards/answer_reward/mean": 0.71875, "rewards/answer_reward/std": 0.45680341124534607, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.30412155389785767, "sampling/importance_sampling_ratio/max": 2.9105048179626465, "sampling/importance_sampling_ratio/mean": 0.8057036995887756, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9097874164581299, "sampling/sampling_logp_difference/mean": 0.019600480794906616, "step": 186, "step_time": 9.927827226929367 }, { "clip_ratio/high_max": 0.0026194853126071393, "clip_ratio/high_mean": 0.0006548713281517848, "clip_ratio/low_mean": 0.0004630187904695049, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011178901186212897, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 220.09375, "completions/mean_terminated_length": 214.8064422607422, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.4813721366226673, "epoch": 0.9121951219512195, "frac_reward_zero_std": 0.75, "grad_norm": 0.03278962895274162, "kl": 0.004051378404255956, "learning_rate": 8.341463414634146e-07, "loss": 0.0011, "num_tokens": 16690986.0, "reward": 1.625, "reward_std": 0.5535807013511658, "rewards/answer_reward/mean": 0.65625, "rewards/answer_reward/std": 0.4825586974620819, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 2.1369190216064453, "sampling/importance_sampling_ratio/mean": 0.7384707927703857, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.024813175201416, "sampling/sampling_logp_difference/mean": 0.023189779371023178, "step": 187, "step_time": 10.680635725148022 }, { "clip_ratio/high_max": 0.002061283419607207, "clip_ratio/high_mean": 0.0006613488876610063, "clip_ratio/low_mean": 0.0001806358341127634, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008419847217737697, "completions/clipped_ratio": 0.0625, "completions/max_length": 384.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 234.28125, "completions/mean_terminated_length": 224.30001831054688, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.4507321007549763, "epoch": 0.9170731707317074, "frac_reward_zero_std": 0.75, "grad_norm": 0.02810896933078766, "kl": 0.006156625342555344, "learning_rate": 7.902439024390244e-07, "loss": -0.0222, "num_tokens": 16777761.0, "reward": 1.5625, "reward_std": 0.6189220547676086, "rewards/answer_reward/mean": 0.625, "rewards/answer_reward/std": 0.49186936020851135, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "sampling/importance_sampling_ratio/max": 2.283298969268799, "sampling/importance_sampling_ratio/mean": 0.6978024244308472, "sampling/importance_sampling_ratio/min": 0.07224693149328232, "sampling/sampling_logp_difference/max": 1.573683738708496, "sampling/sampling_logp_difference/mean": 0.021288525313138962, "step": 188, "step_time": 10.873648665379733 }, { "clip_ratio/high_max": 0.0015666797407902777, "clip_ratio/high_mean": 0.00039166993519756943, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00039166993519756943, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 218.15625, "completions/mean_terminated_length": 212.8064422607422, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.46297022327780724, "epoch": 0.9219512195121952, "frac_reward_zero_std": 0.8125, "grad_norm": 0.03461591899394989, "kl": 0.0057158072595484555, "learning_rate": 7.463414634146342e-07, "loss": 0.0528, "num_tokens": 16867664.0, "reward": 1.765625, "reward_std": 0.4576302170753479, "rewards/answer_reward/mean": 0.78125, "rewards/answer_reward/std": 0.420013427734375, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0883883461356163, "sampling/importance_sampling_ratio/max": 1.6362252235412598, "sampling/importance_sampling_ratio/mean": 0.5664652585983276, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.4337809085845947, "sampling/sampling_logp_difference/mean": 0.023402856662869453, "step": 189, "step_time": 11.320731544401497 }, { "clip_ratio/high_max": 0.0011848341673612595, "clip_ratio/high_mean": 0.00029620854184031487, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00029620854184031487, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 224.75, "completions/mean_terminated_length": 224.75, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.42975563555955887, "epoch": 0.926829268292683, "frac_reward_zero_std": 0.875, "grad_norm": 0.03803783655166626, "kl": 0.0037852034147363156, "learning_rate": 7.024390243902439e-07, "loss": 0.0167, "num_tokens": 16973058.0, "reward": 1.8125, "reward_std": 0.3965577781200409, "rewards/answer_reward/mean": 0.8125, "rewards/answer_reward/std": 0.3965577781200409, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.8005051612854004, "sampling/importance_sampling_ratio/mean": 1.040290117263794, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.102252721786499, "sampling/sampling_logp_difference/mean": 0.02113921195268631, "step": 190, "step_time": 11.182342760730535 }, { "clip_ratio/high_max": 0.00831304513849318, "clip_ratio/high_mean": 0.0021631797426380217, "clip_ratio/low_mean": 0.0006059268343960866, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002769106620689854, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 224.53125, "completions/mean_terminated_length": 219.3870849609375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.4595133662223816, "epoch": 0.9317073170731708, "frac_reward_zero_std": 0.75, "grad_norm": 0.02744545415043831, "kl": 0.003909450140781701, "learning_rate": 6.585365853658536e-07, "loss": -0.0147, "num_tokens": 17083263.0, "reward": 1.59375, "reward_std": 0.5599179267883301, "rewards/answer_reward/mean": 0.625, "rewards/answer_reward/std": 0.49186936020851135, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 2.826179027557373, "sampling/importance_sampling_ratio/mean": 0.820960283279419, "sampling/importance_sampling_ratio/min": 0.025086157023906708, "sampling/sampling_logp_difference/max": 2.2226483821868896, "sampling/sampling_logp_difference/mean": 0.021807601675391197, "step": 191, "step_time": 11.098394230473787 }, { "clip_ratio/high_max": 0.005538418656215072, "clip_ratio/high_mean": 0.001384604664053768, "clip_ratio/low_mean": 8.138021075865254e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0014659848748124205, "completions/clipped_ratio": 0.125, "completions/max_length": 384.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 238.625, "completions/mean_terminated_length": 217.85714721679688, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.43495074287056923, "epoch": 0.9365853658536586, "frac_reward_zero_std": 0.5625, "grad_norm": 0.05996643751859665, "kl": 0.010258348018396646, "learning_rate": 6.146341463414634e-07, "loss": -0.0888, "num_tokens": 17153837.0, "reward": 1.4453125, "reward_std": 0.6404836177825928, "rewards/answer_reward/mean": 0.53125, "rewards/answer_reward/std": 0.507007360458374, "rewards/format_reward/mean": 0.9140625, "rewards/format_reward/std": 0.24271129071712494, "sampling/importance_sampling_ratio/max": 2.7956135272979736, "sampling/importance_sampling_ratio/mean": 0.6788397431373596, "sampling/importance_sampling_ratio/min": 0.06339692324399948, "sampling/sampling_logp_difference/max": 2.079192638397217, "sampling/sampling_logp_difference/mean": 0.019637133926153183, "step": 192, "step_time": 9.909145697485656 }, { "clip_ratio/high_max": 0.0010504202218726277, "clip_ratio/high_mean": 0.00026260505546815693, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00026260505546815693, "completions/clipped_ratio": 0.0625, "completions/max_length": 384.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 235.09375, "completions/mean_terminated_length": 225.1666717529297, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.43838976696133614, "epoch": 0.9414634146341463, "frac_reward_zero_std": 0.8125, "grad_norm": 0.013177145272493362, "kl": 0.0073972616519313306, "learning_rate": 5.707317073170732e-07, "loss": -0.031, "num_tokens": 17236912.0, "reward": 1.609375, "reward_std": 0.5782952308654785, "rewards/answer_reward/mean": 0.65625, "rewards/answer_reward/std": 0.4825586974620819, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.19507545232772827, "sampling/importance_sampling_ratio/max": 2.7895686626434326, "sampling/importance_sampling_ratio/mean": 0.6865278482437134, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.3832662105560303, "sampling/sampling_logp_difference/mean": 0.02114027366042137, "step": 193, "step_time": 11.398583413101733 }, { "clip_ratio/high_max": 0.00042662114719860256, "clip_ratio/high_mean": 0.00010665528679965064, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00010665528679965064, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 219.375, "completions/mean_terminated_length": 214.06451416015625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.4564849063754082, "epoch": 0.9463414634146341, "frac_reward_zero_std": 0.8125, "grad_norm": 0.042264945805072784, "kl": 0.00322515208972618, "learning_rate": 5.26829268292683e-07, "loss": -0.0279, "num_tokens": 17324642.0, "reward": 1.75, "reward_std": 0.5080004930496216, "rewards/answer_reward/mean": 0.78125, "rewards/answer_reward/std": 0.420013427734375, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 2.1456665992736816, "sampling/importance_sampling_ratio/mean": 0.8515944480895996, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8581054210662842, "sampling/sampling_logp_difference/mean": 0.02103826031088829, "step": 194, "step_time": 11.28847742266953 }, { "clip_ratio/high_max": 0.003353333711856976, "clip_ratio/high_mean": 0.0009523845292278565, "clip_ratio/low_mean": 0.0007134207844501361, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001665805328229908, "completions/clipped_ratio": 0.0625, "completions/max_length": 384.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 230.25, "completions/mean_terminated_length": 220.00001525878906, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.4520576409995556, "epoch": 0.9512195121951219, "frac_reward_zero_std": 0.625, "grad_norm": 0.04753180593252182, "kl": 0.0046316401567310095, "learning_rate": 4.829268292682927e-07, "loss": -0.0661, "num_tokens": 17410934.0, "reward": 1.5625, "reward_std": 0.5644009113311768, "rewards/answer_reward/mean": 0.59375, "rewards/answer_reward/std": 0.49899089336395264, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 1.617477536201477, "sampling/importance_sampling_ratio/mean": 0.6689938306808472, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8456025123596191, "sampling/sampling_logp_difference/mean": 0.02115941047668457, "step": 195, "step_time": 10.379010274074972 }, { "clip_ratio/high_max": 0.0023555761436000466, "clip_ratio/high_mean": 0.0007596590439788997, "clip_ratio/low_mean": 0.00016276042151730508, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0009224194654962048, "completions/clipped_ratio": 0.0625, "completions/max_length": 384.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 208.125, "completions/mean_terminated_length": 196.40000915527344, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.45349348336458206, "epoch": 0.9560975609756097, "frac_reward_zero_std": 0.75, "grad_norm": 0.06080297380685806, "kl": 0.0063120529521256685, "learning_rate": 4.3902439024390246e-07, "loss": 0.0727, "num_tokens": 17501030.0, "reward": 1.703125, "reward_std": 0.55152827501297, "rewards/answer_reward/mean": 0.75, "rewards/answer_reward/std": 0.4399413466453552, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.19507545232772827, "sampling/importance_sampling_ratio/max": 2.90635085105896, "sampling/importance_sampling_ratio/mean": 0.8226816654205322, "sampling/importance_sampling_ratio/min": 0.07022611796855927, "sampling/sampling_logp_difference/max": 1.2650408744812012, "sampling/sampling_logp_difference/mean": 0.021759407594799995, "step": 196, "step_time": 11.188433247152716 }, { "clip_ratio/high_max": 0.0036725992104038596, "clip_ratio/high_mean": 0.0009181498026009649, "clip_ratio/low_mean": 8.138021075865254e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0009995300133596174, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 216.71875, "completions/mean_terminated_length": 211.32257080078125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.4431290104985237, "epoch": 0.9609756097560975, "frac_reward_zero_std": 0.75, "grad_norm": 0.03708234056830406, "kl": 0.009447437099879608, "learning_rate": 3.951219512195122e-07, "loss": -0.0391, "num_tokens": 17588345.0, "reward": 1.71875, "reward_std": 0.5226715207099915, "rewards/answer_reward/mean": 0.75, "rewards/answer_reward/std": 0.4399413466453552, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 2.528080463409424, "sampling/importance_sampling_ratio/mean": 0.963192343711853, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.5857353210449219, "sampling/sampling_logp_difference/mean": 0.02034963108599186, "step": 197, "step_time": 10.76899948483333 }, { "clip_ratio/high_max": 0.002101671852869913, "clip_ratio/high_mean": 0.0007207304806797765, "clip_ratio/low_mean": 0.0002372810267843306, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0009580115074641071, "completions/clipped_ratio": 0.0625, "completions/max_length": 384.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 239.4375, "completions/mean_terminated_length": 229.80001831054688, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.43860315904021263, "epoch": 0.9658536585365853, "frac_reward_zero_std": 0.6875, "grad_norm": 0.03257884457707405, "kl": 0.0027654510631691664, "learning_rate": 3.5121951219512195e-07, "loss": 0.0053, "num_tokens": 17684201.0, "reward": 1.6953125, "reward_std": 0.5705627202987671, "rewards/answer_reward/mean": 0.75, "rewards/answer_reward/std": 0.4399413466453552, "rewards/format_reward/mean": 0.9453125, "rewards/format_reward/std": 0.21752247214317322, "sampling/importance_sampling_ratio/max": 2.4404547214508057, "sampling/importance_sampling_ratio/mean": 0.714219331741333, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.5862724781036377, "sampling/sampling_logp_difference/mean": 0.020519975572824478, "step": 198, "step_time": 10.457097836770117 }, { "clip_ratio/high_max": 0.0005707762320525944, "clip_ratio/high_mean": 0.000269212294369936, "clip_ratio/low_mean": 0.00022241992701310664, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004916322213830426, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 234.5, "completions/mean_terminated_length": 234.5, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.4728074334561825, "epoch": 0.9707317073170731, "frac_reward_zero_std": 0.875, "grad_norm": 0.04412536323070526, "kl": 0.0031479664030484855, "learning_rate": 3.073170731707317e-07, "loss": -0.0526, "num_tokens": 17774037.0, "reward": 1.7421875, "reward_std": 0.4556295573711395, "rewards/answer_reward/mean": 0.75, "rewards/answer_reward/std": 0.4399413466453552, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.4919586181640625, "sampling/importance_sampling_ratio/mean": 0.6762293577194214, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.0089819431304932, "sampling/sampling_logp_difference/mean": 0.023380260914564133, "step": 199, "step_time": 11.409059608355165 }, { "clip_ratio/high_max": 0.004094912495929748, "clip_ratio/high_mean": 0.001023728123982437, "clip_ratio/low_mean": 9.585889347363263e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011195870174560696, "completions/clipped_ratio": 0.0625, "completions/max_length": 384.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 224.625, "completions/mean_terminated_length": 214.00001525878906, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.43833550810813904, "epoch": 0.975609756097561, "frac_reward_zero_std": 0.75, "grad_norm": 0.04204738885164261, "kl": 0.004441671160748228, "learning_rate": 2.634146341463415e-07, "loss": -0.0678, "num_tokens": 17856071.0, "reward": 1.65625, "reward_std": 0.6015772223472595, "rewards/answer_reward/mean": 0.71875, "rewards/answer_reward/std": 0.45680341124534607, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "sampling/importance_sampling_ratio/max": 2.638582706451416, "sampling/importance_sampling_ratio/mean": 0.8104733228683472, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7748641967773438, "sampling/sampling_logp_difference/mean": 0.02084459736943245, "step": 200, "step_time": 10.808166905771941 }, { "clip_ratio/high_max": 0.004610990057699382, "clip_ratio/high_mean": 0.0015568280941806734, "clip_ratio/low_mean": 0.0001509661815362051, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017077942757168785, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 226.40625, "completions/mean_terminated_length": 221.32257080078125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.47282105684280396, "epoch": 0.9804878048780488, "frac_reward_zero_std": 0.6875, "grad_norm": 0.05886881425976753, "kl": 0.0031774196249898523, "learning_rate": 2.1951219512195123e-07, "loss": 0.0663, "num_tokens": 17947658.0, "reward": 1.625, "reward_std": 0.5535807013511658, "rewards/answer_reward/mean": 0.65625, "rewards/answer_reward/std": 0.4825586974620819, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 2.9894485473632812, "sampling/importance_sampling_ratio/mean": 0.8542252779006958, "sampling/importance_sampling_ratio/min": 0.05772608891129494, "sampling/sampling_logp_difference/max": 0.9505095481872559, "sampling/sampling_logp_difference/mean": 0.021834976971149445, "step": 201, "step_time": 10.612883982714266 }, { "clip_ratio/high_max": 0.0035791925038211048, "clip_ratio/high_mean": 0.0012530418898677453, "clip_ratio/low_mean": 0.00039811909664422274, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016511610156157985, "completions/clipped_ratio": 0.0625, "completions/max_length": 384.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 237.46875, "completions/mean_terminated_length": 227.70001220703125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.447588037699461, "epoch": 0.9853658536585366, "frac_reward_zero_std": 0.625, "grad_norm": 0.041394852101802826, "kl": 0.003872697241604328, "learning_rate": 1.7560975609756097e-07, "loss": -0.0142, "num_tokens": 18022279.0, "reward": 1.5390625, "reward_std": 0.6027804613113403, "rewards/answer_reward/mean": 0.59375, "rewards/answer_reward/std": 0.49899089336395264, "rewards/format_reward/mean": 0.9453125, "rewards/format_reward/std": 0.21752247214317322, "sampling/importance_sampling_ratio/max": 2.2482316493988037, "sampling/importance_sampling_ratio/mean": 0.7710789442062378, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7523748874664307, "sampling/sampling_logp_difference/mean": 0.020237356424331665, "step": 202, "step_time": 10.096908348612487 }, { "clip_ratio/high_max": 0.004537479020655155, "clip_ratio/high_mean": 0.0011343697551637888, "clip_ratio/low_mean": 0.000915427430300042, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002049797185463831, "completions/clipped_ratio": 0.03125, "completions/max_length": 384.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 213.125, "completions/mean_terminated_length": 207.61289978027344, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.5168952755630016, "epoch": 0.9902439024390244, "frac_reward_zero_std": 0.75, "grad_norm": 0.019319448620080948, "kl": 0.004126500047277659, "learning_rate": 1.3170731707317074e-07, "loss": -0.005, "num_tokens": 18113723.0, "reward": 1.65625, "reward_std": 0.5453247427940369, "rewards/answer_reward/mean": 0.6875, "rewards/answer_reward/std": 0.4709290862083435, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 2.1611924171447754, "sampling/importance_sampling_ratio/mean": 0.632793664932251, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9966816902160645, "sampling/sampling_logp_difference/mean": 0.024396179243922234, "step": 203, "step_time": 10.739579997956753 }, { "clip_ratio/high_max": 0.007450781704392284, "clip_ratio/high_mean": 0.0024411976482952014, "clip_ratio/low_mean": 0.0017306014487985522, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0041717990970937535, "completions/clipped_ratio": 0.0625, "completions/max_length": 384.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 218.1875, "completions/mean_terminated_length": 207.1333465576172, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.47251078486442566, "epoch": 0.9951219512195122, "frac_reward_zero_std": 0.4375, "grad_norm": 0.07291460782289505, "kl": 0.0030853954376652837, "learning_rate": 8.780487804878049e-08, "loss": -0.0345, "num_tokens": 18216359.0, "reward": 1.40625, "reward_std": 0.6148366928100586, "rewards/answer_reward/mean": 0.46875, "rewards/answer_reward/std": 0.507007360458374, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24593468010425568, "sampling/importance_sampling_ratio/max": 2.775588035583496, "sampling/importance_sampling_ratio/mean": 0.8540469408035278, "sampling/importance_sampling_ratio/min": 0.06364062428474426, "sampling/sampling_logp_difference/max": 0.7679234743118286, "sampling/sampling_logp_difference/mean": 0.022379405796527863, "step": 204, "step_time": 11.566578593570739 }, { "clip_ratio/high_max": 0.001054852269589901, "clip_ratio/high_mean": 0.00026371306739747524, "clip_ratio/low_mean": 0.0014204545877873898, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001684167655184865, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 204.1875, "completions/mean_terminated_length": 204.1875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.41701607406139374, "epoch": 1.0, "frac_reward_zero_std": 0.8125, "grad_norm": 0.04486149922013283, "kl": 0.0035870482970494777, "learning_rate": 4.3902439024390244e-08, "loss": -0.0335, "num_tokens": 18316893.0, "reward": 1.6875, "reward_std": 0.5350610613822937, "rewards/answer_reward/mean": 0.71875, "rewards/answer_reward/std": 0.45680341124534607, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 2.0152788162231445, "sampling/importance_sampling_ratio/mean": 0.6947116851806641, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.7783215045928955, "sampling/sampling_logp_difference/mean": 0.02238449454307556, "step": 205, "step_time": 10.744295610114932 } ], "logging_steps": 1, "max_steps": 205, "num_input_tokens_seen": 18316893, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }