{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.03765060240963856, "eval_steps": 500, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 1536.0, "completions/max_terminated_length": 1461.0, "completions/mean_length": 539.939453125, "completions/mean_terminated_length": 534.0687866210938, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.0007530120481927711, "frac_reward_zero_std": 0.0, "grad_norm": 0.24425463378429413, "learning_rate": 1e-06, "loss": 0.0483, "num_tokens": 322497.0, "reward": 18.535240173339844, "reward_std": 4.5626091957092285, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 18.535240173339844, "rewards/skywork_reward/std": 7.240255355834961, "step": 1 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1536.0, "completions/max_terminated_length": 1313.0, "completions/mean_length": 539.921875, "completions/mean_terminated_length": 537.9725952148438, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.0015060240963855422, "frac_reward_zero_std": 0.0, "grad_norm": 0.2527785897254944, "learning_rate": 1e-06, "loss": 0.0676, "num_tokens": 642601.0, "reward": 19.66384506225586, "reward_std": 4.405518531799316, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 19.66384506225586, "rewards/skywork_reward/std": 7.860788822174072, "step": 2 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1096.0, "completions/max_terminated_length": 1096.0, "completions/mean_length": 509.203125, "completions/mean_terminated_length": 509.203125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.002259036144578313, "frac_reward_zero_std": 0.0, "grad_norm": 0.23311284184455872, "learning_rate": 1e-06, "loss": 0.036, "num_tokens": 948897.0, "reward": 19.736499786376953, "reward_std": 4.317551136016846, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 19.736499786376953, "rewards/skywork_reward/std": 8.241270065307617, "step": 3 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1332.0, "completions/max_terminated_length": 1332.0, "completions/mean_length": 481.34765625, "completions/mean_terminated_length": 481.34765625, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.0030120481927710845, "frac_reward_zero_std": 0.0, "grad_norm": 0.25114020705223083, "learning_rate": 1e-06, "loss": 0.037, "num_tokens": 1240675.0, "reward": 19.980758666992188, "reward_std": 4.497593879699707, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 19.980758666992188, "rewards/skywork_reward/std": 7.561793804168701, "step": 4 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1536.0, "completions/max_terminated_length": 1367.0, "completions/mean_length": 550.08984375, "completions/mean_terminated_length": 548.1604614257812, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.0037650602409638554, "frac_reward_zero_std": 0.0, "grad_norm": 0.27846774458885193, "learning_rate": 1e-06, "loss": 0.06, "num_tokens": 1567393.0, "reward": 15.45170783996582, "reward_std": 4.615124702453613, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 15.45170783996582, "rewards/skywork_reward/std": 7.330972194671631, "step": 5 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1536.0, "completions/max_terminated_length": 1435.0, "completions/mean_length": 466.416015625, "completions/mean_terminated_length": 462.2215881347656, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.004518072289156626, "frac_reward_zero_std": 0.0, "grad_norm": 0.26218029856681824, "learning_rate": 1e-06, "loss": 0.0554, "num_tokens": 1848294.0, "reward": 15.858068466186523, "reward_std": 4.51039981842041, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 15.858068466186523, "rewards/skywork_reward/std": 8.340972900390625, "step": 6 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1536.0, "completions/max_terminated_length": 1397.0, "completions/mean_length": 546.90625, "completions/mean_terminated_length": 544.9706420898438, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.005271084337349397, "frac_reward_zero_std": 0.0, "grad_norm": 0.2502232789993286, "learning_rate": 1e-06, "loss": 0.0183, "num_tokens": 2176214.0, "reward": 20.738807678222656, "reward_std": 4.506719589233398, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 20.738807678222656, "rewards/skywork_reward/std": 7.784599781036377, "step": 7 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1536.0, "completions/max_terminated_length": 1344.0, "completions/mean_length": 507.919921875, "completions/mean_terminated_length": 505.90802001953125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.006024096385542169, "frac_reward_zero_std": 0.0, "grad_norm": 0.24587783217430115, "learning_rate": 1e-06, "loss": 0.0108, "num_tokens": 2483485.0, "reward": 18.190261840820312, "reward_std": 4.080215930938721, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 18.190261840820312, "rewards/skywork_reward/std": 7.144257068634033, "step": 8 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1536.0, "completions/max_terminated_length": 1392.0, "completions/mean_length": 517.755859375, "completions/mean_terminated_length": 513.7627563476562, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.00677710843373494, "frac_reward_zero_std": 0.0, "grad_norm": 0.25550204515457153, "learning_rate": 1e-06, "loss": 0.0365, "num_tokens": 2791968.0, "reward": 18.123830795288086, "reward_std": 3.959782838821411, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 18.123830795288086, "rewards/skywork_reward/std": 8.163817405700684, "step": 9 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1536.0, "completions/max_terminated_length": 1385.0, "completions/mean_length": 469.869140625, "completions/mean_terminated_length": 467.78277587890625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.007530120481927711, "frac_reward_zero_std": 0.0, "grad_norm": 0.2836010456085205, "learning_rate": 1e-06, "loss": 0.0308, "num_tokens": 3079901.0, "reward": 19.304351806640625, "reward_std": 4.372314453125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 19.304351806640625, "rewards/skywork_reward/std": 7.716484069824219, "step": 10 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1485.0, "completions/max_terminated_length": 1485.0, "completions/mean_length": 500.615234375, "completions/mean_terminated_length": 500.615234375, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.008283132530120483, "frac_reward_zero_std": 0.0, "grad_norm": 0.246844083070755, "learning_rate": 1e-06, "loss": 0.0386, "num_tokens": 3382520.0, "reward": 18.477981567382812, "reward_std": 4.108589172363281, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 18.477981567382812, "rewards/skywork_reward/std": 7.084549427032471, "step": 11 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1090.0, "completions/max_terminated_length": 1090.0, "completions/mean_length": 478.2578125, "completions/mean_terminated_length": 478.2578125, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.009036144578313253, "frac_reward_zero_std": 0.0, "grad_norm": 0.2760355472564697, "learning_rate": 1e-06, "loss": 0.0437, "num_tokens": 3669020.0, "reward": 18.244699478149414, "reward_std": 4.524354457855225, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 18.244699478149414, "rewards/skywork_reward/std": 7.753305435180664, "step": 12 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1536.0, "completions/max_terminated_length": 1359.0, "completions/mean_length": 509.525390625, "completions/mean_terminated_length": 505.5000305175781, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.009789156626506024, "frac_reward_zero_std": 0.0, "grad_norm": 0.2421659678220749, "learning_rate": 1e-06, "loss": 0.0431, "num_tokens": 3973577.0, "reward": 18.84097671508789, "reward_std": 3.6833090782165527, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 18.84097671508789, "rewards/skywork_reward/std": 7.0365753173828125, "step": 13 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 986.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 502.75, "completions/mean_terminated_length": 502.75, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.010542168674698794, "frac_reward_zero_std": 0.0, "grad_norm": 0.24300047755241394, "learning_rate": 1e-06, "loss": 0.0095, "num_tokens": 4275273.0, "reward": 19.73338508605957, "reward_std": 3.7505850791931152, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 19.73338508605957, "rewards/skywork_reward/std": 7.851949691772461, "step": 14 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1536.0, "completions/max_terminated_length": 1212.0, "completions/mean_length": 495.01171875, "completions/mean_terminated_length": 492.97454833984375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.011295180722891566, "frac_reward_zero_std": 0.0, "grad_norm": 0.24820354580879211, "learning_rate": 1e-06, "loss": 0.041, "num_tokens": 4572607.0, "reward": 20.985137939453125, "reward_std": 3.9907798767089844, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 20.985137939453125, "rewards/skywork_reward/std": 7.5082783699035645, "step": 15 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1452.0, "completions/max_terminated_length": 1452.0, "completions/mean_length": 503.646484375, "completions/mean_terminated_length": 503.646484375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.012048192771084338, "frac_reward_zero_std": 0.0, "grad_norm": 0.25032636523246765, "learning_rate": 1e-06, "loss": 0.0271, "num_tokens": 4879242.0, "reward": 19.479820251464844, "reward_std": 3.9832570552825928, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 19.479820251464844, "rewards/skywork_reward/std": 7.7815022468566895, "step": 16 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1536.0, "completions/max_terminated_length": 1505.0, "completions/mean_length": 548.275390625, "completions/mean_terminated_length": 546.3424682617188, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.012801204819277108, "frac_reward_zero_std": 0.0, "grad_norm": 0.23289093375205994, "learning_rate": 1e-06, "loss": 0.0344, "num_tokens": 5206471.0, "reward": 18.09130859375, "reward_std": 4.67989444732666, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 18.09130859375, "rewards/skywork_reward/std": 8.046331405639648, "step": 17 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1333.0, "completions/max_terminated_length": 1333.0, "completions/mean_length": 500.513671875, "completions/mean_terminated_length": 500.513671875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.01355421686746988, "frac_reward_zero_std": 0.0, "grad_norm": 0.2743029296398163, "learning_rate": 1e-06, "loss": 0.0608, "num_tokens": 5508110.0, "reward": 19.004474639892578, "reward_std": 4.812801361083984, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 19.004474639892578, "rewards/skywork_reward/std": 7.56382942199707, "step": 18 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1536.0, "completions/max_terminated_length": 1300.0, "completions/mean_length": 493.69921875, "completions/mean_terminated_length": 491.65948486328125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.01430722891566265, "frac_reward_zero_std": 0.0, "grad_norm": 0.24759620428085327, "learning_rate": 1e-06, "loss": 0.0399, "num_tokens": 5802532.0, "reward": 20.18354034423828, "reward_std": 4.097359657287598, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 20.18354034423828, "rewards/skywork_reward/std": 6.95109224319458, "step": 19 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1536.0, "completions/max_terminated_length": 1506.0, "completions/mean_length": 540.8828125, "completions/mean_terminated_length": 536.9804077148438, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.015060240963855422, "frac_reward_zero_std": 0.0, "grad_norm": 0.22581446170806885, "learning_rate": 1e-06, "loss": 0.0349, "num_tokens": 6127912.0, "reward": 18.810096740722656, "reward_std": 4.296095848083496, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 18.810096740722656, "rewards/skywork_reward/std": 7.788106918334961, "step": 20 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 1536.0, "completions/max_terminated_length": 1486.0, "completions/mean_length": 568.685546875, "completions/mean_terminated_length": 562.9843139648438, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.01581325301204819, "frac_reward_zero_std": 0.0, "grad_norm": 0.25190842151641846, "learning_rate": 1e-06, "loss": 0.031, "num_tokens": 6462007.0, "reward": 16.9500675201416, "reward_std": 3.961681842803955, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 16.9500675201416, "rewards/skywork_reward/std": 7.946998119354248, "step": 21 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1536.0, "completions/max_terminated_length": 1516.0, "completions/mean_length": 536.982421875, "completions/mean_terminated_length": 535.0274047851562, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.016566265060240965, "frac_reward_zero_std": 0.0, "grad_norm": 0.23905393481254578, "learning_rate": 1e-06, "loss": 0.0237, "num_tokens": 6777998.0, "reward": 19.38282012939453, "reward_std": 3.568166494369507, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 19.38282012939453, "rewards/skywork_reward/std": 8.19947624206543, "step": 22 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1536.0, "completions/max_terminated_length": 1442.0, "completions/mean_length": 564.58203125, "completions/mean_terminated_length": 562.6810302734375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.017319277108433735, "frac_reward_zero_std": 0.0, "grad_norm": 0.24202950298786163, "learning_rate": 1e-06, "loss": 0.0379, "num_tokens": 7113960.0, "reward": 17.542194366455078, "reward_std": 3.6086037158966064, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 17.542194366455078, "rewards/skywork_reward/std": 6.972787857055664, "step": 23 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 1536.0, "completions/max_terminated_length": 1526.0, "completions/mean_length": 594.02734375, "completions/mean_terminated_length": 584.7376708984375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.018072289156626505, "frac_reward_zero_std": 0.0, "grad_norm": 0.23193013668060303, "learning_rate": 1e-06, "loss": 0.0177, "num_tokens": 7464662.0, "reward": 19.02389907836914, "reward_std": 4.250513076782227, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 19.02389907836914, "rewards/skywork_reward/std": 6.219736576080322, "step": 24 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1536.0, "completions/max_terminated_length": 1258.0, "completions/mean_length": 549.28125, "completions/mean_terminated_length": 547.3502807617188, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.01882530120481928, "frac_reward_zero_std": 0.0, "grad_norm": 0.24706406891345978, "learning_rate": 1e-06, "loss": 0.0544, "num_tokens": 7789846.0, "reward": 18.385107040405273, "reward_std": 4.854748249053955, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 18.385107040405273, "rewards/skywork_reward/std": 7.241014003753662, "step": 25 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 1536.0, "completions/max_terminated_length": 1387.0, "completions/mean_length": 531.689453125, "completions/mean_terminated_length": 521.7850341796875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.01957831325301205, "frac_reward_zero_std": 0.0, "grad_norm": 0.2525196373462677, "learning_rate": 1e-06, "loss": 0.0523, "num_tokens": 8111927.0, "reward": 19.272109985351562, "reward_std": 3.762904167175293, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 19.272109985351562, "rewards/skywork_reward/std": 7.09940767288208, "step": 26 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1536.0, "completions/max_terminated_length": 1308.0, "completions/mean_length": 547.365234375, "completions/mean_terminated_length": 545.4305419921875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.02033132530120482, "frac_reward_zero_std": 0.0, "grad_norm": 0.2493782937526703, "learning_rate": 1e-06, "loss": 0.0507, "num_tokens": 8435202.0, "reward": 17.78737449645996, "reward_std": 4.015490531921387, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 17.78737449645996, "rewards/skywork_reward/std": 7.691453456878662, "step": 27 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1301.0, "completions/max_terminated_length": 1301.0, "completions/mean_length": 478.685546875, "completions/mean_terminated_length": 478.685546875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.02108433734939759, "frac_reward_zero_std": 0.0, "grad_norm": 0.26527339220046997, "learning_rate": 1e-06, "loss": 0.0339, "num_tokens": 8727633.0, "reward": 18.366004943847656, "reward_std": 3.85819411277771, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 18.366004943847656, "rewards/skywork_reward/std": 6.539942264556885, "step": 28 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1473.0, "completions/max_terminated_length": 1473.0, "completions/mean_length": 509.15234375, "completions/mean_terminated_length": 509.15234375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.021837349397590362, "frac_reward_zero_std": 0.0, "grad_norm": 0.2530669867992401, "learning_rate": 1e-06, "loss": 0.0226, "num_tokens": 9036511.0, "reward": 18.36314582824707, "reward_std": 3.6422839164733887, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 18.36314582824707, "rewards/skywork_reward/std": 7.377774715423584, "step": 29 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1536.0, "completions/max_terminated_length": 1452.0, "completions/mean_length": 523.609375, "completions/mean_terminated_length": 521.628173828125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.022590361445783132, "frac_reward_zero_std": 0.0, "grad_norm": 0.24128130078315735, "learning_rate": 1e-06, "loss": 0.0412, "num_tokens": 9345751.0, "reward": 20.08295440673828, "reward_std": 3.875840902328491, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 20.08295440673828, "rewards/skywork_reward/std": 7.567595958709717, "step": 30 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1536.0, "completions/max_terminated_length": 1503.0, "completions/mean_length": 516.138671875, "completions/mean_terminated_length": 512.1392211914062, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.023343373493975902, "frac_reward_zero_std": 0.0, "grad_norm": 0.2687889337539673, "learning_rate": 1e-06, "loss": 0.0406, "num_tokens": 9651646.0, "reward": 18.440399169921875, "reward_std": 4.034783363342285, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 18.440399169921875, "rewards/skywork_reward/std": 9.241204261779785, "step": 31 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1351.0, "completions/max_terminated_length": 1351.0, "completions/mean_length": 509.87109375, "completions/mean_terminated_length": 509.87109375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.024096385542168676, "frac_reward_zero_std": 0.0, "grad_norm": 0.2565000653266907, "learning_rate": 1e-06, "loss": 0.0341, "num_tokens": 9958156.0, "reward": 21.218032836914062, "reward_std": 3.8647594451904297, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 21.218032836914062, "rewards/skywork_reward/std": 7.692581653594971, "step": 32 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1536.0, "completions/max_terminated_length": 1272.0, "completions/mean_length": 483.21484375, "completions/mean_terminated_length": 481.15460205078125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.024849397590361446, "frac_reward_zero_std": 0.0, "grad_norm": 0.2624817490577698, "learning_rate": 1e-06, "loss": 0.0431, "num_tokens": 10249610.0, "reward": 19.79848861694336, "reward_std": 4.353243827819824, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 19.79848861694336, "rewards/skywork_reward/std": 7.000552654266357, "step": 33 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1237.0, "completions/max_terminated_length": 1237.0, "completions/mean_length": 479.90234375, "completions/mean_terminated_length": 479.90234375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.025602409638554216, "frac_reward_zero_std": 0.0, "grad_norm": 0.2779456377029419, "learning_rate": 1e-06, "loss": 0.0414, "num_tokens": 10539784.0, "reward": 18.208389282226562, "reward_std": 3.7550158500671387, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 18.208389282226562, "rewards/skywork_reward/std": 7.102542400360107, "step": 34 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1438.0, "completions/max_terminated_length": 1438.0, "completions/mean_length": 538.0859375, "completions/mean_terminated_length": 538.0859375, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.02635542168674699, "frac_reward_zero_std": 0.0, "grad_norm": 0.27090051770210266, "learning_rate": 1e-06, "loss": 0.0616, "num_tokens": 10861060.0, "reward": 18.551515579223633, "reward_std": 4.775388717651367, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 18.551515579223633, "rewards/skywork_reward/std": 8.144341468811035, "step": 35 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1536.0, "completions/max_terminated_length": 1414.0, "completions/mean_length": 523.318359375, "completions/mean_terminated_length": 521.3366088867188, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.02710843373493976, "frac_reward_zero_std": 0.0, "grad_norm": 0.24933283030986786, "learning_rate": 1e-06, "loss": 0.0219, "num_tokens": 11174711.0, "reward": 19.23906707763672, "reward_std": 3.579415798187256, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 19.23906707763672, "rewards/skywork_reward/std": 6.7693352699279785, "step": 36 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1471.0, "completions/max_terminated_length": 1471.0, "completions/mean_length": 540.38671875, "completions/mean_terminated_length": 540.38671875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.02786144578313253, "frac_reward_zero_std": 0.0, "grad_norm": 0.2600509524345398, "learning_rate": 1e-06, "loss": 0.0552, "num_tokens": 11499629.0, "reward": 17.48117446899414, "reward_std": 4.680095672607422, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 17.48117446899414, "rewards/skywork_reward/std": 7.522308826446533, "step": 37 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1536.0, "completions/max_terminated_length": 1324.0, "completions/mean_length": 527.3359375, "completions/mean_terminated_length": 525.362060546875, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.0286144578313253, "frac_reward_zero_std": 0.0, "grad_norm": 0.2569137215614319, "learning_rate": 1e-06, "loss": 0.0256, "num_tokens": 11812185.0, "reward": 19.502559661865234, "reward_std": 4.2003655433654785, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 19.502559661865234, "rewards/skywork_reward/std": 7.574954986572266, "step": 38 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 1536.0, "completions/max_terminated_length": 1497.0, "completions/mean_length": 546.02734375, "completions/mean_terminated_length": 540.1925659179688, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.029367469879518073, "frac_reward_zero_std": 0.0, "grad_norm": 0.25866448879241943, "learning_rate": 1e-06, "loss": 0.0702, "num_tokens": 12136471.0, "reward": 19.45880889892578, "reward_std": 4.22914457321167, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 19.45880889892578, "rewards/skywork_reward/std": 7.417150497436523, "step": 39 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 1536.0, "completions/max_terminated_length": 1300.0, "completions/mean_length": 549.212890625, "completions/mean_terminated_length": 543.3968505859375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.030120481927710843, "frac_reward_zero_std": 0.0, "grad_norm": 0.26242557168006897, "learning_rate": 1e-06, "loss": 0.0572, "num_tokens": 12468676.0, "reward": 18.719432830810547, "reward_std": 4.223837852478027, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 18.719432830810547, "rewards/skywork_reward/std": 7.981883525848389, "step": 40 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 1536.0, "completions/max_terminated_length": 1474.0, "completions/mean_length": 592.703125, "completions/mean_terminated_length": 587.1434326171875, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.030873493975903613, "frac_reward_zero_std": 0.0, "grad_norm": 0.23195688426494598, "learning_rate": 1e-06, "loss": 0.0277, "num_tokens": 12819564.0, "reward": 19.721298217773438, "reward_std": 4.098422527313232, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 19.721298217773438, "rewards/skywork_reward/std": 8.307689666748047, "step": 41 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1536.0, "completions/max_terminated_length": 1385.0, "completions/mean_length": 505.2734375, "completions/mean_terminated_length": 501.2314147949219, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.03162650602409638, "frac_reward_zero_std": 0.0, "grad_norm": 0.26936817169189453, "learning_rate": 1e-06, "loss": 0.0703, "num_tokens": 13120760.0, "reward": 19.512004852294922, "reward_std": 4.078932762145996, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 19.512004852294922, "rewards/skywork_reward/std": 7.664077281951904, "step": 42 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1536.0, "completions/max_terminated_length": 1478.0, "completions/mean_length": 525.787109375, "completions/mean_terminated_length": 523.8101806640625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.03237951807228916, "frac_reward_zero_std": 0.0, "grad_norm": 0.25450485944747925, "learning_rate": 1e-06, "loss": 0.0453, "num_tokens": 13435275.0, "reward": 19.4080810546875, "reward_std": 4.121971130371094, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 19.4080810546875, "rewards/skywork_reward/std": 8.246720314025879, "step": 43 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.025390625, "completions/max_length": 1536.0, "completions/max_terminated_length": 1514.0, "completions/mean_length": 598.4140625, "completions/mean_terminated_length": 573.9879760742188, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.03313253012048193, "frac_reward_zero_std": 0.0, "grad_norm": 0.22897975146770477, "learning_rate": 1e-06, "loss": 0.0552, "num_tokens": 13792623.0, "reward": 19.808815002441406, "reward_std": 4.206421375274658, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 19.808815002441406, "rewards/skywork_reward/std": 8.275568008422852, "step": 44 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1536.0, "completions/max_terminated_length": 1531.0, "completions/mean_length": 540.21875, "completions/mean_terminated_length": 512.224853515625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.0338855421686747, "frac_reward_zero_std": 0.0, "grad_norm": 0.26464077830314636, "learning_rate": 1e-06, "loss": 0.084, "num_tokens": 14114543.0, "reward": 19.398923873901367, "reward_std": 4.753540992736816, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 19.398923873901367, "rewards/skywork_reward/std": 8.964569091796875, "step": 45 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1536.0, "completions/max_terminated_length": 1331.0, "completions/mean_length": 550.970703125, "completions/mean_terminated_length": 543.2145385742188, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.03463855421686747, "frac_reward_zero_std": 0.0, "grad_norm": 0.26099711656570435, "learning_rate": 1e-06, "loss": 0.0685, "num_tokens": 14438880.0, "reward": 20.744400024414062, "reward_std": 4.4266676902771, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 20.744400024414062, "rewards/skywork_reward/std": 9.021385192871094, "step": 46 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1536.0, "completions/max_terminated_length": 1476.0, "completions/mean_length": 555.751953125, "completions/mean_terminated_length": 553.8336791992188, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.035391566265060244, "frac_reward_zero_std": 0.0, "grad_norm": 0.23918218910694122, "learning_rate": 1e-06, "loss": 0.0153, "num_tokens": 14773041.0, "reward": 18.19009017944336, "reward_std": 3.2478275299072266, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 18.19009017944336, "rewards/skywork_reward/std": 8.022202491760254, "step": 47 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1536.0, "completions/max_terminated_length": 1381.0, "completions/mean_length": 504.818359375, "completions/mean_terminated_length": 502.8003845214844, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.03614457831325301, "frac_reward_zero_std": 0.0, "grad_norm": 0.2639124095439911, "learning_rate": 1e-06, "loss": 0.0118, "num_tokens": 15076308.0, "reward": 18.888355255126953, "reward_std": 3.525162696838379, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 18.888355255126953, "rewards/skywork_reward/std": 7.233704090118408, "step": 48 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 1536.0, "completions/max_terminated_length": 1477.0, "completions/mean_length": 558.080078125, "completions/mean_terminated_length": 552.3163452148438, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.036897590361445784, "frac_reward_zero_std": 0.0, "grad_norm": 0.2637381851673126, "learning_rate": 1e-06, "loss": 0.061, "num_tokens": 15408557.0, "reward": 18.518508911132812, "reward_std": 4.11885929107666, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 18.518508911132812, "rewards/skywork_reward/std": 7.327213287353516, "step": 49 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1536.0, "completions/max_terminated_length": 1511.0, "completions/mean_length": 519.8359375, "completions/mean_terminated_length": 503.7063903808594, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.03765060240963856, "frac_reward_zero_std": 0.0, "grad_norm": 0.2628968060016632, "learning_rate": 1e-06, "loss": 0.052, "num_tokens": 15719481.0, "reward": 19.84031105041504, "reward_std": 3.317889928817749, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/skywork_reward/mean": 19.84031105041504, "rewards/skywork_reward/std": 7.0079755783081055, "step": 50 } ], "logging_steps": 1, "max_steps": 1328, "num_input_tokens_seen": 15719481, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }