{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 20, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 218.84375, "epoch": 0.5714285714285714, "grad_norm": 16.413349151611328, "kl": 0.0, "learning_rate": 4.965903258506806e-07, "loss": -0.0, "reward": 3.463685939088464, "reward_std": 0.7034649187116884, "rewards/concensus_correctness_reward_func": 0.7219999879598618, "rewards/consensus_reward_func": 1.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.7888422352261841, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.15625, "rewards/xmlcount_reward_func": 0.7340937531553209, "step": 2 }, { "completion_length": 136.70833333333334, "epoch": 1.0, "grad_norm": 16.801883697509766, "kl": 0.11107732852300008, "learning_rate": 4.698684378016222e-07, "loss": 0.0001, "reward": 5.620846748352051, "reward_std": 0.8283557544151942, "rewards/concensus_correctness_reward_func": 1.700999992589156, "rewards/consensus_reward_func": 1.3333333333333333, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.9094300220410029, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.2916666666666667, "rewards/xmlcount_reward_func": 1.1354166666666667, "step": 4 }, { "completion_length": 137.625, "epoch": 1.5714285714285714, "grad_norm": 1251.4542236328125, "kl": 29.201528461358976, "learning_rate": 4.193203929064353e-07, "loss": 0.0292, "reward": 6.083442434668541, "reward_std": 0.6032323502004147, "rewards/concensus_correctness_reward_func": 1.8181874863803387, "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.9065674878656864, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 1.1711875014007092, "step": 6 }, { "completion_length": 141.95833333333334, "epoch": 2.0, "grad_norm": 80.33175659179688, "kl": 2238.1095382701606, "learning_rate": 3.5042385616324236e-07, "loss": 1.6786, "reward": 5.476725300153096, "reward_std": 0.5055693089962006, "rewards/concensus_correctness_reward_func": 1.338249978919824, "rewards/consensus_reward_func": 1.5833333333333333, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.08333333333333333, "rewards/question_recreation_reward_func": 0.9041003783543905, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3541666666666667, "rewards/xmlcount_reward_func": 1.2135416666666667, "step": 8 }, { "completion_length": 140.0625, "epoch": 2.571428571428571, "grad_norm": 5685.56640625, "kl": 73.77052622684278, "learning_rate": 2.706448363680831e-07, "loss": 0.0738, "reward": 6.193239971995354, "reward_std": 1.6658249166794121, "rewards/concensus_correctness_reward_func": 1.9971249867230654, "rewards/consensus_reward_func": 1.4375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.9472400806844234, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 1.2019999995827675, "step": 10 }, { "completion_length": 127.08333333333333, "epoch": 3.0, "grad_norm": 9.540337562561035, "kl": 3.0621676202863455, "learning_rate": 1.886286282148002e-07, "loss": 0.0023, "reward": 5.709832270940145, "reward_std": 0.3538346770995607, "rewards/concensus_correctness_reward_func": 1.5369999806086223, "rewards/consensus_reward_func": 1.5833333333333333, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.988540584842364, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333333333333, "rewards/xmlcount_reward_func": 1.2051250040531158, "step": 12 }, { "completion_length": 132.15625, "epoch": 3.571428571428571, "grad_norm": 16.967500686645508, "kl": 11.426087894011289, "learning_rate": 1.1326296046939333e-07, "loss": 0.0114, "reward": 5.505586206912994, "reward_std": 0.27794025372713804, "rewards/concensus_correctness_reward_func": 1.4691874869167805, "rewards/consensus_reward_func": 1.5625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.9235549308359623, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 1.2065937519073486, "step": 14 }, { "completion_length": 134.25, "epoch": 4.0, "grad_norm": 16.20191192626953, "kl": 2.2744216828917465, "learning_rate": 5.271487265090163e-08, "loss": 0.0017, "reward": 6.086844245592753, "reward_std": 0.542460098862648, "rewards/concensus_correctness_reward_func": 1.7269999807079632, "rewards/consensus_reward_func": 1.75, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.16666666666666666, "rewards/question_recreation_reward_func": 0.9250942468643188, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 1.2055833339691162, "step": 16 }, { "completion_length": 149.78125, "epoch": 4.571428571428571, "grad_norm": 23.975969314575195, "kl": 21.27271423127968, "learning_rate": 1.3545689574841341e-08, "loss": 0.0213, "reward": 5.366471603512764, "reward_std": 0.9596925657242537, "rewards/concensus_correctness_reward_func": 1.4007499851286411, "rewards/consensus_reward_func": 1.5, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.9201591331511736, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 1.201812505722046, "step": 18 }, { "completion_length": 143.20833333333334, "epoch": 5.0, "grad_norm": 22.604459762573242, "kl": 0.7605764990051588, "learning_rate": 0.0, "loss": 0.0006, "reward": 6.170487980047862, "reward_std": 1.3884001096788172, "rewards/concensus_correctness_reward_func": 2.157916652659575, "rewards/consensus_reward_func": 1.4166666666666667, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.8883628845214844, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3333333333333333, "rewards/xmlcount_reward_func": 1.1242083335916202, "step": 20 }, { "epoch": 5.0, "step": 20, "total_flos": 0.0, "train_loss": 0.18189025411120383, "train_runtime": 97.0582, "train_samples_per_second": 3.297, "train_steps_per_second": 0.206 } ], "logging_steps": 2, "max_steps": 20, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }