{ "best_global_step": 46, "best_metric": 0.15980443358421326, "best_model_checkpoint": "./thinktank-prm/checkpoint-46", "epoch": 2.0, "eval_steps": 500, "global_step": 46, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "accuracy": 0.6, "epoch": 0.21739130434782608, "grad_norm": 49.35371017456055, "learning_rate": 8e-05, "loss": 1.2811368942260741, "margin": 0.199169921875, "max_reward": 2.334765625, "mean_reward": -1.283544921875, "min_reward": -5.059375, "num_tokens": 2572.0, "step": 5 }, { "accuracy": 0.45, "epoch": 0.43478260869565216, "grad_norm": 62.163856506347656, "learning_rate": 9.636363636363637e-05, "loss": 1.2909283638000488, "margin": -0.2004150390625, "max_reward": 2.146875, "mean_reward": -1.03646240234375, "min_reward": -4.05, "num_tokens": 5067.0, "step": 10 }, { "accuracy": 0.55, "epoch": 0.6521739130434783, "grad_norm": 51.77572250366211, "learning_rate": 9.181818181818183e-05, "loss": 0.8858256340026855, "margin": 0.2597198486328125, "max_reward": 1.7330078125, "mean_reward": -0.9123306274414062, "min_reward": -4.0484375, "num_tokens": 7178.0, "step": 15 }, { "accuracy": 0.8, "epoch": 0.8695652173913043, "grad_norm": 33.857173919677734, "learning_rate": 8.727272727272727e-05, "loss": 0.3597615957260132, "margin": 2.6916748046875, "max_reward": 4.940625, "mean_reward": 0.67430419921875, "min_reward": -3.065625, "num_tokens": 9399.0, "step": 20 }, { "epoch": 1.0, "eval_accuracy": 0.7638888905445734, "eval_loss": 0.4415476322174072, "eval_margin": 3.1962076822916665, "eval_max_reward": 6.135416666666667, "eval_mean_reward": 1.716891845067342, "eval_min_reward": -2.5227864583333335, "eval_num_tokens": 11027.0, "eval_runtime": 1.9074, "eval_samples_per_second": 12.058, "eval_steps_per_second": 3.146, "step": 23 }, { "accuracy": 0.8, "epoch": 1.0869565217391304, "grad_norm": 6.21420955657959, "learning_rate": 8.272727272727273e-05, "loss": 0.444346284866333, "margin": 4.0765228271484375, "max_reward": 7.60625, "mean_reward": 1.1567672729492187, "min_reward": -3.390625, "num_tokens": 11940.0, "step": 25 }, { "accuracy": 0.8, "epoch": 1.3043478260869565, "grad_norm": 42.05428695678711, "learning_rate": 7.818181818181818e-05, "loss": 0.2892508029937744, "margin": 4.1197021484375, "max_reward": 8.65, "mean_reward": 1.99346923828125, "min_reward": -2.23203125, "num_tokens": 14312.0, "step": 30 }, { "accuracy": 0.85, "epoch": 1.5217391304347827, "grad_norm": 44.804229736328125, "learning_rate": 7.363636363636364e-05, "loss": 0.1502391815185547, "margin": 5.14873046875, "max_reward": 9.75625, "mean_reward": 1.94619140625, "min_reward": -2.438671875, "num_tokens": 16670.0, "step": 35 }, { "accuracy": 0.9, "epoch": 1.7391304347826086, "grad_norm": 1.3212089538574219, "learning_rate": 6.90909090909091e-05, "loss": 0.24649481773376464, "margin": 6.7328125, "max_reward": 11.0, "mean_reward": 2.592578125, "min_reward": -3.328125, "num_tokens": 19020.0, "step": 40 }, { "accuracy": 0.9, "epoch": 1.9565217391304348, "grad_norm": 12.125640869140625, "learning_rate": 6.454545454545455e-05, "loss": 0.15210472345352172, "margin": 6.601220703125, "max_reward": 10.5375, "mean_reward": 2.4595458984375, "min_reward": -3.0890625, "num_tokens": 21618.0, "step": 45 }, { "epoch": 2.0, "eval_accuracy": 0.9166666666666666, "eval_loss": 0.15980443358421326, "eval_margin": 4.878390848636627, "eval_max_reward": 6.994791666666667, "eval_mean_reward": 1.7292073567708333, "eval_min_reward": -3.3043619791666665, "eval_num_tokens": 22054.0, "eval_runtime": 1.2993, "eval_samples_per_second": 17.702, "eval_steps_per_second": 4.618, "step": 46 } ], "logging_steps": 5, "max_steps": 115, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 66236348030976.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }