{ "best_global_step": 1000, "best_metric": 1.26480901, "best_model_checkpoint": "/home/user_00006_557dc2/shared-storage/yuki-home/output_model/Qwen3-4B-unsafe-simpo/v0-20260108-044607/checkpoint-1000", "epoch": 0.7185198491108317, "eval_steps": 200, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007185198491108317, "grad_norm": 7.691910743713379, "learning_rate": 7.142857142857144e-08, "logits/chosen": -1.3974609375, "logits/rejected": -0.91796875, "logps/chosen": -1.3876953125, "logps/rejected": -2.279296875, "loss": 1.79931640625, "nll_loss": 1.390625, "rewards/accuracies": 0.984375, "rewards/chosen": -2.775390625, "rewards/margins": 1.7783203125, "rewards/rejected": -4.55859375, "step": 1 }, { "epoch": 0.0035925992455541583, "grad_norm": 7.972429275512695, "learning_rate": 3.5714285714285716e-07, "logits/chosen": -1.50830078125, "logits/rejected": -1.2586669921875, "logps/chosen": -1.407470703125, "logps/rejected": -2.2744140625, "loss": 1.8402099609375, "nll_loss": 1.408935546875, "rewards/accuracies": 0.9921875, "rewards/chosen": -2.81494140625, "rewards/margins": 1.73388671875, "rewards/rejected": -4.548828125, "step": 5 }, { "epoch": 0.007185198491108317, "grad_norm": 8.690775871276855, "learning_rate": 7.142857142857143e-07, "logits/chosen": -1.495703101158142, "logits/rejected": -1.3515625, "logps/chosen": -1.4296875, "logps/rejected": -2.2699217796325684, "loss": 1.8765625, "nll_loss": 1.428125023841858, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.859375, "rewards/margins": 1.678125023841858, "rewards/rejected": -4.539843559265137, "step": 10 }, { "epoch": 0.010777797736662474, "grad_norm": 8.171070098876953, "learning_rate": 1.0714285714285714e-06, "logits/chosen": -1.637304663658142, "logits/rejected": -1.4182617664337158, "logps/chosen": -1.4216797351837158, "logps/rejected": -2.3140625953674316, "loss": 1.83095703125, "nll_loss": 1.419531226158142, "rewards/accuracies": 1.0, "rewards/chosen": -2.8433594703674316, "rewards/margins": 1.785742163658142, "rewards/rejected": -4.628125190734863, "step": 15 }, { "epoch": 0.014370396982216633, "grad_norm": 6.307018756866455, "learning_rate": 1.4285714285714286e-06, "logits/chosen": -1.5041992664337158, "logits/rejected": -1.28271484375, "logps/chosen": -1.416406273841858, "logps/rejected": -2.4859375953674316, "loss": 1.738623046875, "nll_loss": 1.417382836341858, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.832812547683716, "rewards/margins": 2.1382813453674316, "rewards/rejected": -4.971875190734863, "step": 20 }, { "epoch": 0.017962996227770794, "grad_norm": 2.4112226963043213, "learning_rate": 1.7857142857142859e-06, "logits/chosen": -1.5656249523162842, "logits/rejected": -1.4568359851837158, "logps/chosen": -1.412109375, "logps/rejected": -3.0628905296325684, "loss": 1.5588623046875, "nll_loss": 1.4113280773162842, "rewards/accuracies": 1.0, "rewards/chosen": -2.82421875, "rewards/margins": 3.303515672683716, "rewards/rejected": -6.125781059265137, "step": 25 }, { "epoch": 0.02155559547332495, "grad_norm": 2.003995895385742, "learning_rate": 2.1428571428571427e-06, "logits/chosen": -1.511621117591858, "logits/rejected": -1.37841796875, "logps/chosen": -1.45703125, "logps/rejected": -3.813671827316284, "loss": 1.493701171875, "nll_loss": 1.451562523841858, "rewards/accuracies": 1.0, "rewards/chosen": -2.9140625, "rewards/margins": 4.713671684265137, "rewards/rejected": -7.627343654632568, "step": 30 }, { "epoch": 0.025148194718879108, "grad_norm": 1.9368467330932617, "learning_rate": 2.5e-06, "logits/chosen": -1.3904297351837158, "logits/rejected": -1.2985351085662842, "logps/chosen": -1.4650390148162842, "logps/rejected": -5.449999809265137, "loss": 1.457666015625, "nll_loss": 1.4529297351837158, "rewards/accuracies": 1.0, "rewards/chosen": -2.9300780296325684, "rewards/margins": 7.96875, "rewards/rejected": -10.899999618530273, "step": 35 }, { "epoch": 0.028740793964433266, "grad_norm": 1.6902642250061035, "learning_rate": 2.8571428571428573e-06, "logits/chosen": -1.4148437976837158, "logits/rejected": -1.455175757408142, "logps/chosen": -1.476171851158142, "logps/rejected": -6.092968940734863, "loss": 1.472119140625, "nll_loss": 1.469140648841858, "rewards/accuracies": 1.0, "rewards/chosen": -2.952343702316284, "rewards/margins": 9.240625381469727, "rewards/rejected": -12.185937881469727, "step": 40 }, { "epoch": 0.032333393209987425, "grad_norm": 1.5812033414840698, "learning_rate": 3.2142857142857147e-06, "logits/chosen": -1.596289038658142, "logits/rejected": -1.608984351158142, "logps/chosen": -1.422460913658142, "logps/rejected": -5.978125095367432, "loss": 1.420361328125, "nll_loss": 1.418554663658142, "rewards/accuracies": 1.0, "rewards/chosen": -2.844921827316284, "rewards/margins": 9.104687690734863, "rewards/rejected": -11.956250190734863, "step": 45 }, { "epoch": 0.03592599245554159, "grad_norm": 1.5095185041427612, "learning_rate": 3.5714285714285718e-06, "logits/chosen": -1.5986328125, "logits/rejected": -1.5666992664337158, "logps/chosen": -1.4177734851837158, "logps/rejected": -6.051562309265137, "loss": 1.4094482421875, "nll_loss": 1.4089844226837158, "rewards/accuracies": 1.0, "rewards/chosen": -2.8355469703674316, "rewards/margins": 9.264062881469727, "rewards/rejected": -12.103124618530273, "step": 50 }, { "epoch": 0.03951859170109574, "grad_norm": 1.468376874923706, "learning_rate": 3.928571428571429e-06, "logits/chosen": -1.4972655773162842, "logits/rejected": -1.5333983898162842, "logps/chosen": -1.4392578601837158, "logps/rejected": -6.267187595367432, "loss": 1.447802734375, "nll_loss": 1.4406249523162842, "rewards/accuracies": 1.0, "rewards/chosen": -2.8785157203674316, "rewards/margins": 9.654687881469727, "rewards/rejected": -12.534375190734863, "step": 55 }, { "epoch": 0.0431111909466499, "grad_norm": 1.4280197620391846, "learning_rate": 4.2857142857142855e-06, "logits/chosen": -1.605078101158142, "logits/rejected": -1.675390601158142, "logps/chosen": -1.3914062976837158, "logps/rejected": -6.785937309265137, "loss": 1.3864501953125, "nll_loss": 1.3857421875, "rewards/accuracies": 1.0, "rewards/chosen": -2.7828125953674316, "rewards/margins": 10.787500381469727, "rewards/rejected": -13.571874618530273, "step": 60 }, { "epoch": 0.04670379019220406, "grad_norm": 1.2989500761032104, "learning_rate": 4.642857142857144e-06, "logits/chosen": -1.5071289539337158, "logits/rejected": -1.490234375, "logps/chosen": -1.407617211341858, "logps/rejected": -7.173437595367432, "loss": 1.418798828125, "nll_loss": 1.4171874523162842, "rewards/accuracies": 1.0, "rewards/chosen": -2.815234422683716, "rewards/margins": 11.529687881469727, "rewards/rejected": -14.346875190734863, "step": 65 }, { "epoch": 0.050296389437758215, "grad_norm": 1.3736355304718018, "learning_rate": 5e-06, "logits/chosen": -1.48046875, "logits/rejected": -1.5500977039337158, "logps/chosen": -1.392578125, "logps/rejected": -7.403124809265137, "loss": 1.386376953125, "nll_loss": 1.386132836341858, "rewards/accuracies": 1.0, "rewards/chosen": -2.78515625, "rewards/margins": 12.017187118530273, "rewards/rejected": -14.806249618530273, "step": 70 }, { "epoch": 0.05388898868331238, "grad_norm": 1.5199470520019531, "learning_rate": 5.357142857142857e-06, "logits/chosen": -1.5632812976837158, "logits/rejected": -1.547949194908142, "logps/chosen": -1.421484351158142, "logps/rejected": -7.706250190734863, "loss": 1.419873046875, "nll_loss": 1.418554663658142, "rewards/accuracies": 1.0, "rewards/chosen": -2.842968702316284, "rewards/margins": 12.564062118530273, "rewards/rejected": -15.412500381469727, "step": 75 }, { "epoch": 0.05748158792886653, "grad_norm": 1.5772722959518433, "learning_rate": 5.7142857142857145e-06, "logits/chosen": -1.3556640148162842, "logits/rejected": -1.415136694908142, "logps/chosen": -1.3955078125, "logps/rejected": -7.82421875, "loss": 1.392333984375, "nll_loss": 1.3914062976837158, "rewards/accuracies": 1.0, "rewards/chosen": -2.791015625, "rewards/margins": 12.859375, "rewards/rejected": -15.6484375, "step": 80 }, { "epoch": 0.061074187174420695, "grad_norm": 1.5475585460662842, "learning_rate": 6.071428571428571e-06, "logits/chosen": -1.4274413585662842, "logits/rejected": -1.354394555091858, "logps/chosen": -1.408789038658142, "logps/rejected": -8.210156440734863, "loss": 1.402734375, "nll_loss": 1.4025390148162842, "rewards/accuracies": 1.0, "rewards/chosen": -2.817578077316284, "rewards/margins": 13.609375, "rewards/rejected": -16.420312881469727, "step": 85 }, { "epoch": 0.06466678641997485, "grad_norm": 1.5293960571289062, "learning_rate": 6.4285714285714295e-06, "logits/chosen": -1.3779296875, "logits/rejected": -1.29296875, "logps/chosen": -1.396875023841858, "logps/rejected": -8.625781059265137, "loss": 1.39951171875, "nll_loss": 1.3996093273162842, "rewards/accuracies": 1.0, "rewards/chosen": -2.793750047683716, "rewards/margins": 14.464062690734863, "rewards/rejected": -17.251562118530273, "step": 90 }, { "epoch": 0.06825938566552901, "grad_norm": 1.4575022459030151, "learning_rate": 6.785714285714287e-06, "logits/chosen": -1.375756859779358, "logits/rejected": -1.258203148841858, "logps/chosen": -1.395117163658142, "logps/rejected": -8.856249809265137, "loss": 1.3924560546875, "nll_loss": 1.3917968273162842, "rewards/accuracies": 1.0, "rewards/chosen": -2.790234327316284, "rewards/margins": 14.935937881469727, "rewards/rejected": -17.712499618530273, "step": 95 }, { "epoch": 0.07185198491108317, "grad_norm": 1.5032144784927368, "learning_rate": 7.1428571428571436e-06, "logits/chosen": -1.471093773841858, "logits/rejected": -1.3935546875, "logps/chosen": -1.3933594226837158, "logps/rejected": -8.979687690734863, "loss": 1.3912109375, "nll_loss": 1.390625, "rewards/accuracies": 1.0, "rewards/chosen": -2.7867188453674316, "rewards/margins": 15.15625, "rewards/rejected": -17.959375381469727, "step": 100 }, { "epoch": 0.07544458415663732, "grad_norm": 1.5966267585754395, "learning_rate": 7.500000000000001e-06, "logits/chosen": -1.4542968273162842, "logits/rejected": -1.3732421398162842, "logps/chosen": -1.391015648841858, "logps/rejected": -9.457812309265137, "loss": 1.3874267578125, "nll_loss": 1.3875000476837158, "rewards/accuracies": 1.0, "rewards/chosen": -2.782031297683716, "rewards/margins": 16.140625, "rewards/rejected": -18.915624618530273, "step": 105 }, { "epoch": 0.07903718340219149, "grad_norm": 1.4725176095962524, "learning_rate": 7.857142857142858e-06, "logits/chosen": -1.33544921875, "logits/rejected": -1.059667944908142, "logps/chosen": -1.4220702648162842, "logps/rejected": -9.829687118530273, "loss": 1.416064453125, "nll_loss": 1.416015625, "rewards/accuracies": 1.0, "rewards/chosen": -2.8441405296325684, "rewards/margins": 16.817188262939453, "rewards/rejected": -19.659374237060547, "step": 110 }, { "epoch": 0.08262978264774565, "grad_norm": 1.5218441486358643, "learning_rate": 8.214285714285714e-06, "logits/chosen": -1.2913086414337158, "logits/rejected": -1.210595726966858, "logps/chosen": -1.399023413658142, "logps/rejected": -9.837499618530273, "loss": 1.40068359375, "nll_loss": 1.400390625, "rewards/accuracies": 1.0, "rewards/chosen": -2.798046827316284, "rewards/margins": 16.8828125, "rewards/rejected": -19.674999237060547, "step": 115 }, { "epoch": 0.0862223818932998, "grad_norm": 1.391550064086914, "learning_rate": 8.571428571428571e-06, "logits/chosen": -1.368261694908142, "logits/rejected": -1.1633789539337158, "logps/chosen": -1.3972656726837158, "logps/rejected": -10.154687881469727, "loss": 1.39423828125, "nll_loss": 1.393945336341858, "rewards/accuracies": 1.0, "rewards/chosen": -2.7945313453674316, "rewards/margins": 17.510936737060547, "rewards/rejected": -20.309375762939453, "step": 120 }, { "epoch": 0.08981498113885396, "grad_norm": 1.6060618162155151, "learning_rate": 8.92857142857143e-06, "logits/chosen": -1.3708984851837158, "logits/rejected": -1.2664062976837158, "logps/chosen": -1.389062523841858, "logps/rejected": -10.285937309265137, "loss": 1.382275390625, "nll_loss": 1.382226586341858, "rewards/accuracies": 1.0, "rewards/chosen": -2.778125047683716, "rewards/margins": 17.795312881469727, "rewards/rejected": -20.571874618530273, "step": 125 }, { "epoch": 0.09340758038440812, "grad_norm": 1.542303204536438, "learning_rate": 9.285714285714288e-06, "logits/chosen": -1.317285180091858, "logits/rejected": -1.21044921875, "logps/chosen": -1.389062523841858, "logps/rejected": -10.251562118530273, "loss": 1.38330078125, "nll_loss": 1.383398413658142, "rewards/accuracies": 1.0, "rewards/chosen": -2.778125047683716, "rewards/margins": 17.712499618530273, "rewards/rejected": -20.503124237060547, "step": 130 }, { "epoch": 0.09700017962996228, "grad_norm": 1.4002010822296143, "learning_rate": 9.642857142857144e-06, "logits/chosen": -1.293359398841858, "logits/rejected": -1.1779296398162842, "logps/chosen": -1.372460961341858, "logps/rejected": -10.571874618530273, "loss": 1.37607421875, "nll_loss": 1.3759765625, "rewards/accuracies": 1.0, "rewards/chosen": -2.744921922683716, "rewards/margins": 18.401561737060547, "rewards/rejected": -21.143749237060547, "step": 135 }, { "epoch": 0.10059277887551643, "grad_norm": 1.5104930400848389, "learning_rate": 1e-05, "logits/chosen": -1.39794921875, "logits/rejected": -1.223242163658142, "logps/chosen": -1.388085961341858, "logps/rejected": -10.771875381469727, "loss": 1.3880615234375, "nll_loss": 1.3878905773162842, "rewards/accuracies": 1.0, "rewards/chosen": -2.776171922683716, "rewards/margins": 18.759374618530273, "rewards/rejected": -21.543750762939453, "step": 140 }, { "epoch": 0.1041853781210706, "grad_norm": 1.5133243799209595, "learning_rate": 9.999606481269841e-06, "logits/chosen": -1.351171851158142, "logits/rejected": -1.25146484375, "logps/chosen": -1.4013671875, "logps/rejected": -10.706250190734863, "loss": 1.3987060546875, "nll_loss": 1.399023413658142, "rewards/accuracies": 1.0, "rewards/chosen": -2.802734375, "rewards/margins": 18.610937118530273, "rewards/rejected": -21.412500381469727, "step": 145 }, { "epoch": 0.10777797736662476, "grad_norm": 1.489080548286438, "learning_rate": 9.99842598702216e-06, "logits/chosen": -1.4142577648162842, "logits/rejected": -1.235937476158142, "logps/chosen": -1.384374976158142, "logps/rejected": -11.0, "loss": 1.378076171875, "nll_loss": 1.378320336341858, "rewards/accuracies": 1.0, "rewards/chosen": -2.768749952316284, "rewards/margins": 19.243749618530273, "rewards/rejected": -22.0, "step": 150 }, { "epoch": 0.11137057661217892, "grad_norm": 1.5197798013687134, "learning_rate": 9.996458703075593e-06, "logits/chosen": -1.4298827648162842, "logits/rejected": -1.2919921875, "logps/chosen": -1.3855469226837158, "logps/rejected": -11.068750381469727, "loss": 1.377734375, "nll_loss": 1.377343773841858, "rewards/accuracies": 1.0, "rewards/chosen": -2.7710938453674316, "rewards/margins": 19.357812881469727, "rewards/rejected": -22.137500762939453, "step": 155 }, { "epoch": 0.11496317585773307, "grad_norm": 1.5488848686218262, "learning_rate": 9.993704939095376e-06, "logits/chosen": -1.422460913658142, "logits/rejected": -1.30029296875, "logps/chosen": -1.384179711341858, "logps/rejected": -10.862500190734863, "loss": 1.385693359375, "nll_loss": 1.385156273841858, "rewards/accuracies": 1.0, "rewards/chosen": -2.768359422683716, "rewards/margins": 18.954687118530273, "rewards/rejected": -21.725000381469727, "step": 160 }, { "epoch": 0.11855577510328723, "grad_norm": 1.5068778991699219, "learning_rate": 9.99016512854459e-06, "logits/chosen": -1.381445288658142, "logits/rejected": -1.295019507408142, "logps/chosen": -1.3611328601837158, "logps/rejected": -11.225000381469727, "loss": 1.3570068359375, "nll_loss": 1.3564453125, "rewards/accuracies": 1.0, "rewards/chosen": -2.7222657203674316, "rewards/margins": 19.7265625, "rewards/rejected": -22.450000762939453, "step": 165 }, { "epoch": 0.12214837434884139, "grad_norm": 1.4243464469909668, "learning_rate": 9.985839828615937e-06, "logits/chosen": -1.1414062976837158, "logits/rejected": -0.960205078125, "logps/chosen": -1.380468726158142, "logps/rejected": -11.578125, "loss": 1.3791015625, "nll_loss": 1.379492163658142, "rewards/accuracies": 1.0, "rewards/chosen": -2.760937452316284, "rewards/margins": 20.403125762939453, "rewards/rejected": -23.15625, "step": 170 }, { "epoch": 0.12574097359439554, "grad_norm": 1.5595279932022095, "learning_rate": 9.980729720144027e-06, "logits/chosen": -1.168554663658142, "logits/rejected": -1.041894555091858, "logps/chosen": -1.3818359375, "logps/rejected": -11.807812690734863, "loss": 1.382763671875, "nll_loss": 1.3830077648162842, "rewards/accuracies": 1.0, "rewards/chosen": -2.763671875, "rewards/margins": 20.856250762939453, "rewards/rejected": -23.615625381469727, "step": 175 }, { "epoch": 0.1293335728399497, "grad_norm": 1.5964690446853638, "learning_rate": 9.974835607498224e-06, "logits/chosen": -1.3154296875, "logits/rejected": -1.1243164539337158, "logps/chosen": -1.3994140625, "logps/rejected": -12.473437309265137, "loss": 1.40009765625, "nll_loss": 1.400390625, "rewards/accuracies": 1.0, "rewards/chosen": -2.798828125, "rewards/margins": 22.140625, "rewards/rejected": -24.946874618530273, "step": 180 }, { "epoch": 0.13292617208550386, "grad_norm": 1.6358031034469604, "learning_rate": 9.968158418456013e-06, "logits/chosen": -1.194921851158142, "logits/rejected": -0.9388183355331421, "logps/chosen": -1.3546874523162842, "logps/rejected": -12.665624618530273, "loss": 1.357421875, "nll_loss": 1.3576171398162842, "rewards/accuracies": 1.0, "rewards/chosen": -2.7093749046325684, "rewards/margins": 22.615625381469727, "rewards/rejected": -25.331249237060547, "step": 185 }, { "epoch": 0.13651877133105803, "grad_norm": 1.5263099670410156, "learning_rate": 9.960699204056978e-06, "logits/chosen": -1.212744116783142, "logits/rejected": -1.1403319835662842, "logps/chosen": -1.3955078125, "logps/rejected": -13.206250190734863, "loss": 1.389697265625, "nll_loss": 1.3896484375, "rewards/accuracies": 1.0, "rewards/chosen": -2.791015625, "rewards/margins": 23.618749618530273, "rewards/rejected": -26.412500381469727, "step": 190 }, { "epoch": 0.1401113705766122, "grad_norm": 1.5514755249023438, "learning_rate": 9.952459138437352e-06, "logits/chosen": -1.358984351158142, "logits/rejected": -1.283300757408142, "logps/chosen": -1.3703124523162842, "logps/rejected": -13.670312881469727, "loss": 1.36474609375, "nll_loss": 1.364648461341858, "rewards/accuracies": 1.0, "rewards/chosen": -2.7406249046325684, "rewards/margins": 24.59375, "rewards/rejected": -27.340625762939453, "step": 195 }, { "epoch": 0.14370396982216635, "grad_norm": 1.5811927318572998, "learning_rate": 9.943439518645193e-06, "logits/chosen": -1.274804711341858, "logits/rejected": -1.1195800304412842, "logps/chosen": -1.372460961341858, "logps/rejected": -13.676562309265137, "loss": 1.37119140625, "nll_loss": 1.370703101158142, "rewards/accuracies": 1.0, "rewards/chosen": -2.744921922683716, "rewards/margins": 24.603124618530273, "rewards/rejected": -27.353124618530273, "step": 200 }, { "epoch": 0.14370396982216635, "eval_logits/chosen": -1.5566233396530151, "eval_logits/rejected": -1.4717228412628174, "eval_logps/chosen": -1.3620713949203491, "eval_logps/rejected": -14.021018028259277, "eval_loss": 1.3645399808883667, "eval_nll_loss": 1.3635923862457275, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": -2.7241427898406982, "eval_rewards/margins": 25.323009490966797, "eval_rewards/rejected": -28.042036056518555, "eval_runtime": 10.8197, "eval_samples_per_second": 83.182, "eval_steps_per_second": 10.444, "step": 200 }, { "epoch": 0.14729656906772048, "grad_norm": 1.4829566478729248, "learning_rate": 9.933641764436237e-06, "logits/chosen": -1.5146484375, "logits/rejected": -1.41015625, "logps/chosen": -1.350976586341858, "logps/rejected": -14.003125190734863, "loss": 1.3512939453125, "nll_loss": 1.352148413658142, "rewards/accuracies": 1.0, "rewards/chosen": -2.701953172683716, "rewards/margins": 25.315624237060547, "rewards/rejected": -28.006250381469727, "step": 205 }, { "epoch": 0.15088916831327465, "grad_norm": 1.488863468170166, "learning_rate": 9.923067418050399e-06, "logits/chosen": -1.4609375, "logits/rejected": -1.376367211341858, "logps/chosen": -1.3537108898162842, "logps/rejected": -14.432812690734863, "loss": 1.354248046875, "nll_loss": 1.354101538658142, "rewards/accuracies": 1.0, "rewards/chosen": -2.7074217796325684, "rewards/margins": 26.146875381469727, "rewards/rejected": -28.865625381469727, "step": 210 }, { "epoch": 0.1544817675588288, "grad_norm": 1.5745856761932373, "learning_rate": 9.911718143969024e-06, "logits/chosen": -1.5654296875, "logits/rejected": -1.530664086341858, "logps/chosen": -1.352929711341858, "logps/rejected": -14.587499618530273, "loss": 1.354638671875, "nll_loss": 1.354882836341858, "rewards/accuracies": 1.0, "rewards/chosen": -2.705859422683716, "rewards/margins": 26.459375381469727, "rewards/rejected": -29.174999237060547, "step": 215 }, { "epoch": 0.15807436680438297, "grad_norm": 1.6107984781265259, "learning_rate": 9.899595728652883e-06, "logits/chosen": -1.553320288658142, "logits/rejected": -1.609375, "logps/chosen": -1.37890625, "logps/rejected": -14.303125381469727, "loss": 1.370703125, "nll_loss": 1.3701171875, "rewards/accuracies": 1.0, "rewards/chosen": -2.7578125, "rewards/margins": 25.840625762939453, "rewards/rejected": -28.606250762939453, "step": 220 }, { "epoch": 0.16166696604993713, "grad_norm": 1.610357642173767, "learning_rate": 9.88670208026097e-06, "logits/chosen": -1.679101586341858, "logits/rejected": -1.7560546398162842, "logps/chosen": -1.3630859851837158, "logps/rejected": -14.837499618530273, "loss": 1.3674072265625, "nll_loss": 1.3662109375, "rewards/accuracies": 1.0, "rewards/chosen": -2.7261719703674316, "rewards/margins": 26.950000762939453, "rewards/rejected": -29.674999237060547, "step": 225 }, { "epoch": 0.1652595652954913, "grad_norm": 1.470034122467041, "learning_rate": 9.87303922835014e-06, "logits/chosen": -1.560937523841858, "logits/rejected": -1.6134765148162842, "logps/chosen": -1.363867163658142, "logps/rejected": -16.9140625, "loss": 1.370361328125, "nll_loss": 1.361718773841858, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.727734327316284, "rewards/margins": 31.109375, "rewards/rejected": -33.828125, "step": 230 }, { "epoch": 0.16885216454104546, "grad_norm": 1.502193808555603, "learning_rate": 9.858609323555646e-06, "logits/chosen": -1.701757788658142, "logits/rejected": -1.6632812023162842, "logps/chosen": -1.3712890148162842, "logps/rejected": -17.698436737060547, "loss": 1.372021484375, "nll_loss": 1.3712890148162842, "rewards/accuracies": 1.0, "rewards/chosen": -2.7425780296325684, "rewards/margins": 32.65312576293945, "rewards/rejected": -35.396873474121094, "step": 235 }, { "epoch": 0.1724447637865996, "grad_norm": 1.6777915954589844, "learning_rate": 9.843414637252615e-06, "logits/chosen": -1.684960961341858, "logits/rejected": -1.758691430091858, "logps/chosen": -1.341406226158142, "logps/rejected": -18.637500762939453, "loss": 1.3465087890625, "nll_loss": 1.3458983898162842, "rewards/accuracies": 1.0, "rewards/chosen": -2.682812452316284, "rewards/margins": 34.603126525878906, "rewards/rejected": -37.275001525878906, "step": 240 }, { "epoch": 0.17603736303215375, "grad_norm": 1.5214755535125732, "learning_rate": 9.827457561198507e-06, "logits/chosen": -1.894921898841858, "logits/rejected": -2.000195264816284, "logps/chosen": -1.377343773841858, "logps/rejected": -17.646875381469727, "loss": 1.369384765625, "nll_loss": 1.369531273841858, "rewards/accuracies": 1.0, "rewards/chosen": -2.754687547683716, "rewards/margins": 32.537498474121094, "rewards/rejected": -35.29375076293945, "step": 245 }, { "epoch": 0.17962996227770792, "grad_norm": 1.5618224143981934, "learning_rate": 9.810740607156647e-06, "logits/chosen": -1.814062476158142, "logits/rejected": -1.903710961341858, "logps/chosen": -1.337890625, "logps/rejected": -18.207813262939453, "loss": 1.3361572265625, "nll_loss": 1.335546851158142, "rewards/accuracies": 1.0, "rewards/chosen": -2.67578125, "rewards/margins": 33.734375, "rewards/rejected": -36.415626525878906, "step": 250 }, { "epoch": 0.18322256152326208, "grad_norm": 1.5485224723815918, "learning_rate": 9.793266406500847e-06, "logits/chosen": -1.803320288658142, "logits/rejected": -1.846093773841858, "logps/chosen": -1.3583984375, "logps/rejected": -18.203125, "loss": 1.357177734375, "nll_loss": 1.357031226158142, "rewards/accuracies": 1.0, "rewards/chosen": -2.716796875, "rewards/margins": 33.66875076293945, "rewards/rejected": -36.40625, "step": 255 }, { "epoch": 0.18681516076881624, "grad_norm": 1.4868297576904297, "learning_rate": 9.775037709801206e-06, "logits/chosen": -1.8380858898162842, "logits/rejected": -1.9230468273162842, "logps/chosen": -1.3859374523162842, "logps/rejected": -18.621875762939453, "loss": 1.38720703125, "nll_loss": 1.3875000476837158, "rewards/accuracies": 1.0, "rewards/chosen": -2.7718749046325684, "rewards/margins": 34.46562576293945, "rewards/rejected": -37.243751525878906, "step": 260 }, { "epoch": 0.1904077600143704, "grad_norm": 1.4483191967010498, "learning_rate": 9.756057386391154e-06, "logits/chosen": -1.6541016101837158, "logits/rejected": -1.627539038658142, "logps/chosen": -1.3582031726837158, "logps/rejected": -18.401561737060547, "loss": 1.352978515625, "nll_loss": 1.3527343273162842, "rewards/accuracies": 1.0, "rewards/chosen": -2.7164063453674316, "rewards/margins": 34.05937576293945, "rewards/rejected": -36.803123474121094, "step": 265 }, { "epoch": 0.19400035925992457, "grad_norm": 1.4131176471710205, "learning_rate": 9.736328423915797e-06, "logits/chosen": -1.6599609851837158, "logits/rejected": -1.72265625, "logps/chosen": -1.349218726158142, "logps/rejected": -18.081249237060547, "loss": 1.3497802734375, "nll_loss": 1.350000023841858, "rewards/accuracies": 1.0, "rewards/chosen": -2.698437452316284, "rewards/margins": 33.446876525878906, "rewards/rejected": -36.162498474121094, "step": 270 }, { "epoch": 0.19759295850547873, "grad_norm": 1.4289692640304565, "learning_rate": 9.715853927861643e-06, "logits/chosen": -1.825781226158142, "logits/rejected": -1.909765601158142, "logps/chosen": -1.3318359851837158, "logps/rejected": -18.801563262939453, "loss": 1.32822265625, "nll_loss": 1.328125, "rewards/accuracies": 1.0, "rewards/chosen": -2.6636719703674316, "rewards/margins": 34.94062423706055, "rewards/rejected": -37.603126525878906, "step": 275 }, { "epoch": 0.20118555775103286, "grad_norm": 1.3404064178466797, "learning_rate": 9.694637121067764e-06, "logits/chosen": -1.7648437023162842, "logits/rejected": -1.879296898841858, "logps/chosen": -1.346289038658142, "logps/rejected": -18.956249237060547, "loss": 1.339404296875, "nll_loss": 1.339257836341858, "rewards/accuracies": 1.0, "rewards/chosen": -2.692578077316284, "rewards/margins": 35.1875, "rewards/rejected": -37.912498474121094, "step": 280 }, { "epoch": 0.20477815699658702, "grad_norm": 1.467057466506958, "learning_rate": 9.67268134321851e-06, "logits/chosen": -1.755273461341858, "logits/rejected": -1.8513672351837158, "logps/chosen": -1.3359375, "logps/rejected": -19.698436737060547, "loss": 1.3359619140625, "nll_loss": 1.336328148841858, "rewards/accuracies": 1.0, "rewards/chosen": -2.671875, "rewards/margins": 36.72187423706055, "rewards/rejected": -39.396873474121094, "step": 285 }, { "epoch": 0.2083707562421412, "grad_norm": 1.470383882522583, "learning_rate": 9.649990050317806e-06, "logits/chosen": -1.8650391101837158, "logits/rejected": -1.7693359851837158, "logps/chosen": -1.347070336341858, "logps/rejected": -22.123437881469727, "loss": 1.3421875, "nll_loss": 1.3416016101837158, "rewards/accuracies": 1.0, "rewards/chosen": -2.694140672683716, "rewards/margins": 41.556251525878906, "rewards/rejected": -44.24687576293945, "step": 290 }, { "epoch": 0.21196335548769535, "grad_norm": 1.3637882471084595, "learning_rate": 9.62656681414516e-06, "logits/chosen": -1.8605468273162842, "logits/rejected": -1.29638671875, "logps/chosen": -1.3552734851837158, "logps/rejected": -24.821874618530273, "loss": 1.3458984375, "nll_loss": 1.346289038658142, "rewards/accuracies": 1.0, "rewards/chosen": -2.7105469703674316, "rewards/margins": 46.92499923706055, "rewards/rejected": -49.64374923706055, "step": 295 }, { "epoch": 0.2155559547332495, "grad_norm": 1.3850106000900269, "learning_rate": 9.602415321693434e-06, "logits/chosen": -2.0318360328674316, "logits/rejected": -1.422460913658142, "logps/chosen": -1.3425781726837158, "logps/rejected": -25.299999237060547, "loss": 1.3366943359375, "nll_loss": 1.3371093273162842, "rewards/accuracies": 1.0, "rewards/chosen": -2.6851563453674316, "rewards/margins": 47.943748474121094, "rewards/rejected": -50.599998474121094, "step": 300 }, { "epoch": 0.21914855397880367, "grad_norm": 1.4790948629379272, "learning_rate": 9.577539374588486e-06, "logits/chosen": -1.968164086341858, "logits/rejected": -1.3800780773162842, "logps/chosen": -1.3484375476837158, "logps/rejected": -26.568750381469727, "loss": 1.3474853515625, "nll_loss": 1.3474609851837158, "rewards/accuracies": 1.0, "rewards/chosen": -2.6968750953674316, "rewards/margins": 50.42499923706055, "rewards/rejected": -53.13750076293945, "step": 305 }, { "epoch": 0.22274115322435784, "grad_norm": 1.4023866653442383, "learning_rate": 9.551942888490759e-06, "logits/chosen": -1.8865234851837158, "logits/rejected": -1.200781226158142, "logps/chosen": -1.3542969226837158, "logps/rejected": -25.659374237060547, "loss": 1.3539306640625, "nll_loss": 1.353906273841858, "rewards/accuracies": 1.0, "rewards/chosen": -2.7085938453674316, "rewards/margins": 48.625, "rewards/rejected": -51.318748474121094, "step": 310 }, { "epoch": 0.22633375246991197, "grad_norm": 1.3669973611831665, "learning_rate": 9.525629892478936e-06, "logits/chosen": -2.0208983421325684, "logits/rejected": -1.45556640625, "logps/chosen": -1.3400390148162842, "logps/rejected": -25.734375, "loss": 1.334619140625, "nll_loss": 1.334375023841858, "rewards/accuracies": 1.0, "rewards/chosen": -2.6800780296325684, "rewards/margins": 48.76874923706055, "rewards/rejected": -51.46875, "step": 315 }, { "epoch": 0.22992635171546613, "grad_norm": 1.4521005153656006, "learning_rate": 9.498604528415731e-06, "logits/chosen": -2.009765625, "logits/rejected": -1.3367187976837158, "logps/chosen": -1.3371093273162842, "logps/rejected": -25.662500381469727, "loss": 1.3375244140625, "nll_loss": 1.337499976158142, "rewards/accuracies": 1.0, "rewards/chosen": -2.6742186546325684, "rewards/margins": 48.618751525878906, "rewards/rejected": -51.32500076293945, "step": 320 }, { "epoch": 0.2335189509610203, "grad_norm": 1.4478468894958496, "learning_rate": 9.47087105029592e-06, "logits/chosen": -1.8263671398162842, "logits/rejected": -1.1667969226837158, "logps/chosen": -1.3591797351837158, "logps/rejected": -25.931249618530273, "loss": 1.355322265625, "nll_loss": 1.3556640148162842, "rewards/accuracies": 1.0, "rewards/chosen": -2.7183594703674316, "rewards/margins": 49.17499923706055, "rewards/rejected": -51.86249923706055, "step": 325 }, { "epoch": 0.23711155020657446, "grad_norm": 1.379584550857544, "learning_rate": 9.442433823576741e-06, "logits/chosen": -1.768164038658142, "logits/rejected": -1.1067383289337158, "logps/chosen": -1.333984375, "logps/rejected": -25.903125762939453, "loss": 1.3345703125, "nll_loss": 1.3347656726837158, "rewards/accuracies": 1.0, "rewards/chosen": -2.66796875, "rewards/margins": 49.13750076293945, "rewards/rejected": -51.806251525878906, "step": 330 }, { "epoch": 0.24070414945212862, "grad_norm": 1.4488558769226074, "learning_rate": 9.413297324490736e-06, "logits/chosen": -1.6134765148162842, "logits/rejected": -0.8900390863418579, "logps/chosen": -1.3361327648162842, "logps/rejected": -25.981250762939453, "loss": 1.33505859375, "nll_loss": 1.3349609375, "rewards/accuracies": 1.0, "rewards/chosen": -2.6722655296325684, "rewards/margins": 49.26874923706055, "rewards/rejected": -51.962501525878906, "step": 335 }, { "epoch": 0.24429674869768278, "grad_norm": 1.3693856000900269, "learning_rate": 9.38346613934115e-06, "logits/chosen": -1.6583983898162842, "logits/rejected": -0.969042956829071, "logps/chosen": -1.3273437023162842, "logps/rejected": -25.806249618530273, "loss": 1.329931640625, "nll_loss": 1.3298828601837158, "rewards/accuracies": 1.0, "rewards/chosen": -2.6546874046325684, "rewards/margins": 48.95000076293945, "rewards/rejected": -51.61249923706055, "step": 340 }, { "epoch": 0.24788934794323694, "grad_norm": 1.406905174255371, "learning_rate": 9.352944963780024e-06, "logits/chosen": -1.476953148841858, "logits/rejected": -0.679394543170929, "logps/chosen": -1.3388671875, "logps/rejected": -26.012500762939453, "loss": 1.34248046875, "nll_loss": 1.3425781726837158, "rewards/accuracies": 1.0, "rewards/chosen": -2.677734375, "rewards/margins": 49.34375, "rewards/rejected": -52.025001525878906, "step": 345 }, { "epoch": 0.2514819471887911, "grad_norm": 1.4613367319107056, "learning_rate": 9.321738602069057e-06, "logits/chosen": -1.4833984375, "logits/rejected": -0.7633301019668579, "logps/chosen": -1.321874976158142, "logps/rejected": -26.181249618530273, "loss": 1.32119140625, "nll_loss": 1.320703148841858, "rewards/accuracies": 1.0, "rewards/chosen": -2.643749952316284, "rewards/margins": 49.70624923706055, "rewards/rejected": -52.36249923706055, "step": 350 }, { "epoch": 0.25507454643434524, "grad_norm": 1.504093885421753, "learning_rate": 9.289851966323382e-06, "logits/chosen": -1.4500000476837158, "logits/rejected": -0.7582031488418579, "logps/chosen": -1.3220703601837158, "logps/rejected": -25.159374237060547, "loss": 1.3218017578125, "nll_loss": 1.321874976158142, "rewards/accuracies": 1.0, "rewards/chosen": -2.6441407203674316, "rewards/margins": 47.70000076293945, "rewards/rejected": -50.318748474121094, "step": 355 }, { "epoch": 0.2586671456798994, "grad_norm": 1.4803372621536255, "learning_rate": 9.257290075738365e-06, "logits/chosen": -1.639062523841858, "logits/rejected": -0.955078125, "logps/chosen": -1.3292968273162842, "logps/rejected": -25.696874618530273, "loss": 1.3293701171875, "nll_loss": 1.3292968273162842, "rewards/accuracies": 1.0, "rewards/chosen": -2.6585936546325684, "rewards/margins": 48.75, "rewards/rejected": -51.39374923706055, "step": 360 }, { "epoch": 0.26225974492545356, "grad_norm": 1.4134585857391357, "learning_rate": 9.22405805579954e-06, "logits/chosen": -1.8039062023162842, "logits/rejected": -1.179296851158142, "logps/chosen": -1.3259766101837158, "logps/rejected": -25.478124618530273, "loss": 1.3216796875, "nll_loss": 1.3212890625, "rewards/accuracies": 1.0, "rewards/chosen": -2.6519532203674316, "rewards/margins": 48.33124923706055, "rewards/rejected": -50.95624923706055, "step": 365 }, { "epoch": 0.2658523441710077, "grad_norm": 1.4043936729431152, "learning_rate": 9.190161137475814e-06, "logits/chosen": -1.8455078601837158, "logits/rejected": -1.253515601158142, "logps/chosen": -1.3195312023162842, "logps/rejected": -25.584375381469727, "loss": 1.322607421875, "nll_loss": 1.322656273841858, "rewards/accuracies": 1.0, "rewards/chosen": -2.6390624046325684, "rewards/margins": 48.54375076293945, "rewards/rejected": -51.16875076293945, "step": 370 }, { "epoch": 0.2694449434165619, "grad_norm": 1.393306016921997, "learning_rate": 9.1556046563961e-06, "logits/chosen": -1.8347656726837158, "logits/rejected": -1.2158203125, "logps/chosen": -1.3279297351837158, "logps/rejected": -25.503124237060547, "loss": 1.3258544921875, "nll_loss": 1.326562523841858, "rewards/accuracies": 1.0, "rewards/chosen": -2.6558594703674316, "rewards/margins": 48.337501525878906, "rewards/rejected": -51.006248474121094, "step": 375 }, { "epoch": 0.27303754266211605, "grad_norm": 1.4320181608200073, "learning_rate": 9.120394052009412e-06, "logits/chosen": -1.757421851158142, "logits/rejected": -1.2019531726837158, "logps/chosen": -1.34375, "logps/rejected": -25.665624618530273, "loss": 1.349951171875, "nll_loss": 1.3498046398162842, "rewards/accuracies": 1.0, "rewards/chosen": -2.6875, "rewards/margins": 48.64374923706055, "rewards/rejected": -51.33124923706055, "step": 380 }, { "epoch": 0.2766301419076702, "grad_norm": 1.3351263999938965, "learning_rate": 9.084534866728683e-06, "logits/chosen": -1.7912108898162842, "logits/rejected": -1.316015601158142, "logps/chosen": -1.323828101158142, "logps/rejected": -24.731250762939453, "loss": 1.3206298828125, "nll_loss": 1.3205077648162842, "rewards/accuracies": 1.0, "rewards/chosen": -2.647656202316284, "rewards/margins": 46.837501525878906, "rewards/rejected": -49.462501525878906, "step": 385 }, { "epoch": 0.2802227411532244, "grad_norm": 1.3897342681884766, "learning_rate": 9.048032745058335e-06, "logits/chosen": -1.9580078125, "logits/rejected": -1.4689452648162842, "logps/chosen": -1.300390601158142, "logps/rejected": -25.350000381469727, "loss": 1.300146484375, "nll_loss": 1.300195336341858, "rewards/accuracies": 1.0, "rewards/chosen": -2.600781202316284, "rewards/margins": 48.099998474121094, "rewards/rejected": -50.70000076293945, "step": 390 }, { "epoch": 0.28381534039877854, "grad_norm": 1.3743433952331543, "learning_rate": 9.010893432705796e-06, "logits/chosen": -2.001953125, "logits/rejected": -1.5050780773162842, "logps/chosen": -1.328515648841858, "logps/rejected": -25.475000381469727, "loss": 1.327978515625, "nll_loss": 1.327734351158142, "rewards/accuracies": 1.0, "rewards/chosen": -2.657031297683716, "rewards/margins": 48.29999923706055, "rewards/rejected": -50.95000076293945, "step": 395 }, { "epoch": 0.2874079396443327, "grad_norm": 1.4429858922958374, "learning_rate": 8.973122775677078e-06, "logits/chosen": -1.842382788658142, "logits/rejected": -1.389062523841858, "logps/chosen": -1.3234374523162842, "logps/rejected": -25.459375381469727, "loss": 1.3240478515625, "nll_loss": 1.3234374523162842, "rewards/accuracies": 1.0, "rewards/chosen": -2.6468749046325684, "rewards/margins": 48.29375076293945, "rewards/rejected": -50.91875076293945, "step": 400 }, { "epoch": 0.2874079396443327, "eval_logits/chosen": -1.6806554794311523, "eval_logits/rejected": -1.1412818431854248, "eval_logps/chosen": -1.320174217224121, "eval_logps/rejected": -25.540929794311523, "eval_loss": 1.3220659494400024, "eval_nll_loss": 1.3214877843856812, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": -2.640348434448242, "eval_rewards/margins": 48.44247817993164, "eval_rewards/rejected": -51.08185958862305, "eval_runtime": 10.8056, "eval_samples_per_second": 83.29, "eval_steps_per_second": 10.458, "step": 400 }, { "epoch": 0.2910005388898868, "grad_norm": 1.3683401346206665, "learning_rate": 8.934726719356582e-06, "logits/chosen": -1.8271484375, "logits/rejected": -1.352148413658142, "logps/chosen": -1.3205077648162842, "logps/rejected": -26.159374237060547, "loss": 1.3176025390625, "nll_loss": 1.3173828125, "rewards/accuracies": 1.0, "rewards/chosen": -2.6410155296325684, "rewards/margins": 49.681251525878906, "rewards/rejected": -52.318748474121094, "step": 405 }, { "epoch": 0.29459313813544097, "grad_norm": 1.3627511262893677, "learning_rate": 8.895711307571235e-06, "logits/chosen": -1.7931640148162842, "logits/rejected": -1.351953148841858, "logps/chosen": -1.33984375, "logps/rejected": -25.140625, "loss": 1.3393798828125, "nll_loss": 1.33984375, "rewards/accuracies": 1.0, "rewards/chosen": -2.6796875, "rewards/margins": 47.59375, "rewards/rejected": -50.28125, "step": 410 }, { "epoch": 0.29818573738099513, "grad_norm": 1.3673158884048462, "learning_rate": 8.856082681639158e-06, "logits/chosen": -1.7703125476837158, "logits/rejected": -1.2882812023162842, "logps/chosen": -1.300390601158142, "logps/rejected": -25.515625, "loss": 1.303662109375, "nll_loss": 1.3035156726837158, "rewards/accuracies": 1.0, "rewards/chosen": -2.600781202316284, "rewards/margins": 48.42499923706055, "rewards/rejected": -51.03125, "step": 415 }, { "epoch": 0.3017783366265493, "grad_norm": 1.3663487434387207, "learning_rate": 8.815847079402972e-06, "logits/chosen": -1.7468750476837158, "logits/rejected": -1.295312523841858, "logps/chosen": -1.3419921398162842, "logps/rejected": -25.240625381469727, "loss": 1.34345703125, "nll_loss": 1.3427734375, "rewards/accuracies": 1.0, "rewards/chosen": -2.6839842796325684, "rewards/margins": 47.8125, "rewards/rejected": -50.48125076293945, "step": 420 }, { "epoch": 0.30537093587210346, "grad_norm": 1.3990769386291504, "learning_rate": 8.77501083424792e-06, "logits/chosen": -1.883398413658142, "logits/rejected": -1.515625, "logps/chosen": -1.3359375, "logps/rejected": -24.953125, "loss": 1.33154296875, "nll_loss": 1.331445336341858, "rewards/accuracies": 1.0, "rewards/chosen": -2.671875, "rewards/margins": 47.23749923706055, "rewards/rejected": -49.90625, "step": 425 }, { "epoch": 0.3089635351176576, "grad_norm": 1.4307533502578735, "learning_rate": 8.733580374104936e-06, "logits/chosen": -1.919335961341858, "logits/rejected": -1.54296875, "logps/chosen": -1.3332030773162842, "logps/rejected": -25.087499618530273, "loss": 1.3327880859375, "nll_loss": 1.3322265148162842, "rewards/accuracies": 1.0, "rewards/chosen": -2.6664061546325684, "rewards/margins": 47.51874923706055, "rewards/rejected": -50.17499923706055, "step": 430 }, { "epoch": 0.3125561343632118, "grad_norm": 1.3801665306091309, "learning_rate": 8.691562220438845e-06, "logits/chosen": -2.040234327316284, "logits/rejected": -1.6427733898162842, "logps/chosen": -1.317773461341858, "logps/rejected": -25.253124237060547, "loss": 1.3204345703125, "nll_loss": 1.320898413658142, "rewards/accuracies": 1.0, "rewards/chosen": -2.635546922683716, "rewards/margins": 47.881248474121094, "rewards/rejected": -50.506248474121094, "step": 435 }, { "epoch": 0.31614873360876594, "grad_norm": 1.367019772529602, "learning_rate": 8.648962987221837e-06, "logits/chosen": -2.056640625, "logits/rejected": -1.773828148841858, "logps/chosen": -1.338476538658142, "logps/rejected": -25.765625, "loss": 1.3358642578125, "nll_loss": 1.3351562023162842, "rewards/accuracies": 1.0, "rewards/chosen": -2.676953077316284, "rewards/margins": 48.818748474121094, "rewards/rejected": -51.53125, "step": 440 }, { "epoch": 0.3197413328543201, "grad_norm": 1.303930640220642, "learning_rate": 8.605789379892378e-06, "logits/chosen": -2.1685547828674316, "logits/rejected": -1.9763672351837158, "logps/chosen": -1.318359375, "logps/rejected": -25.115625381469727, "loss": 1.316162109375, "nll_loss": 1.3162109851837158, "rewards/accuracies": 1.0, "rewards/chosen": -2.63671875, "rewards/margins": 47.568748474121094, "rewards/rejected": -50.23125076293945, "step": 445 }, { "epoch": 0.32333393209987427, "grad_norm": 1.359526515007019, "learning_rate": 8.562048194299719e-06, "logits/chosen": -2.1402344703674316, "logits/rejected": -1.923828125, "logps/chosen": -1.3074219226837158, "logps/rejected": -24.325000762939453, "loss": 1.3082763671875, "nll_loss": 1.3087890148162842, "rewards/accuracies": 1.0, "rewards/chosen": -2.6148438453674316, "rewards/margins": 46.01250076293945, "rewards/rejected": -48.650001525878906, "step": 450 }, { "epoch": 0.32692653134542843, "grad_norm": 1.3633463382720947, "learning_rate": 8.517746315634186e-06, "logits/chosen": -2.1171875, "logits/rejected": -1.8513672351837158, "logps/chosen": -1.3214843273162842, "logps/rejected": -25.231250762939453, "loss": 1.3213623046875, "nll_loss": 1.320898413658142, "rewards/accuracies": 1.0, "rewards/chosen": -2.6429686546325684, "rewards/margins": 47.837501525878906, "rewards/rejected": -50.462501525878906, "step": 455 }, { "epoch": 0.3305191305909826, "grad_norm": 1.4858362674713135, "learning_rate": 8.472890717343391e-06, "logits/chosen": -2.1703124046325684, "logits/rejected": -1.986328125, "logps/chosen": -1.3107421398162842, "logps/rejected": -25.128124237060547, "loss": 1.3078369140625, "nll_loss": 1.3074219226837158, "rewards/accuracies": 1.0, "rewards/chosen": -2.6214842796325684, "rewards/margins": 47.650001525878906, "rewards/rejected": -50.256248474121094, "step": 460 }, { "epoch": 0.33411172983653675, "grad_norm": 1.299914002418518, "learning_rate": 8.427488460034567e-06, "logits/chosen": -2.121289014816284, "logits/rejected": -1.9025390148162842, "logps/chosen": -1.3078124523162842, "logps/rejected": -24.918750762939453, "loss": 1.3064208984375, "nll_loss": 1.306249976158142, "rewards/accuracies": 1.0, "rewards/chosen": -2.6156249046325684, "rewards/margins": 47.23125076293945, "rewards/rejected": -49.837501525878906, "step": 465 }, { "epoch": 0.3377043290820909, "grad_norm": 1.4685298204421997, "learning_rate": 8.381546690363174e-06, "logits/chosen": -2.049609422683716, "logits/rejected": -1.8035156726837158, "logps/chosen": -1.328710913658142, "logps/rejected": -24.306249618530273, "loss": 1.3246337890625, "nll_loss": 1.32421875, "rewards/accuracies": 1.0, "rewards/chosen": -2.657421827316284, "rewards/margins": 45.95624923706055, "rewards/rejected": -48.61249923706055, "step": 470 }, { "epoch": 0.3412969283276451, "grad_norm": 1.474596381187439, "learning_rate": 8.335072639907953e-06, "logits/chosen": -2.0572266578674316, "logits/rejected": -1.8097655773162842, "logps/chosen": -1.3097655773162842, "logps/rejected": -24.284374237060547, "loss": 1.3056640625, "nll_loss": 1.3054687976837158, "rewards/accuracies": 1.0, "rewards/chosen": -2.6195311546325684, "rewards/margins": 45.98125076293945, "rewards/rejected": -48.568748474121094, "step": 475 }, { "epoch": 0.3448895275731992, "grad_norm": 1.4318324327468872, "learning_rate": 8.288073624032634e-06, "logits/chosen": -2.0667967796325684, "logits/rejected": -1.8214843273162842, "logps/chosen": -1.302343726158142, "logps/rejected": -23.834375381469727, "loss": 1.2990234375, "nll_loss": 1.2990233898162842, "rewards/accuracies": 1.0, "rewards/chosen": -2.604687452316284, "rewards/margins": 45.087501525878906, "rewards/rejected": -47.66875076293945, "step": 480 }, { "epoch": 0.34848212681875335, "grad_norm": 1.429807424545288, "learning_rate": 8.240557040734434e-06, "logits/chosen": -1.9724609851837158, "logits/rejected": -1.695898413658142, "logps/chosen": -1.296875, "logps/rejected": -24.371875762939453, "loss": 1.2963134765625, "nll_loss": 1.296875, "rewards/accuracies": 1.0, "rewards/chosen": -2.59375, "rewards/margins": 46.14374923706055, "rewards/rejected": -48.743751525878906, "step": 485 }, { "epoch": 0.3520747260643075, "grad_norm": 1.4391751289367676, "learning_rate": 8.192530369479562e-06, "logits/chosen": -1.983984351158142, "logits/rejected": -1.6808593273162842, "logps/chosen": -1.321679711341858, "logps/rejected": -24.159374237060547, "loss": 1.315673828125, "nll_loss": 1.3152344226837158, "rewards/accuracies": 1.0, "rewards/chosen": -2.643359422683716, "rewards/margins": 45.67499923706055, "rewards/rejected": -48.318748474121094, "step": 490 }, { "epoch": 0.35566732530986167, "grad_norm": 1.318945288658142, "learning_rate": 8.144001170025894e-06, "logits/chosen": -1.8820312023162842, "logits/rejected": -1.5671875476837158, "logps/chosen": -1.337499976158142, "logps/rejected": -23.293750762939453, "loss": 1.3283203125, "nll_loss": 1.3283202648162842, "rewards/accuracies": 1.0, "rewards/chosen": -2.674999952316284, "rewards/margins": 43.931251525878906, "rewards/rejected": -46.587501525878906, "step": 495 }, { "epoch": 0.35925992455541583, "grad_norm": 1.341042399406433, "learning_rate": 8.094977081233006e-06, "logits/chosen": -1.8185546398162842, "logits/rejected": -1.5197265148162842, "logps/chosen": -1.3078124523162842, "logps/rejected": -24.037500381469727, "loss": 1.30478515625, "nll_loss": 1.3046875, "rewards/accuracies": 1.0, "rewards/chosen": -2.6156249046325684, "rewards/margins": 45.45624923706055, "rewards/rejected": -48.07500076293945, "step": 500 }, { "epoch": 0.36285252380097, "grad_norm": 1.449488639831543, "learning_rate": 8.045465819859766e-06, "logits/chosen": -1.7742187976837158, "logits/rejected": -1.476953148841858, "logps/chosen": -1.3240234851837158, "logps/rejected": -23.128124237060547, "loss": 1.32294921875, "nll_loss": 1.3234374523162842, "rewards/accuracies": 1.0, "rewards/chosen": -2.6480469703674316, "rewards/margins": 43.587501525878906, "rewards/rejected": -46.256248474121094, "step": 505 }, { "epoch": 0.36644512304652416, "grad_norm": 1.3669012784957886, "learning_rate": 7.995475179349657e-06, "logits/chosen": -1.7960937023162842, "logits/rejected": -1.439843773841858, "logps/chosen": -1.3117187023162842, "logps/rejected": -23.606250762939453, "loss": 1.310302734375, "nll_loss": 1.310546875, "rewards/accuracies": 1.0, "rewards/chosen": -2.6234374046325684, "rewards/margins": 44.587501525878906, "rewards/rejected": -47.212501525878906, "step": 510 }, { "epoch": 0.3700377222920783, "grad_norm": 1.367836833000183, "learning_rate": 7.945013028604026e-06, "logits/chosen": -1.6884765625, "logits/rejected": -1.352929711341858, "logps/chosen": -1.323828101158142, "logps/rejected": -23.725000381469727, "loss": 1.320263671875, "nll_loss": 1.3201172351837158, "rewards/accuracies": 1.0, "rewards/chosen": -2.647656202316284, "rewards/margins": 44.79375076293945, "rewards/rejected": -47.45000076293945, "step": 515 }, { "epoch": 0.3736303215376325, "grad_norm": 1.4297999143600464, "learning_rate": 7.894087310743468e-06, "logits/chosen": -1.7175781726837158, "logits/rejected": -1.3517577648162842, "logps/chosen": -1.3103516101837158, "logps/rejected": -23.831249237060547, "loss": 1.3116943359375, "nll_loss": 1.3117187023162842, "rewards/accuracies": 1.0, "rewards/chosen": -2.6207032203674316, "rewards/margins": 45.025001525878906, "rewards/rejected": -47.662498474121094, "step": 520 }, { "epoch": 0.37722292078318664, "grad_norm": 1.2659492492675781, "learning_rate": 7.842706041857512e-06, "logits/chosen": -1.7373046875, "logits/rejected": -1.3507812023162842, "logps/chosen": -1.301367163658142, "logps/rejected": -23.96875, "loss": 1.2968994140625, "nll_loss": 1.296484351158142, "rewards/accuracies": 1.0, "rewards/chosen": -2.602734327316284, "rewards/margins": 45.32500076293945, "rewards/rejected": -47.9375, "step": 525 }, { "epoch": 0.3808155200287408, "grad_norm": 1.3162612915039062, "learning_rate": 7.790877309742833e-06, "logits/chosen": -1.7541015148162842, "logits/rejected": -1.385156273841858, "logps/chosen": -1.298242211341858, "logps/rejected": -24.165624618530273, "loss": 1.3, "nll_loss": 1.2999999523162842, "rewards/accuracies": 1.0, "rewards/chosen": -2.596484422683716, "rewards/margins": 45.73749923706055, "rewards/rejected": -48.33124923706055, "step": 530 }, { "epoch": 0.38440811927429497, "grad_norm": 1.3984168767929077, "learning_rate": 7.73860927263017e-06, "logits/chosen": -1.751367211341858, "logits/rejected": -1.345703125, "logps/chosen": -1.2990233898162842, "logps/rejected": -24.496875762939453, "loss": 1.2975830078125, "nll_loss": 1.298437476158142, "rewards/accuracies": 1.0, "rewards/chosen": -2.5980467796325684, "rewards/margins": 46.41875076293945, "rewards/rejected": -48.993751525878906, "step": 535 }, { "epoch": 0.38800071851984913, "grad_norm": 1.3745523691177368, "learning_rate": 7.685910157900158e-06, "logits/chosen": -1.7451171875, "logits/rejected": -1.4044921398162842, "logps/chosen": -1.2980468273162842, "logps/rejected": -24.0, "loss": 1.2944580078125, "nll_loss": 1.294531226158142, "rewards/accuracies": 1.0, "rewards/chosen": -2.5960936546325684, "rewards/margins": 45.400001525878906, "rewards/rejected": -48.0, "step": 540 }, { "epoch": 0.3915933177654033, "grad_norm": 1.2834235429763794, "learning_rate": 7.632788260788285e-06, "logits/chosen": -1.7292969226837158, "logits/rejected": -1.339257836341858, "logps/chosen": -1.3103516101837158, "logps/rejected": -23.706249237060547, "loss": 1.30927734375, "nll_loss": 1.30859375, "rewards/accuracies": 1.0, "rewards/chosen": -2.6207032203674316, "rewards/margins": 44.82500076293945, "rewards/rejected": -47.412498474121094, "step": 545 }, { "epoch": 0.39518591701095745, "grad_norm": 1.4184428453445435, "learning_rate": 7.579251943079145e-06, "logits/chosen": -1.7482421398162842, "logits/rejected": -1.367773413658142, "logps/chosen": -1.302734375, "logps/rejected": -23.865625381469727, "loss": 1.3060791015625, "nll_loss": 1.306054711341858, "rewards/accuracies": 1.0, "rewards/chosen": -2.60546875, "rewards/margins": 45.162498474121094, "rewards/rejected": -47.73125076293945, "step": 550 }, { "epoch": 0.39877851625651156, "grad_norm": 1.274613618850708, "learning_rate": 7.525309631790244e-06, "logits/chosen": -1.5695312023162842, "logits/rejected": -1.064843773841858, "logps/chosen": -1.272851586341858, "logps/rejected": -23.746875762939453, "loss": 1.2704345703125, "nll_loss": 1.270117163658142, "rewards/accuracies": 1.0, "rewards/chosen": -2.545703172683716, "rewards/margins": 44.931251525878906, "rewards/rejected": -47.493751525878906, "step": 555 }, { "epoch": 0.4023711155020657, "grad_norm": 1.360320806503296, "learning_rate": 7.470969817845518e-06, "logits/chosen": -1.513671875, "logits/rejected": -1.148535132408142, "logps/chosen": -1.3123047351837158, "logps/rejected": -23.356250762939453, "loss": 1.3122802734375, "nll_loss": 1.311914086341858, "rewards/accuracies": 1.0, "rewards/chosen": -2.6246094703674316, "rewards/margins": 44.068748474121094, "rewards/rejected": -46.712501525878906, "step": 560 }, { "epoch": 0.4059637147476199, "grad_norm": 1.3916856050491333, "learning_rate": 7.416241054738801e-06, "logits/chosen": -1.3459961414337158, "logits/rejected": -0.8058105707168579, "logps/chosen": -1.291406273841858, "logps/rejected": -23.453125, "loss": 1.2943115234375, "nll_loss": 1.2941405773162842, "rewards/accuracies": 1.0, "rewards/chosen": -2.582812547683716, "rewards/margins": 44.349998474121094, "rewards/rejected": -46.90625, "step": 565 }, { "epoch": 0.40955631399317405, "grad_norm": 1.3848360776901245, "learning_rate": 7.361131957187435e-06, "logits/chosen": -1.4708983898162842, "logits/rejected": -1.0579102039337158, "logps/chosen": -1.3025391101837158, "logps/rejected": -23.649999618530273, "loss": 1.2989990234375, "nll_loss": 1.299414038658142, "rewards/accuracies": 1.0, "rewards/chosen": -2.6050782203674316, "rewards/margins": 44.6875, "rewards/rejected": -47.29999923706055, "step": 570 }, { "epoch": 0.4131489132387282, "grad_norm": 1.3308842182159424, "learning_rate": 7.305651199776258e-06, "logits/chosen": -1.4827148914337158, "logits/rejected": -1.053613305091858, "logps/chosen": -1.3203125, "logps/rejected": -23.6875, "loss": 1.3232666015625, "nll_loss": 1.322851538658142, "rewards/accuracies": 1.0, "rewards/chosen": -2.640625, "rewards/margins": 44.743751525878906, "rewards/rejected": -47.375, "step": 575 }, { "epoch": 0.4167415124842824, "grad_norm": 1.4075103998184204, "learning_rate": 7.249807515592149e-06, "logits/chosen": -1.7177734375, "logits/rejected": -1.362695336341858, "logps/chosen": -1.298242211341858, "logps/rejected": -23.03125, "loss": 1.2957763671875, "nll_loss": 1.295312523841858, "rewards/accuracies": 1.0, "rewards/chosen": -2.596484422683716, "rewards/margins": 43.45624923706055, "rewards/rejected": -46.0625, "step": 580 }, { "epoch": 0.42033411172983653, "grad_norm": 1.3655961751937866, "learning_rate": 7.193609694849385e-06, "logits/chosen": -1.599023461341858, "logits/rejected": -1.16357421875, "logps/chosen": -1.332617163658142, "logps/rejected": -23.621875762939453, "loss": 1.3315185546875, "nll_loss": 1.33203125, "rewards/accuracies": 1.0, "rewards/chosen": -2.665234327316284, "rewards/margins": 44.54999923706055, "rewards/rejected": -47.243751525878906, "step": 585 }, { "epoch": 0.4239267109753907, "grad_norm": 1.35598886013031, "learning_rate": 7.137066583505987e-06, "logits/chosen": -1.426171898841858, "logits/rejected": -0.9120117425918579, "logps/chosen": -1.298437476158142, "logps/rejected": -23.600000381469727, "loss": 1.299609375, "nll_loss": 1.2990233898162842, "rewards/accuracies": 1.0, "rewards/chosen": -2.596874952316284, "rewards/margins": 44.618751525878906, "rewards/rejected": -47.20000076293945, "step": 590 }, { "epoch": 0.42751931022094486, "grad_norm": 1.3101612329483032, "learning_rate": 7.080187081871307e-06, "logits/chosen": -1.4705078601837158, "logits/rejected": -0.941210925579071, "logps/chosen": -1.2958984375, "logps/rejected": -24.024999618530273, "loss": 1.2966796875, "nll_loss": 1.296875, "rewards/accuracies": 1.0, "rewards/chosen": -2.591796875, "rewards/margins": 45.45000076293945, "rewards/rejected": -48.04999923706055, "step": 595 }, { "epoch": 0.431111909466499, "grad_norm": 1.3423857688903809, "learning_rate": 7.022980143205046e-06, "logits/chosen": -1.289453148841858, "logits/rejected": -0.7322753667831421, "logps/chosen": -1.314453125, "logps/rejected": -23.478124618530273, "loss": 1.309228515625, "nll_loss": 1.309179663658142, "rewards/accuracies": 1.0, "rewards/chosen": -2.62890625, "rewards/margins": 44.34375, "rewards/rejected": -46.95624923706055, "step": 600 }, { "epoch": 0.431111909466499, "eval_logits/chosen": -1.5194966793060303, "eval_logits/rejected": -1.0023161172866821, "eval_logps/chosen": -1.2943168878555298, "eval_logps/rejected": -23.81637191772461, "eval_loss": 1.2964930534362793, "eval_nll_loss": 1.295423150062561, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": -2.5886337757110596, "eval_rewards/margins": 45.035396575927734, "eval_rewards/rejected": -47.63274383544922, "eval_runtime": 10.8217, "eval_samples_per_second": 83.166, "eval_steps_per_second": 10.442, "step": 600 }, { "epoch": 0.4347045087120532, "grad_norm": 1.2765231132507324, "learning_rate": 6.965454772307948e-06, "logits/chosen": -1.4140625, "logits/rejected": -0.925341784954071, "logps/chosen": -1.322265625, "logps/rejected": -23.350000381469727, "loss": 1.322607421875, "nll_loss": 1.322851538658142, "rewards/accuracies": 1.0, "rewards/chosen": -2.64453125, "rewards/margins": 44.04375076293945, "rewards/rejected": -46.70000076293945, "step": 605 }, { "epoch": 0.43829710795760735, "grad_norm": 1.3667538166046143, "learning_rate": 6.907620024104377e-06, "logits/chosen": -1.4381835460662842, "logits/rejected": -0.9351562261581421, "logps/chosen": -1.318750023841858, "logps/rejected": -23.524999618530273, "loss": 1.308984375, "nll_loss": 1.309179663658142, "rewards/accuracies": 1.0, "rewards/chosen": -2.637500047683716, "rewards/margins": 44.40625, "rewards/rejected": -47.04999923706055, "step": 610 }, { "epoch": 0.4418897072031615, "grad_norm": 1.436664342880249, "learning_rate": 6.849485002216999e-06, "logits/chosen": -1.5244140625, "logits/rejected": -1.085790991783142, "logps/chosen": -1.310546875, "logps/rejected": -23.256250381469727, "loss": 1.3148193359375, "nll_loss": 1.314843773841858, "rewards/accuracies": 1.0, "rewards/chosen": -2.62109375, "rewards/margins": 43.912498474121094, "rewards/rejected": -46.51250076293945, "step": 615 }, { "epoch": 0.44548230644871567, "grad_norm": 1.3317304849624634, "learning_rate": 6.791058857533814e-06, "logits/chosen": -1.5632812976837158, "logits/rejected": -1.1470215320587158, "logps/chosen": -1.3005859851837158, "logps/rejected": -23.618749618530273, "loss": 1.29970703125, "nll_loss": 1.2996094226837158, "rewards/accuracies": 1.0, "rewards/chosen": -2.6011719703674316, "rewards/margins": 44.64374923706055, "rewards/rejected": -47.23749923706055, "step": 620 }, { "epoch": 0.4490749056942698, "grad_norm": 1.2532631158828735, "learning_rate": 6.732350786767726e-06, "logits/chosen": -1.6130859851837158, "logits/rejected": -1.1335937976837158, "logps/chosen": -1.301171898841858, "logps/rejected": -23.109375, "loss": 1.29619140625, "nll_loss": 1.2957031726837158, "rewards/accuracies": 1.0, "rewards/chosen": -2.602343797683716, "rewards/margins": 43.618751525878906, "rewards/rejected": -46.21875, "step": 625 }, { "epoch": 0.45266750493982394, "grad_norm": 1.250854730606079, "learning_rate": 6.673370031008919e-06, "logits/chosen": -1.5654296875, "logits/rejected": -1.1521484851837158, "logps/chosen": -1.282812476158142, "logps/rejected": -23.0, "loss": 1.276611328125, "nll_loss": 1.276757836341858, "rewards/accuracies": 1.0, "rewards/chosen": -2.565624952316284, "rewards/margins": 43.431251525878906, "rewards/rejected": -46.0, "step": 630 }, { "epoch": 0.4562601041853781, "grad_norm": 1.2643660306930542, "learning_rate": 6.614125874270235e-06, "logits/chosen": -1.543554663658142, "logits/rejected": -1.14111328125, "logps/chosen": -1.2732422351837158, "logps/rejected": -23.575000762939453, "loss": 1.2722412109375, "nll_loss": 1.271875023841858, "rewards/accuracies": 1.0, "rewards/chosen": -2.5464844703674316, "rewards/margins": 44.61249923706055, "rewards/rejected": -47.150001525878906, "step": 635 }, { "epoch": 0.45985270343093226, "grad_norm": 1.3363035917282104, "learning_rate": 6.554627642025807e-06, "logits/chosen": -1.471093773841858, "logits/rejected": -1.066259741783142, "logps/chosen": -1.307226538658142, "logps/rejected": -22.903125762939453, "loss": 1.3047607421875, "nll_loss": 1.305273413658142, "rewards/accuracies": 1.0, "rewards/chosen": -2.614453077316284, "rewards/margins": 43.162498474121094, "rewards/rejected": -45.806251525878906, "step": 640 }, { "epoch": 0.4634453026764864, "grad_norm": 1.3363221883773804, "learning_rate": 6.4948846997431545e-06, "logits/chosen": -1.734765648841858, "logits/rejected": -1.4826171398162842, "logps/chosen": -1.297460913658142, "logps/rejected": -23.121875762939453, "loss": 1.2979248046875, "nll_loss": 1.2976562976837158, "rewards/accuracies": 1.0, "rewards/chosen": -2.594921827316284, "rewards/margins": 43.63750076293945, "rewards/rejected": -46.243751525878906, "step": 645 }, { "epoch": 0.4670379019220406, "grad_norm": 1.337382197380066, "learning_rate": 6.434906451408991e-06, "logits/chosen": -1.6960937976837158, "logits/rejected": -1.3728516101837158, "logps/chosen": -1.2878906726837158, "logps/rejected": -23.159374237060547, "loss": 1.279833984375, "nll_loss": 1.279687523841858, "rewards/accuracies": 1.0, "rewards/chosen": -2.5757813453674316, "rewards/margins": 43.73125076293945, "rewards/rejected": -46.318748474121094, "step": 650 }, { "epoch": 0.47063050116759475, "grad_norm": 1.34009850025177, "learning_rate": 6.374702338048966e-06, "logits/chosen": -1.6873047351837158, "logits/rejected": -1.373046875, "logps/chosen": -1.283203125, "logps/rejected": -23.418750762939453, "loss": 1.2853271484375, "nll_loss": 1.2853515148162842, "rewards/accuracies": 1.0, "rewards/chosen": -2.56640625, "rewards/margins": 44.275001525878906, "rewards/rejected": -46.837501525878906, "step": 655 }, { "epoch": 0.4742231004131489, "grad_norm": 1.3014594316482544, "learning_rate": 6.314281836241573e-06, "logits/chosen": -1.604101538658142, "logits/rejected": -1.0714843273162842, "logps/chosen": -1.2931640148162842, "logps/rejected": -24.359375, "loss": 1.2911376953125, "nll_loss": 1.2902343273162842, "rewards/accuracies": 1.0, "rewards/chosen": -2.5863280296325684, "rewards/margins": 46.162498474121094, "rewards/rejected": -48.71875, "step": 660 }, { "epoch": 0.4778156996587031, "grad_norm": 1.285325050354004, "learning_rate": 6.253654456626475e-06, "logits/chosen": -1.655859351158142, "logits/rejected": -0.611621081829071, "logps/chosen": -1.2919921875, "logps/rejected": -26.634374618530273, "loss": 1.292138671875, "nll_loss": 1.292382836341858, "rewards/accuracies": 1.0, "rewards/chosen": -2.583984375, "rewards/margins": 50.67499923706055, "rewards/rejected": -53.26874923706055, "step": 665 }, { "epoch": 0.48140829890425724, "grad_norm": 1.3492448329925537, "learning_rate": 6.192829742407442e-06, "logits/chosen": -1.5896484851837158, "logits/rejected": -0.4659423828125, "logps/chosen": -1.2726562023162842, "logps/rejected": -27.024999618530273, "loss": 1.2752197265625, "nll_loss": 1.2742187976837158, "rewards/accuracies": 1.0, "rewards/chosen": -2.5453124046325684, "rewards/margins": 51.51874923706055, "rewards/rejected": -54.04999923706055, "step": 670 }, { "epoch": 0.4850008981498114, "grad_norm": 1.3275717496871948, "learning_rate": 6.131817267850198e-06, "logits/chosen": -1.566796898841858, "logits/rejected": -0.5628417730331421, "logps/chosen": -1.284765601158142, "logps/rejected": -26.453125, "loss": 1.2881103515625, "nll_loss": 1.2878906726837158, "rewards/accuracies": 1.0, "rewards/chosen": -2.569531202316284, "rewards/margins": 50.318748474121094, "rewards/rejected": -52.90625, "step": 675 }, { "epoch": 0.48859349739536556, "grad_norm": 1.2872709035873413, "learning_rate": 6.070626636775349e-06, "logits/chosen": -1.6726562976837158, "logits/rejected": -0.7708984613418579, "logps/chosen": -1.283593773841858, "logps/rejected": -26.090625762939453, "loss": 1.2883056640625, "nll_loss": 1.2880859375, "rewards/accuracies": 1.0, "rewards/chosen": -2.567187547683716, "rewards/margins": 49.59375, "rewards/rejected": -52.181251525878906, "step": 680 }, { "epoch": 0.4921860966409197, "grad_norm": 1.3302466869354248, "learning_rate": 6.009267481046667e-06, "logits/chosen": -1.6228516101837158, "logits/rejected": -0.7652832269668579, "logps/chosen": -1.323828101158142, "logps/rejected": -26.209375381469727, "loss": 1.323291015625, "nll_loss": 1.3232421875, "rewards/accuracies": 1.0, "rewards/chosen": -2.647656202316284, "rewards/margins": 49.75, "rewards/rejected": -52.41875076293945, "step": 685 }, { "epoch": 0.4957786958864739, "grad_norm": 1.463944911956787, "learning_rate": 5.947749459054972e-06, "logits/chosen": -1.705468773841858, "logits/rejected": -0.858593761920929, "logps/chosen": -1.297460913658142, "logps/rejected": -25.778125762939453, "loss": 1.2973876953125, "nll_loss": 1.2976562976837158, "rewards/accuracies": 1.0, "rewards/chosen": -2.594921827316284, "rewards/margins": 48.974998474121094, "rewards/rejected": -51.556251525878906, "step": 690 }, { "epoch": 0.49937129513202805, "grad_norm": 1.4560619592666626, "learning_rate": 5.8860822541978225e-06, "logits/chosen": -1.78515625, "logits/rejected": -0.955859363079071, "logps/chosen": -1.2726562023162842, "logps/rejected": -25.315624237060547, "loss": 1.2758544921875, "nll_loss": 1.2755858898162842, "rewards/accuracies": 1.0, "rewards/chosen": -2.5453124046325684, "rewards/margins": 48.068748474121094, "rewards/rejected": -50.631248474121094, "step": 695 }, { "epoch": 0.5029638943775822, "grad_norm": 1.322197437286377, "learning_rate": 5.824275573355278e-06, "logits/chosen": -1.71484375, "logits/rejected": -0.90673828125, "logps/chosen": -1.292578101158142, "logps/rejected": -25.475000381469727, "loss": 1.2906005859375, "nll_loss": 1.290429711341858, "rewards/accuracies": 1.0, "rewards/chosen": -2.585156202316284, "rewards/margins": 48.381248474121094, "rewards/rejected": -50.95000076293945, "step": 700 }, { "epoch": 0.5065564936231364, "grad_norm": 1.3345097303390503, "learning_rate": 5.762339145361962e-06, "logits/chosen": -1.8083984851837158, "logits/rejected": -1.069921851158142, "logps/chosen": -1.2880859375, "logps/rejected": -25.4375, "loss": 1.2857421875, "nll_loss": 1.2861328125, "rewards/accuracies": 1.0, "rewards/chosen": -2.576171875, "rewards/margins": 48.29375076293945, "rewards/rejected": -50.875, "step": 705 }, { "epoch": 0.5101490928686905, "grad_norm": 1.2403104305267334, "learning_rate": 5.700282719475672e-06, "logits/chosen": -1.841406226158142, "logits/rejected": -1.2326171398162842, "logps/chosen": -1.30859375, "logps/rejected": -25.575000762939453, "loss": 1.31083984375, "nll_loss": 1.310937523841858, "rewards/accuracies": 1.0, "rewards/chosen": -2.6171875, "rewards/margins": 48.51250076293945, "rewards/rejected": -51.150001525878906, "step": 710 }, { "epoch": 0.5137416921142447, "grad_norm": 1.2611744403839111, "learning_rate": 5.638116063842774e-06, "logits/chosen": -1.895117163658142, "logits/rejected": -1.269628882408142, "logps/chosen": -1.273828148841858, "logps/rejected": -25.350000381469727, "loss": 1.274365234375, "nll_loss": 1.273828148841858, "rewards/accuracies": 1.0, "rewards/chosen": -2.547656297683716, "rewards/margins": 48.131248474121094, "rewards/rejected": -50.70000076293945, "step": 715 }, { "epoch": 0.5173342913597988, "grad_norm": 1.2506487369537354, "learning_rate": 5.575848963960621e-06, "logits/chosen": -1.8371093273162842, "logits/rejected": -1.1985352039337158, "logps/chosen": -1.2791016101837158, "logps/rejected": -25.475000381469727, "loss": 1.27314453125, "nll_loss": 1.2726562023162842, "rewards/accuracies": 1.0, "rewards/chosen": -2.5582032203674316, "rewards/margins": 48.400001525878906, "rewards/rejected": -50.95000076293945, "step": 720 }, { "epoch": 0.520926890605353, "grad_norm": 1.3620471954345703, "learning_rate": 5.513491221137244e-06, "logits/chosen": -1.7062499523162842, "logits/rejected": -1.007910132408142, "logps/chosen": -1.2703125476837158, "logps/rejected": -25.246875762939453, "loss": 1.2715576171875, "nll_loss": 1.271093726158142, "rewards/accuracies": 1.0, "rewards/chosen": -2.5406250953674316, "rewards/margins": 47.931251525878906, "rewards/rejected": -50.493751525878906, "step": 725 }, { "epoch": 0.5245194898509071, "grad_norm": 1.4125722646713257, "learning_rate": 5.451052650948549e-06, "logits/chosen": -1.7374999523162842, "logits/rejected": -1.010351538658142, "logps/chosen": -1.2765624523162842, "logps/rejected": -24.746875762939453, "loss": 1.2767578125, "nll_loss": 1.2763671875, "rewards/accuracies": 1.0, "rewards/chosen": -2.5531249046325684, "rewards/margins": 46.943748474121094, "rewards/rejected": -49.493751525878906, "step": 730 }, { "epoch": 0.5281120890964612, "grad_norm": 1.3194482326507568, "learning_rate": 5.388543081693281e-06, "logits/chosen": -1.6375000476837158, "logits/rejected": -0.8878418207168579, "logps/chosen": -1.3035156726837158, "logps/rejected": -25.084375381469727, "loss": 1.299755859375, "nll_loss": 1.2998046875, "rewards/accuracies": 1.0, "rewards/chosen": -2.6070313453674316, "rewards/margins": 47.54375076293945, "rewards/rejected": -50.16875076293945, "step": 735 }, { "epoch": 0.5317046883420155, "grad_norm": 1.251672387123108, "learning_rate": 5.325972352845965e-06, "logits/chosen": -1.660546898841858, "logits/rejected": -0.9129883050918579, "logps/chosen": -1.2833983898162842, "logps/rejected": -25.365625381469727, "loss": 1.2735107421875, "nll_loss": 1.2732422351837158, "rewards/accuracies": 1.0, "rewards/chosen": -2.5667967796325684, "rewards/margins": 48.17499923706055, "rewards/rejected": -50.73125076293945, "step": 740 }, { "epoch": 0.5352972875875696, "grad_norm": 1.2692995071411133, "learning_rate": 5.263350313508105e-06, "logits/chosen": -1.7023437023162842, "logits/rejected": -0.9541015625, "logps/chosen": -1.299414038658142, "logps/rejected": -25.112499237060547, "loss": 1.2959716796875, "nll_loss": 1.2951171398162842, "rewards/accuracies": 1.0, "rewards/chosen": -2.598828077316284, "rewards/margins": 47.631248474121094, "rewards/rejected": -50.224998474121094, "step": 745 }, { "epoch": 0.5388898868331238, "grad_norm": 1.1914738416671753, "learning_rate": 5.200686820857862e-06, "logits/chosen": -1.7878906726837158, "logits/rejected": -1.0377929210662842, "logps/chosen": -1.291406273841858, "logps/rejected": -25.896875381469727, "loss": 1.2941650390625, "nll_loss": 1.294335961341858, "rewards/accuracies": 1.0, "rewards/chosen": -2.582812547683716, "rewards/margins": 49.1875, "rewards/rejected": -51.79375076293945, "step": 750 }, { "epoch": 0.5424824860786779, "grad_norm": 1.3020259141921997, "learning_rate": 5.137991738598457e-06, "logits/chosen": -1.6437499523162842, "logits/rejected": -0.744921863079071, "logps/chosen": -1.2978515625, "logps/rejected": -26.478124618530273, "loss": 1.2934326171875, "nll_loss": 1.293359398841858, "rewards/accuracies": 1.0, "rewards/chosen": -2.595703125, "rewards/margins": 50.35625076293945, "rewards/rejected": -52.95624923706055, "step": 755 }, { "epoch": 0.5460750853242321, "grad_norm": 1.2489492893218994, "learning_rate": 5.075274935405554e-06, "logits/chosen": -1.657812476158142, "logits/rejected": -0.737988293170929, "logps/chosen": -1.283203125, "logps/rejected": -25.987499237060547, "loss": 1.2807373046875, "nll_loss": 1.2800781726837158, "rewards/accuracies": 1.0, "rewards/chosen": -2.56640625, "rewards/margins": 49.4375, "rewards/rejected": -51.974998474121094, "step": 760 }, { "epoch": 0.5496676845697862, "grad_norm": 1.2859312295913696, "learning_rate": 5.012546283373853e-06, "logits/chosen": -1.719140648841858, "logits/rejected": -0.7794433832168579, "logps/chosen": -1.2625000476837158, "logps/rejected": -26.325000762939453, "loss": 1.256689453125, "nll_loss": 1.2570312023162842, "rewards/accuracies": 1.0, "rewards/chosen": -2.5250000953674316, "rewards/margins": 50.13750076293945, "rewards/rejected": -52.650001525878906, "step": 765 }, { "epoch": 0.5532602838153404, "grad_norm": 1.2959156036376953, "learning_rate": 4.949815656463151e-06, "logits/chosen": -1.6435546875, "logits/rejected": -0.6708984375, "logps/chosen": -1.298437476158142, "logps/rejected": -26.240625381469727, "loss": 1.296044921875, "nll_loss": 1.295507788658142, "rewards/accuracies": 1.0, "rewards/chosen": -2.596874952316284, "rewards/margins": 49.89374923706055, "rewards/rejected": -52.48125076293945, "step": 770 }, { "epoch": 0.5568528830608945, "grad_norm": 1.2257376909255981, "learning_rate": 4.887092928944109e-06, "logits/chosen": -1.7810547351837158, "logits/rejected": -0.9029296636581421, "logps/chosen": -1.291015625, "logps/rejected": -26.418750762939453, "loss": 1.2862548828125, "nll_loss": 1.2863280773162842, "rewards/accuracies": 1.0, "rewards/chosen": -2.58203125, "rewards/margins": 50.25, "rewards/rejected": -52.837501525878906, "step": 775 }, { "epoch": 0.5604454823064488, "grad_norm": 1.384114146232605, "learning_rate": 4.824387973843957e-06, "logits/chosen": -1.536523461341858, "logits/rejected": -0.5414062738418579, "logps/chosen": -1.2625000476837158, "logps/rejected": -26.871875762939453, "loss": 1.2651611328125, "nll_loss": 1.265234351158142, "rewards/accuracies": 1.0, "rewards/chosen": -2.5250000953674316, "rewards/margins": 51.23125076293945, "rewards/rejected": -53.743751525878906, "step": 780 }, { "epoch": 0.5640380815520029, "grad_norm": 1.2737990617752075, "learning_rate": 4.761710661392416e-06, "logits/chosen": -1.5812499523162842, "logits/rejected": -0.62646484375, "logps/chosen": -1.285742163658142, "logps/rejected": -26.290624618530273, "loss": 1.285498046875, "nll_loss": 1.285742163658142, "rewards/accuracies": 1.0, "rewards/chosen": -2.571484327316284, "rewards/margins": 50.037498474121094, "rewards/rejected": -52.58124923706055, "step": 785 }, { "epoch": 0.5676306807975571, "grad_norm": 1.2511963844299316, "learning_rate": 4.699070857468052e-06, "logits/chosen": -1.6640625, "logits/rejected": -0.764843761920929, "logps/chosen": -1.2849609851837158, "logps/rejected": -26.553125381469727, "loss": 1.2810546875, "nll_loss": 1.28125, "rewards/accuracies": 1.0, "rewards/chosen": -2.5699219703674316, "rewards/margins": 50.51250076293945, "rewards/rejected": -53.10625076293945, "step": 790 }, { "epoch": 0.5712232800431112, "grad_norm": 1.2975844144821167, "learning_rate": 4.636478422045302e-06, "logits/chosen": -1.7570312023162842, "logits/rejected": -0.8634277582168579, "logps/chosen": -1.302734375, "logps/rejected": -26.465625762939453, "loss": 1.297509765625, "nll_loss": 1.2978515625, "rewards/accuracies": 1.0, "rewards/chosen": -2.60546875, "rewards/margins": 50.32500076293945, "rewards/rejected": -52.931251525878906, "step": 795 }, { "epoch": 0.5748158792886654, "grad_norm": 1.2981692552566528, "learning_rate": 4.573943207642452e-06, "logits/chosen": -1.7734375, "logits/rejected": -0.840771496295929, "logps/chosen": -1.292382836341858, "logps/rejected": -26.674999237060547, "loss": 1.2906982421875, "nll_loss": 1.291601538658142, "rewards/accuracies": 1.0, "rewards/chosen": -2.584765672683716, "rewards/margins": 50.787498474121094, "rewards/rejected": -53.349998474121094, "step": 800 }, { "epoch": 0.5748158792886654, "eval_logits/chosen": -1.6875, "eval_logits/rejected": -0.7205302715301514, "eval_logps/chosen": -1.2761338949203491, "eval_logps/rejected": -26.57632827758789, "eval_loss": 1.27776038646698, "eval_nll_loss": 1.2772400379180908, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": -2.5522677898406982, "eval_rewards/margins": 50.59955596923828, "eval_rewards/rejected": -53.15265655517578, "eval_runtime": 10.7748, "eval_samples_per_second": 83.528, "eval_steps_per_second": 10.487, "step": 800 }, { "epoch": 0.5784084785342195, "grad_norm": 1.255006194114685, "learning_rate": 4.5114750577707606e-06, "logits/chosen": -1.762304663658142, "logits/rejected": -0.811816394329071, "logps/chosen": -1.2724609375, "logps/rejected": -26.815624237060547, "loss": 1.2694091796875, "nll_loss": 1.268945336341858, "rewards/accuracies": 1.0, "rewards/chosen": -2.544921875, "rewards/margins": 51.07500076293945, "rewards/rejected": -53.631248474121094, "step": 805 }, { "epoch": 0.5820010777797736, "grad_norm": 1.1863782405853271, "learning_rate": 4.449083805385037e-06, "logits/chosen": -1.833593726158142, "logits/rejected": -0.904589831829071, "logps/chosen": -1.271484375, "logps/rejected": -26.590625762939453, "loss": 1.269287109375, "nll_loss": 1.269140601158142, "rewards/accuracies": 1.0, "rewards/chosen": -2.54296875, "rewards/margins": 50.631248474121094, "rewards/rejected": -53.181251525878906, "step": 810 }, { "epoch": 0.5855936770253278, "grad_norm": 1.3515548706054688, "learning_rate": 4.386779271335845e-06, "logits/chosen": -1.686132788658142, "logits/rejected": -0.7086426019668579, "logps/chosen": -1.2814452648162842, "logps/rejected": -26.75, "loss": 1.2802001953125, "nll_loss": 1.280859351158142, "rewards/accuracies": 1.0, "rewards/chosen": -2.5628905296325684, "rewards/margins": 50.912498474121094, "rewards/rejected": -53.5, "step": 815 }, { "epoch": 0.5891862762708819, "grad_norm": 1.2749079465866089, "learning_rate": 4.3245712628236356e-06, "logits/chosen": -1.656835913658142, "logits/rejected": -0.7159668207168579, "logps/chosen": -1.2839844226837158, "logps/rejected": -26.215625762939453, "loss": 1.2838623046875, "nll_loss": 1.2794921398162842, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.5679688453674316, "rewards/margins": 49.85625076293945, "rewards/rejected": -52.431251525878906, "step": 820 }, { "epoch": 0.5927788755164362, "grad_norm": 1.3244684934616089, "learning_rate": 4.26246957185501e-06, "logits/chosen": -1.477148413658142, "logits/rejected": -0.528271496295929, "logps/chosen": -1.2804687023162842, "logps/rejected": -26.524999618530273, "loss": 1.273583984375, "nll_loss": 1.272851586341858, "rewards/accuracies": 1.0, "rewards/chosen": -2.5609374046325684, "rewards/margins": 50.5, "rewards/rejected": -53.04999923706055, "step": 825 }, { "epoch": 0.5963714747619903, "grad_norm": 1.2788718938827515, "learning_rate": 4.200483973701401e-06, "logits/chosen": -1.5330078601837158, "logits/rejected": -0.5139983892440796, "logps/chosen": -1.246484398841858, "logps/rejected": -26.596874237060547, "loss": 1.24169921875, "nll_loss": 1.241601586341858, "rewards/accuracies": 1.0, "rewards/chosen": -2.492968797683716, "rewards/margins": 50.71875, "rewards/rejected": -53.193748474121094, "step": 830 }, { "epoch": 0.5999640740075445, "grad_norm": 1.267411708831787, "learning_rate": 4.1386242253603555e-06, "logits/chosen": -1.518164038658142, "logits/rejected": -0.5548095703125, "logps/chosen": -1.286718726158142, "logps/rejected": -26.490625381469727, "loss": 1.2797607421875, "nll_loss": 1.2791016101837158, "rewards/accuracies": 1.0, "rewards/chosen": -2.573437452316284, "rewards/margins": 50.41875076293945, "rewards/rejected": -52.98125076293945, "step": 835 }, { "epoch": 0.6035566732530986, "grad_norm": 1.2807753086090088, "learning_rate": 4.076900064019721e-06, "logits/chosen": -1.4646484851837158, "logits/rejected": -0.4119018614292145, "logps/chosen": -1.2705078125, "logps/rejected": -27.087499618530273, "loss": 1.2683349609375, "nll_loss": 1.268164038658142, "rewards/accuracies": 1.0, "rewards/chosen": -2.541015625, "rewards/margins": 51.631248474121094, "rewards/rejected": -54.17499923706055, "step": 840 }, { "epoch": 0.6071492724986528, "grad_norm": 1.3430705070495605, "learning_rate": 4.015321205524935e-06, "logits/chosen": -1.392480492591858, "logits/rejected": -0.38892823457717896, "logps/chosen": -1.2960937023162842, "logps/rejected": -26.215625762939453, "loss": 1.28765869140625, "nll_loss": 1.287695288658142, "rewards/accuracies": 1.0, "rewards/chosen": -2.5921874046325684, "rewards/margins": 49.849998474121094, "rewards/rejected": -52.431251525878906, "step": 845 }, { "epoch": 0.6107418717442069, "grad_norm": 1.2146618366241455, "learning_rate": 3.953897342849673e-06, "logits/chosen": -1.5070312023162842, "logits/rejected": -0.532958984375, "logps/chosen": -1.287695288658142, "logps/rejected": -26.765625, "loss": 1.286669921875, "nll_loss": 1.2869141101837158, "rewards/accuracies": 1.0, "rewards/chosen": -2.575390577316284, "rewards/margins": 50.931251525878906, "rewards/rejected": -53.53125, "step": 850 }, { "epoch": 0.6143344709897611, "grad_norm": 1.2116889953613281, "learning_rate": 3.892638144570116e-06, "logits/chosen": -1.4646484851837158, "logits/rejected": -0.3893798887729645, "logps/chosen": -1.2980468273162842, "logps/rejected": -26.446874618530273, "loss": 1.2904541015625, "nll_loss": 1.291015625, "rewards/accuracies": 1.0, "rewards/chosen": -2.5960936546325684, "rewards/margins": 50.287498474121094, "rewards/rejected": -52.89374923706055, "step": 855 }, { "epoch": 0.6179270702353152, "grad_norm": 1.3148047924041748, "learning_rate": 3.8315532533430285e-06, "logits/chosen": -1.502343773841858, "logits/rejected": -0.4493408203125, "logps/chosen": -1.265625, "logps/rejected": -26.765625, "loss": 1.2690185546875, "nll_loss": 1.268164038658142, "rewards/accuracies": 1.0, "rewards/chosen": -2.53125, "rewards/margins": 50.993751525878906, "rewards/rejected": -53.53125, "step": 860 }, { "epoch": 0.6215196694808695, "grad_norm": 1.217235803604126, "learning_rate": 3.7706522843879435e-06, "logits/chosen": -1.576171875, "logits/rejected": -0.5768798589706421, "logps/chosen": -1.286523461341858, "logps/rejected": -26.453125, "loss": 1.283740234375, "nll_loss": 1.283203125, "rewards/accuracies": 1.0, "rewards/chosen": -2.573046922683716, "rewards/margins": 50.318748474121094, "rewards/rejected": -52.90625, "step": 865 }, { "epoch": 0.6251122687264236, "grad_norm": 1.3339595794677734, "learning_rate": 3.709944823973647e-06, "logits/chosen": -1.677734375, "logits/rejected": -0.764208972454071, "logps/chosen": -1.250390648841858, "logps/rejected": -25.868749618530273, "loss": 1.2469482421875, "nll_loss": 1.246484398841858, "rewards/accuracies": 1.0, "rewards/chosen": -2.500781297683716, "rewards/margins": 49.21875, "rewards/rejected": -51.73749923706055, "step": 870 }, { "epoch": 0.6287048679719778, "grad_norm": 1.2446101903915405, "learning_rate": 3.649440427909231e-06, "logits/chosen": -1.596289038658142, "logits/rejected": -0.608691394329071, "logps/chosen": -1.26171875, "logps/rejected": -26.415624618530273, "loss": 1.2564453125, "nll_loss": 1.256445288658142, "rewards/accuracies": 1.0, "rewards/chosen": -2.5234375, "rewards/margins": 50.29375076293945, "rewards/rejected": -52.83124923706055, "step": 875 }, { "epoch": 0.6322974672175319, "grad_norm": 1.3078398704528809, "learning_rate": 3.5891486200399413e-06, "logits/chosen": -1.7238280773162842, "logits/rejected": -0.79248046875, "logps/chosen": -1.264062523841858, "logps/rejected": -26.056249618530273, "loss": 1.2614013671875, "nll_loss": 1.261132836341858, "rewards/accuracies": 1.0, "rewards/chosen": -2.528125047683716, "rewards/margins": 49.587501525878906, "rewards/rejected": -52.11249923706055, "step": 880 }, { "epoch": 0.635890066463086, "grad_norm": 1.260400414466858, "learning_rate": 3.5290788907480454e-06, "logits/chosen": -1.6232421398162842, "logits/rejected": -0.6546875238418579, "logps/chosen": -1.283789038658142, "logps/rejected": -26.475000381469727, "loss": 1.276611328125, "nll_loss": 1.2771484851837158, "rewards/accuracies": 1.0, "rewards/chosen": -2.567578077316284, "rewards/margins": 50.41875076293945, "rewards/rejected": -52.95000076293945, "step": 885 }, { "epoch": 0.6394826657086402, "grad_norm": 1.2450644969940186, "learning_rate": 3.469240695458983e-06, "logits/chosen": -1.543554663658142, "logits/rejected": -0.57275390625, "logps/chosen": -1.274023413658142, "logps/rejected": -26.978124618530273, "loss": 1.2771484375, "nll_loss": 1.2765624523162842, "rewards/accuracies": 1.0, "rewards/chosen": -2.548046827316284, "rewards/margins": 51.381248474121094, "rewards/rejected": -53.95624923706055, "step": 890 }, { "epoch": 0.6430752649541943, "grad_norm": 1.3227910995483398, "learning_rate": 3.4096434531529986e-06, "logits/chosen": -1.651953101158142, "logits/rejected": -0.721923828125, "logps/chosen": -1.2712891101837158, "logps/rejected": -26.112499237060547, "loss": 1.27021484375, "nll_loss": 1.2697265148162842, "rewards/accuracies": 1.0, "rewards/chosen": -2.5425782203674316, "rewards/margins": 49.67499923706055, "rewards/rejected": -52.224998474121094, "step": 895 }, { "epoch": 0.6466678641997485, "grad_norm": 1.2807458639144897, "learning_rate": 3.350296544882543e-06, "logits/chosen": -1.5812499523162842, "logits/rejected": -0.628613293170929, "logps/chosen": -1.2878906726837158, "logps/rejected": -26.515625, "loss": 1.28681640625, "nll_loss": 1.286718726158142, "rewards/accuracies": 1.0, "rewards/chosen": -2.5757813453674316, "rewards/margins": 50.41875076293945, "rewards/rejected": -53.03125, "step": 900 }, { "epoch": 0.6502604634453026, "grad_norm": 1.212979793548584, "learning_rate": 3.2912093122956046e-06, "logits/chosen": -1.6261718273162842, "logits/rejected": -0.6142822504043579, "logps/chosen": -1.285546898841858, "logps/rejected": -26.5, "loss": 1.2876708984375, "nll_loss": 1.287500023841858, "rewards/accuracies": 1.0, "rewards/chosen": -2.571093797683716, "rewards/margins": 50.41875076293945, "rewards/rejected": -53.0, "step": 905 }, { "epoch": 0.6538530626908569, "grad_norm": 1.2762136459350586, "learning_rate": 3.2323910561652798e-06, "logits/chosen": -1.6496093273162842, "logits/rejected": -0.6158202886581421, "logps/chosen": -1.283593773841858, "logps/rejected": -26.496875762939453, "loss": 1.278076171875, "nll_loss": 1.2783203125, "rewards/accuracies": 1.0, "rewards/chosen": -2.567187547683716, "rewards/margins": 50.41875076293945, "rewards/rejected": -52.993751525878906, "step": 910 }, { "epoch": 0.657445661936411, "grad_norm": 1.2103921175003052, "learning_rate": 3.1738510349257556e-06, "logits/chosen": -1.674414038658142, "logits/rejected": -0.624707043170929, "logps/chosen": -1.274999976158142, "logps/rejected": -26.903125762939453, "loss": 1.2763916015625, "nll_loss": 1.2761719226837158, "rewards/accuracies": 1.0, "rewards/chosen": -2.549999952316284, "rewards/margins": 51.26874923706055, "rewards/rejected": -53.806251525878906, "step": 915 }, { "epoch": 0.6610382611819652, "grad_norm": 1.32221257686615, "learning_rate": 3.1155984632149565e-06, "logits/chosen": -1.6257812976837158, "logits/rejected": -0.573803722858429, "logps/chosen": -1.2763671875, "logps/rejected": -26.665624618530273, "loss": 1.26953125, "nll_loss": 1.2693359851837158, "rewards/accuracies": 1.0, "rewards/chosen": -2.552734375, "rewards/margins": 50.787498474121094, "rewards/rejected": -53.33124923706055, "step": 920 }, { "epoch": 0.6646308604275193, "grad_norm": 1.2526887655258179, "learning_rate": 3.0576425104241047e-06, "logits/chosen": -1.602148413658142, "logits/rejected": -0.577197253704071, "logps/chosen": -1.300195336341858, "logps/rejected": -26.643749237060547, "loss": 1.300830078125, "nll_loss": 1.300195336341858, "rewards/accuracies": 1.0, "rewards/chosen": -2.600390672683716, "rewards/margins": 50.70000076293945, "rewards/rejected": -53.287498474121094, "step": 925 }, { "epoch": 0.6682234596730735, "grad_norm": 1.2841291427612305, "learning_rate": 2.9999922992543777e-06, "logits/chosen": -1.529296875, "logits/rejected": -0.4998779296875, "logps/chosen": -1.254296898841858, "logps/rejected": -26.975000381469727, "loss": 1.2544189453125, "nll_loss": 1.254492163658142, "rewards/accuracies": 1.0, "rewards/chosen": -2.508593797683716, "rewards/margins": 51.41875076293945, "rewards/rejected": -53.95000076293945, "step": 930 }, { "epoch": 0.6718160589186276, "grad_norm": 1.3011598587036133, "learning_rate": 2.9426569042809356e-06, "logits/chosen": -1.6492187976837158, "logits/rejected": -0.6298583745956421, "logps/chosen": -1.2800781726837158, "logps/rejected": -26.665624618530273, "loss": 1.2728759765625, "nll_loss": 1.272851586341858, "rewards/accuracies": 1.0, "rewards/chosen": -2.5601563453674316, "rewards/margins": 50.756248474121094, "rewards/rejected": -53.33124923706055, "step": 935 }, { "epoch": 0.6754086581641818, "grad_norm": 1.2587294578552246, "learning_rate": 2.8856453505245018e-06, "logits/chosen": -1.5681641101837158, "logits/rejected": -0.5076659917831421, "logps/chosen": -1.2658202648162842, "logps/rejected": -26.665624618530273, "loss": 1.2689697265625, "nll_loss": 1.2693359851837158, "rewards/accuracies": 1.0, "rewards/chosen": -2.5316405296325684, "rewards/margins": 50.78125, "rewards/rejected": -53.33124923706055, "step": 940 }, { "epoch": 0.6790012574097359, "grad_norm": 1.2374063730239868, "learning_rate": 2.8289666120307773e-06, "logits/chosen": -1.556640625, "logits/rejected": -0.497589111328125, "logps/chosen": -1.2570312023162842, "logps/rejected": -26.837499618530273, "loss": 1.2567626953125, "nll_loss": 1.2560546398162842, "rewards/accuracies": 1.0, "rewards/chosen": -2.5140624046325684, "rewards/margins": 51.16875076293945, "rewards/rejected": -53.67499923706055, "step": 945 }, { "epoch": 0.6825938566552902, "grad_norm": 1.2470556497573853, "learning_rate": 2.77262961045784e-06, "logits/chosen": -1.587304711341858, "logits/rejected": -0.5886474847793579, "logps/chosen": -1.2763671875, "logps/rejected": -26.393749237060547, "loss": 1.269775390625, "nll_loss": 1.2697265148162842, "rewards/accuracies": 1.0, "rewards/chosen": -2.552734375, "rewards/margins": 50.224998474121094, "rewards/rejected": -52.787498474121094, "step": 950 }, { "epoch": 0.6861864559008443, "grad_norm": 1.296369194984436, "learning_rate": 2.7166432136718156e-06, "logits/chosen": -1.639257788658142, "logits/rejected": -0.591796875, "logps/chosen": -1.2705078125, "logps/rejected": -26.362499237060547, "loss": 1.267578125, "nll_loss": 1.2677733898162842, "rewards/accuracies": 1.0, "rewards/chosen": -2.541015625, "rewards/margins": 50.181251525878906, "rewards/rejected": -52.724998474121094, "step": 955 }, { "epoch": 0.6897790551463984, "grad_norm": 1.2463890314102173, "learning_rate": 2.6610162343510183e-06, "logits/chosen": -1.56640625, "logits/rejected": -0.45904237031936646, "logps/chosen": -1.273046851158142, "logps/rejected": -26.859375, "loss": 1.269091796875, "nll_loss": 1.2683594226837158, "rewards/accuracies": 1.0, "rewards/chosen": -2.546093702316284, "rewards/margins": 51.17499923706055, "rewards/rejected": -53.71875, "step": 960 }, { "epoch": 0.6933716543919526, "grad_norm": 1.202349066734314, "learning_rate": 2.6057574285987446e-06, "logits/chosen": -1.6279296875, "logits/rejected": -0.634960949420929, "logps/chosen": -1.2599608898162842, "logps/rejected": -26.549999237060547, "loss": 1.257861328125, "nll_loss": 1.257226586341858, "rewards/accuracies": 1.0, "rewards/chosen": -2.5199217796325684, "rewards/margins": 50.587501525878906, "rewards/rejected": -53.099998474121094, "step": 965 }, { "epoch": 0.6969642536375067, "grad_norm": 1.2501283884048462, "learning_rate": 2.5508754945650305e-06, "logits/chosen": -1.5939452648162842, "logits/rejected": -0.565478503704071, "logps/chosen": -1.25390625, "logps/rejected": -26.806249618530273, "loss": 1.2497314453125, "nll_loss": 1.25, "rewards/accuracies": 1.0, "rewards/chosen": -2.5078125, "rewards/margins": 51.125, "rewards/rejected": -53.61249923706055, "step": 970 }, { "epoch": 0.7005568528830609, "grad_norm": 1.2475359439849854, "learning_rate": 2.4963790710774683e-06, "logits/chosen": -1.560156226158142, "logits/rejected": -0.546191394329071, "logps/chosen": -1.2736327648162842, "logps/rejected": -26.125, "loss": 1.2764404296875, "nll_loss": 1.2771484851837158, "rewards/accuracies": 1.0, "rewards/chosen": -2.5472655296325684, "rewards/margins": 49.67499923706055, "rewards/rejected": -52.25, "step": 975 }, { "epoch": 0.704149452128615, "grad_norm": 1.2980130910873413, "learning_rate": 2.4422767362814045e-06, "logits/chosen": -1.5988280773162842, "logits/rejected": -0.566479504108429, "logps/chosen": -1.2800781726837158, "logps/rejected": -25.646875381469727, "loss": 1.28154296875, "nll_loss": 1.281835913658142, "rewards/accuracies": 1.0, "rewards/chosen": -2.5601563453674316, "rewards/margins": 48.743751525878906, "rewards/rejected": -51.29375076293945, "step": 980 }, { "epoch": 0.7077420513741692, "grad_norm": 1.3116934299468994, "learning_rate": 2.3885770062896795e-06, "logits/chosen": -1.569921851158142, "logits/rejected": -0.5450683832168579, "logps/chosen": -1.290429711341858, "logps/rejected": -26.799999237060547, "loss": 1.2818359375, "nll_loss": 1.2820312976837158, "rewards/accuracies": 1.0, "rewards/chosen": -2.580859422683716, "rewards/margins": 51.01250076293945, "rewards/rejected": -53.599998474121094, "step": 985 }, { "epoch": 0.7113346506197233, "grad_norm": 1.2527976036071777, "learning_rate": 2.3352883338421085e-06, "logits/chosen": -1.5283203125, "logits/rejected": -0.4849853515625, "logps/chosen": -1.2966797351837158, "logps/rejected": -26.659374237060547, "loss": 1.292138671875, "nll_loss": 1.2921874523162842, "rewards/accuracies": 1.0, "rewards/chosen": -2.5933594703674316, "rewards/margins": 50.71875, "rewards/rejected": -53.318748474121094, "step": 990 }, { "epoch": 0.7149272498652776, "grad_norm": 1.2639521360397339, "learning_rate": 2.2824191069749824e-06, "logits/chosen": -1.5343749523162842, "logits/rejected": -0.42864990234375, "logps/chosen": -1.279296875, "logps/rejected": -26.903125762939453, "loss": 1.2745361328125, "nll_loss": 1.275390625, "rewards/accuracies": 1.0, "rewards/chosen": -2.55859375, "rewards/margins": 51.224998474121094, "rewards/rejected": -53.806251525878906, "step": 995 }, { "epoch": 0.7185198491108317, "grad_norm": 1.2673225402832031, "learning_rate": 2.2299776477007073e-06, "logits/chosen": -1.5164062976837158, "logits/rejected": -0.4778381288051605, "logps/chosen": -1.2892577648162842, "logps/rejected": -26.696874618530273, "loss": 1.2930908203125, "nll_loss": 1.2927734851837158, "rewards/accuracies": 1.0, "rewards/chosen": -2.5785155296325684, "rewards/margins": 50.837501525878906, "rewards/rejected": -53.39374923706055, "step": 1000 }, { "epoch": 0.7185198491108317, "eval_logits/chosen": -1.4657771587371826, "eval_logits/rejected": -0.39356765151023865, "eval_logps/chosen": -1.2632743120193481, "eval_logps/rejected": -26.45464515686035, "eval_loss": 1.264809012413025, "eval_nll_loss": 1.2640348672866821, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": -2.5265486240386963, "eval_rewards/margins": 50.38495635986328, "eval_rewards/rejected": -52.9092903137207, "eval_runtime": 10.811, "eval_samples_per_second": 83.249, "eval_steps_per_second": 10.452, "step": 1000 } ], "logging_steps": 5, "max_steps": 1392, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.9565035953061888e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }