{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 10624, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0037650602409638554, "grad_norm": 131.53591532717587, "learning_rate": 9.99152861445783e-07, "logits/chosen": -1.9904296398162842, "logits/rejected": -2.041015625, "logps/chosen": -478.5249938964844, "logps/rejected": -395.95001220703125, "loss": 0.6548, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": -0.0077186585403978825, "rewards/margins": 0.10002288967370987, "rewards/rejected": -0.1077674850821495, "step": 10 }, { "epoch": 0.007530120481927711, "grad_norm": 92.5969997719306, "learning_rate": 9.98211596385542e-07, "logits/chosen": -1.864843726158142, "logits/rejected": -1.7822265625, "logps/chosen": -417.9624938964844, "logps/rejected": -347.9125061035156, "loss": 0.5991, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.22912903130054474, "rewards/margins": 0.3745971620082855, "rewards/rejected": -0.603350818157196, "step": 20 }, { "epoch": 0.011295180722891566, "grad_norm": 116.41588759909305, "learning_rate": 9.972703313253011e-07, "logits/chosen": -1.8123047351837158, "logits/rejected": -1.7999999523162842, "logps/chosen": -392.41876220703125, "logps/rejected": -335.5, "loss": 0.5599, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.454855352640152, "rewards/margins": 0.7464355230331421, "rewards/rejected": -1.201330542564392, "step": 30 }, { "epoch": 0.015060240963855422, "grad_norm": 84.63173895937301, "learning_rate": 9.963290662650602e-07, "logits/chosen": -2.006054639816284, "logits/rejected": -1.904882788658142, "logps/chosen": -422.3125, "logps/rejected": -326.04998779296875, "loss": 0.4999, "rewards/accuracies": 0.75, "rewards/chosen": -0.2895568907260895, "rewards/margins": 0.923583984375, "rewards/rejected": -1.213165283203125, "step": 40 }, { "epoch": 0.01882530120481928, "grad_norm": 103.89454602387097, "learning_rate": 9.953878012048193e-07, "logits/chosen": -1.994726538658142, "logits/rejected": -1.9035155773162842, "logps/chosen": -382.8500061035156, "logps/rejected": -350.0375061035156, "loss": 0.475, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.05167236179113388, "rewards/margins": 0.8580871820449829, "rewards/rejected": -0.9098175168037415, "step": 50 }, { "epoch": 0.022590361445783132, "grad_norm": 91.50124208673783, "learning_rate": 9.944465361445784e-07, "logits/chosen": -1.992773413658142, "logits/rejected": -1.9734375476837158, "logps/chosen": -401.13751220703125, "logps/rejected": -334.4125061035156, "loss": 0.4071, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.14934691786766052, "rewards/margins": 1.2671630382537842, "rewards/rejected": -1.4166991710662842, "step": 60 }, { "epoch": 0.02635542168674699, "grad_norm": 122.16140447324007, "learning_rate": 9.935052710843374e-07, "logits/chosen": -2.0111327171325684, "logits/rejected": -1.9591796398162842, "logps/chosen": -384.73126220703125, "logps/rejected": -363.76251220703125, "loss": 0.3969, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.257345587015152, "rewards/margins": 1.357214331626892, "rewards/rejected": -1.6149413585662842, "step": 70 }, { "epoch": 0.030120481927710843, "grad_norm": 79.72381257543091, "learning_rate": 9.925640060240963e-07, "logits/chosen": -1.9822266101837158, "logits/rejected": -1.985937476158142, "logps/chosen": -424.2250061035156, "logps/rejected": -343.2749938964844, "loss": 0.3727, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.18103942275047302, "rewards/margins": 1.476739525794983, "rewards/rejected": -1.6574218273162842, "step": 80 }, { "epoch": 0.0338855421686747, "grad_norm": 84.0539278883207, "learning_rate": 9.916227409638554e-07, "logits/chosen": -1.9660155773162842, "logits/rejected": -1.9423828125, "logps/chosen": -404.04998779296875, "logps/rejected": -342.3374938964844, "loss": 0.3576, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.3914856016635895, "rewards/margins": 1.8466308116912842, "rewards/rejected": -2.237499952316284, "step": 90 }, { "epoch": 0.03765060240963856, "grad_norm": 96.41169750870563, "learning_rate": 9.906814759036145e-07, "logits/chosen": -1.9997069835662842, "logits/rejected": -1.9289062023162842, "logps/chosen": -402.8500061035156, "logps/rejected": -347.0249938964844, "loss": 0.382, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.23468017578125, "rewards/margins": 1.6337158679962158, "rewards/rejected": -1.86865234375, "step": 100 }, { "epoch": 0.04141566265060241, "grad_norm": 38.572933049108855, "learning_rate": 9.897402108433735e-07, "logits/chosen": -2.128124952316284, "logits/rejected": -2.205859422683716, "logps/chosen": -433.5625, "logps/rejected": -338.6499938964844, "loss": 0.2868, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -0.5282958745956421, "rewards/margins": 2.2200684547424316, "rewards/rejected": -2.7491211891174316, "step": 110 }, { "epoch": 0.045180722891566265, "grad_norm": 81.06205222905237, "learning_rate": 9.887989457831324e-07, "logits/chosen": -2.1986327171325684, "logits/rejected": -2.161914110183716, "logps/chosen": -464.13751220703125, "logps/rejected": -380.07501220703125, "loss": 0.3417, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.6987670660018921, "rewards/margins": 1.9639160633087158, "rewards/rejected": -2.66259765625, "step": 120 }, { "epoch": 0.04894578313253012, "grad_norm": 69.00118056974482, "learning_rate": 9.878576807228915e-07, "logits/chosen": -2.1494140625, "logits/rejected": -2.1363282203674316, "logps/chosen": -414.54998779296875, "logps/rejected": -351.57501220703125, "loss": 0.3667, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.11570434272289276, "rewards/margins": 1.8508789539337158, "rewards/rejected": -1.9660155773162842, "step": 130 }, { "epoch": 0.05271084337349398, "grad_norm": 74.93908093698064, "learning_rate": 9.869164156626506e-07, "logits/chosen": -2.122851610183716, "logits/rejected": -2.0560545921325684, "logps/chosen": -411.25, "logps/rejected": -356.3374938964844, "loss": 0.3717, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.05319518968462944, "rewards/margins": 1.9933593273162842, "rewards/rejected": -2.0470213890075684, "step": 140 }, { "epoch": 0.05647590361445783, "grad_norm": 99.60815870383335, "learning_rate": 9.859751506024096e-07, "logits/chosen": -2.143749952316284, "logits/rejected": -2.094531297683716, "logps/chosen": -405.23748779296875, "logps/rejected": -358.32501220703125, "loss": 0.3607, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.47428590059280396, "rewards/margins": 2.165332078933716, "rewards/rejected": -2.6402344703674316, "step": 150 }, { "epoch": 0.060240963855421686, "grad_norm": 72.59702351072167, "learning_rate": 9.850338855421685e-07, "logits/chosen": -2.158007860183716, "logits/rejected": -2.1480469703674316, "logps/chosen": -392.1499938964844, "logps/rejected": -350.9125061035156, "loss": 0.3424, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.16802978515625, "rewards/margins": 1.97900390625, "rewards/rejected": -3.149218797683716, "step": 160 }, { "epoch": 0.06400602409638555, "grad_norm": 82.44408252892029, "learning_rate": 9.840926204819276e-07, "logits/chosen": -2.130078077316284, "logits/rejected": -2.1849608421325684, "logps/chosen": -464.7250061035156, "logps/rejected": -350.54998779296875, "loss": 0.2827, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.421142578125, "rewards/margins": 2.2367186546325684, "rewards/rejected": -3.660937547683716, "step": 170 }, { "epoch": 0.0677710843373494, "grad_norm": 64.30163955420714, "learning_rate": 9.831513554216867e-07, "logits/chosen": -2.1830077171325684, "logits/rejected": -2.134960889816284, "logps/chosen": -405.79998779296875, "logps/rejected": -365.75, "loss": 0.3767, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.9023498296737671, "rewards/margins": 2.1868042945861816, "rewards/rejected": -3.0908203125, "step": 180 }, { "epoch": 0.07153614457831325, "grad_norm": 91.60784500141546, "learning_rate": 9.822100903614458e-07, "logits/chosen": -2.133984327316284, "logits/rejected": -2.1240234375, "logps/chosen": -471.45001220703125, "logps/rejected": -395.1499938964844, "loss": 0.3508, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.5190795660018921, "rewards/margins": 2.346630811691284, "rewards/rejected": -2.8666014671325684, "step": 190 }, { "epoch": 0.07530120481927711, "grad_norm": 66.13954442870882, "learning_rate": 9.812688253012048e-07, "logits/chosen": -2.154492139816284, "logits/rejected": -2.107421875, "logps/chosen": -429.6875, "logps/rejected": -378.125, "loss": 0.2559, "rewards/accuracies": 0.875, "rewards/chosen": -0.4797729551792145, "rewards/margins": 2.631298780441284, "rewards/rejected": -3.1117186546325684, "step": 200 }, { "epoch": 0.07906626506024096, "grad_norm": 91.96139230091366, "learning_rate": 9.80327560240964e-07, "logits/chosen": -2.1429686546325684, "logits/rejected": -2.0902342796325684, "logps/chosen": -421.32501220703125, "logps/rejected": -385.5375061035156, "loss": 0.3834, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.577343761920929, "rewards/margins": 2.134033203125, "rewards/rejected": -2.7120604515075684, "step": 210 }, { "epoch": 0.08283132530120482, "grad_norm": 96.25168333646853, "learning_rate": 9.793862951807228e-07, "logits/chosen": -2.193554639816284, "logits/rejected": -2.12890625, "logps/chosen": -408.2250061035156, "logps/rejected": -342.45001220703125, "loss": 0.4237, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.9426513910293579, "rewards/margins": 2.182324171066284, "rewards/rejected": -3.1259765625, "step": 220 }, { "epoch": 0.08659638554216867, "grad_norm": 109.04340070220671, "learning_rate": 9.784450301204819e-07, "logits/chosen": -2.1666016578674316, "logits/rejected": -2.1861329078674316, "logps/chosen": -472.82501220703125, "logps/rejected": -378.875, "loss": 0.3004, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.886853039264679, "rewards/margins": 2.5367674827575684, "rewards/rejected": -3.4234375953674316, "step": 230 }, { "epoch": 0.09036144578313253, "grad_norm": 48.19341621444691, "learning_rate": 9.77503765060241e-07, "logits/chosen": -2.225390672683716, "logits/rejected": -2.2125000953674316, "logps/chosen": -393.29998779296875, "logps/rejected": -329.07501220703125, "loss": 0.3467, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.114038109779358, "rewards/margins": 2.449267625808716, "rewards/rejected": -3.5638670921325684, "step": 240 }, { "epoch": 0.09412650602409639, "grad_norm": 96.20729126927293, "learning_rate": 9.765625e-07, "logits/chosen": -2.242382764816284, "logits/rejected": -2.1923828125, "logps/chosen": -391.29998779296875, "logps/rejected": -342.6625061035156, "loss": 0.3236, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.567480444908142, "rewards/margins": 2.365917921066284, "rewards/rejected": -3.930468797683716, "step": 250 }, { "epoch": 0.09789156626506024, "grad_norm": 69.2808883584412, "learning_rate": 9.75621234939759e-07, "logits/chosen": -2.1246094703674316, "logits/rejected": -2.121875047683716, "logps/chosen": -405.6499938964844, "logps/rejected": -367.7250061035156, "loss": 0.3672, "rewards/accuracies": 0.8125, "rewards/chosen": -1.524023413658142, "rewards/margins": 2.287402391433716, "rewards/rejected": -3.810546875, "step": 260 }, { "epoch": 0.1016566265060241, "grad_norm": 110.11249137698765, "learning_rate": 9.74679969879518e-07, "logits/chosen": -2.155468702316284, "logits/rejected": -2.132617235183716, "logps/chosen": -423.875, "logps/rejected": -372.8999938964844, "loss": 0.3092, "rewards/accuracies": 0.84375, "rewards/chosen": -1.0773804187774658, "rewards/margins": 2.8239502906799316, "rewards/rejected": -3.9017577171325684, "step": 270 }, { "epoch": 0.10542168674698796, "grad_norm": 76.9176338328976, "learning_rate": 9.73738704819277e-07, "logits/chosen": -2.244140625, "logits/rejected": -2.232226610183716, "logps/chosen": -409.6875, "logps/rejected": -367.79998779296875, "loss": 0.2945, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.0732605457305908, "rewards/margins": 2.504101514816284, "rewards/rejected": -3.576171875, "step": 280 }, { "epoch": 0.1091867469879518, "grad_norm": 95.4040671209109, "learning_rate": 9.727974397590361e-07, "logits/chosen": -2.1943359375, "logits/rejected": -2.2603516578674316, "logps/chosen": -443.45001220703125, "logps/rejected": -402.75, "loss": 0.3013, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.0694091320037842, "rewards/margins": 2.55078125, "rewards/rejected": -3.6226563453674316, "step": 290 }, { "epoch": 0.11295180722891567, "grad_norm": 67.97217608858946, "learning_rate": 9.718561746987952e-07, "logits/chosen": -2.130859375, "logits/rejected": -2.0718750953674316, "logps/chosen": -456.0, "logps/rejected": -395.20001220703125, "loss": 0.414, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.639135718345642, "rewards/margins": 2.13623046875, "rewards/rejected": -3.7748045921325684, "step": 300 }, { "epoch": 0.11671686746987951, "grad_norm": 96.1563980693113, "learning_rate": 9.70914909638554e-07, "logits/chosen": -2.1458983421325684, "logits/rejected": -2.1361327171325684, "logps/chosen": -385.875, "logps/rejected": -376.45001220703125, "loss": 0.3879, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.352563500404358, "rewards/margins": 2.559326171875, "rewards/rejected": -3.9124999046325684, "step": 310 }, { "epoch": 0.12048192771084337, "grad_norm": 63.82596970388472, "learning_rate": 9.699736445783132e-07, "logits/chosen": -2.0257811546325684, "logits/rejected": -2.058398485183716, "logps/chosen": -442.8999938964844, "logps/rejected": -389.92498779296875, "loss": 0.3447, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.9002685546875, "rewards/margins": 2.6500487327575684, "rewards/rejected": -3.553515672683716, "step": 320 }, { "epoch": 0.12424698795180723, "grad_norm": 56.157993374251724, "learning_rate": 9.690323795180722e-07, "logits/chosen": -2.094921827316284, "logits/rejected": -2.0669922828674316, "logps/chosen": -435.375, "logps/rejected": -375.1000061035156, "loss": 0.2728, "rewards/accuracies": 0.875, "rewards/chosen": -0.6834472417831421, "rewards/margins": 2.8525633811950684, "rewards/rejected": -3.534960985183716, "step": 330 }, { "epoch": 0.1280120481927711, "grad_norm": 102.89509844921051, "learning_rate": 9.680911144578313e-07, "logits/chosen": -2.144335985183716, "logits/rejected": -2.177734375, "logps/chosen": -402.125, "logps/rejected": -359.07501220703125, "loss": 0.2919, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.0821411609649658, "rewards/margins": 2.9081053733825684, "rewards/rejected": -3.992968797683716, "step": 340 }, { "epoch": 0.13177710843373494, "grad_norm": 75.0863936913258, "learning_rate": 9.671498493975904e-07, "logits/chosen": -2.025390625, "logits/rejected": -2.0201172828674316, "logps/chosen": -481.2749938964844, "logps/rejected": -380.7250061035156, "loss": 0.3131, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.8079589605331421, "rewards/margins": 2.4765381813049316, "rewards/rejected": -3.2850584983825684, "step": 350 }, { "epoch": 0.1355421686746988, "grad_norm": 102.89726142748448, "learning_rate": 9.662085843373493e-07, "logits/chosen": -2.0990233421325684, "logits/rejected": -2.123046875, "logps/chosen": -458.82501220703125, "logps/rejected": -424.54998779296875, "loss": 0.2826, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -0.4655395448207855, "rewards/margins": 2.9281249046325684, "rewards/rejected": -3.3946290016174316, "step": 360 }, { "epoch": 0.13930722891566266, "grad_norm": 92.35719470715001, "learning_rate": 9.652673192771083e-07, "logits/chosen": -2.106250047683716, "logits/rejected": -2.090625047683716, "logps/chosen": -430.4375, "logps/rejected": -395.625, "loss": 0.3201, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.680432140827179, "rewards/margins": 2.4410400390625, "rewards/rejected": -3.1195311546325684, "step": 370 }, { "epoch": 0.1430722891566265, "grad_norm": 96.1788047426685, "learning_rate": 9.643260542168674e-07, "logits/chosen": -2.0667967796325684, "logits/rejected": -2.1175780296325684, "logps/chosen": -395.01251220703125, "logps/rejected": -383.375, "loss": 0.3075, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.1037170886993408, "rewards/margins": 2.7733397483825684, "rewards/rejected": -3.877148389816284, "step": 380 }, { "epoch": 0.14683734939759036, "grad_norm": 115.18548873894878, "learning_rate": 9.633847891566265e-07, "logits/chosen": -2.151562452316284, "logits/rejected": -2.0648436546325684, "logps/chosen": -452.0249938964844, "logps/rejected": -408.70001220703125, "loss": 0.299, "rewards/accuracies": 0.875, "rewards/chosen": -2.0213623046875, "rewards/margins": 2.5245118141174316, "rewards/rejected": -4.544140815734863, "step": 390 }, { "epoch": 0.15060240963855423, "grad_norm": 85.52137590632408, "learning_rate": 9.624435240963856e-07, "logits/chosen": -2.041796922683716, "logits/rejected": -2.160351514816284, "logps/chosen": -487.5375061035156, "logps/rejected": -407.04998779296875, "loss": 0.3018, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.5074341297149658, "rewards/margins": 2.924267530441284, "rewards/rejected": -4.433203220367432, "step": 400 }, { "epoch": 0.15436746987951808, "grad_norm": 131.93013883815055, "learning_rate": 9.615022590361447e-07, "logits/chosen": -2.1126952171325684, "logits/rejected": -2.0902342796325684, "logps/chosen": -457.17498779296875, "logps/rejected": -384.67498779296875, "loss": 0.3873, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.642822265625, "rewards/margins": 2.8291015625, "rewards/rejected": -4.471093654632568, "step": 410 }, { "epoch": 0.15813253012048192, "grad_norm": 34.759905441644435, "learning_rate": 9.605609939759035e-07, "logits/chosen": -2.133593797683716, "logits/rejected": -2.135937452316284, "logps/chosen": -433.36248779296875, "logps/rejected": -346.07501220703125, "loss": 0.2817, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -0.7135375738143921, "rewards/margins": 3.0513672828674316, "rewards/rejected": -3.7660155296325684, "step": 420 }, { "epoch": 0.16189759036144577, "grad_norm": 92.787456143489, "learning_rate": 9.596197289156626e-07, "logits/chosen": -2.080761671066284, "logits/rejected": -2.0552735328674316, "logps/chosen": -396.125, "logps/rejected": -397.57501220703125, "loss": 0.2639, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -0.8045867681503296, "rewards/margins": 2.6491456031799316, "rewards/rejected": -3.45458984375, "step": 430 }, { "epoch": 0.16566265060240964, "grad_norm": 67.55824240046263, "learning_rate": 9.586784638554217e-07, "logits/chosen": -2.075976610183716, "logits/rejected": -2.0425782203674316, "logps/chosen": -409.7250061035156, "logps/rejected": -365.63751220703125, "loss": 0.3355, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.846728503704071, "rewards/margins": 2.7286620140075684, "rewards/rejected": -3.5746092796325684, "step": 440 }, { "epoch": 0.1694277108433735, "grad_norm": 109.16791079744033, "learning_rate": 9.577371987951808e-07, "logits/chosen": -2.1416015625, "logits/rejected": -2.1722655296325684, "logps/chosen": -472.7749938964844, "logps/rejected": -410.625, "loss": 0.3064, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.3935546875, "rewards/margins": 3.002392530441284, "rewards/rejected": -4.3984375, "step": 450 }, { "epoch": 0.17319277108433734, "grad_norm": 30.547355406526414, "learning_rate": 9.567959337349396e-07, "logits/chosen": -2.1041016578674316, "logits/rejected": -2.073046922683716, "logps/chosen": -479.79998779296875, "logps/rejected": -455.4750061035156, "loss": 0.2575, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.6370728015899658, "rewards/margins": 3.0836424827575684, "rewards/rejected": -4.720898628234863, "step": 460 }, { "epoch": 0.1769578313253012, "grad_norm": 118.67921208598516, "learning_rate": 9.558546686746987e-07, "logits/chosen": -2.1568360328674316, "logits/rejected": -2.1611328125, "logps/chosen": -447.1499938964844, "logps/rejected": -381.75, "loss": 0.2987, "rewards/accuracies": 0.875, "rewards/chosen": -2.293701171875, "rewards/margins": 3.3521485328674316, "rewards/rejected": -5.647656440734863, "step": 470 }, { "epoch": 0.18072289156626506, "grad_norm": 94.8608541748225, "learning_rate": 9.549134036144578e-07, "logits/chosen": -2.162304639816284, "logits/rejected": -2.146484375, "logps/chosen": -426.38751220703125, "logps/rejected": -405.2749938964844, "loss": 0.2884, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.157727003097534, "rewards/margins": 3.8658690452575684, "rewards/rejected": -6.021874904632568, "step": 480 }, { "epoch": 0.1844879518072289, "grad_norm": 75.98985814954506, "learning_rate": 9.539721385542169e-07, "logits/chosen": -2.1263670921325684, "logits/rejected": -2.094921827316284, "logps/chosen": -436.07501220703125, "logps/rejected": -405.7250061035156, "loss": 0.2705, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.132031202316284, "rewards/margins": 3.349169969558716, "rewards/rejected": -5.482031345367432, "step": 490 }, { "epoch": 0.18825301204819278, "grad_norm": 67.50049034496736, "learning_rate": 9.530308734939758e-07, "logits/chosen": -2.1656250953674316, "logits/rejected": -2.127734422683716, "logps/chosen": -339.5625, "logps/rejected": -329.125, "loss": 0.2975, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.480615258216858, "rewards/margins": 2.9576172828674316, "rewards/rejected": -4.437109470367432, "step": 500 }, { "epoch": 0.19201807228915663, "grad_norm": 94.16672089672376, "learning_rate": 9.520896084337348e-07, "logits/chosen": -2.2408204078674316, "logits/rejected": -2.228710889816284, "logps/chosen": -436.04998779296875, "logps/rejected": -394.42498779296875, "loss": 0.3112, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.712890625, "rewards/margins": 3.1875, "rewards/rejected": -4.903515815734863, "step": 510 }, { "epoch": 0.19578313253012047, "grad_norm": 150.51201725913538, "learning_rate": 9.511483433734939e-07, "logits/chosen": -2.2099609375, "logits/rejected": -2.1880860328674316, "logps/chosen": -425.8999938964844, "logps/rejected": -396.45001220703125, "loss": 0.3955, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.218267798423767, "rewards/margins": 2.8587403297424316, "rewards/rejected": -4.075585842132568, "step": 520 }, { "epoch": 0.19954819277108435, "grad_norm": 109.986898512079, "learning_rate": 9.50207078313253e-07, "logits/chosen": -2.094531297683716, "logits/rejected": -2.0732421875, "logps/chosen": -444.8999938964844, "logps/rejected": -407.625, "loss": 0.3141, "rewards/accuracies": 0.84375, "rewards/chosen": -0.6080077886581421, "rewards/margins": 3.382617235183716, "rewards/rejected": -3.991015672683716, "step": 530 }, { "epoch": 0.2033132530120482, "grad_norm": 86.54646733446407, "learning_rate": 9.492658132530121e-07, "logits/chosen": -2.1025390625, "logits/rejected": -2.1597657203674316, "logps/chosen": -493.3999938964844, "logps/rejected": -390.04998779296875, "loss": 0.3982, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.6875, "rewards/margins": 3.1336426734924316, "rewards/rejected": -3.822070360183716, "step": 540 }, { "epoch": 0.20707831325301204, "grad_norm": 81.92553435513977, "learning_rate": 9.48324548192771e-07, "logits/chosen": -2.1128907203674316, "logits/rejected": -2.1070313453674316, "logps/chosen": -420.8500061035156, "logps/rejected": -377.8999938964844, "loss": 0.34, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.831494152545929, "rewards/margins": 2.819140672683716, "rewards/rejected": -3.6529297828674316, "step": 550 }, { "epoch": 0.21084337349397592, "grad_norm": 29.507500567850798, "learning_rate": 9.473832831325301e-07, "logits/chosen": -2.1187500953674316, "logits/rejected": -2.0904297828674316, "logps/chosen": -441.625, "logps/rejected": -385.125, "loss": 0.3505, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.0130736827850342, "rewards/margins": 3.123046875, "rewards/rejected": -4.137499809265137, "step": 560 }, { "epoch": 0.21460843373493976, "grad_norm": 63.01169732988263, "learning_rate": 9.464420180722891e-07, "logits/chosen": -2.076171875, "logits/rejected": -2.0804686546325684, "logps/chosen": -420.0, "logps/rejected": -408.45001220703125, "loss": 0.2492, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.2921020984649658, "rewards/margins": 3.238330125808716, "rewards/rejected": -4.529687404632568, "step": 570 }, { "epoch": 0.2183734939759036, "grad_norm": 127.1291145300896, "learning_rate": 9.455007530120482e-07, "logits/chosen": -2.1166014671325684, "logits/rejected": -2.013867139816284, "logps/chosen": -349.25, "logps/rejected": -329.29998779296875, "loss": 0.4079, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.3963623046875, "rewards/margins": 2.671142578125, "rewards/rejected": -4.067968845367432, "step": 580 }, { "epoch": 0.22213855421686746, "grad_norm": 47.486986187915875, "learning_rate": 9.445594879518071e-07, "logits/chosen": -2.177929639816284, "logits/rejected": -2.1552734375, "logps/chosen": -376.5625, "logps/rejected": -360.54998779296875, "loss": 0.2613, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.47497254610061646, "rewards/margins": 2.896484375, "rewards/rejected": -3.374218702316284, "step": 590 }, { "epoch": 0.22590361445783133, "grad_norm": 73.37668084567073, "learning_rate": 9.436182228915662e-07, "logits/chosen": -2.115234375, "logits/rejected": -2.1044921875, "logps/chosen": -460.13751220703125, "logps/rejected": -384.57501220703125, "loss": 0.2482, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.262603759765625, "rewards/margins": 3.441601514816284, "rewards/rejected": -3.7037110328674316, "step": 600 }, { "epoch": 0.22966867469879518, "grad_norm": 113.4382556816779, "learning_rate": 9.426769578313253e-07, "logits/chosen": -2.137890577316284, "logits/rejected": -2.128124952316284, "logps/chosen": -436.8374938964844, "logps/rejected": -389.95001220703125, "loss": 0.2753, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.41334837675094604, "rewards/margins": 3.176806688308716, "rewards/rejected": -3.589343309402466, "step": 610 }, { "epoch": 0.23343373493975902, "grad_norm": 81.94270174347506, "learning_rate": 9.417356927710844e-07, "logits/chosen": -2.045117139816284, "logits/rejected": -2.0736327171325684, "logps/chosen": -418.125, "logps/rejected": -386.875, "loss": 0.2768, "rewards/accuracies": 0.875, "rewards/chosen": -0.502062976360321, "rewards/margins": 3.188671827316284, "rewards/rejected": -3.69140625, "step": 620 }, { "epoch": 0.2371987951807229, "grad_norm": 112.75738962682294, "learning_rate": 9.407944277108434e-07, "logits/chosen": -2.0570311546325684, "logits/rejected": -2.122265577316284, "logps/chosen": -474.8500061035156, "logps/rejected": -371.9750061035156, "loss": 0.3152, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -0.756451427936554, "rewards/margins": 2.835693359375, "rewards/rejected": -3.5907225608825684, "step": 630 }, { "epoch": 0.24096385542168675, "grad_norm": 84.06193257641714, "learning_rate": 9.398531626506023e-07, "logits/chosen": -2.1265625953674316, "logits/rejected": -2.1734375953674316, "logps/chosen": -408.88751220703125, "logps/rejected": -364.1000061035156, "loss": 0.2541, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.9083007574081421, "rewards/margins": 2.9198241233825684, "rewards/rejected": -3.830273389816284, "step": 640 }, { "epoch": 0.2447289156626506, "grad_norm": 87.2493995229064, "learning_rate": 9.389118975903614e-07, "logits/chosen": -2.0943360328674316, "logits/rejected": -2.138867139816284, "logps/chosen": -445.2250061035156, "logps/rejected": -377.5249938964844, "loss": 0.3065, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.703656017780304, "rewards/margins": 3.0354981422424316, "rewards/rejected": -3.737988233566284, "step": 650 }, { "epoch": 0.24849397590361447, "grad_norm": 110.92555832656788, "learning_rate": 9.379706325301205e-07, "logits/chosen": -2.171875, "logits/rejected": -2.1220703125, "logps/chosen": -413.0249938964844, "logps/rejected": -363.25, "loss": 0.3937, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.3126952648162842, "rewards/margins": 2.87548828125, "rewards/rejected": -4.188281059265137, "step": 660 }, { "epoch": 0.2522590361445783, "grad_norm": 110.14223712512388, "learning_rate": 9.370293674698795e-07, "logits/chosen": -2.1664061546325684, "logits/rejected": -2.033007860183716, "logps/chosen": -430.8999938964844, "logps/rejected": -361.0, "loss": 0.4127, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.251953125, "rewards/margins": 2.678466796875, "rewards/rejected": -3.926953077316284, "step": 670 }, { "epoch": 0.2560240963855422, "grad_norm": 31.327481736309334, "learning_rate": 9.360881024096385e-07, "logits/chosen": -2.074414014816284, "logits/rejected": -2.0630860328674316, "logps/chosen": -422.42498779296875, "logps/rejected": -364.0, "loss": 0.2463, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.5521606206893921, "rewards/margins": 3.109179735183716, "rewards/rejected": -3.66015625, "step": 680 }, { "epoch": 0.25978915662650603, "grad_norm": 39.577761960509086, "learning_rate": 9.351468373493976e-07, "logits/chosen": -2.073437452316284, "logits/rejected": -1.967382788658142, "logps/chosen": -415.48748779296875, "logps/rejected": -369.20001220703125, "loss": 0.2843, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.72003173828125, "rewards/margins": 3.052783250808716, "rewards/rejected": -3.7728514671325684, "step": 690 }, { "epoch": 0.2635542168674699, "grad_norm": 131.25798114503576, "learning_rate": 9.342055722891565e-07, "logits/chosen": -2.0044922828674316, "logits/rejected": -1.9480469226837158, "logps/chosen": -412.61248779296875, "logps/rejected": -430.6499938964844, "loss": 0.2563, "rewards/accuracies": 0.875, "rewards/chosen": -0.542675793170929, "rewards/margins": 3.39208984375, "rewards/rejected": -3.934765577316284, "step": 700 }, { "epoch": 0.26731927710843373, "grad_norm": 104.63365078210042, "learning_rate": 9.332643072289156e-07, "logits/chosen": -2.1751952171325684, "logits/rejected": -2.0855469703674316, "logps/chosen": -390.73748779296875, "logps/rejected": -338.125, "loss": 0.3774, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.480566382408142, "rewards/margins": 2.79345703125, "rewards/rejected": -4.273828029632568, "step": 710 }, { "epoch": 0.2710843373493976, "grad_norm": 48.42545771539419, "learning_rate": 9.323230421686746e-07, "logits/chosen": -2.0869140625, "logits/rejected": -2.0390625, "logps/chosen": -384.2749938964844, "logps/rejected": -362.1000061035156, "loss": 0.3171, "rewards/accuracies": 0.875, "rewards/chosen": -1.6393311023712158, "rewards/margins": 3.337597608566284, "rewards/rejected": -4.973046779632568, "step": 720 }, { "epoch": 0.2748493975903614, "grad_norm": 96.96009518661364, "learning_rate": 9.313817771084337e-07, "logits/chosen": -2.0433592796325684, "logits/rejected": -2.0201172828674316, "logps/chosen": -436.79998779296875, "logps/rejected": -403.625, "loss": 0.2834, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.5180175304412842, "rewards/margins": 3.1732420921325684, "rewards/rejected": -4.691015720367432, "step": 730 }, { "epoch": 0.2786144578313253, "grad_norm": 33.17145684759076, "learning_rate": 9.304405120481927e-07, "logits/chosen": -2.0179686546325684, "logits/rejected": -2.030078172683716, "logps/chosen": -451.3999938964844, "logps/rejected": -400.625, "loss": 0.247, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.57928466796875, "rewards/margins": 3.051025390625, "rewards/rejected": -4.629492282867432, "step": 740 }, { "epoch": 0.28237951807228917, "grad_norm": 106.4695750935859, "learning_rate": 9.294992469879518e-07, "logits/chosen": -2.0933594703674316, "logits/rejected": -2.0565428733825684, "logps/chosen": -411.42498779296875, "logps/rejected": -363.54998779296875, "loss": 0.3197, "rewards/accuracies": 0.875, "rewards/chosen": -1.394647240638733, "rewards/margins": 3.2420897483825684, "rewards/rejected": -4.63671875, "step": 750 }, { "epoch": 0.286144578313253, "grad_norm": 64.93709602362559, "learning_rate": 9.285579819277109e-07, "logits/chosen": -2.1353516578674316, "logits/rejected": -2.130078077316284, "logps/chosen": -356.8125, "logps/rejected": -367.125, "loss": 0.2505, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.387414574623108, "rewards/margins": 3.4292969703674316, "rewards/rejected": -4.815234184265137, "step": 760 }, { "epoch": 0.28990963855421686, "grad_norm": 36.25477795199984, "learning_rate": 9.276167168674698e-07, "logits/chosen": -2.062695264816284, "logits/rejected": -2.0523438453674316, "logps/chosen": -458.0, "logps/rejected": -414.375, "loss": 0.3013, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.1524658203125, "rewards/margins": 3.1875, "rewards/rejected": -4.340234279632568, "step": 770 }, { "epoch": 0.2936746987951807, "grad_norm": 91.70971853342344, "learning_rate": 9.266754518072288e-07, "logits/chosen": -2.216992139816284, "logits/rejected": -2.1669921875, "logps/chosen": -407.4750061035156, "logps/rejected": -403.79998779296875, "loss": 0.3494, "rewards/accuracies": 0.84375, "rewards/chosen": -1.269036889076233, "rewards/margins": 2.987548828125, "rewards/rejected": -4.253125190734863, "step": 780 }, { "epoch": 0.29743975903614456, "grad_norm": 101.28250722678908, "learning_rate": 9.257341867469879e-07, "logits/chosen": -2.169726610183716, "logits/rejected": -2.091992139816284, "logps/chosen": -445.2749938964844, "logps/rejected": -406.9750061035156, "loss": 0.3383, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.25030517578125, "rewards/margins": 3.1893553733825684, "rewards/rejected": -4.436913967132568, "step": 790 }, { "epoch": 0.30120481927710846, "grad_norm": 71.90842331530553, "learning_rate": 9.24792921686747e-07, "logits/chosen": -2.1978516578674316, "logits/rejected": -2.1333985328674316, "logps/chosen": -446.3999938964844, "logps/rejected": -405.1499938964844, "loss": 0.2109, "rewards/accuracies": 0.90625, "rewards/chosen": -1.3445312976837158, "rewards/margins": 3.631640672683716, "rewards/rejected": -4.975781440734863, "step": 800 }, { "epoch": 0.3049698795180723, "grad_norm": 85.47776561440223, "learning_rate": 9.23851656626506e-07, "logits/chosen": -2.0361328125, "logits/rejected": -2.0224609375, "logps/chosen": -368.0, "logps/rejected": -381.57501220703125, "loss": 0.341, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.658471703529358, "rewards/margins": 2.8340821266174316, "rewards/rejected": -4.491015434265137, "step": 810 }, { "epoch": 0.30873493975903615, "grad_norm": 90.30980906366864, "learning_rate": 9.22910391566265e-07, "logits/chosen": -2.170703172683716, "logits/rejected": -2.087890625, "logps/chosen": -430.7749938964844, "logps/rejected": -395.57501220703125, "loss": 0.3447, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.943872094154358, "rewards/margins": 2.837597608566284, "rewards/rejected": -4.782031059265137, "step": 820 }, { "epoch": 0.3125, "grad_norm": 53.46541437569864, "learning_rate": 9.219691265060241e-07, "logits/chosen": -2.1148438453674316, "logits/rejected": -2.151562452316284, "logps/chosen": -401.0375061035156, "logps/rejected": -379.54998779296875, "loss": 0.2089, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.6345703601837158, "rewards/margins": 3.517773389816284, "rewards/rejected": -5.151953220367432, "step": 830 }, { "epoch": 0.31626506024096385, "grad_norm": 34.08113004937894, "learning_rate": 9.210278614457831e-07, "logits/chosen": -2.1337890625, "logits/rejected": -2.273632764816284, "logps/chosen": -489.2124938964844, "logps/rejected": -405.9750061035156, "loss": 0.2095, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.27557373046875, "rewards/margins": 3.5888671875, "rewards/rejected": -4.862500190734863, "step": 840 }, { "epoch": 0.3200301204819277, "grad_norm": 23.904735229729347, "learning_rate": 9.20086596385542e-07, "logits/chosen": -2.061718702316284, "logits/rejected": -2.037304639816284, "logps/chosen": -449.79998779296875, "logps/rejected": -405.07501220703125, "loss": 0.3141, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.5504882335662842, "rewards/margins": 3.537109375, "rewards/rejected": -5.090624809265137, "step": 850 }, { "epoch": 0.32379518072289154, "grad_norm": 27.146650692484673, "learning_rate": 9.191453313253011e-07, "logits/chosen": -2.106250047683716, "logits/rejected": -2.1695313453674316, "logps/chosen": -471.125, "logps/rejected": -414.875, "loss": 0.2801, "rewards/accuracies": 0.875, "rewards/chosen": -1.421167016029358, "rewards/margins": 3.7818360328674316, "rewards/rejected": -5.203515529632568, "step": 860 }, { "epoch": 0.32756024096385544, "grad_norm": 38.53760721622766, "learning_rate": 9.182040662650602e-07, "logits/chosen": -2.088085889816284, "logits/rejected": -2.1099610328674316, "logps/chosen": -414.92498779296875, "logps/rejected": -419.7250061035156, "loss": 0.2922, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.6373779773712158, "rewards/margins": 3.6640625, "rewards/rejected": -5.300195217132568, "step": 870 }, { "epoch": 0.3313253012048193, "grad_norm": 57.382840432113674, "learning_rate": 9.172628012048193e-07, "logits/chosen": -2.1904296875, "logits/rejected": -2.2490234375, "logps/chosen": -408.95001220703125, "logps/rejected": -322.42498779296875, "loss": 0.2491, "rewards/accuracies": 0.90625, "rewards/chosen": -0.961804211139679, "rewards/margins": 3.2837891578674316, "rewards/rejected": -4.245312690734863, "step": 880 }, { "epoch": 0.33509036144578314, "grad_norm": 58.01752443114554, "learning_rate": 9.163215361445783e-07, "logits/chosen": -2.1640625, "logits/rejected": -2.148242235183716, "logps/chosen": -424.57501220703125, "logps/rejected": -371.1000061035156, "loss": 0.3051, "rewards/accuracies": 0.875, "rewards/chosen": -1.4306640625, "rewards/margins": 3.4457030296325684, "rewards/rejected": -4.876367092132568, "step": 890 }, { "epoch": 0.338855421686747, "grad_norm": 49.56289800868724, "learning_rate": 9.153802710843373e-07, "logits/chosen": -2.2108397483825684, "logits/rejected": -2.1714844703674316, "logps/chosen": -450.7124938964844, "logps/rejected": -410.54998779296875, "loss": 0.2292, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.468603491783142, "rewards/margins": 3.7232422828674316, "rewards/rejected": -5.192187309265137, "step": 900 }, { "epoch": 0.34262048192771083, "grad_norm": 41.227455756263154, "learning_rate": 9.144390060240963e-07, "logits/chosen": -2.1781249046325684, "logits/rejected": -2.191210985183716, "logps/chosen": -416.375, "logps/rejected": -363.9624938964844, "loss": 0.284, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.36572265625, "rewards/margins": 3.052050828933716, "rewards/rejected": -4.41796875, "step": 910 }, { "epoch": 0.3463855421686747, "grad_norm": 68.16148336408327, "learning_rate": 9.134977409638554e-07, "logits/chosen": -2.08349609375, "logits/rejected": -2.202343702316284, "logps/chosen": -437.92498779296875, "logps/rejected": -363.6000061035156, "loss": 0.3101, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.420385718345642, "rewards/margins": 3.320483446121216, "rewards/rejected": -4.738476753234863, "step": 920 }, { "epoch": 0.3501506024096386, "grad_norm": 99.30346176438924, "learning_rate": 9.125564759036144e-07, "logits/chosen": -2.1109375953674316, "logits/rejected": -2.077929735183716, "logps/chosen": -416.1000061035156, "logps/rejected": -388.20001220703125, "loss": 0.3236, "rewards/accuracies": 0.84375, "rewards/chosen": -1.17425537109375, "rewards/margins": 3.061718702316284, "rewards/rejected": -4.236914157867432, "step": 930 }, { "epoch": 0.3539156626506024, "grad_norm": 92.96046743810562, "learning_rate": 9.116152108433734e-07, "logits/chosen": -2.187695264816284, "logits/rejected": -2.1337890625, "logps/chosen": -384.6000061035156, "logps/rejected": -348.5375061035156, "loss": 0.3619, "rewards/accuracies": 0.875, "rewards/chosen": -1.563330054283142, "rewards/margins": 2.861035108566284, "rewards/rejected": -4.426171779632568, "step": 940 }, { "epoch": 0.35768072289156627, "grad_norm": 65.88624779684952, "learning_rate": 9.106739457831325e-07, "logits/chosen": -2.1400389671325684, "logits/rejected": -2.214648485183716, "logps/chosen": -394.04998779296875, "logps/rejected": -362.2749938964844, "loss": 0.2712, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.3043091297149658, "rewards/margins": 3.262988328933716, "rewards/rejected": -4.567968845367432, "step": 950 }, { "epoch": 0.3614457831325301, "grad_norm": 99.42200898299515, "learning_rate": 9.097326807228916e-07, "logits/chosen": -2.1630859375, "logits/rejected": -2.160351514816284, "logps/chosen": -449.54998779296875, "logps/rejected": -399.07501220703125, "loss": 0.274, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.703710913658142, "rewards/margins": 3.349316358566284, "rewards/rejected": -5.057031154632568, "step": 960 }, { "epoch": 0.36521084337349397, "grad_norm": 87.8263575829758, "learning_rate": 9.087914156626506e-07, "logits/chosen": -2.1611328125, "logits/rejected": -2.178515672683716, "logps/chosen": -451.3999938964844, "logps/rejected": -391.375, "loss": 0.2673, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.0218749046325684, "rewards/margins": 3.2959961891174316, "rewards/rejected": -5.322265625, "step": 970 }, { "epoch": 0.3689759036144578, "grad_norm": 122.17249394763829, "learning_rate": 9.078501506024095e-07, "logits/chosen": -2.1390624046325684, "logits/rejected": -2.173632860183716, "logps/chosen": -457.375, "logps/rejected": -451.2749938964844, "loss": 0.243, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.128955125808716, "rewards/margins": 4.0166015625, "rewards/rejected": -6.147656440734863, "step": 980 }, { "epoch": 0.3727409638554217, "grad_norm": 74.4522254661928, "learning_rate": 9.069088855421686e-07, "logits/chosen": -2.2007813453674316, "logits/rejected": -2.2017579078674316, "logps/chosen": -431.6000061035156, "logps/rejected": -397.25, "loss": 0.2826, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.8095703125, "rewards/margins": 3.0933594703674316, "rewards/rejected": -5.901171684265137, "step": 990 }, { "epoch": 0.37650602409638556, "grad_norm": 101.27105095945237, "learning_rate": 9.059676204819276e-07, "logits/chosen": -2.2007813453674316, "logits/rejected": -2.140429735183716, "logps/chosen": -492.6499938964844, "logps/rejected": -445.95001220703125, "loss": 0.347, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.324658155441284, "rewards/margins": 3.331835985183716, "rewards/rejected": -5.658203125, "step": 1000 }, { "epoch": 0.3802710843373494, "grad_norm": 79.2999602375978, "learning_rate": 9.050263554216867e-07, "logits/chosen": -2.214648485183716, "logits/rejected": -2.206835985183716, "logps/chosen": -402.6000061035156, "logps/rejected": -388.04998779296875, "loss": 0.281, "rewards/accuracies": 0.84375, "rewards/chosen": -2.1468262672424316, "rewards/margins": 3.685351610183716, "rewards/rejected": -5.83203125, "step": 1010 }, { "epoch": 0.38403614457831325, "grad_norm": 107.22680960830169, "learning_rate": 9.040850903614458e-07, "logits/chosen": -2.1421875953674316, "logits/rejected": -2.157031297683716, "logps/chosen": -508.95001220703125, "logps/rejected": -391.8500061035156, "loss": 0.2632, "rewards/accuracies": 0.875, "rewards/chosen": -1.3499755859375, "rewards/margins": 3.982226610183716, "rewards/rejected": -5.330468654632568, "step": 1020 }, { "epoch": 0.3878012048192771, "grad_norm": 95.00467340752571, "learning_rate": 9.031438253012048e-07, "logits/chosen": -2.1236329078674316, "logits/rejected": -2.146289110183716, "logps/chosen": -440.4375, "logps/rejected": -398.5, "loss": 0.2568, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.5607421398162842, "rewards/margins": 3.5121092796325684, "rewards/rejected": -5.073437690734863, "step": 1030 }, { "epoch": 0.39156626506024095, "grad_norm": 50.72808512437591, "learning_rate": 9.022025602409638e-07, "logits/chosen": -2.0787110328674316, "logits/rejected": -2.0970702171325684, "logps/chosen": -440.7749938964844, "logps/rejected": -388.125, "loss": 0.3174, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.102636694908142, "rewards/margins": 3.2831053733825684, "rewards/rejected": -4.387109279632568, "step": 1040 }, { "epoch": 0.3953313253012048, "grad_norm": 53.48685041966083, "learning_rate": 9.012612951807228e-07, "logits/chosen": -2.1167969703674316, "logits/rejected": -2.109375, "logps/chosen": -380.9750061035156, "logps/rejected": -384.0249938964844, "loss": 0.2934, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.3595397472381592, "rewards/margins": 3.3030762672424316, "rewards/rejected": -4.66015625, "step": 1050 }, { "epoch": 0.3990963855421687, "grad_norm": 23.549198392752874, "learning_rate": 9.003200301204819e-07, "logits/chosen": -2.100390672683716, "logits/rejected": -2.0777344703674316, "logps/chosen": -380.6000061035156, "logps/rejected": -338.32501220703125, "loss": 0.2557, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.463525414466858, "rewards/margins": 3.70654296875, "rewards/rejected": -5.172656059265137, "step": 1060 }, { "epoch": 0.40286144578313254, "grad_norm": 93.30475943391482, "learning_rate": 8.99378765060241e-07, "logits/chosen": -2.1978516578674316, "logits/rejected": -2.1669921875, "logps/chosen": -409.54998779296875, "logps/rejected": -363.6000061035156, "loss": 0.3301, "rewards/accuracies": 0.84375, "rewards/chosen": -1.5463683605194092, "rewards/margins": 3.147167921066284, "rewards/rejected": -4.693359375, "step": 1070 }, { "epoch": 0.4066265060240964, "grad_norm": 145.5141153945925, "learning_rate": 8.984374999999999e-07, "logits/chosen": -2.1009764671325684, "logits/rejected": -2.102343797683716, "logps/chosen": -478.7749938964844, "logps/rejected": -421.7749938964844, "loss": 0.2705, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.3025634288787842, "rewards/margins": 3.3475584983825684, "rewards/rejected": -4.646484375, "step": 1080 }, { "epoch": 0.41039156626506024, "grad_norm": 43.90947091147853, "learning_rate": 8.97496234939759e-07, "logits/chosen": -2.228710889816284, "logits/rejected": -2.1806640625, "logps/chosen": -429.0, "logps/rejected": -373.7749938964844, "loss": 0.2289, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.195532202720642, "rewards/margins": 3.750293016433716, "rewards/rejected": -4.944140434265137, "step": 1090 }, { "epoch": 0.4141566265060241, "grad_norm": 73.98365699224516, "learning_rate": 8.965549698795181e-07, "logits/chosen": -2.087109327316284, "logits/rejected": -2.1841797828674316, "logps/chosen": -446.1499938964844, "logps/rejected": -351.9750061035156, "loss": 0.228, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.33624267578125, "rewards/margins": 3.658496141433716, "rewards/rejected": -4.993750095367432, "step": 1100 }, { "epoch": 0.41792168674698793, "grad_norm": 81.66712514310206, "learning_rate": 8.956137048192772e-07, "logits/chosen": -2.1566405296325684, "logits/rejected": -2.187695264816284, "logps/chosen": -414.45001220703125, "logps/rejected": -355.8500061035156, "loss": 0.2737, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.3431396484375, "rewards/margins": 3.3628907203674316, "rewards/rejected": -4.709570407867432, "step": 1110 }, { "epoch": 0.42168674698795183, "grad_norm": 53.81320996116208, "learning_rate": 8.94672439759036e-07, "logits/chosen": -2.228320360183716, "logits/rejected": -2.212695360183716, "logps/chosen": -457.3125, "logps/rejected": -401.7250061035156, "loss": 0.2357, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.659033179283142, "rewards/margins": 3.4029784202575684, "rewards/rejected": -5.064453125, "step": 1120 }, { "epoch": 0.4254518072289157, "grad_norm": 66.01899261498104, "learning_rate": 8.937311746987951e-07, "logits/chosen": -2.099609375, "logits/rejected": -2.140820264816284, "logps/chosen": -443.375, "logps/rejected": -372.625, "loss": 0.2826, "rewards/accuracies": 0.875, "rewards/chosen": -1.112146019935608, "rewards/margins": 3.727832078933716, "rewards/rejected": -4.838281154632568, "step": 1130 }, { "epoch": 0.4292168674698795, "grad_norm": 80.25992408315766, "learning_rate": 8.927899096385542e-07, "logits/chosen": -2.1005859375, "logits/rejected": -2.0962891578674316, "logps/chosen": -466.5625, "logps/rejected": -395.92498779296875, "loss": 0.316, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -0.6715942621231079, "rewards/margins": 3.4523682594299316, "rewards/rejected": -4.125, "step": 1140 }, { "epoch": 0.4329819277108434, "grad_norm": 74.77932322981695, "learning_rate": 8.918486445783133e-07, "logits/chosen": -2.166796922683716, "logits/rejected": -2.1332030296325684, "logps/chosen": -390.57501220703125, "logps/rejected": -373.54998779296875, "loss": 0.3132, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.1254394054412842, "rewards/margins": 3.466479539871216, "rewards/rejected": -4.58984375, "step": 1150 }, { "epoch": 0.4367469879518072, "grad_norm": 94.05911336412525, "learning_rate": 8.909073795180722e-07, "logits/chosen": -2.135546922683716, "logits/rejected": -2.1078124046325684, "logps/chosen": -449.13751220703125, "logps/rejected": -391.375, "loss": 0.2424, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.669897437095642, "rewards/margins": 3.819140672683716, "rewards/rejected": -5.486718654632568, "step": 1160 }, { "epoch": 0.44051204819277107, "grad_norm": 112.53051384812176, "learning_rate": 8.899661144578313e-07, "logits/chosen": -2.0912108421325684, "logits/rejected": -2.0160155296325684, "logps/chosen": -425.63751220703125, "logps/rejected": -397.6499938964844, "loss": 0.2703, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.546362280845642, "rewards/margins": 3.71923828125, "rewards/rejected": -5.266015529632568, "step": 1170 }, { "epoch": 0.4442771084337349, "grad_norm": 171.47131791265662, "learning_rate": 8.890248493975904e-07, "logits/chosen": -2.179492235183716, "logits/rejected": -2.108203172683716, "logps/chosen": -405.79998779296875, "logps/rejected": -380.75, "loss": 0.3975, "rewards/accuracies": 0.84375, "rewards/chosen": -2.628405809402466, "rewards/margins": 3.5389161109924316, "rewards/rejected": -6.164453029632568, "step": 1180 }, { "epoch": 0.4480421686746988, "grad_norm": 72.00542331654916, "learning_rate": 8.880835843373493e-07, "logits/chosen": -2.0361328125, "logits/rejected": -2.0494141578674316, "logps/chosen": -473.5249938964844, "logps/rejected": -427.04998779296875, "loss": 0.2864, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.8711426258087158, "rewards/margins": 3.8521485328674316, "rewards/rejected": -5.7216796875, "step": 1190 }, { "epoch": 0.45180722891566266, "grad_norm": 76.48372358092988, "learning_rate": 8.871423192771083e-07, "logits/chosen": -2.1611328125, "logits/rejected": -2.1626954078674316, "logps/chosen": -365.0249938964844, "logps/rejected": -324.57501220703125, "loss": 0.248, "rewards/accuracies": 0.875, "rewards/chosen": -2.049999952316284, "rewards/margins": 3.4012694358825684, "rewards/rejected": -5.449999809265137, "step": 1200 }, { "epoch": 0.4555722891566265, "grad_norm": 46.32868577979854, "learning_rate": 8.862010542168674e-07, "logits/chosen": -2.164843797683716, "logits/rejected": -2.165234327316284, "logps/chosen": -406.625, "logps/rejected": -373.29998779296875, "loss": 0.2473, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.7876708507537842, "rewards/margins": 3.9571290016174316, "rewards/rejected": -5.743750095367432, "step": 1210 }, { "epoch": 0.45933734939759036, "grad_norm": 68.2955499186239, "learning_rate": 8.852597891566265e-07, "logits/chosen": -2.210742235183716, "logits/rejected": -2.2376952171325684, "logps/chosen": -404.54998779296875, "logps/rejected": -355.20001220703125, "loss": 0.3781, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.792028784751892, "rewards/margins": 3.7393555641174316, "rewards/rejected": -5.530077934265137, "step": 1220 }, { "epoch": 0.4631024096385542, "grad_norm": 68.8444027528496, "learning_rate": 8.843185240963855e-07, "logits/chosen": -2.177539110183716, "logits/rejected": -2.1039061546325684, "logps/chosen": -424.7124938964844, "logps/rejected": -391.79998779296875, "loss": 0.227, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.606164574623108, "rewards/margins": 3.5916504859924316, "rewards/rejected": -5.200781345367432, "step": 1230 }, { "epoch": 0.46686746987951805, "grad_norm": 61.917794725334716, "learning_rate": 8.833772590361446e-07, "logits/chosen": -2.0835938453674316, "logits/rejected": -2.1214842796325684, "logps/chosen": -448.5249938964844, "logps/rejected": -413.29998779296875, "loss": 0.2349, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.4093506336212158, "rewards/margins": 3.8675780296325684, "rewards/rejected": -5.2734375, "step": 1240 }, { "epoch": 0.47063253012048195, "grad_norm": 55.79481515169431, "learning_rate": 8.824359939759036e-07, "logits/chosen": -2.1346678733825684, "logits/rejected": -2.1078124046325684, "logps/chosen": -388.4750061035156, "logps/rejected": -352.5249938964844, "loss": 0.2104, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -2.4501953125, "rewards/margins": 3.8076171875, "rewards/rejected": -6.262109279632568, "step": 1250 }, { "epoch": 0.4743975903614458, "grad_norm": 154.19277755622798, "learning_rate": 8.814947289156626e-07, "logits/chosen": -2.2738280296325684, "logits/rejected": -2.2660155296325684, "logps/chosen": -457.45001220703125, "logps/rejected": -439.29998779296875, "loss": 0.2895, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.2718262672424316, "rewards/margins": 3.920703172683716, "rewards/rejected": -6.1875, "step": 1260 }, { "epoch": 0.47816265060240964, "grad_norm": 57.76044709065727, "learning_rate": 8.805534638554216e-07, "logits/chosen": -2.1314454078674316, "logits/rejected": -2.157421827316284, "logps/chosen": -466.8500061035156, "logps/rejected": -392.2250061035156, "loss": 0.2738, "rewards/accuracies": 0.875, "rewards/chosen": -1.8100097179412842, "rewards/margins": 4.396288871765137, "rewards/rejected": -6.202343940734863, "step": 1270 }, { "epoch": 0.4819277108433735, "grad_norm": 180.006145357383, "learning_rate": 8.796121987951807e-07, "logits/chosen": -2.185351610183716, "logits/rejected": -2.116015672683716, "logps/chosen": -445.7250061035156, "logps/rejected": -443.07501220703125, "loss": 0.2596, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.5180907249450684, "rewards/margins": 4.090624809265137, "rewards/rejected": -6.609375, "step": 1280 }, { "epoch": 0.48569277108433734, "grad_norm": 54.55743879188633, "learning_rate": 8.786709337349397e-07, "logits/chosen": -2.1371092796325684, "logits/rejected": -2.211718797683716, "logps/chosen": -470.79998779296875, "logps/rejected": -385.75, "loss": 0.2212, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.2955565452575684, "rewards/margins": 4.206835746765137, "rewards/rejected": -6.501172065734863, "step": 1290 }, { "epoch": 0.4894578313253012, "grad_norm": 73.49489737256927, "learning_rate": 8.777296686746988e-07, "logits/chosen": -2.2724609375, "logits/rejected": -2.225781202316284, "logps/chosen": -388.51251220703125, "logps/rejected": -366.29998779296875, "loss": 0.2948, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.7459838390350342, "rewards/margins": 3.388476610183716, "rewards/rejected": -5.138671875, "step": 1300 }, { "epoch": 0.4932228915662651, "grad_norm": 39.58315574917423, "learning_rate": 8.767884036144578e-07, "logits/chosen": -2.140429735183716, "logits/rejected": -2.101367235183716, "logps/chosen": -427.5, "logps/rejected": -426.5249938964844, "loss": 0.2656, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.793634057044983, "rewards/margins": 4.167236328125, "rewards/rejected": -5.957812309265137, "step": 1310 }, { "epoch": 0.49698795180722893, "grad_norm": 93.55499368100087, "learning_rate": 8.758471385542169e-07, "logits/chosen": -2.2398438453674316, "logits/rejected": -2.1591796875, "logps/chosen": -498.1000061035156, "logps/rejected": -435.7250061035156, "loss": 0.2266, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.7927734851837158, "rewards/margins": 4.1416015625, "rewards/rejected": -5.931640625, "step": 1320 }, { "epoch": 0.5007530120481928, "grad_norm": 71.20816745604236, "learning_rate": 8.749058734939759e-07, "logits/chosen": -2.1839842796325684, "logits/rejected": -2.169921875, "logps/chosen": -434.8374938964844, "logps/rejected": -418.82501220703125, "loss": 0.2475, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.789514183998108, "rewards/margins": 3.85107421875, "rewards/rejected": -5.640625, "step": 1330 }, { "epoch": 0.5045180722891566, "grad_norm": 70.08364250659972, "learning_rate": 8.739646084337348e-07, "logits/chosen": -2.2783203125, "logits/rejected": -2.325390577316284, "logps/chosen": -457.7749938964844, "logps/rejected": -370.7250061035156, "loss": 0.2426, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -2.8470458984375, "rewards/margins": 3.713574171066284, "rewards/rejected": -6.561327934265137, "step": 1340 }, { "epoch": 0.5082831325301205, "grad_norm": 136.8518853014668, "learning_rate": 8.730233433734939e-07, "logits/chosen": -2.2916016578674316, "logits/rejected": -2.30859375, "logps/chosen": -437.95001220703125, "logps/rejected": -408.82501220703125, "loss": 0.2507, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.508496046066284, "rewards/margins": 4.163378715515137, "rewards/rejected": -6.672265529632568, "step": 1350 }, { "epoch": 0.5120481927710844, "grad_norm": 85.77283266675055, "learning_rate": 8.72082078313253e-07, "logits/chosen": -2.3099608421325684, "logits/rejected": -2.293750047683716, "logps/chosen": -451.07501220703125, "logps/rejected": -402.2749938964844, "loss": 0.2395, "rewards/accuracies": 0.875, "rewards/chosen": -1.810034155845642, "rewards/margins": 3.968017578125, "rewards/rejected": -5.77734375, "step": 1360 }, { "epoch": 0.5158132530120482, "grad_norm": 100.5668102097304, "learning_rate": 8.711408132530121e-07, "logits/chosen": -2.3207030296325684, "logits/rejected": -2.2865233421325684, "logps/chosen": -446.07501220703125, "logps/rejected": -400.07501220703125, "loss": 0.3272, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.9567687511444092, "rewards/margins": 3.595507860183716, "rewards/rejected": -5.544921875, "step": 1370 }, { "epoch": 0.5195783132530121, "grad_norm": 72.52253506718826, "learning_rate": 8.70199548192771e-07, "logits/chosen": -2.1611328125, "logits/rejected": -2.1666016578674316, "logps/chosen": -401.2749938964844, "logps/rejected": -402.5249938964844, "loss": 0.2958, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -2.515380859375, "rewards/margins": 3.8828125, "rewards/rejected": -6.3916015625, "step": 1380 }, { "epoch": 0.5233433734939759, "grad_norm": 55.119621540585925, "learning_rate": 8.692582831325301e-07, "logits/chosen": -2.2974610328674316, "logits/rejected": -2.327929735183716, "logps/chosen": -457.13751220703125, "logps/rejected": -407.125, "loss": 0.2309, "rewards/accuracies": 0.90625, "rewards/chosen": -2.507128953933716, "rewards/margins": 4.006079196929932, "rewards/rejected": -6.513671875, "step": 1390 }, { "epoch": 0.5271084337349398, "grad_norm": 57.121228026111886, "learning_rate": 8.683170180722891e-07, "logits/chosen": -2.228710889816284, "logits/rejected": -2.1490235328674316, "logps/chosen": -437.6499938964844, "logps/rejected": -380.5249938964844, "loss": 0.2355, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.392627000808716, "rewards/margins": 3.782031297683716, "rewards/rejected": -6.17578125, "step": 1400 }, { "epoch": 0.5308734939759037, "grad_norm": 77.15648590831769, "learning_rate": 8.673757530120482e-07, "logits/chosen": -2.1839842796325684, "logits/rejected": -2.2134766578674316, "logps/chosen": -452.42498779296875, "logps/rejected": -419.875, "loss": 0.2925, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.1388792991638184, "rewards/margins": 3.8636717796325684, "rewards/rejected": -6.003320217132568, "step": 1410 }, { "epoch": 0.5346385542168675, "grad_norm": 90.1516715195836, "learning_rate": 8.664344879518071e-07, "logits/chosen": -2.3447265625, "logits/rejected": -2.2486329078674316, "logps/chosen": -423.8999938964844, "logps/rejected": -406.3500061035156, "loss": 0.2803, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.6853516101837158, "rewards/margins": 4.14453125, "rewards/rejected": -5.828125, "step": 1420 }, { "epoch": 0.5384036144578314, "grad_norm": 102.10429756339867, "learning_rate": 8.654932228915662e-07, "logits/chosen": -2.1888670921325684, "logits/rejected": -2.1982421875, "logps/chosen": -425.4125061035156, "logps/rejected": -367.29998779296875, "loss": 0.3084, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.086932420730591, "rewards/margins": 3.57275390625, "rewards/rejected": -5.662499904632568, "step": 1430 }, { "epoch": 0.5421686746987951, "grad_norm": 32.01791160850017, "learning_rate": 8.645519578313253e-07, "logits/chosen": -2.240039110183716, "logits/rejected": -2.142382860183716, "logps/chosen": -418.4375, "logps/rejected": -424.82501220703125, "loss": 0.3045, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.133105516433716, "rewards/margins": 3.765576124191284, "rewards/rejected": -5.895117282867432, "step": 1440 }, { "epoch": 0.545933734939759, "grad_norm": 98.97632967765213, "learning_rate": 8.636106927710844e-07, "logits/chosen": -2.1474609375, "logits/rejected": -2.1812500953674316, "logps/chosen": -448.07501220703125, "logps/rejected": -455.57501220703125, "loss": 0.2403, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -2.356982469558716, "rewards/margins": 4.522070407867432, "rewards/rejected": -6.883593559265137, "step": 1450 }, { "epoch": 0.5496987951807228, "grad_norm": 104.10095042988543, "learning_rate": 8.626694277108434e-07, "logits/chosen": -2.2396483421325684, "logits/rejected": -2.213671922683716, "logps/chosen": -485.63751220703125, "logps/rejected": -426.82501220703125, "loss": 0.2356, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.412707567214966, "rewards/margins": 4.013574123382568, "rewards/rejected": -6.426953315734863, "step": 1460 }, { "epoch": 0.5534638554216867, "grad_norm": 78.3420033035564, "learning_rate": 8.617281626506023e-07, "logits/chosen": -2.190624952316284, "logits/rejected": -2.194531202316284, "logps/chosen": -460.2875061035156, "logps/rejected": -413.07501220703125, "loss": 0.3285, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.35009765625, "rewards/margins": 4.200879096984863, "rewards/rejected": -6.549218654632568, "step": 1470 }, { "epoch": 0.5572289156626506, "grad_norm": 37.92120708506012, "learning_rate": 8.607868975903614e-07, "logits/chosen": -2.361523389816284, "logits/rejected": -2.3666014671325684, "logps/chosen": -411.86248779296875, "logps/rejected": -369.0, "loss": 0.2889, "rewards/accuracies": 0.84375, "rewards/chosen": -1.6479370594024658, "rewards/margins": 3.2236328125, "rewards/rejected": -4.868359565734863, "step": 1480 }, { "epoch": 0.5609939759036144, "grad_norm": 34.38971716676574, "learning_rate": 8.598456325301204e-07, "logits/chosen": -2.204882860183716, "logits/rejected": -2.193359375, "logps/chosen": -425.20001220703125, "logps/rejected": -368.3999938964844, "loss": 0.2442, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.238855004310608, "rewards/margins": 4.199023246765137, "rewards/rejected": -5.436327934265137, "step": 1490 }, { "epoch": 0.5647590361445783, "grad_norm": 125.0879486879323, "learning_rate": 8.589043674698795e-07, "logits/chosen": -2.1953125, "logits/rejected": -2.194531202316284, "logps/chosen": -438.5, "logps/rejected": -390.5249938964844, "loss": 0.3534, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.8596312999725342, "rewards/margins": 3.4546875953674316, "rewards/rejected": -5.3134765625, "step": 1500 }, { "epoch": 0.5685240963855421, "grad_norm": 87.79463054783885, "learning_rate": 8.579631024096385e-07, "logits/chosen": -2.2113280296325684, "logits/rejected": -2.2408204078674316, "logps/chosen": -428.20001220703125, "logps/rejected": -383.95001220703125, "loss": 0.2557, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.4449584484100342, "rewards/margins": 3.9007811546325684, "rewards/rejected": -5.342382907867432, "step": 1510 }, { "epoch": 0.572289156626506, "grad_norm": 104.09590721285699, "learning_rate": 8.570218373493976e-07, "logits/chosen": -2.2392578125, "logits/rejected": -2.287890672683716, "logps/chosen": -461.54998779296875, "logps/rejected": -395.3500061035156, "loss": 0.4087, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.270922899246216, "rewards/margins": 3.2861084938049316, "rewards/rejected": -5.5546875, "step": 1520 }, { "epoch": 0.5760542168674698, "grad_norm": 160.72084792073184, "learning_rate": 8.560805722891565e-07, "logits/chosen": -2.1703124046325684, "logits/rejected": -2.1556639671325684, "logps/chosen": -443.0, "logps/rejected": -425.20001220703125, "loss": 0.2907, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.8195922374725342, "rewards/margins": 3.379589796066284, "rewards/rejected": -5.198632717132568, "step": 1530 }, { "epoch": 0.5798192771084337, "grad_norm": 74.56703570337015, "learning_rate": 8.551393072289156e-07, "logits/chosen": -2.1851563453674316, "logits/rejected": -2.2279295921325684, "logps/chosen": -341.125, "logps/rejected": -322.875, "loss": 0.2844, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.5089019536972046, "rewards/margins": 3.3544921875, "rewards/rejected": -4.861718654632568, "step": 1540 }, { "epoch": 0.5835843373493976, "grad_norm": 154.37406193457736, "learning_rate": 8.541980421686747e-07, "logits/chosen": -2.096484422683716, "logits/rejected": -2.0673828125, "logps/chosen": -461.1625061035156, "logps/rejected": -426.6499938964844, "loss": 0.235, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -1.645117163658142, "rewards/margins": 3.9048829078674316, "rewards/rejected": -5.551953315734863, "step": 1550 }, { "epoch": 0.5873493975903614, "grad_norm": 64.37924527291008, "learning_rate": 8.532567771084337e-07, "logits/chosen": -2.1595702171325684, "logits/rejected": -2.199414014816284, "logps/chosen": -469.875, "logps/rejected": -379.7749938964844, "loss": 0.2251, "rewards/accuracies": 0.90625, "rewards/chosen": -1.32684326171875, "rewards/margins": 3.827880859375, "rewards/rejected": -5.15625, "step": 1560 }, { "epoch": 0.5911144578313253, "grad_norm": 87.59407199207232, "learning_rate": 8.523155120481927e-07, "logits/chosen": -2.2134766578674316, "logits/rejected": -2.2144532203674316, "logps/chosen": -394.45001220703125, "logps/rejected": -375.5249938964844, "loss": 0.2436, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.5836913585662842, "rewards/margins": 3.9173827171325684, "rewards/rejected": -5.500390529632568, "step": 1570 }, { "epoch": 0.5948795180722891, "grad_norm": 80.58077599371451, "learning_rate": 8.513742469879518e-07, "logits/chosen": -2.1050782203674316, "logits/rejected": -2.168164014816284, "logps/chosen": -468.17498779296875, "logps/rejected": -398.75, "loss": 0.2402, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.0927977561950684, "rewards/margins": 4.193310737609863, "rewards/rejected": -6.286718845367432, "step": 1580 }, { "epoch": 0.598644578313253, "grad_norm": 60.55104314579305, "learning_rate": 8.504329819277109e-07, "logits/chosen": -2.1644530296325684, "logits/rejected": -2.278515577316284, "logps/chosen": -436.5249938964844, "logps/rejected": -420.6499938964844, "loss": 0.351, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.519091844558716, "rewards/margins": 3.608691453933716, "rewards/rejected": -6.122265815734863, "step": 1590 }, { "epoch": 0.6024096385542169, "grad_norm": 85.01075474187786, "learning_rate": 8.494917168674698e-07, "logits/chosen": -2.31640625, "logits/rejected": -2.3046875, "logps/chosen": -410.67498779296875, "logps/rejected": -370.7749938964844, "loss": 0.2864, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.9972412586212158, "rewards/margins": 4.253320217132568, "rewards/rejected": -6.252734184265137, "step": 1600 }, { "epoch": 0.6061746987951807, "grad_norm": 74.70323831815915, "learning_rate": 8.485504518072288e-07, "logits/chosen": -2.2230467796325684, "logits/rejected": -2.242382764816284, "logps/chosen": -439.42498779296875, "logps/rejected": -382.17498779296875, "loss": 0.2828, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.9635009765625, "rewards/margins": 4.172558784484863, "rewards/rejected": -6.134375095367432, "step": 1610 }, { "epoch": 0.6099397590361446, "grad_norm": 60.87045668969478, "learning_rate": 8.476091867469879e-07, "logits/chosen": -2.2359375953674316, "logits/rejected": -2.273632764816284, "logps/chosen": -454.1499938964844, "logps/rejected": -401.2749938964844, "loss": 0.3096, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.411328077316284, "rewards/margins": 4.136523246765137, "rewards/rejected": -6.543749809265137, "step": 1620 }, { "epoch": 0.6137048192771084, "grad_norm": 46.02149841890831, "learning_rate": 8.46667921686747e-07, "logits/chosen": -2.2837891578674316, "logits/rejected": -2.205078125, "logps/chosen": -387.7875061035156, "logps/rejected": -423.6000061035156, "loss": 0.266, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.8931641578674316, "rewards/margins": 3.8617186546325684, "rewards/rejected": -6.754687309265137, "step": 1630 }, { "epoch": 0.6174698795180723, "grad_norm": 76.45429335382832, "learning_rate": 8.457266566265059e-07, "logits/chosen": -2.2750000953674316, "logits/rejected": -2.357421875, "logps/chosen": -403.5874938964844, "logps/rejected": -375.5249938964844, "loss": 0.3242, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.4883666038513184, "rewards/margins": 3.5682616233825684, "rewards/rejected": -6.057031154632568, "step": 1640 }, { "epoch": 0.6212349397590361, "grad_norm": 40.20294536213306, "learning_rate": 8.44785391566265e-07, "logits/chosen": -2.2744140625, "logits/rejected": -2.2445311546325684, "logps/chosen": -464.13751220703125, "logps/rejected": -388.92498779296875, "loss": 0.3617, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.6151366233825684, "rewards/margins": 3.9395508766174316, "rewards/rejected": -6.5537109375, "step": 1650 }, { "epoch": 0.625, "grad_norm": 48.16879391671048, "learning_rate": 8.438441265060241e-07, "logits/chosen": -2.26171875, "logits/rejected": -2.302929639816284, "logps/chosen": -464.5375061035156, "logps/rejected": -416.3500061035156, "loss": 0.1954, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.263378858566284, "rewards/margins": 4.046679496765137, "rewards/rejected": -6.313281059265137, "step": 1660 }, { "epoch": 0.6287650602409639, "grad_norm": 66.75552931984477, "learning_rate": 8.429028614457831e-07, "logits/chosen": -2.1978516578674316, "logits/rejected": -2.17578125, "logps/chosen": -408.9624938964844, "logps/rejected": -406.67498779296875, "loss": 0.3343, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.5042967796325684, "rewards/margins": 3.256542921066284, "rewards/rejected": -5.762499809265137, "step": 1670 }, { "epoch": 0.6325301204819277, "grad_norm": 98.26432188134017, "learning_rate": 8.41961596385542e-07, "logits/chosen": -2.287890672683716, "logits/rejected": -2.182421922683716, "logps/chosen": -437.57501220703125, "logps/rejected": -426.07501220703125, "loss": 0.2611, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.505126953125, "rewards/margins": 3.9659423828125, "rewards/rejected": -6.471484184265137, "step": 1680 }, { "epoch": 0.6362951807228916, "grad_norm": 122.09130033290188, "learning_rate": 8.410203313253011e-07, "logits/chosen": -2.1910157203674316, "logits/rejected": -2.2876954078674316, "logps/chosen": -429.95001220703125, "logps/rejected": -409.1499938964844, "loss": 0.2595, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.8434081077575684, "rewards/margins": 3.5865235328674316, "rewards/rejected": -6.434765815734863, "step": 1690 }, { "epoch": 0.6400602409638554, "grad_norm": 59.2530690305444, "learning_rate": 8.400790662650602e-07, "logits/chosen": -2.2650389671325684, "logits/rejected": -2.279296875, "logps/chosen": -469.0625, "logps/rejected": -424.29998779296875, "loss": 0.2688, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -3.0062499046325684, "rewards/margins": 4.017529487609863, "rewards/rejected": -7.024218559265137, "step": 1700 }, { "epoch": 0.6438253012048193, "grad_norm": 66.65065974075205, "learning_rate": 8.391378012048193e-07, "logits/chosen": -2.168750047683716, "logits/rejected": -2.3304686546325684, "logps/chosen": -427.98748779296875, "logps/rejected": -364.29998779296875, "loss": 0.2278, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.5552735328674316, "rewards/margins": 4.029101371765137, "rewards/rejected": -6.582812309265137, "step": 1710 }, { "epoch": 0.6475903614457831, "grad_norm": 46.55145337647766, "learning_rate": 8.381965361445783e-07, "logits/chosen": -2.181445360183716, "logits/rejected": -2.200390577316284, "logps/chosen": -414.2749938964844, "logps/rejected": -397.32501220703125, "loss": 0.2087, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.595410108566284, "rewards/margins": 3.8333983421325684, "rewards/rejected": -6.428515434265137, "step": 1720 }, { "epoch": 0.651355421686747, "grad_norm": 88.30298274237552, "learning_rate": 8.372552710843373e-07, "logits/chosen": -2.2142577171325684, "logits/rejected": -2.2339844703674316, "logps/chosen": -434.25, "logps/rejected": -391.25, "loss": 0.3482, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.309985399246216, "rewards/margins": 3.6229491233825684, "rewards/rejected": -5.934374809265137, "step": 1730 }, { "epoch": 0.6551204819277109, "grad_norm": 111.9463714506719, "learning_rate": 8.363140060240963e-07, "logits/chosen": -2.1031250953674316, "logits/rejected": -2.044140577316284, "logps/chosen": -443.1000061035156, "logps/rejected": -393.6000061035156, "loss": 0.3358, "rewards/accuracies": 0.84375, "rewards/chosen": -1.6714355945587158, "rewards/margins": 3.6903319358825684, "rewards/rejected": -5.363671779632568, "step": 1740 }, { "epoch": 0.6588855421686747, "grad_norm": 103.65676322326802, "learning_rate": 8.353727409638554e-07, "logits/chosen": -2.2242188453674316, "logits/rejected": -2.2607421875, "logps/chosen": -433.92498779296875, "logps/rejected": -415.6499938964844, "loss": 0.2707, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.732812523841858, "rewards/margins": 3.59912109375, "rewards/rejected": -5.332324028015137, "step": 1750 }, { "epoch": 0.6626506024096386, "grad_norm": 97.19755003514508, "learning_rate": 8.344314759036144e-07, "logits/chosen": -2.1546874046325684, "logits/rejected": -2.146484375, "logps/chosen": -403.875, "logps/rejected": -429.25, "loss": 0.3296, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.101025342941284, "rewards/margins": 3.568774461746216, "rewards/rejected": -5.672265529632568, "step": 1760 }, { "epoch": 0.6664156626506024, "grad_norm": 51.925446577041214, "learning_rate": 8.334902108433734e-07, "logits/chosen": -2.2249999046325684, "logits/rejected": -2.2542967796325684, "logps/chosen": -446.54998779296875, "logps/rejected": -385.04998779296875, "loss": 0.2709, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.956323266029358, "rewards/margins": 3.498730421066284, "rewards/rejected": -5.453906059265137, "step": 1770 }, { "epoch": 0.6701807228915663, "grad_norm": 62.846886956969236, "learning_rate": 8.325489457831325e-07, "logits/chosen": -2.194140672683716, "logits/rejected": -2.1695313453674316, "logps/chosen": -441.125, "logps/rejected": -420.5249938964844, "loss": 0.3217, "rewards/accuracies": 0.875, "rewards/chosen": -1.7685058116912842, "rewards/margins": 3.5760741233825684, "rewards/rejected": -5.348437309265137, "step": 1780 }, { "epoch": 0.6739457831325302, "grad_norm": 13.03513853505052, "learning_rate": 8.316076807228916e-07, "logits/chosen": -2.2276368141174316, "logits/rejected": -2.1742186546325684, "logps/chosen": -416.0, "logps/rejected": -372.82501220703125, "loss": 0.3054, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.232763648033142, "rewards/margins": 3.299072265625, "rewards/rejected": -4.531445503234863, "step": 1790 }, { "epoch": 0.677710843373494, "grad_norm": 29.5883204518918, "learning_rate": 8.306664156626506e-07, "logits/chosen": -2.204882860183716, "logits/rejected": -2.193554639816284, "logps/chosen": -416.88751220703125, "logps/rejected": -397.5874938964844, "loss": 0.2102, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.5256836414337158, "rewards/margins": 3.50927734375, "rewards/rejected": -5.037109375, "step": 1800 }, { "epoch": 0.6814759036144579, "grad_norm": 91.40913066359528, "learning_rate": 8.297251506024096e-07, "logits/chosen": -2.2738280296325684, "logits/rejected": -2.2728514671325684, "logps/chosen": -506.8999938964844, "logps/rejected": -441.20001220703125, "loss": 0.2488, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.368444800376892, "rewards/margins": 4.277148246765137, "rewards/rejected": -5.644921779632568, "step": 1810 }, { "epoch": 0.6852409638554217, "grad_norm": 42.78026905332134, "learning_rate": 8.287838855421686e-07, "logits/chosen": -2.15625, "logits/rejected": -2.165234327316284, "logps/chosen": -432.51251220703125, "logps/rejected": -410.0249938964844, "loss": 0.3575, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.9101073741912842, "rewards/margins": 4.08154296875, "rewards/rejected": -5.993359565734863, "step": 1820 }, { "epoch": 0.6890060240963856, "grad_norm": 84.5147791356096, "learning_rate": 8.278426204819276e-07, "logits/chosen": -2.1636719703674316, "logits/rejected": -2.1908202171325684, "logps/chosen": -420.6000061035156, "logps/rejected": -384.0, "loss": 0.2293, "rewards/accuracies": 0.90625, "rewards/chosen": -1.627197265625, "rewards/margins": 4.311132907867432, "rewards/rejected": -5.938281059265137, "step": 1830 }, { "epoch": 0.6927710843373494, "grad_norm": 42.19923123216441, "learning_rate": 8.269013554216867e-07, "logits/chosen": -2.150585889816284, "logits/rejected": -2.2080078125, "logps/chosen": -433.04998779296875, "logps/rejected": -376.42498779296875, "loss": 0.1693, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -1.213720679283142, "rewards/margins": 4.484375, "rewards/rejected": -5.69921875, "step": 1840 }, { "epoch": 0.6965361445783133, "grad_norm": 73.8373122328886, "learning_rate": 8.259600903614458e-07, "logits/chosen": -2.2587890625, "logits/rejected": -2.296679735183716, "logps/chosen": -430.2250061035156, "logps/rejected": -382.32501220703125, "loss": 0.2212, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.566491723060608, "rewards/margins": 4.150000095367432, "rewards/rejected": -5.713671684265137, "step": 1850 }, { "epoch": 0.7003012048192772, "grad_norm": 80.87741634329736, "learning_rate": 8.250188253012048e-07, "logits/chosen": -2.133593797683716, "logits/rejected": -2.1640625, "logps/chosen": -499.20001220703125, "logps/rejected": -434.54998779296875, "loss": 0.201, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.551049828529358, "rewards/margins": 4.296484470367432, "rewards/rejected": -5.848046779632568, "step": 1860 }, { "epoch": 0.704066265060241, "grad_norm": 56.08752450620536, "learning_rate": 8.240775602409638e-07, "logits/chosen": -2.252148389816284, "logits/rejected": -2.221484422683716, "logps/chosen": -440.2749938964844, "logps/rejected": -431.2250061035156, "loss": 0.2448, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.1741700172424316, "rewards/margins": 4.267870903015137, "rewards/rejected": -6.444921970367432, "step": 1870 }, { "epoch": 0.7078313253012049, "grad_norm": 34.71761018281237, "learning_rate": 8.231362951807228e-07, "logits/chosen": -2.229296922683716, "logits/rejected": -2.288281202316284, "logps/chosen": -411.2250061035156, "logps/rejected": -389.82501220703125, "loss": 0.2832, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.3283448219299316, "rewards/margins": 4.082812309265137, "rewards/rejected": -6.411718845367432, "step": 1880 }, { "epoch": 0.7115963855421686, "grad_norm": 91.28246239136305, "learning_rate": 8.221950301204819e-07, "logits/chosen": -2.2064452171325684, "logits/rejected": -2.211718797683716, "logps/chosen": -444.6312561035156, "logps/rejected": -416.1499938964844, "loss": 0.3004, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.043896436691284, "rewards/margins": 4.002343654632568, "rewards/rejected": -6.044921875, "step": 1890 }, { "epoch": 0.7153614457831325, "grad_norm": 135.73859050078866, "learning_rate": 8.21253765060241e-07, "logits/chosen": -2.2431640625, "logits/rejected": -2.229687452316284, "logps/chosen": -411.1875, "logps/rejected": -387.5249938964844, "loss": 0.304, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.9744415283203125, "rewards/margins": 3.898242235183716, "rewards/rejected": -5.867578029632568, "step": 1900 }, { "epoch": 0.7191265060240963, "grad_norm": 73.17030114020557, "learning_rate": 8.203124999999999e-07, "logits/chosen": -2.2652344703674316, "logits/rejected": -2.302734375, "logps/chosen": -360.42498779296875, "logps/rejected": -350.20001220703125, "loss": 0.3, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.1255736351013184, "rewards/margins": 3.590039014816284, "rewards/rejected": -5.714453220367432, "step": 1910 }, { "epoch": 0.7228915662650602, "grad_norm": 66.53487629382481, "learning_rate": 8.19371234939759e-07, "logits/chosen": -2.25390625, "logits/rejected": -2.3119139671325684, "logps/chosen": -435.2250061035156, "logps/rejected": -405.3999938964844, "loss": 0.1986, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -1.638916015625, "rewards/margins": 4.45654296875, "rewards/rejected": -6.098437309265137, "step": 1920 }, { "epoch": 0.7266566265060241, "grad_norm": 81.6332598144061, "learning_rate": 8.184299698795181e-07, "logits/chosen": -2.279296875, "logits/rejected": -2.3021483421325684, "logps/chosen": -427.54998779296875, "logps/rejected": -406.57501220703125, "loss": 0.3224, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.605993628501892, "rewards/margins": 3.9263672828674316, "rewards/rejected": -5.530468940734863, "step": 1930 }, { "epoch": 0.7304216867469879, "grad_norm": 48.203759493803815, "learning_rate": 8.174887048192772e-07, "logits/chosen": -2.2431640625, "logits/rejected": -2.2388672828674316, "logps/chosen": -392.8125, "logps/rejected": -373.98748779296875, "loss": 0.2448, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -1.349096655845642, "rewards/margins": 3.980224609375, "rewards/rejected": -5.328515529632568, "step": 1940 }, { "epoch": 0.7341867469879518, "grad_norm": 58.9379492648601, "learning_rate": 8.16547439759036e-07, "logits/chosen": -2.1865234375, "logits/rejected": -2.1861329078674316, "logps/chosen": -469.63751220703125, "logps/rejected": -381.5874938964844, "loss": 0.3254, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.0428099632263184, "rewards/margins": 3.760986328125, "rewards/rejected": -5.803515434265137, "step": 1950 }, { "epoch": 0.7379518072289156, "grad_norm": 80.51059309471277, "learning_rate": 8.156061746987951e-07, "logits/chosen": -2.2652344703674316, "logits/rejected": -2.1908202171325684, "logps/chosen": -384.17498779296875, "logps/rejected": -366.2250061035156, "loss": 0.3272, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.89599609375, "rewards/margins": 3.9332518577575684, "rewards/rejected": -5.832421779632568, "step": 1960 }, { "epoch": 0.7417168674698795, "grad_norm": 37.44159403569154, "learning_rate": 8.146649096385542e-07, "logits/chosen": -2.2181639671325684, "logits/rejected": -2.241406202316284, "logps/chosen": -370.75, "logps/rejected": -377.7250061035156, "loss": 0.2467, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.577966332435608, "rewards/margins": 3.93359375, "rewards/rejected": -5.510546684265137, "step": 1970 }, { "epoch": 0.7454819277108434, "grad_norm": 52.08499468061045, "learning_rate": 8.137236445783132e-07, "logits/chosen": -2.1117186546325684, "logits/rejected": -2.116992235183716, "logps/chosen": -408.07501220703125, "logps/rejected": -394.04998779296875, "loss": 0.2951, "rewards/accuracies": 0.875, "rewards/chosen": -1.6052734851837158, "rewards/margins": 3.986328125, "rewards/rejected": -5.594336032867432, "step": 1980 }, { "epoch": 0.7492469879518072, "grad_norm": 173.2094267605532, "learning_rate": 8.127823795180722e-07, "logits/chosen": -2.1087889671325684, "logits/rejected": -2.1791014671325684, "logps/chosen": -449.32501220703125, "logps/rejected": -402.29998779296875, "loss": 0.2667, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.439697265625, "rewards/margins": 3.905078172683716, "rewards/rejected": -5.346093654632568, "step": 1990 }, { "epoch": 0.7530120481927711, "grad_norm": 28.637427153876185, "learning_rate": 8.118411144578313e-07, "logits/chosen": -2.1656250953674316, "logits/rejected": -2.201367139816284, "logps/chosen": -428.61248779296875, "logps/rejected": -399.04998779296875, "loss": 0.2057, "rewards/accuracies": 0.90625, "rewards/chosen": -1.4736816883087158, "rewards/margins": 3.9639649391174316, "rewards/rejected": -5.440234184265137, "step": 2000 }, { "epoch": 0.7567771084337349, "grad_norm": 36.18603747980631, "learning_rate": 8.108998493975904e-07, "logits/chosen": -2.346874952316284, "logits/rejected": -2.3330078125, "logps/chosen": -397.70001220703125, "logps/rejected": -364.2749938964844, "loss": 0.3067, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.0194091796875, "rewards/margins": 3.634082078933716, "rewards/rejected": -5.654687404632568, "step": 2010 }, { "epoch": 0.7605421686746988, "grad_norm": 97.05020442969196, "learning_rate": 8.099585843373493e-07, "logits/chosen": -2.1376953125, "logits/rejected": -2.1607422828674316, "logps/chosen": -451.32501220703125, "logps/rejected": -408.8500061035156, "loss": 0.2846, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -2.4820799827575684, "rewards/margins": 3.792407274246216, "rewards/rejected": -6.273046970367432, "step": 2020 }, { "epoch": 0.7643072289156626, "grad_norm": 61.44182414015643, "learning_rate": 8.090173192771084e-07, "logits/chosen": -2.2880859375, "logits/rejected": -2.2720704078674316, "logps/chosen": -409.70001220703125, "logps/rejected": -442.42498779296875, "loss": 0.2771, "rewards/accuracies": 0.90625, "rewards/chosen": -2.865917921066284, "rewards/margins": 4.295922756195068, "rewards/rejected": -7.162499904632568, "step": 2030 }, { "epoch": 0.7680722891566265, "grad_norm": 90.43117375378077, "learning_rate": 8.080760542168674e-07, "logits/chosen": -2.174999952316284, "logits/rejected": -2.218945264816284, "logps/chosen": -436.5249938964844, "logps/rejected": -402.79998779296875, "loss": 0.2824, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.2536377906799316, "rewards/margins": 3.7154297828674316, "rewards/rejected": -5.967577934265137, "step": 2040 }, { "epoch": 0.7718373493975904, "grad_norm": 50.046249153412184, "learning_rate": 8.071347891566265e-07, "logits/chosen": -2.2789063453674316, "logits/rejected": -2.3193359375, "logps/chosen": -515.5250244140625, "logps/rejected": -412.8500061035156, "loss": 0.319, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.684375047683716, "rewards/margins": 3.8304686546325684, "rewards/rejected": -6.511328220367432, "step": 2050 }, { "epoch": 0.7756024096385542, "grad_norm": 73.14577018217848, "learning_rate": 8.061935240963855e-07, "logits/chosen": -2.1949219703674316, "logits/rejected": -2.245898485183716, "logps/chosen": -427.7749938964844, "logps/rejected": -398.3999938964844, "loss": 0.1641, "rewards/accuracies": 0.90625, "rewards/chosen": -2.446728467941284, "rewards/margins": 4.343945503234863, "rewards/rejected": -6.790625095367432, "step": 2060 }, { "epoch": 0.7793674698795181, "grad_norm": 91.90004603712033, "learning_rate": 8.052522590361446e-07, "logits/chosen": -2.298828125, "logits/rejected": -2.223828077316284, "logps/chosen": -450.29998779296875, "logps/rejected": -397.0, "loss": 0.2602, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.9801514148712158, "rewards/margins": 4.256249904632568, "rewards/rejected": -6.234375, "step": 2070 }, { "epoch": 0.7831325301204819, "grad_norm": 60.96101223509167, "learning_rate": 8.043109939759036e-07, "logits/chosen": -2.1830077171325684, "logits/rejected": -2.203320264816284, "logps/chosen": -453.82501220703125, "logps/rejected": -399.07501220703125, "loss": 0.3369, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.445214867591858, "rewards/margins": 3.7417969703674316, "rewards/rejected": -5.1875, "step": 2080 }, { "epoch": 0.7868975903614458, "grad_norm": 104.74103034234575, "learning_rate": 8.033697289156626e-07, "logits/chosen": -2.180859327316284, "logits/rejected": -2.1337890625, "logps/chosen": -380.01251220703125, "logps/rejected": -365.6499938964844, "loss": 0.3085, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.8293945789337158, "rewards/margins": 3.3695311546325684, "rewards/rejected": -5.198828220367432, "step": 2090 }, { "epoch": 0.7906626506024096, "grad_norm": 128.07360677237494, "learning_rate": 8.024284638554216e-07, "logits/chosen": -2.1919922828674316, "logits/rejected": -2.2701172828674316, "logps/chosen": -422.36248779296875, "logps/rejected": -373.11248779296875, "loss": 0.3635, "rewards/accuracies": 0.84375, "rewards/chosen": -1.9606444835662842, "rewards/margins": 3.321728467941284, "rewards/rejected": -5.2822265625, "step": 2100 }, { "epoch": 0.7944277108433735, "grad_norm": 62.06516907565759, "learning_rate": 8.014871987951807e-07, "logits/chosen": -2.239453077316284, "logits/rejected": -2.2367186546325684, "logps/chosen": -401.6000061035156, "logps/rejected": -380.8500061035156, "loss": 0.2411, "rewards/accuracies": 0.875, "rewards/chosen": -1.602197289466858, "rewards/margins": 3.954882860183716, "rewards/rejected": -5.557031154632568, "step": 2110 }, { "epoch": 0.7981927710843374, "grad_norm": 82.3505983520944, "learning_rate": 8.005459337349398e-07, "logits/chosen": -2.1802735328674316, "logits/rejected": -2.167187452316284, "logps/chosen": -388.32501220703125, "logps/rejected": -389.1875, "loss": 0.2791, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.385400414466858, "rewards/margins": 4.169140815734863, "rewards/rejected": -5.553320407867432, "step": 2120 }, { "epoch": 0.8019578313253012, "grad_norm": 83.16922240969257, "learning_rate": 7.996046686746987e-07, "logits/chosen": -2.0541014671325684, "logits/rejected": -2.1207032203674316, "logps/chosen": -397.63751220703125, "logps/rejected": -382.3999938964844, "loss": 0.2729, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.123205542564392, "rewards/margins": 3.905956983566284, "rewards/rejected": -5.031836032867432, "step": 2130 }, { "epoch": 0.8057228915662651, "grad_norm": 80.05012421912659, "learning_rate": 7.986634036144578e-07, "logits/chosen": -2.152539014816284, "logits/rejected": -2.142382860183716, "logps/chosen": -381.5375061035156, "logps/rejected": -350.4750061035156, "loss": 0.2322, "rewards/accuracies": 0.875, "rewards/chosen": -0.984057605266571, "rewards/margins": 3.9759764671325684, "rewards/rejected": -4.95703125, "step": 2140 }, { "epoch": 0.8094879518072289, "grad_norm": 65.00826425088366, "learning_rate": 7.977221385542169e-07, "logits/chosen": -2.0501952171325684, "logits/rejected": -2.037304639816284, "logps/chosen": -436.5, "logps/rejected": -391.125, "loss": 0.3098, "rewards/accuracies": 0.84375, "rewards/chosen": -0.775073230266571, "rewards/margins": 3.6166014671325684, "rewards/rejected": -4.392773628234863, "step": 2150 }, { "epoch": 0.8132530120481928, "grad_norm": 63.14394341832049, "learning_rate": 7.967808734939759e-07, "logits/chosen": -2.022656202316284, "logits/rejected": -2.106640577316284, "logps/chosen": -415.9750061035156, "logps/rejected": -385.1499938964844, "loss": 0.2306, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.142370581626892, "rewards/margins": 3.823046922683716, "rewards/rejected": -4.96484375, "step": 2160 }, { "epoch": 0.8170180722891566, "grad_norm": 99.96284056305113, "learning_rate": 7.958396084337348e-07, "logits/chosen": -2.0794920921325684, "logits/rejected": -2.0999999046325684, "logps/chosen": -400.3374938964844, "logps/rejected": -407.17498779296875, "loss": 0.4031, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.309326171875, "rewards/margins": 3.659289598464966, "rewards/rejected": -4.9677734375, "step": 2170 }, { "epoch": 0.8207831325301205, "grad_norm": 53.843086816069786, "learning_rate": 7.948983433734939e-07, "logits/chosen": -2.1900391578674316, "logits/rejected": -2.1919922828674316, "logps/chosen": -467.7749938964844, "logps/rejected": -402.9750061035156, "loss": 0.2872, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.264135718345642, "rewards/margins": 3.8086915016174316, "rewards/rejected": -5.071484565734863, "step": 2180 }, { "epoch": 0.8245481927710844, "grad_norm": 104.61627212557983, "learning_rate": 7.93957078313253e-07, "logits/chosen": -2.164355516433716, "logits/rejected": -2.214062452316284, "logps/chosen": -437.8999938964844, "logps/rejected": -407.07501220703125, "loss": 0.3468, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.327856421470642, "rewards/margins": 3.512158155441284, "rewards/rejected": -4.837890625, "step": 2190 }, { "epoch": 0.8283132530120482, "grad_norm": 36.680227522223596, "learning_rate": 7.930158132530121e-07, "logits/chosen": -2.177734375, "logits/rejected": -2.2152342796325684, "logps/chosen": -422.54998779296875, "logps/rejected": -397.25, "loss": 0.2817, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.363134741783142, "rewards/margins": 3.6421875953674316, "rewards/rejected": -5.004101753234863, "step": 2200 }, { "epoch": 0.8320783132530121, "grad_norm": 109.44952885909207, "learning_rate": 7.92074548192771e-07, "logits/chosen": -2.276171922683716, "logits/rejected": -2.2548828125, "logps/chosen": -416.0375061035156, "logps/rejected": -375.20001220703125, "loss": 0.3341, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.4935424327850342, "rewards/margins": 3.582275390625, "rewards/rejected": -5.076757907867432, "step": 2210 }, { "epoch": 0.8358433734939759, "grad_norm": 53.49300596037489, "learning_rate": 7.911332831325301e-07, "logits/chosen": -2.216015577316284, "logits/rejected": -2.166796922683716, "logps/chosen": -385.7250061035156, "logps/rejected": -381.3500061035156, "loss": 0.2174, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -1.655908226966858, "rewards/margins": 3.9443359375, "rewards/rejected": -5.600781440734863, "step": 2220 }, { "epoch": 0.8396084337349398, "grad_norm": 44.28004518013996, "learning_rate": 7.901920180722891e-07, "logits/chosen": -2.3306641578674316, "logits/rejected": -2.299609422683716, "logps/chosen": -419.45001220703125, "logps/rejected": -385.5249938964844, "loss": 0.2814, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.7302124500274658, "rewards/margins": 3.587158203125, "rewards/rejected": -5.316504001617432, "step": 2230 }, { "epoch": 0.8433734939759037, "grad_norm": 75.78467533912237, "learning_rate": 7.892507530120482e-07, "logits/chosen": -2.225390672683716, "logits/rejected": -2.305859327316284, "logps/chosen": -394.79998779296875, "logps/rejected": -380.0375061035156, "loss": 0.3179, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.7059204578399658, "rewards/margins": 3.8305907249450684, "rewards/rejected": -5.534570217132568, "step": 2240 }, { "epoch": 0.8471385542168675, "grad_norm": 45.34661443851996, "learning_rate": 7.883094879518072e-07, "logits/chosen": -2.203906297683716, "logits/rejected": -2.166796922683716, "logps/chosen": -437.4750061035156, "logps/rejected": -414.625, "loss": 0.2082, "rewards/accuracies": 0.90625, "rewards/chosen": -1.7566649913787842, "rewards/margins": 4.502148628234863, "rewards/rejected": -6.255859375, "step": 2250 }, { "epoch": 0.8509036144578314, "grad_norm": 125.10049140813433, "learning_rate": 7.873682228915662e-07, "logits/chosen": -2.189257860183716, "logits/rejected": -2.149609327316284, "logps/chosen": -407.73748779296875, "logps/rejected": -381.54998779296875, "loss": 0.3548, "rewards/accuracies": 0.84375, "rewards/chosen": -1.7558104991912842, "rewards/margins": 3.6841797828674316, "rewards/rejected": -5.444140434265137, "step": 2260 }, { "epoch": 0.8546686746987951, "grad_norm": 116.29623654419734, "learning_rate": 7.864269578313253e-07, "logits/chosen": -2.200976610183716, "logits/rejected": -2.2621092796325684, "logps/chosen": -467.04998779296875, "logps/rejected": -394.67498779296875, "loss": 0.2738, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.334619164466858, "rewards/margins": 4.431738376617432, "rewards/rejected": -5.764843940734863, "step": 2270 }, { "epoch": 0.858433734939759, "grad_norm": 102.5051179026946, "learning_rate": 7.854856927710844e-07, "logits/chosen": -2.123242139816284, "logits/rejected": -2.173632860183716, "logps/chosen": -452.7124938964844, "logps/rejected": -409.5249938964844, "loss": 0.2335, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -1.563745141029358, "rewards/margins": 3.996044874191284, "rewards/rejected": -5.565234184265137, "step": 2280 }, { "epoch": 0.8621987951807228, "grad_norm": 28.572709672856643, "learning_rate": 7.845444277108434e-07, "logits/chosen": -2.190234422683716, "logits/rejected": -2.207226514816284, "logps/chosen": -444.875, "logps/rejected": -380.2250061035156, "loss": 0.2355, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.364526391029358, "rewards/margins": 4.466894626617432, "rewards/rejected": -5.8359375, "step": 2290 }, { "epoch": 0.8659638554216867, "grad_norm": 66.7341096771834, "learning_rate": 7.836031626506023e-07, "logits/chosen": -2.1585936546325684, "logits/rejected": -2.2132811546325684, "logps/chosen": -393.6000061035156, "logps/rejected": -369.75, "loss": 0.3132, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.079833984375, "rewards/margins": 3.9537110328674316, "rewards/rejected": -5.032422065734863, "step": 2300 }, { "epoch": 0.8697289156626506, "grad_norm": 62.54331921282039, "learning_rate": 7.826618975903614e-07, "logits/chosen": -2.032031297683716, "logits/rejected": -2.0335936546325684, "logps/chosen": -409.20001220703125, "logps/rejected": -384.6499938964844, "loss": 0.3281, "rewards/accuracies": 0.84375, "rewards/chosen": -1.1747314929962158, "rewards/margins": 3.9647459983825684, "rewards/rejected": -5.143164157867432, "step": 2310 }, { "epoch": 0.8734939759036144, "grad_norm": 127.5770791364032, "learning_rate": 7.817206325301204e-07, "logits/chosen": -2.1566405296325684, "logits/rejected": -2.125781297683716, "logps/chosen": -383.3999938964844, "logps/rejected": -364.6499938964844, "loss": 0.3012, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.879480004310608, "rewards/margins": 3.7074217796325684, "rewards/rejected": -5.5869140625, "step": 2320 }, { "epoch": 0.8772590361445783, "grad_norm": 92.85886980999535, "learning_rate": 7.807793674698795e-07, "logits/chosen": -2.2164063453674316, "logits/rejected": -2.1923828125, "logps/chosen": -418.4624938964844, "logps/rejected": -412.0, "loss": 0.207, "rewards/accuracies": 0.9375, "rewards/chosen": -2.02587890625, "rewards/margins": 4.212011814117432, "rewards/rejected": -6.236718654632568, "step": 2330 }, { "epoch": 0.8810240963855421, "grad_norm": 55.785007605140855, "learning_rate": 7.798381024096386e-07, "logits/chosen": -2.0990233421325684, "logits/rejected": -2.1558594703674316, "logps/chosen": -521.5250244140625, "logps/rejected": -429.67498779296875, "loss": 0.2669, "rewards/accuracies": 0.875, "rewards/chosen": -1.65948486328125, "rewards/margins": 4.378027439117432, "rewards/rejected": -6.037109375, "step": 2340 }, { "epoch": 0.884789156626506, "grad_norm": 56.57564492139891, "learning_rate": 7.788968373493976e-07, "logits/chosen": -2.169726610183716, "logits/rejected": -2.177734375, "logps/chosen": -476.125, "logps/rejected": -389.8500061035156, "loss": 0.3144, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.9027831554412842, "rewards/margins": 4.3310546875, "rewards/rejected": -6.2353515625, "step": 2350 }, { "epoch": 0.8885542168674698, "grad_norm": 116.05497792705081, "learning_rate": 7.779555722891565e-07, "logits/chosen": -2.1558594703674316, "logits/rejected": -2.135546922683716, "logps/chosen": -433.6000061035156, "logps/rejected": -415.45001220703125, "loss": 0.3279, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.2577881813049316, "rewards/margins": 4.081835746765137, "rewards/rejected": -6.344140529632568, "step": 2360 }, { "epoch": 0.8923192771084337, "grad_norm": 88.31264958274684, "learning_rate": 7.770143072289156e-07, "logits/chosen": -2.098437547683716, "logits/rejected": -2.1201171875, "logps/chosen": -469.2749938964844, "logps/rejected": -437.92498779296875, "loss": 0.2903, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.6334471702575684, "rewards/margins": 4.435449123382568, "rewards/rejected": -7.071484565734863, "step": 2370 }, { "epoch": 0.8960843373493976, "grad_norm": 49.55958089633511, "learning_rate": 7.760730421686747e-07, "logits/chosen": -2.1167969703674316, "logits/rejected": -2.11376953125, "logps/chosen": -495.2250061035156, "logps/rejected": -404.7250061035156, "loss": 0.2998, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.7323241233825684, "rewards/margins": 3.6617188453674316, "rewards/rejected": -6.398828029632568, "step": 2380 }, { "epoch": 0.8998493975903614, "grad_norm": 98.66370868308631, "learning_rate": 7.751317771084337e-07, "logits/chosen": -2.173632860183716, "logits/rejected": -2.1357421875, "logps/chosen": -423.92498779296875, "logps/rejected": -405.125, "loss": 0.2752, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.8661131858825684, "rewards/margins": 3.492480516433716, "rewards/rejected": -6.358984470367432, "step": 2390 }, { "epoch": 0.9036144578313253, "grad_norm": 71.63020996624344, "learning_rate": 7.741905120481927e-07, "logits/chosen": -2.126757860183716, "logits/rejected": -2.2144532203674316, "logps/chosen": -430.8125, "logps/rejected": -410.4624938964844, "loss": 0.1734, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.5889649391174316, "rewards/margins": 4.207617282867432, "rewards/rejected": -6.792578220367432, "step": 2400 }, { "epoch": 0.9073795180722891, "grad_norm": 37.883797980028234, "learning_rate": 7.732492469879518e-07, "logits/chosen": -2.102734327316284, "logits/rejected": -2.148242235183716, "logps/chosen": -412.625, "logps/rejected": -382.32501220703125, "loss": 0.3466, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.7710938453674316, "rewards/margins": 3.945117235183716, "rewards/rejected": -6.718359470367432, "step": 2410 }, { "epoch": 0.911144578313253, "grad_norm": 69.09274332354535, "learning_rate": 7.723079819277109e-07, "logits/chosen": -2.1322264671325684, "logits/rejected": -2.140625, "logps/chosen": -447.79998779296875, "logps/rejected": -415.1499938964844, "loss": 0.2154, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.8382811546325684, "rewards/margins": 4.336523532867432, "rewards/rejected": -7.175000190734863, "step": 2420 }, { "epoch": 0.9149096385542169, "grad_norm": 78.49347837115606, "learning_rate": 7.713667168674698e-07, "logits/chosen": -2.0721678733825684, "logits/rejected": -2.1185545921325684, "logps/chosen": -485.4375, "logps/rejected": -438.17498779296875, "loss": 0.2164, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -2.733593702316284, "rewards/margins": 4.685644626617432, "rewards/rejected": -7.420312404632568, "step": 2430 }, { "epoch": 0.9186746987951807, "grad_norm": 180.80019247981, "learning_rate": 7.704254518072288e-07, "logits/chosen": -2.075976610183716, "logits/rejected": -2.0908203125, "logps/chosen": -481.4624938964844, "logps/rejected": -443.375, "loss": 0.3957, "rewards/accuracies": 0.84375, "rewards/chosen": -3.091259717941284, "rewards/margins": 3.7608399391174316, "rewards/rejected": -6.850390434265137, "step": 2440 }, { "epoch": 0.9224397590361446, "grad_norm": 115.3631228168688, "learning_rate": 7.694841867469879e-07, "logits/chosen": -2.1490235328674316, "logits/rejected": -2.1060547828674316, "logps/chosen": -420.5375061035156, "logps/rejected": -409.25, "loss": 0.2618, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.6142210960388184, "rewards/margins": 4.400781154632568, "rewards/rejected": -7.017187595367432, "step": 2450 }, { "epoch": 0.9262048192771084, "grad_norm": 95.5790815182828, "learning_rate": 7.68542921686747e-07, "logits/chosen": -2.149218797683716, "logits/rejected": -2.1455078125, "logps/chosen": -437.9125061035156, "logps/rejected": -406.54998779296875, "loss": 0.3042, "rewards/accuracies": 0.8125, "rewards/chosen": -3.4984374046325684, "rewards/margins": 4.423828125, "rewards/rejected": -7.921093940734863, "step": 2460 }, { "epoch": 0.9299698795180723, "grad_norm": 120.18179165232532, "learning_rate": 7.67601656626506e-07, "logits/chosen": -2.141406297683716, "logits/rejected": -2.177539110183716, "logps/chosen": -444.75, "logps/rejected": -421.92498779296875, "loss": 0.3084, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -3.28070068359375, "rewards/margins": 4.127295017242432, "rewards/rejected": -7.41015625, "step": 2470 }, { "epoch": 0.9337349397590361, "grad_norm": 20.614892800581636, "learning_rate": 7.66660391566265e-07, "logits/chosen": -2.1724610328674316, "logits/rejected": -2.1158204078674316, "logps/chosen": -430.8999938964844, "logps/rejected": -393.54998779296875, "loss": 0.3273, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.763964891433716, "rewards/margins": 4.06689453125, "rewards/rejected": -6.831250190734863, "step": 2480 }, { "epoch": 0.9375, "grad_norm": 34.03678726583991, "learning_rate": 7.657191265060241e-07, "logits/chosen": -2.2203125953674316, "logits/rejected": -2.255859375, "logps/chosen": -434.32501220703125, "logps/rejected": -379.875, "loss": 0.2782, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.3963866233825684, "rewards/margins": 3.942187547683716, "rewards/rejected": -6.336328029632568, "step": 2490 }, { "epoch": 0.9412650602409639, "grad_norm": 49.38862706704195, "learning_rate": 7.647778614457831e-07, "logits/chosen": -2.1382813453674316, "logits/rejected": -2.1519532203674316, "logps/chosen": -383.82501220703125, "logps/rejected": -382.125, "loss": 0.2176, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.033642530441284, "rewards/margins": 4.150976657867432, "rewards/rejected": -6.183984279632568, "step": 2500 }, { "epoch": 0.9450301204819277, "grad_norm": 33.35118237739162, "learning_rate": 7.638365963855421e-07, "logits/chosen": -2.159960985183716, "logits/rejected": -2.2455077171325684, "logps/chosen": -489.5249938964844, "logps/rejected": -413.0, "loss": 0.311, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.67669677734375, "rewards/margins": 3.9810547828674316, "rewards/rejected": -5.655859470367432, "step": 2510 }, { "epoch": 0.9487951807228916, "grad_norm": 38.865796553098285, "learning_rate": 7.628953313253011e-07, "logits/chosen": -2.187695264816284, "logits/rejected": -2.2113280296325684, "logps/chosen": -398.61248779296875, "logps/rejected": -393.3500061035156, "loss": 0.2427, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.7671020030975342, "rewards/margins": 3.971874952316284, "rewards/rejected": -5.741015434265137, "step": 2520 }, { "epoch": 0.9525602409638554, "grad_norm": 118.37448324080988, "learning_rate": 7.619540662650602e-07, "logits/chosen": -2.081249952316284, "logits/rejected": -2.099316358566284, "logps/chosen": -438.67498779296875, "logps/rejected": -436.67498779296875, "loss": 0.2596, "rewards/accuracies": 0.875, "rewards/chosen": -1.6367919445037842, "rewards/margins": 4.216210842132568, "rewards/rejected": -5.850390434265137, "step": 2530 }, { "epoch": 0.9563253012048193, "grad_norm": 69.04080314480599, "learning_rate": 7.610128012048193e-07, "logits/chosen": -2.1714844703674316, "logits/rejected": -2.197070360183716, "logps/chosen": -408.92498779296875, "logps/rejected": -379.75, "loss": 0.2834, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.5866577625274658, "rewards/margins": 3.890331983566284, "rewards/rejected": -5.474413871765137, "step": 2540 }, { "epoch": 0.9600903614457831, "grad_norm": 76.62912705699492, "learning_rate": 7.600715361445783e-07, "logits/chosen": -2.2027344703674316, "logits/rejected": -2.1923828125, "logps/chosen": -391.04998779296875, "logps/rejected": -397.20001220703125, "loss": 0.2674, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -1.9059569835662842, "rewards/margins": 3.57763671875, "rewards/rejected": -5.482031345367432, "step": 2550 }, { "epoch": 0.963855421686747, "grad_norm": 18.33619309115644, "learning_rate": 7.591302710843373e-07, "logits/chosen": -2.1728515625, "logits/rejected": -2.224804639816284, "logps/chosen": -431.5, "logps/rejected": -438.2250061035156, "loss": 0.223, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -2.1110596656799316, "rewards/margins": 4.386816501617432, "rewards/rejected": -6.498437404632568, "step": 2560 }, { "epoch": 0.9676204819277109, "grad_norm": 144.03717720091728, "learning_rate": 7.581890060240963e-07, "logits/chosen": -2.091992139816284, "logits/rejected": -2.158398389816284, "logps/chosen": -470.0249938964844, "logps/rejected": -396.45001220703125, "loss": 0.3892, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.1910643577575684, "rewards/margins": 4.182568550109863, "rewards/rejected": -6.3720703125, "step": 2570 }, { "epoch": 0.9713855421686747, "grad_norm": 90.60833645659875, "learning_rate": 7.572477409638554e-07, "logits/chosen": -2.197070360183716, "logits/rejected": -2.267578125, "logps/chosen": -405.17498779296875, "logps/rejected": -390.125, "loss": 0.2283, "rewards/accuracies": 0.90625, "rewards/chosen": -1.663330078125, "rewards/margins": 4.199999809265137, "rewards/rejected": -5.865624904632568, "step": 2580 }, { "epoch": 0.9751506024096386, "grad_norm": 56.025060264277215, "learning_rate": 7.563064759036144e-07, "logits/chosen": -2.208789110183716, "logits/rejected": -2.2275390625, "logps/chosen": -407.57501220703125, "logps/rejected": -384.7250061035156, "loss": 0.2535, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.976721167564392, "rewards/margins": 4.302734375, "rewards/rejected": -6.275390625, "step": 2590 }, { "epoch": 0.9789156626506024, "grad_norm": 100.00198943716327, "learning_rate": 7.553652108433735e-07, "logits/chosen": -2.2544922828674316, "logits/rejected": -2.2279295921325684, "logps/chosen": -428.57501220703125, "logps/rejected": -411.67498779296875, "loss": 0.2433, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.9423339366912842, "rewards/margins": 4.581152439117432, "rewards/rejected": -6.526953220367432, "step": 2600 }, { "epoch": 0.9826807228915663, "grad_norm": 77.99940278599016, "learning_rate": 7.544239457831325e-07, "logits/chosen": -2.2041015625, "logits/rejected": -2.240039110183716, "logps/chosen": -459.7250061035156, "logps/rejected": -401.79998779296875, "loss": 0.2457, "rewards/accuracies": 0.875, "rewards/chosen": -1.494042992591858, "rewards/margins": 4.529492378234863, "rewards/rejected": -6.025586128234863, "step": 2610 }, { "epoch": 0.9864457831325302, "grad_norm": 51.425008974249046, "learning_rate": 7.534826807228915e-07, "logits/chosen": -2.164843797683716, "logits/rejected": -2.1937499046325684, "logps/chosen": -446.0249938964844, "logps/rejected": -405.86248779296875, "loss": 0.3019, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.7052490711212158, "rewards/margins": 3.96923828125, "rewards/rejected": -5.676562309265137, "step": 2620 }, { "epoch": 0.990210843373494, "grad_norm": 64.45148648540383, "learning_rate": 7.525414156626506e-07, "logits/chosen": -2.2025389671325684, "logits/rejected": -2.1763672828674316, "logps/chosen": -416.9624938964844, "logps/rejected": -410.45001220703125, "loss": 0.2863, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.9085571765899658, "rewards/margins": 3.8433594703674316, "rewards/rejected": -5.750781059265137, "step": 2630 }, { "epoch": 0.9939759036144579, "grad_norm": 40.879199642329645, "learning_rate": 7.516001506024096e-07, "logits/chosen": -2.100390672683716, "logits/rejected": -2.1509766578674316, "logps/chosen": -459.6499938964844, "logps/rejected": -394.125, "loss": 0.2579, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.6545531749725342, "rewards/margins": 4.626367092132568, "rewards/rejected": -6.282812595367432, "step": 2640 }, { "epoch": 0.9977409638554217, "grad_norm": 59.98598909241057, "learning_rate": 7.506588855421686e-07, "logits/chosen": -2.1981444358825684, "logits/rejected": -2.1919922828674316, "logps/chosen": -428.9375, "logps/rejected": -401.25, "loss": 0.2651, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.6555664539337158, "rewards/margins": 4.279052734375, "rewards/rejected": -5.927734375, "step": 2650 }, { "epoch": 1.0015060240963856, "grad_norm": 34.97011926560312, "learning_rate": 7.497176204819276e-07, "logits/chosen": -2.1224608421325684, "logits/rejected": -2.1763672828674316, "logps/chosen": -400.8500061035156, "logps/rejected": -363.5249938964844, "loss": 0.1826, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.9047362804412842, "rewards/margins": 4.1220703125, "rewards/rejected": -6.023828029632568, "step": 2660 }, { "epoch": 1.0052710843373494, "grad_norm": 39.139915001546065, "learning_rate": 7.487763554216867e-07, "logits/chosen": -2.1949219703674316, "logits/rejected": -2.2818360328674316, "logps/chosen": -460.0249938964844, "logps/rejected": -386.2749938964844, "loss": 0.0837, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.3486816883087158, "rewards/margins": 5.373827934265137, "rewards/rejected": -6.73046875, "step": 2670 }, { "epoch": 1.0090361445783131, "grad_norm": 17.172694438867197, "learning_rate": 7.478350903614458e-07, "logits/chosen": -2.1884765625, "logits/rejected": -2.212890625, "logps/chosen": -409.125, "logps/rejected": -376.95001220703125, "loss": 0.0651, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.6854979991912842, "rewards/margins": 5.237109184265137, "rewards/rejected": -6.928515434265137, "step": 2680 }, { "epoch": 1.0128012048192772, "grad_norm": 15.715030586136296, "learning_rate": 7.468938253012049e-07, "logits/chosen": -2.238085985183716, "logits/rejected": -2.2679686546325684, "logps/chosen": -436.2250061035156, "logps/rejected": -427.625, "loss": 0.0687, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.0873045921325684, "rewards/margins": 5.745312690734863, "rewards/rejected": -7.836718559265137, "step": 2690 }, { "epoch": 1.016566265060241, "grad_norm": 28.29758653554626, "learning_rate": 7.459525602409638e-07, "logits/chosen": -2.2798829078674316, "logits/rejected": -2.2183594703674316, "logps/chosen": -429.8125, "logps/rejected": -420.86248779296875, "loss": 0.0577, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.8884766101837158, "rewards/margins": 5.748046875, "rewards/rejected": -7.63671875, "step": 2700 }, { "epoch": 1.0203313253012047, "grad_norm": 38.52182787805743, "learning_rate": 7.450112951807228e-07, "logits/chosen": -2.213085889816284, "logits/rejected": -2.281054735183716, "logps/chosen": -436.2749938964844, "logps/rejected": -415.5, "loss": 0.0744, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.9893310070037842, "rewards/margins": 5.750390529632568, "rewards/rejected": -7.741406440734863, "step": 2710 }, { "epoch": 1.0240963855421688, "grad_norm": 11.55091269515102, "learning_rate": 7.440700301204819e-07, "logits/chosen": -2.348828077316284, "logits/rejected": -2.408398389816284, "logps/chosen": -419.2124938964844, "logps/rejected": -432.5249938964844, "loss": 0.0539, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.471484422683716, "rewards/margins": 6.152929782867432, "rewards/rejected": -8.627344131469727, "step": 2720 }, { "epoch": 1.0278614457831325, "grad_norm": 5.498846480910144, "learning_rate": 7.43128765060241e-07, "logits/chosen": -2.4222655296325684, "logits/rejected": -2.481640577316284, "logps/chosen": -455.2749938964844, "logps/rejected": -476.92498779296875, "loss": 0.0448, "rewards/accuracies": 1.0, "rewards/chosen": -2.1492919921875, "rewards/margins": 6.530468940734863, "rewards/rejected": -8.682812690734863, "step": 2730 }, { "epoch": 1.0316265060240963, "grad_norm": 17.681592244618823, "learning_rate": 7.421874999999999e-07, "logits/chosen": -2.40234375, "logits/rejected": -2.4652342796325684, "logps/chosen": -433.38751220703125, "logps/rejected": -423.20001220703125, "loss": 0.0723, "rewards/accuracies": 0.96875, "rewards/chosen": -2.7181639671325684, "rewards/margins": 6.478906154632568, "rewards/rejected": -9.200780868530273, "step": 2740 }, { "epoch": 1.0353915662650603, "grad_norm": 29.14375863480098, "learning_rate": 7.41246234939759e-07, "logits/chosen": -2.4410157203674316, "logits/rejected": -2.484570264816284, "logps/chosen": -454.82501220703125, "logps/rejected": -441.3500061035156, "loss": 0.0759, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.122363328933716, "rewards/margins": 6.213281154632568, "rewards/rejected": -9.336718559265137, "step": 2750 }, { "epoch": 1.0391566265060241, "grad_norm": 17.97997752937906, "learning_rate": 7.403049698795181e-07, "logits/chosen": -2.3453125953674316, "logits/rejected": -2.41015625, "logps/chosen": -411.8500061035156, "logps/rejected": -469.0249938964844, "loss": 0.0733, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.46002197265625, "rewards/margins": 6.644140720367432, "rewards/rejected": -9.108593940734863, "step": 2760 }, { "epoch": 1.042921686746988, "grad_norm": 16.48330604315409, "learning_rate": 7.393637048192772e-07, "logits/chosen": -2.421093702316284, "logits/rejected": -2.5224609375, "logps/chosen": -402.17498779296875, "logps/rejected": -403.25, "loss": 0.0816, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.3724243640899658, "rewards/margins": 5.713476657867432, "rewards/rejected": -7.086328029632568, "step": 2770 }, { "epoch": 1.0466867469879517, "grad_norm": 48.084887017446775, "learning_rate": 7.38422439759036e-07, "logits/chosen": -2.423828125, "logits/rejected": -2.4818358421325684, "logps/chosen": -444.3999938964844, "logps/rejected": -416.2749938964844, "loss": 0.0482, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.332250952720642, "rewards/margins": 6.478906154632568, "rewards/rejected": -7.80859375, "step": 2780 }, { "epoch": 1.0504518072289157, "grad_norm": 38.295242248125525, "learning_rate": 7.374811746987951e-07, "logits/chosen": -2.429882764816284, "logits/rejected": -2.4361329078674316, "logps/chosen": -452.13751220703125, "logps/rejected": -449.04998779296875, "loss": 0.1225, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -2.161083936691284, "rewards/margins": 6.597265720367432, "rewards/rejected": -8.760156631469727, "step": 2790 }, { "epoch": 1.0542168674698795, "grad_norm": 10.71649487178938, "learning_rate": 7.365399096385542e-07, "logits/chosen": -2.4439454078674316, "logits/rejected": -2.461132764816284, "logps/chosen": -428.3500061035156, "logps/rejected": -413.6499938964844, "loss": 0.1144, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.40081787109375, "rewards/margins": 6.574999809265137, "rewards/rejected": -8.978906631469727, "step": 2800 }, { "epoch": 1.0579819277108433, "grad_norm": 6.922151376131191, "learning_rate": 7.355986445783132e-07, "logits/chosen": -2.514843702316284, "logits/rejected": -2.5552735328674316, "logps/chosen": -445.3999938964844, "logps/rejected": -430.1499938964844, "loss": 0.0607, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.455615282058716, "rewards/margins": 6.948437690734863, "rewards/rejected": -9.403905868530273, "step": 2810 }, { "epoch": 1.0617469879518073, "grad_norm": 17.716708326354407, "learning_rate": 7.346573795180723e-07, "logits/chosen": -2.4136719703674316, "logits/rejected": -2.4644532203674316, "logps/chosen": -445.9750061035156, "logps/rejected": -417.70001220703125, "loss": 0.0434, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.70428466796875, "rewards/margins": 6.689453125, "rewards/rejected": -9.392969131469727, "step": 2820 }, { "epoch": 1.0655120481927711, "grad_norm": 8.725297939848499, "learning_rate": 7.337161144578313e-07, "logits/chosen": -2.394335985183716, "logits/rejected": -2.539257764816284, "logps/chosen": -474.375, "logps/rejected": -444.875, "loss": 0.0521, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.45953369140625, "rewards/margins": 6.529687404632568, "rewards/rejected": -8.990625381469727, "step": 2830 }, { "epoch": 1.069277108433735, "grad_norm": 1.7039794856874941, "learning_rate": 7.327748493975904e-07, "logits/chosen": -2.4175782203674316, "logits/rejected": -2.545117139816284, "logps/chosen": -456.79998779296875, "logps/rejected": -427.29998779296875, "loss": 0.0563, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.646533250808716, "rewards/margins": 6.752343654632568, "rewards/rejected": -9.399218559265137, "step": 2840 }, { "epoch": 1.0730421686746987, "grad_norm": 35.01081145131901, "learning_rate": 7.318335843373493e-07, "logits/chosen": -2.3912110328674316, "logits/rejected": -2.4945311546325684, "logps/chosen": -525.8125, "logps/rejected": -446.625, "loss": 0.0513, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.21923828125, "rewards/margins": 7.391406059265137, "rewards/rejected": -9.611719131469727, "step": 2850 }, { "epoch": 1.0768072289156627, "grad_norm": 3.4823158261555327, "learning_rate": 7.308923192771084e-07, "logits/chosen": -2.466796875, "logits/rejected": -2.586718797683716, "logps/chosen": -452.6499938964844, "logps/rejected": -426.8999938964844, "loss": 0.0847, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.7595946788787842, "rewards/margins": 7.018359184265137, "rewards/rejected": -8.774218559265137, "step": 2860 }, { "epoch": 1.0805722891566265, "grad_norm": 19.88234518092511, "learning_rate": 7.299510542168674e-07, "logits/chosen": -2.388671875, "logits/rejected": -2.4427733421325684, "logps/chosen": -481.0249938964844, "logps/rejected": -427.6499938964844, "loss": 0.0976, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -2.681933641433716, "rewards/margins": 6.595312595367432, "rewards/rejected": -9.282812118530273, "step": 2870 }, { "epoch": 1.0843373493975903, "grad_norm": 23.559486528134435, "learning_rate": 7.290097891566265e-07, "logits/chosen": -2.473828077316284, "logits/rejected": -2.4906249046325684, "logps/chosen": -472.20001220703125, "logps/rejected": -463.29998779296875, "loss": 0.049, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.0853028297424316, "rewards/margins": 7.02734375, "rewards/rejected": -10.110156059265137, "step": 2880 }, { "epoch": 1.0881024096385543, "grad_norm": 12.908923576498568, "learning_rate": 7.280685240963855e-07, "logits/chosen": -2.372851610183716, "logits/rejected": -2.419921875, "logps/chosen": -439.29998779296875, "logps/rejected": -437.0249938964844, "loss": 0.066, "rewards/accuracies": 0.96875, "rewards/chosen": -2.85498046875, "rewards/margins": 6.760937690734863, "rewards/rejected": -9.615625381469727, "step": 2890 }, { "epoch": 1.091867469879518, "grad_norm": 17.245961513818337, "learning_rate": 7.271272590361446e-07, "logits/chosen": -2.3681640625, "logits/rejected": -2.429882764816284, "logps/chosen": -423.5, "logps/rejected": -407.8999938964844, "loss": 0.0747, "rewards/accuracies": 0.96875, "rewards/chosen": -2.6109557151794434, "rewards/margins": 5.849218845367432, "rewards/rejected": -8.461718559265137, "step": 2900 }, { "epoch": 1.095632530120482, "grad_norm": 52.260711061516474, "learning_rate": 7.261859939759037e-07, "logits/chosen": -2.4039063453674316, "logits/rejected": -2.4818358421325684, "logps/chosen": -422.0, "logps/rejected": -423.375, "loss": 0.041, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.849206566810608, "rewards/margins": 7.10546875, "rewards/rejected": -8.954297065734863, "step": 2910 }, { "epoch": 1.0993975903614457, "grad_norm": 7.595302936740082, "learning_rate": 7.252447289156625e-07, "logits/chosen": -2.5230469703674316, "logits/rejected": -2.5796875953674316, "logps/chosen": -428.61248779296875, "logps/rejected": -399.04998779296875, "loss": 0.0593, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.323779344558716, "rewards/margins": 6.658593654632568, "rewards/rejected": -8.98046875, "step": 2920 }, { "epoch": 1.1031626506024097, "grad_norm": 13.982075156398155, "learning_rate": 7.243034638554216e-07, "logits/chosen": -2.4613280296325684, "logits/rejected": -2.6451172828674316, "logps/chosen": -392.0249938964844, "logps/rejected": -423.8500061035156, "loss": 0.1006, "rewards/accuracies": 0.96875, "rewards/chosen": -2.268798828125, "rewards/margins": 6.1015625, "rewards/rejected": -8.370312690734863, "step": 2930 }, { "epoch": 1.1069277108433735, "grad_norm": 22.13295780617572, "learning_rate": 7.233621987951807e-07, "logits/chosen": -2.4388670921325684, "logits/rejected": -2.513867139816284, "logps/chosen": -494.2250061035156, "logps/rejected": -465.625, "loss": 0.0425, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.110107421875, "rewards/margins": 7.458593845367432, "rewards/rejected": -9.568750381469727, "step": 2940 }, { "epoch": 1.1106927710843373, "grad_norm": 70.45287242879155, "learning_rate": 7.224209337349398e-07, "logits/chosen": -2.541015625, "logits/rejected": -2.6742186546325684, "logps/chosen": -402.1000061035156, "logps/rejected": -431.125, "loss": 0.1283, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.113757371902466, "rewards/margins": 6.5341796875, "rewards/rejected": -9.646875381469727, "step": 2950 }, { "epoch": 1.1144578313253013, "grad_norm": 8.641265337653335, "learning_rate": 7.214796686746987e-07, "logits/chosen": -2.510546922683716, "logits/rejected": -2.556835889816284, "logps/chosen": -413.0375061035156, "logps/rejected": -410.29998779296875, "loss": 0.0671, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.080127000808716, "rewards/margins": 7.086133003234863, "rewards/rejected": -10.16796875, "step": 2960 }, { "epoch": 1.118222891566265, "grad_norm": 20.341061528743797, "learning_rate": 7.205384036144578e-07, "logits/chosen": -2.3607420921325684, "logits/rejected": -2.4146485328674316, "logps/chosen": -469.6499938964844, "logps/rejected": -469.29998779296875, "loss": 0.0463, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.6838622093200684, "rewards/margins": 6.380468845367432, "rewards/rejected": -9.0625, "step": 2970 }, { "epoch": 1.1219879518072289, "grad_norm": 36.79411332064457, "learning_rate": 7.195971385542169e-07, "logits/chosen": -2.39453125, "logits/rejected": -2.4644532203674316, "logps/chosen": -425.70001220703125, "logps/rejected": -444.0249938964844, "loss": 0.0829, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.604480028152466, "rewards/margins": 6.633203029632568, "rewards/rejected": -9.236719131469727, "step": 2980 }, { "epoch": 1.1257530120481927, "grad_norm": 32.75308782628263, "learning_rate": 7.186558734939759e-07, "logits/chosen": -2.3958983421325684, "logits/rejected": -2.5015625953674316, "logps/chosen": -441.04998779296875, "logps/rejected": -423.6499938964844, "loss": 0.086, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.7381348609924316, "rewards/margins": 6.879296779632568, "rewards/rejected": -9.621874809265137, "step": 2990 }, { "epoch": 1.1295180722891567, "grad_norm": 24.67927737803069, "learning_rate": 7.177146084337348e-07, "logits/chosen": -2.4251952171325684, "logits/rejected": -2.484570264816284, "logps/chosen": -440.3999938964844, "logps/rejected": -447.7250061035156, "loss": 0.0565, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.9546875953674316, "rewards/margins": 6.260546684265137, "rewards/rejected": -10.21484375, "step": 3000 }, { "epoch": 1.1332831325301205, "grad_norm": 48.565817118031404, "learning_rate": 7.167733433734939e-07, "logits/chosen": -2.479296922683716, "logits/rejected": -2.561328172683716, "logps/chosen": -434.6000061035156, "logps/rejected": -420.04998779296875, "loss": 0.0409, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.717578172683716, "rewards/margins": 7.123827934265137, "rewards/rejected": -10.83984375, "step": 3010 }, { "epoch": 1.1370481927710843, "grad_norm": 33.421980182143464, "learning_rate": 7.15832078313253e-07, "logits/chosen": -2.457812547683716, "logits/rejected": -2.5091795921325684, "logps/chosen": -385.4624938964844, "logps/rejected": -410.67498779296875, "loss": 0.0749, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.831835985183716, "rewards/margins": 6.598437309265137, "rewards/rejected": -10.427343368530273, "step": 3020 }, { "epoch": 1.1408132530120483, "grad_norm": 22.33918633552018, "learning_rate": 7.148908132530121e-07, "logits/chosen": -2.384765625, "logits/rejected": -2.503124952316284, "logps/chosen": -485.3999938964844, "logps/rejected": -467.54998779296875, "loss": 0.055, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.767382860183716, "rewards/margins": 7.659375190734863, "rewards/rejected": -11.434374809265137, "step": 3030 }, { "epoch": 1.144578313253012, "grad_norm": 44.71552077431291, "learning_rate": 7.13949548192771e-07, "logits/chosen": -2.513476610183716, "logits/rejected": -2.5777344703674316, "logps/chosen": -409.2749938964844, "logps/rejected": -452.67498779296875, "loss": 0.0503, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.9520020484924316, "rewards/margins": 7.391015529632568, "rewards/rejected": -10.345312118530273, "step": 3040 }, { "epoch": 1.1483433734939759, "grad_norm": 50.15448310605343, "learning_rate": 7.130082831325301e-07, "logits/chosen": -2.6187500953674316, "logits/rejected": -2.6343750953674316, "logps/chosen": -462.0375061035156, "logps/rejected": -431.2250061035156, "loss": 0.0384, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.6143555641174316, "rewards/margins": 7.090234279632568, "rewards/rejected": -9.70703125, "step": 3050 }, { "epoch": 1.1521084337349397, "grad_norm": 32.399755985281764, "learning_rate": 7.120670180722891e-07, "logits/chosen": -2.457226514816284, "logits/rejected": -2.526171922683716, "logps/chosen": -405.11248779296875, "logps/rejected": -402.25, "loss": 0.075, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.709033250808716, "rewards/margins": 6.454687595367432, "rewards/rejected": -9.164843559265137, "step": 3060 }, { "epoch": 1.1558734939759037, "grad_norm": 7.344113105408142, "learning_rate": 7.111257530120482e-07, "logits/chosen": -2.536914110183716, "logits/rejected": -2.654101610183716, "logps/chosen": -484.67498779296875, "logps/rejected": -442.3999938964844, "loss": 0.0782, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.9876952171325684, "rewards/margins": 7.150000095367432, "rewards/rejected": -10.127344131469727, "step": 3070 }, { "epoch": 1.1596385542168675, "grad_norm": 29.597500042261306, "learning_rate": 7.101844879518072e-07, "logits/chosen": -2.4525389671325684, "logits/rejected": -2.5220704078674316, "logps/chosen": -440.36248779296875, "logps/rejected": -451.32501220703125, "loss": 0.1115, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -2.830883741378784, "rewards/margins": 7.646874904632568, "rewards/rejected": -10.478124618530273, "step": 3080 }, { "epoch": 1.1634036144578312, "grad_norm": 88.05755150737848, "learning_rate": 7.092432228915662e-07, "logits/chosen": -2.511523485183716, "logits/rejected": -2.577929735183716, "logps/chosen": -430.5, "logps/rejected": -444.79998779296875, "loss": 0.0795, "rewards/accuracies": 0.96875, "rewards/chosen": -2.7520995140075684, "rewards/margins": 7.085156440734863, "rewards/rejected": -9.832812309265137, "step": 3090 }, { "epoch": 1.1671686746987953, "grad_norm": 21.505816285289608, "learning_rate": 7.083019578313253e-07, "logits/chosen": -2.5884766578674316, "logits/rejected": -2.6332030296325684, "logps/chosen": -430.7250061035156, "logps/rejected": -436.0249938964844, "loss": 0.069, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.5870118141174316, "rewards/margins": 7.173437595367432, "rewards/rejected": -10.753125190734863, "step": 3100 }, { "epoch": 1.170933734939759, "grad_norm": 3.348640099671434, "learning_rate": 7.073606927710843e-07, "logits/chosen": -2.477734327316284, "logits/rejected": -2.5308594703674316, "logps/chosen": -398.20001220703125, "logps/rejected": -405.3999938964844, "loss": 0.084, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.024975538253784, "rewards/margins": 6.700390815734863, "rewards/rejected": -9.734375, "step": 3110 }, { "epoch": 1.1746987951807228, "grad_norm": 23.28711279863308, "learning_rate": 7.064194277108434e-07, "logits/chosen": -2.4654297828674316, "logits/rejected": -2.510937452316284, "logps/chosen": -420.9750061035156, "logps/rejected": -417.4750061035156, "loss": 0.0515, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.6318602561950684, "rewards/margins": 7.573046684265137, "rewards/rejected": -10.202343940734863, "step": 3120 }, { "epoch": 1.1784638554216866, "grad_norm": 73.73791306074662, "learning_rate": 7.054781626506023e-07, "logits/chosen": -2.4869141578674316, "logits/rejected": -2.5980467796325684, "logps/chosen": -416.625, "logps/rejected": -428.32501220703125, "loss": 0.1034, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -2.788525342941284, "rewards/margins": 6.833203315734863, "rewards/rejected": -9.621874809265137, "step": 3130 }, { "epoch": 1.1822289156626506, "grad_norm": 48.33414750173185, "learning_rate": 7.045368975903614e-07, "logits/chosen": -2.4574217796325684, "logits/rejected": -2.4853515625, "logps/chosen": -460.51251220703125, "logps/rejected": -463.1499938964844, "loss": 0.0512, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.5204100608825684, "rewards/margins": 7.164453029632568, "rewards/rejected": -10.6796875, "step": 3140 }, { "epoch": 1.1859939759036144, "grad_norm": 65.53613566418547, "learning_rate": 7.035956325301204e-07, "logits/chosen": -2.378124952316284, "logits/rejected": -2.5054688453674316, "logps/chosen": -462.45001220703125, "logps/rejected": -439.07501220703125, "loss": 0.0362, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.021679639816284, "rewards/margins": 8.115625381469727, "rewards/rejected": -11.140625, "step": 3150 }, { "epoch": 1.1897590361445782, "grad_norm": 20.452630390605428, "learning_rate": 7.026543674698795e-07, "logits/chosen": -2.5542969703674316, "logits/rejected": -2.619921922683716, "logps/chosen": -416.375, "logps/rejected": -440.625, "loss": 0.0358, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.5357422828674316, "rewards/margins": 7.967968940734863, "rewards/rejected": -10.498437881469727, "step": 3160 }, { "epoch": 1.1935240963855422, "grad_norm": 53.19626419678922, "learning_rate": 7.017131024096386e-07, "logits/chosen": -2.513476610183716, "logits/rejected": -2.586132764816284, "logps/chosen": -417.70001220703125, "logps/rejected": -456.5, "loss": 0.0791, "rewards/accuracies": 0.96875, "rewards/chosen": -2.4529175758361816, "rewards/margins": 7.122754096984863, "rewards/rejected": -9.574999809265137, "step": 3170 }, { "epoch": 1.197289156626506, "grad_norm": 22.83452232153238, "learning_rate": 7.007718373493976e-07, "logits/chosen": -2.4736328125, "logits/rejected": -2.6044921875, "logps/chosen": -475.95001220703125, "logps/rejected": -450.20001220703125, "loss": 0.05, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.2942872047424316, "rewards/margins": 7.211328029632568, "rewards/rejected": -9.509374618530273, "step": 3180 }, { "epoch": 1.2010542168674698, "grad_norm": 106.35133602288056, "learning_rate": 6.998305722891565e-07, "logits/chosen": -2.5224609375, "logits/rejected": -2.558398485183716, "logps/chosen": -368.7250061035156, "logps/rejected": -412.42498779296875, "loss": 0.1624, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.608996629714966, "rewards/margins": 6.684668064117432, "rewards/rejected": -9.287500381469727, "step": 3190 }, { "epoch": 1.2048192771084336, "grad_norm": 57.67574471058866, "learning_rate": 6.988893072289156e-07, "logits/chosen": -2.6455078125, "logits/rejected": -2.637890577316284, "logps/chosen": -398.0249938964844, "logps/rejected": -395.375, "loss": 0.043, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.188732862472534, "rewards/margins": 6.828906059265137, "rewards/rejected": -9.021875381469727, "step": 3200 }, { "epoch": 1.2085843373493976, "grad_norm": 29.15183283819991, "learning_rate": 6.979480421686747e-07, "logits/chosen": -2.55859375, "logits/rejected": -2.645703077316284, "logps/chosen": -451.6499938964844, "logps/rejected": -430.7250061035156, "loss": 0.0289, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.9913086891174316, "rewards/margins": 7.163281440734863, "rewards/rejected": -10.151562690734863, "step": 3210 }, { "epoch": 1.2123493975903614, "grad_norm": 13.124747399618963, "learning_rate": 6.970067771084337e-07, "logits/chosen": -2.637890577316284, "logits/rejected": -2.801562547683716, "logps/chosen": -436.42498779296875, "logps/rejected": -432.75, "loss": 0.1026, "rewards/accuracies": 0.96875, "rewards/chosen": -3.187744140625, "rewards/margins": 7.035937309265137, "rewards/rejected": -10.220312118530273, "step": 3220 }, { "epoch": 1.2161144578313252, "grad_norm": 16.834795632456768, "learning_rate": 6.960655120481927e-07, "logits/chosen": -2.6146483421325684, "logits/rejected": -2.7554688453674316, "logps/chosen": -446.875, "logps/rejected": -434.2250061035156, "loss": 0.0656, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.53369140625, "rewards/margins": 7.055859565734863, "rewards/rejected": -10.5859375, "step": 3230 }, { "epoch": 1.2198795180722892, "grad_norm": 8.63617110876984, "learning_rate": 6.951242469879518e-07, "logits/chosen": -2.5560545921325684, "logits/rejected": -2.677929639816284, "logps/chosen": -420.07501220703125, "logps/rejected": -446.6499938964844, "loss": 0.0598, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.8397459983825684, "rewards/margins": 7.237890720367432, "rewards/rejected": -11.079687118530273, "step": 3240 }, { "epoch": 1.223644578313253, "grad_norm": 123.40670408740394, "learning_rate": 6.941829819277109e-07, "logits/chosen": -2.560742139816284, "logits/rejected": -2.6175780296325684, "logps/chosen": -481.29998779296875, "logps/rejected": -467.57501220703125, "loss": 0.0554, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.2862305641174316, "rewards/margins": 7.461718559265137, "rewards/rejected": -10.74609375, "step": 3250 }, { "epoch": 1.2274096385542168, "grad_norm": 42.69790098373359, "learning_rate": 6.932417168674697e-07, "logits/chosen": -2.5826172828674316, "logits/rejected": -2.62890625, "logps/chosen": -411.6000061035156, "logps/rejected": -440.1499938964844, "loss": 0.0609, "rewards/accuracies": 0.96875, "rewards/chosen": -3.147656202316284, "rewards/margins": 7.234765529632568, "rewards/rejected": -10.381250381469727, "step": 3260 }, { "epoch": 1.2311746987951806, "grad_norm": 19.828306606369527, "learning_rate": 6.923004518072288e-07, "logits/chosen": -2.6244139671325684, "logits/rejected": -2.5873045921325684, "logps/chosen": -424.9375, "logps/rejected": -456.7749938964844, "loss": 0.0553, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.8990235328674316, "rewards/margins": 7.923437595367432, "rewards/rejected": -11.825780868530273, "step": 3270 }, { "epoch": 1.2349397590361446, "grad_norm": 83.29811679216284, "learning_rate": 6.913591867469879e-07, "logits/chosen": -2.4957032203674316, "logits/rejected": -2.5693359375, "logps/chosen": -466.7749938964844, "logps/rejected": -466.57501220703125, "loss": 0.1098, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.358984470367432, "rewards/margins": 7.114062309265137, "rewards/rejected": -11.473437309265137, "step": 3280 }, { "epoch": 1.2387048192771084, "grad_norm": 75.17531530467373, "learning_rate": 6.90417921686747e-07, "logits/chosen": -2.593554735183716, "logits/rejected": -2.718554735183716, "logps/chosen": -478.57501220703125, "logps/rejected": -484.82501220703125, "loss": 0.0834, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.7138671875, "rewards/margins": 7.163866996765137, "rewards/rejected": -11.878125190734863, "step": 3290 }, { "epoch": 1.2424698795180722, "grad_norm": 161.67024840323512, "learning_rate": 6.89476656626506e-07, "logits/chosen": -2.635937452316284, "logits/rejected": -2.6630859375, "logps/chosen": -446.4375, "logps/rejected": -424.95001220703125, "loss": 0.0967, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.101806640625, "rewards/margins": 6.805078029632568, "rewards/rejected": -10.910937309265137, "step": 3300 }, { "epoch": 1.2462349397590362, "grad_norm": 10.71700561204402, "learning_rate": 6.88535391566265e-07, "logits/chosen": -2.5423827171325684, "logits/rejected": -2.676562547683716, "logps/chosen": -437.3500061035156, "logps/rejected": -450.3999938964844, "loss": 0.035, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.0884766578674316, "rewards/margins": 7.428124904632568, "rewards/rejected": -10.515625, "step": 3310 }, { "epoch": 1.25, "grad_norm": 43.792932742045544, "learning_rate": 6.875941265060241e-07, "logits/chosen": -2.5328125953674316, "logits/rejected": -2.6664061546325684, "logps/chosen": -372.9375, "logps/rejected": -400.07501220703125, "loss": 0.0499, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.830029249191284, "rewards/margins": 7.358202934265137, "rewards/rejected": -10.185937881469727, "step": 3320 }, { "epoch": 1.2537650602409638, "grad_norm": 29.79171990197859, "learning_rate": 6.866528614457831e-07, "logits/chosen": -2.6107420921325684, "logits/rejected": -2.6255860328674316, "logps/chosen": -452.04998779296875, "logps/rejected": -468.32501220703125, "loss": 0.0746, "rewards/accuracies": 0.96875, "rewards/chosen": -2.7961182594299316, "rewards/margins": 7.554296970367432, "rewards/rejected": -10.34375, "step": 3330 }, { "epoch": 1.2575301204819276, "grad_norm": 51.22370842346554, "learning_rate": 6.857115963855421e-07, "logits/chosen": -2.509960889816284, "logits/rejected": -2.528515577316284, "logps/chosen": -403.25, "logps/rejected": -410.8500061035156, "loss": 0.0716, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.976513624191284, "rewards/margins": 7.08984375, "rewards/rejected": -10.064844131469727, "step": 3340 }, { "epoch": 1.2612951807228916, "grad_norm": 11.662948532873147, "learning_rate": 6.847703313253011e-07, "logits/chosen": -2.530468702316284, "logits/rejected": -2.5999999046325684, "logps/chosen": -447.5625, "logps/rejected": -451.375, "loss": 0.0399, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.0491700172424316, "rewards/margins": 7.600390434265137, "rewards/rejected": -10.649999618530273, "step": 3350 }, { "epoch": 1.2650602409638554, "grad_norm": 1.181504092918282, "learning_rate": 6.838290662650602e-07, "logits/chosen": -2.478320360183716, "logits/rejected": -2.542773485183716, "logps/chosen": -422.29998779296875, "logps/rejected": -437.375, "loss": 0.0529, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.438525438308716, "rewards/margins": 7.426171779632568, "rewards/rejected": -9.869921684265137, "step": 3360 }, { "epoch": 1.2688253012048194, "grad_norm": 5.798596051187987, "learning_rate": 6.828878012048193e-07, "logits/chosen": -2.5189452171325684, "logits/rejected": -2.622265577316284, "logps/chosen": -457.0, "logps/rejected": -437.9750061035156, "loss": 0.0433, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.40771484375, "rewards/margins": 7.066796779632568, "rewards/rejected": -9.48046875, "step": 3370 }, { "epoch": 1.2725903614457832, "grad_norm": 64.66748577712261, "learning_rate": 6.819465361445783e-07, "logits/chosen": -2.5669922828674316, "logits/rejected": -2.6636719703674316, "logps/chosen": -444.0, "logps/rejected": -437.3500061035156, "loss": 0.0606, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.8519530296325684, "rewards/margins": 6.830468654632568, "rewards/rejected": -9.684374809265137, "step": 3380 }, { "epoch": 1.276355421686747, "grad_norm": 39.44523145013274, "learning_rate": 6.810052710843374e-07, "logits/chosen": -2.5390625, "logits/rejected": -2.6373047828674316, "logps/chosen": -455.625, "logps/rejected": -442.875, "loss": 0.0565, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.1905760765075684, "rewards/margins": 7.233984470367432, "rewards/rejected": -10.421093940734863, "step": 3390 }, { "epoch": 1.2801204819277108, "grad_norm": 2.939601302386116, "learning_rate": 6.800640060240963e-07, "logits/chosen": -2.569140672683716, "logits/rejected": -2.6390624046325684, "logps/chosen": -503.5249938964844, "logps/rejected": -490.7250061035156, "loss": 0.0479, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.293011426925659, "rewards/margins": 8.02734375, "rewards/rejected": -11.321093559265137, "step": 3400 }, { "epoch": 1.2838855421686746, "grad_norm": 23.874871281664962, "learning_rate": 6.791227409638553e-07, "logits/chosen": -2.572460889816284, "logits/rejected": -2.668750047683716, "logps/chosen": -449.26251220703125, "logps/rejected": -439.6499938964844, "loss": 0.0681, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.9701170921325684, "rewards/margins": 7.544921875, "rewards/rejected": -11.517969131469727, "step": 3410 }, { "epoch": 1.2876506024096386, "grad_norm": 80.66240317403383, "learning_rate": 6.781814759036144e-07, "logits/chosen": -2.5316405296325684, "logits/rejected": -2.657421827316284, "logps/chosen": -436.42498779296875, "logps/rejected": -431.20001220703125, "loss": 0.1018, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.073144435882568, "rewards/margins": 6.517968654632568, "rewards/rejected": -10.582812309265137, "step": 3420 }, { "epoch": 1.2914156626506024, "grad_norm": 28.841749466164632, "learning_rate": 6.772402108433735e-07, "logits/chosen": -2.5824217796325684, "logits/rejected": -2.681640625, "logps/chosen": -431.4375, "logps/rejected": -444.45001220703125, "loss": 0.0648, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.650097608566284, "rewards/margins": 6.871874809265137, "rewards/rejected": -10.520312309265137, "step": 3430 }, { "epoch": 1.2951807228915664, "grad_norm": 2.4521537130358153, "learning_rate": 6.762989457831325e-07, "logits/chosen": -2.434765577316284, "logits/rejected": -2.5810546875, "logps/chosen": -482.1000061035156, "logps/rejected": -456.75, "loss": 0.0906, "rewards/accuracies": 0.96875, "rewards/chosen": -3.742871046066284, "rewards/margins": 7.363965034484863, "rewards/rejected": -11.104687690734863, "step": 3440 }, { "epoch": 1.2989457831325302, "grad_norm": 51.19825419265053, "learning_rate": 6.753576807228915e-07, "logits/chosen": -2.4697265625, "logits/rejected": -2.4749999046325684, "logps/chosen": -474.29998779296875, "logps/rejected": -501.875, "loss": 0.049, "rewards/accuracies": 0.96875, "rewards/chosen": -3.102734327316284, "rewards/margins": 7.606249809265137, "rewards/rejected": -10.71484375, "step": 3450 }, { "epoch": 1.302710843373494, "grad_norm": 60.3189627183134, "learning_rate": 6.744164156626506e-07, "logits/chosen": -2.486328125, "logits/rejected": -2.467968702316284, "logps/chosen": -445.4750061035156, "logps/rejected": -457.07501220703125, "loss": 0.0625, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.659960985183716, "rewards/margins": 7.461328029632568, "rewards/rejected": -11.115625381469727, "step": 3460 }, { "epoch": 1.3064759036144578, "grad_norm": 7.974630606944625, "learning_rate": 6.734751506024096e-07, "logits/chosen": -2.506640672683716, "logits/rejected": -2.4710936546325684, "logps/chosen": -449.8999938964844, "logps/rejected": -452.375, "loss": 0.035, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.154980421066284, "rewards/margins": 7.608984470367432, "rewards/rejected": -10.7578125, "step": 3470 }, { "epoch": 1.3102409638554218, "grad_norm": 20.855121441829915, "learning_rate": 6.725338855421686e-07, "logits/chosen": -2.5966796875, "logits/rejected": -2.7476563453674316, "logps/chosen": -474.2124938964844, "logps/rejected": -414.42498779296875, "loss": 0.0442, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.4915528297424316, "rewards/margins": 6.793749809265137, "rewards/rejected": -9.284375190734863, "step": 3480 }, { "epoch": 1.3140060240963856, "grad_norm": 23.79861074507387, "learning_rate": 6.715926204819276e-07, "logits/chosen": -2.5054688453674316, "logits/rejected": -2.6722655296325684, "logps/chosen": -484.32501220703125, "logps/rejected": -411.0, "loss": 0.089, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.222583055496216, "rewards/margins": 7.185937404632568, "rewards/rejected": -9.407031059265137, "step": 3490 }, { "epoch": 1.3177710843373494, "grad_norm": 20.638068257033847, "learning_rate": 6.706513554216867e-07, "logits/chosen": -2.541015625, "logits/rejected": -2.569531202316284, "logps/chosen": -445.625, "logps/rejected": -455.375, "loss": 0.057, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.864990234375, "rewards/margins": 7.706250190734863, "rewards/rejected": -10.571874618530273, "step": 3500 }, { "epoch": 1.3215361445783134, "grad_norm": 26.984086057783067, "learning_rate": 6.697100903614458e-07, "logits/chosen": -2.4716796875, "logits/rejected": -2.5111327171325684, "logps/chosen": -461.57501220703125, "logps/rejected": -457.5, "loss": 0.0897, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.548144578933716, "rewards/margins": 6.9375, "rewards/rejected": -10.485937118530273, "step": 3510 }, { "epoch": 1.3253012048192772, "grad_norm": 44.46577254203196, "learning_rate": 6.687688253012049e-07, "logits/chosen": -2.421875, "logits/rejected": -2.4990234375, "logps/chosen": -459.1000061035156, "logps/rejected": -463.1000061035156, "loss": 0.0454, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.4273438453674316, "rewards/margins": 7.701952934265137, "rewards/rejected": -11.125, "step": 3520 }, { "epoch": 1.329066265060241, "grad_norm": 44.601345019526846, "learning_rate": 6.678275602409638e-07, "logits/chosen": -2.5689454078674316, "logits/rejected": -2.6107420921325684, "logps/chosen": -460.4750061035156, "logps/rejected": -476.04998779296875, "loss": 0.0571, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.096728563308716, "rewards/margins": 7.210156440734863, "rewards/rejected": -10.306249618530273, "step": 3530 }, { "epoch": 1.3328313253012047, "grad_norm": 16.104844943901792, "learning_rate": 6.668862951807228e-07, "logits/chosen": -2.5003905296325684, "logits/rejected": -2.604687452316284, "logps/chosen": -468.5625, "logps/rejected": -446.82501220703125, "loss": 0.055, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.81494140625, "rewards/margins": 7.577343940734863, "rewards/rejected": -11.399999618530273, "step": 3540 }, { "epoch": 1.3365963855421688, "grad_norm": 25.287772385868998, "learning_rate": 6.659450301204819e-07, "logits/chosen": -2.5931639671325684, "logits/rejected": -2.6285157203674316, "logps/chosen": -441.2749938964844, "logps/rejected": -451.29998779296875, "loss": 0.0667, "rewards/accuracies": 0.96875, "rewards/chosen": -3.405322313308716, "rewards/margins": 7.898046970367432, "rewards/rejected": -11.299219131469727, "step": 3550 }, { "epoch": 1.3403614457831325, "grad_norm": 13.444504246009974, "learning_rate": 6.65003765060241e-07, "logits/chosen": -2.5830078125, "logits/rejected": -2.6015625, "logps/chosen": -461.6000061035156, "logps/rejected": -450.3500061035156, "loss": 0.0972, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.782910108566284, "rewards/margins": 7.659375190734863, "rewards/rejected": -11.449999809265137, "step": 3560 }, { "epoch": 1.3441265060240963, "grad_norm": 5.355584645777043, "learning_rate": 6.640624999999999e-07, "logits/chosen": -2.519335985183716, "logits/rejected": -2.55859375, "logps/chosen": -399.5249938964844, "logps/rejected": -432.8999938964844, "loss": 0.0301, "rewards/accuracies": 1.0, "rewards/chosen": -3.5419921875, "rewards/margins": 7.980859279632568, "rewards/rejected": -11.5234375, "step": 3570 }, { "epoch": 1.3478915662650603, "grad_norm": 9.641446569075818, "learning_rate": 6.63121234939759e-07, "logits/chosen": -2.490234375, "logits/rejected": -2.5912108421325684, "logps/chosen": -470.9750061035156, "logps/rejected": -493.95001220703125, "loss": 0.0509, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.814160108566284, "rewards/margins": 7.9765625, "rewards/rejected": -11.794530868530273, "step": 3580 }, { "epoch": 1.3516566265060241, "grad_norm": 43.98021395901864, "learning_rate": 6.621799698795181e-07, "logits/chosen": -2.4453125, "logits/rejected": -2.586132764816284, "logps/chosen": -481.4624938964844, "logps/rejected": -492.79998779296875, "loss": 0.0756, "rewards/accuracies": 0.96875, "rewards/chosen": -3.90087890625, "rewards/margins": 7.765038967132568, "rewards/rejected": -11.66796875, "step": 3590 }, { "epoch": 1.355421686746988, "grad_norm": 19.18310247908315, "learning_rate": 6.612387048192771e-07, "logits/chosen": -2.512890577316284, "logits/rejected": -2.584765672683716, "logps/chosen": -429.375, "logps/rejected": -444.7250061035156, "loss": 0.0487, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.0616455078125, "rewards/margins": 7.676171779632568, "rewards/rejected": -10.732812881469727, "step": 3600 }, { "epoch": 1.3591867469879517, "grad_norm": 15.742440366101246, "learning_rate": 6.60297439759036e-07, "logits/chosen": -2.536914110183716, "logits/rejected": -2.717968702316284, "logps/chosen": -425.23748779296875, "logps/rejected": -405.8500061035156, "loss": 0.0255, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.143115282058716, "rewards/margins": 7.555859565734863, "rewards/rejected": -10.700780868530273, "step": 3610 }, { "epoch": 1.3629518072289157, "grad_norm": 93.28802639229129, "learning_rate": 6.593561746987951e-07, "logits/chosen": -2.591992139816284, "logits/rejected": -2.619335889816284, "logps/chosen": -501.8500061035156, "logps/rejected": -472.7250061035156, "loss": 0.0511, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.553027391433716, "rewards/margins": 7.584374904632568, "rewards/rejected": -11.142187118530273, "step": 3620 }, { "epoch": 1.3667168674698795, "grad_norm": 3.301929780273248, "learning_rate": 6.584149096385542e-07, "logits/chosen": -2.536328077316284, "logits/rejected": -2.654101610183716, "logps/chosen": -437.42498779296875, "logps/rejected": -460.04998779296875, "loss": 0.061, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.448779344558716, "rewards/margins": 7.708593845367432, "rewards/rejected": -11.15625, "step": 3630 }, { "epoch": 1.3704819277108433, "grad_norm": 9.6314133070424, "learning_rate": 6.574736445783132e-07, "logits/chosen": -2.5474610328674316, "logits/rejected": -2.5494141578674316, "logps/chosen": -458.70001220703125, "logps/rejected": -467.04998779296875, "loss": 0.0677, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.4644775390625, "rewards/margins": 7.853515625, "rewards/rejected": -11.318750381469727, "step": 3640 }, { "epoch": 1.3742469879518073, "grad_norm": 70.33511995045596, "learning_rate": 6.565323795180723e-07, "logits/chosen": -2.494921922683716, "logits/rejected": -2.555859327316284, "logps/chosen": -441.625, "logps/rejected": -458.2749938964844, "loss": 0.0451, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.958056688308716, "rewards/margins": 7.319531440734863, "rewards/rejected": -10.282812118530273, "step": 3650 }, { "epoch": 1.3780120481927711, "grad_norm": 52.73352873370777, "learning_rate": 6.555911144578313e-07, "logits/chosen": -2.587695360183716, "logits/rejected": -2.6031250953674316, "logps/chosen": -449.54998779296875, "logps/rejected": -441.875, "loss": 0.0412, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.6532225608825684, "rewards/margins": 8.100390434265137, "rewards/rejected": -11.75390625, "step": 3660 }, { "epoch": 1.381777108433735, "grad_norm": 40.477594218515435, "learning_rate": 6.546498493975904e-07, "logits/chosen": -2.607617139816284, "logits/rejected": -2.604296922683716, "logps/chosen": -400.92498779296875, "logps/rejected": -440.67498779296875, "loss": 0.0721, "rewards/accuracies": 0.96875, "rewards/chosen": -2.965136766433716, "rewards/margins": 7.782812595367432, "rewards/rejected": -10.753125190734863, "step": 3670 }, { "epoch": 1.3855421686746987, "grad_norm": 49.743722657781205, "learning_rate": 6.537085843373493e-07, "logits/chosen": -2.574414014816284, "logits/rejected": -2.7027344703674316, "logps/chosen": -454.7250061035156, "logps/rejected": -442.75, "loss": 0.0772, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.4505858421325684, "rewards/margins": 7.561718940734863, "rewards/rejected": -11.015625, "step": 3680 }, { "epoch": 1.3893072289156627, "grad_norm": 39.52742898915378, "learning_rate": 6.527673192771084e-07, "logits/chosen": -2.5775389671325684, "logits/rejected": -2.582226514816284, "logps/chosen": -464.0874938964844, "logps/rejected": -430.7749938964844, "loss": 0.0494, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.633227586746216, "rewards/margins": 7.466015815734863, "rewards/rejected": -11.092187881469727, "step": 3690 }, { "epoch": 1.3930722891566265, "grad_norm": 38.3520284120663, "learning_rate": 6.518260542168674e-07, "logits/chosen": -2.4683594703674316, "logits/rejected": -2.674023389816284, "logps/chosen": -492.125, "logps/rejected": -486.82501220703125, "loss": 0.0638, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.5205931663513184, "rewards/margins": 7.911718845367432, "rewards/rejected": -11.438281059265137, "step": 3700 }, { "epoch": 1.3968373493975903, "grad_norm": 28.789624370817684, "learning_rate": 6.508847891566265e-07, "logits/chosen": -2.587109327316284, "logits/rejected": -2.6156249046325684, "logps/chosen": -534.7249755859375, "logps/rejected": -497.20001220703125, "loss": 0.0461, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.142822265625, "rewards/margins": 7.580468654632568, "rewards/rejected": -10.717187881469727, "step": 3710 }, { "epoch": 1.4006024096385543, "grad_norm": 45.06416714559433, "learning_rate": 6.499435240963855e-07, "logits/chosen": -2.6507811546325684, "logits/rejected": -2.753124952316284, "logps/chosen": -421.875, "logps/rejected": -453.54998779296875, "loss": 0.0587, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.9198241233825684, "rewards/margins": 7.201952934265137, "rewards/rejected": -11.119140625, "step": 3720 }, { "epoch": 1.404367469879518, "grad_norm": 14.098742805696181, "learning_rate": 6.490022590361446e-07, "logits/chosen": -2.5453124046325684, "logits/rejected": -2.6644530296325684, "logps/chosen": -457.4750061035156, "logps/rejected": -465.95001220703125, "loss": 0.0505, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.086010694503784, "rewards/margins": 7.72265625, "rewards/rejected": -10.807812690734863, "step": 3730 }, { "epoch": 1.408132530120482, "grad_norm": 58.86354551810719, "learning_rate": 6.480609939759037e-07, "logits/chosen": -2.6675782203674316, "logits/rejected": -2.6937499046325684, "logps/chosen": -430.4125061035156, "logps/rejected": -433.67498779296875, "loss": 0.168, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -2.367431640625, "rewards/margins": 7.507616996765137, "rewards/rejected": -9.875, "step": 3740 }, { "epoch": 1.4118975903614457, "grad_norm": 20.934516694482152, "learning_rate": 6.471197289156625e-07, "logits/chosen": -2.6392579078674316, "logits/rejected": -2.679882764816284, "logps/chosen": -444.57501220703125, "logps/rejected": -423.25, "loss": 0.042, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.4974608421325684, "rewards/margins": 7.608984470367432, "rewards/rejected": -11.110937118530273, "step": 3750 }, { "epoch": 1.4156626506024097, "grad_norm": 41.36988366347056, "learning_rate": 6.461784638554216e-07, "logits/chosen": -2.5423827171325684, "logits/rejected": -2.6781249046325684, "logps/chosen": -436.875, "logps/rejected": -431.8999938964844, "loss": 0.0934, "rewards/accuracies": 0.96875, "rewards/chosen": -3.299853563308716, "rewards/margins": 7.494140625, "rewards/rejected": -10.797656059265137, "step": 3760 }, { "epoch": 1.4194277108433735, "grad_norm": 67.25746187817829, "learning_rate": 6.452371987951807e-07, "logits/chosen": -2.5091795921325684, "logits/rejected": -2.689257860183716, "logps/chosen": -484.5249938964844, "logps/rejected": -435.7250061035156, "loss": 0.0419, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.595776319503784, "rewards/margins": 7.927734375, "rewards/rejected": -10.526562690734863, "step": 3770 }, { "epoch": 1.4231927710843373, "grad_norm": 50.50682198792438, "learning_rate": 6.442959337349398e-07, "logits/chosen": -2.5927734375, "logits/rejected": -2.650390625, "logps/chosen": -389.9125061035156, "logps/rejected": -406.0, "loss": 0.0783, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.088134765625, "rewards/margins": 6.74609375, "rewards/rejected": -9.834765434265137, "step": 3780 }, { "epoch": 1.4269578313253013, "grad_norm": 6.200236108059595, "learning_rate": 6.433546686746987e-07, "logits/chosen": -2.6332030296325684, "logits/rejected": -2.6949219703674316, "logps/chosen": -472.875, "logps/rejected": -473.67498779296875, "loss": 0.0454, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.2378907203674316, "rewards/margins": 8.29296875, "rewards/rejected": -11.528124809265137, "step": 3790 }, { "epoch": 1.430722891566265, "grad_norm": 7.883752995802512, "learning_rate": 6.424134036144578e-07, "logits/chosen": -2.5220704078674316, "logits/rejected": -2.5648436546325684, "logps/chosen": -458.17498779296875, "logps/rejected": -483.75, "loss": 0.0436, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.676464796066284, "rewards/margins": 7.587500095367432, "rewards/rejected": -11.264062881469727, "step": 3800 }, { "epoch": 1.4344879518072289, "grad_norm": 2.206098571563086, "learning_rate": 6.414721385542169e-07, "logits/chosen": -2.5337891578674316, "logits/rejected": -2.6263670921325684, "logps/chosen": -471.86248779296875, "logps/rejected": -449.5, "loss": 0.0548, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.081933498382568, "rewards/margins": 7.706835746765137, "rewards/rejected": -11.784375190734863, "step": 3810 }, { "epoch": 1.4382530120481927, "grad_norm": 9.205828554264535, "learning_rate": 6.405308734939759e-07, "logits/chosen": -2.5703125, "logits/rejected": -2.619140625, "logps/chosen": -420.625, "logps/rejected": -459.3999938964844, "loss": 0.0387, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.409472703933716, "rewards/margins": 7.784375190734863, "rewards/rejected": -11.194531440734863, "step": 3820 }, { "epoch": 1.4420180722891567, "grad_norm": 39.69756785301525, "learning_rate": 6.395896084337348e-07, "logits/chosen": -2.78515625, "logits/rejected": -2.8134765625, "logps/chosen": -432.7250061035156, "logps/rejected": -418.75, "loss": 0.0808, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.452538967132568, "rewards/margins": 7.113671779632568, "rewards/rejected": -11.563281059265137, "step": 3830 }, { "epoch": 1.4457831325301205, "grad_norm": 6.017331218027887, "learning_rate": 6.386483433734939e-07, "logits/chosen": -2.6685547828674316, "logits/rejected": -2.7601561546325684, "logps/chosen": -448.1499938964844, "logps/rejected": -443.3999938964844, "loss": 0.0408, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.23974609375, "rewards/margins": 7.655077934265137, "rewards/rejected": -12.885937690734863, "step": 3840 }, { "epoch": 1.4495481927710843, "grad_norm": 36.617340604310385, "learning_rate": 6.37707078313253e-07, "logits/chosen": -2.582812547683716, "logits/rejected": -2.8062500953674316, "logps/chosen": -464.79998779296875, "logps/rejected": -432.7250061035156, "loss": 0.0439, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.762499809265137, "rewards/margins": 7.829297065734863, "rewards/rejected": -12.59375, "step": 3850 }, { "epoch": 1.4533132530120483, "grad_norm": 29.66782946019375, "learning_rate": 6.367658132530121e-07, "logits/chosen": -2.6279296875, "logits/rejected": -2.7279295921325684, "logps/chosen": -477.5249938964844, "logps/rejected": -461.1000061035156, "loss": 0.1016, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.15478515625, "rewards/margins": 7.589062690734863, "rewards/rejected": -11.744531631469727, "step": 3860 }, { "epoch": 1.457078313253012, "grad_norm": 23.637425338136225, "learning_rate": 6.358245481927711e-07, "logits/chosen": -2.622265577316284, "logits/rejected": -2.6996092796325684, "logps/chosen": -450.79998779296875, "logps/rejected": -447.3500061035156, "loss": 0.1058, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.608569145202637, "rewards/margins": 7.263281345367432, "rewards/rejected": -11.872655868530273, "step": 3870 }, { "epoch": 1.4608433734939759, "grad_norm": 10.033748890797614, "learning_rate": 6.348832831325301e-07, "logits/chosen": -2.630078077316284, "logits/rejected": -2.59765625, "logps/chosen": -494.54998779296875, "logps/rejected": -480.07501220703125, "loss": 0.0832, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.62744140625, "rewards/margins": 7.579687595367432, "rewards/rejected": -11.20703125, "step": 3880 }, { "epoch": 1.4646084337349397, "grad_norm": 33.513538764639975, "learning_rate": 6.339420180722891e-07, "logits/chosen": -2.5425782203674316, "logits/rejected": -2.649609327316284, "logps/chosen": -472.73748779296875, "logps/rejected": -451.0, "loss": 0.0833, "rewards/accuracies": 0.96875, "rewards/chosen": -3.3888182640075684, "rewards/margins": 7.67578125, "rewards/rejected": -11.068750381469727, "step": 3890 }, { "epoch": 1.4683734939759037, "grad_norm": 45.65936612836268, "learning_rate": 6.330007530120481e-07, "logits/chosen": -2.6578125953674316, "logits/rejected": -2.772656202316284, "logps/chosen": -401.54998779296875, "logps/rejected": -395.70001220703125, "loss": 0.0878, "rewards/accuracies": 0.96875, "rewards/chosen": -3.36669921875, "rewards/margins": 7.103515625, "rewards/rejected": -10.471094131469727, "step": 3900 }, { "epoch": 1.4721385542168675, "grad_norm": 4.258375808004731, "learning_rate": 6.320594879518072e-07, "logits/chosen": -2.545703172683716, "logits/rejected": -2.6273436546325684, "logps/chosen": -412.86248779296875, "logps/rejected": -433.95001220703125, "loss": 0.0573, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.148608446121216, "rewards/margins": 7.631640434265137, "rewards/rejected": -10.77734375, "step": 3910 }, { "epoch": 1.4759036144578312, "grad_norm": 83.30489907817751, "learning_rate": 6.311182228915662e-07, "logits/chosen": -2.5712890625, "logits/rejected": -2.688671827316284, "logps/chosen": -421.125, "logps/rejected": -440.3500061035156, "loss": 0.0703, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.311816453933716, "rewards/margins": 7.261328220367432, "rewards/rejected": -10.572656631469727, "step": 3920 }, { "epoch": 1.4796686746987953, "grad_norm": 43.71102570312305, "learning_rate": 6.301769578313253e-07, "logits/chosen": -2.6128907203674316, "logits/rejected": -2.7054686546325684, "logps/chosen": -443.3500061035156, "logps/rejected": -442.7749938964844, "loss": 0.0539, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.2099609375, "rewards/margins": 7.422656059265137, "rewards/rejected": -11.639062881469727, "step": 3930 }, { "epoch": 1.483433734939759, "grad_norm": 3.7428398122326922, "learning_rate": 6.292356927710843e-07, "logits/chosen": -2.5263671875, "logits/rejected": -2.6791014671325684, "logps/chosen": -492.1499938964844, "logps/rejected": -477.04998779296875, "loss": 0.0373, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.566650390625, "rewards/margins": 7.701562404632568, "rewards/rejected": -12.265625, "step": 3940 }, { "epoch": 1.4871987951807228, "grad_norm": 60.8916247004124, "learning_rate": 6.282944277108434e-07, "logits/chosen": -2.616015672683716, "logits/rejected": -2.678906202316284, "logps/chosen": -448.2250061035156, "logps/rejected": -448.17498779296875, "loss": 0.088, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.430761814117432, "rewards/margins": 7.880468845367432, "rewards/rejected": -12.315625190734863, "step": 3950 }, { "epoch": 1.4909638554216866, "grad_norm": 7.868140545907799, "learning_rate": 6.273531626506024e-07, "logits/chosen": -2.592968702316284, "logits/rejected": -2.6900391578674316, "logps/chosen": -444.54998779296875, "logps/rejected": -456.79998779296875, "loss": 0.0779, "rewards/accuracies": 0.96875, "rewards/chosen": -4.006689548492432, "rewards/margins": 8.578906059265137, "rewards/rejected": -12.589062690734863, "step": 3960 }, { "epoch": 1.4947289156626506, "grad_norm": 33.34963413953978, "learning_rate": 6.264118975903614e-07, "logits/chosen": -2.6136717796325684, "logits/rejected": -2.661328077316284, "logps/chosen": -482.07501220703125, "logps/rejected": -486.6000061035156, "loss": 0.123, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.334082126617432, "rewards/margins": 8.160547256469727, "rewards/rejected": -12.490625381469727, "step": 3970 }, { "epoch": 1.4984939759036144, "grad_norm": 8.36863938274027, "learning_rate": 6.254706325301204e-07, "logits/chosen": -2.5884766578674316, "logits/rejected": -2.5990233421325684, "logps/chosen": -501.29998779296875, "logps/rejected": -531.5999755859375, "loss": 0.0486, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.7865233421325684, "rewards/margins": 8.591405868530273, "rewards/rejected": -12.3828125, "step": 3980 }, { "epoch": 1.5022590361445785, "grad_norm": 82.13240030162474, "learning_rate": 6.245293674698795e-07, "logits/chosen": -2.6265625953674316, "logits/rejected": -2.6119141578674316, "logps/chosen": -422.73748779296875, "logps/rejected": -451.125, "loss": 0.0989, "rewards/accuracies": 0.9375, "rewards/chosen": -4.84619140625, "rewards/margins": 7.000781059265137, "rewards/rejected": -11.846875190734863, "step": 3990 }, { "epoch": 1.5060240963855422, "grad_norm": 31.747182111054197, "learning_rate": 6.235881024096386e-07, "logits/chosen": -2.580273389816284, "logits/rejected": -2.694140672683716, "logps/chosen": -451.63751220703125, "logps/rejected": -465.1000061035156, "loss": 0.0929, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.274560451507568, "rewards/margins": 7.585351467132568, "rewards/rejected": -11.854687690734863, "step": 4000 }, { "epoch": 1.509789156626506, "grad_norm": 55.852069856664876, "learning_rate": 6.226468373493976e-07, "logits/chosen": -2.5736327171325684, "logits/rejected": -2.5865235328674316, "logps/chosen": -424.98748779296875, "logps/rejected": -446.92498779296875, "loss": 0.0592, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.82177734375, "rewards/margins": 7.433203220367432, "rewards/rejected": -12.256250381469727, "step": 4010 }, { "epoch": 1.5135542168674698, "grad_norm": 14.769291886001314, "learning_rate": 6.217055722891565e-07, "logits/chosen": -2.5132813453674316, "logits/rejected": -2.6546874046325684, "logps/chosen": -442.0249938964844, "logps/rejected": -436.20001220703125, "loss": 0.0583, "rewards/accuracies": 0.96875, "rewards/chosen": -4.053418159484863, "rewards/margins": 7.698828220367432, "rewards/rejected": -11.752344131469727, "step": 4020 }, { "epoch": 1.5173192771084336, "grad_norm": 9.06400381775196, "learning_rate": 6.207643072289156e-07, "logits/chosen": -2.5869140625, "logits/rejected": -2.6500000953674316, "logps/chosen": -472.38751220703125, "logps/rejected": -451.375, "loss": 0.0515, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.374218940734863, "rewards/margins": 7.4453125, "rewards/rejected": -11.81640625, "step": 4030 }, { "epoch": 1.5210843373493976, "grad_norm": 19.24834353507291, "learning_rate": 6.198230421686747e-07, "logits/chosen": -2.520312547683716, "logits/rejected": -2.609570264816284, "logps/chosen": -462.07501220703125, "logps/rejected": -459.67498779296875, "loss": 0.0709, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.25, "rewards/margins": 7.398828029632568, "rewards/rejected": -11.649999618530273, "step": 4040 }, { "epoch": 1.5248493975903614, "grad_norm": 10.752955158573327, "learning_rate": 6.188817771084338e-07, "logits/chosen": -2.6664061546325684, "logits/rejected": -2.779296875, "logps/chosen": -445.67498779296875, "logps/rejected": -432.0, "loss": 0.0477, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.761523485183716, "rewards/margins": 8.074999809265137, "rewards/rejected": -11.83203125, "step": 4050 }, { "epoch": 1.5286144578313254, "grad_norm": 45.71078232062782, "learning_rate": 6.179405120481927e-07, "logits/chosen": -2.5458984375, "logits/rejected": -2.6722655296325684, "logps/chosen": -493.92498779296875, "logps/rejected": -494.17498779296875, "loss": 0.0602, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.473437547683716, "rewards/margins": 7.881640434265137, "rewards/rejected": -11.350000381469727, "step": 4060 }, { "epoch": 1.5323795180722892, "grad_norm": 2.189357886423785, "learning_rate": 6.169992469879518e-07, "logits/chosen": -2.6943359375, "logits/rejected": -2.7015624046325684, "logps/chosen": -481.95001220703125, "logps/rejected": -471.5249938964844, "loss": 0.0599, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.775585889816284, "rewards/margins": 7.3046875, "rewards/rejected": -11.080469131469727, "step": 4070 }, { "epoch": 1.536144578313253, "grad_norm": 36.33011732539955, "learning_rate": 6.160579819277109e-07, "logits/chosen": -2.576171875, "logits/rejected": -2.7699217796325684, "logps/chosen": -432.42498779296875, "logps/rejected": -408.42498779296875, "loss": 0.061, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.104882717132568, "rewards/margins": 7.182421684265137, "rewards/rejected": -11.284375190734863, "step": 4080 }, { "epoch": 1.5399096385542168, "grad_norm": 42.94943041764584, "learning_rate": 6.151167168674698e-07, "logits/chosen": -2.491406202316284, "logits/rejected": -2.570507764816284, "logps/chosen": -458.51251220703125, "logps/rejected": -457.625, "loss": 0.0847, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.452343702316284, "rewards/margins": 7.7412109375, "rewards/rejected": -11.185155868530273, "step": 4090 }, { "epoch": 1.5436746987951806, "grad_norm": 17.310528999554855, "learning_rate": 6.141754518072288e-07, "logits/chosen": -2.6285157203674316, "logits/rejected": -2.685742139816284, "logps/chosen": -437.125, "logps/rejected": -431.8500061035156, "loss": 0.0678, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.793993949890137, "rewards/margins": 7.286328315734863, "rewards/rejected": -12.074999809265137, "step": 4100 }, { "epoch": 1.5474397590361446, "grad_norm": 28.813571119086024, "learning_rate": 6.132341867469879e-07, "logits/chosen": -2.6529297828674316, "logits/rejected": -2.684375047683716, "logps/chosen": -454.20001220703125, "logps/rejected": -474.25, "loss": 0.0561, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -5.323974609375, "rewards/margins": 7.885546684265137, "rewards/rejected": -13.203125, "step": 4110 }, { "epoch": 1.5512048192771084, "grad_norm": 31.521295877631257, "learning_rate": 6.12292921686747e-07, "logits/chosen": -2.6781249046325684, "logits/rejected": -2.722851514816284, "logps/chosen": -483.625, "logps/rejected": -459.79998779296875, "loss": 0.138, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.253808498382568, "rewards/margins": 7.289843559265137, "rewards/rejected": -12.543749809265137, "step": 4120 }, { "epoch": 1.5549698795180724, "grad_norm": 68.48867613503656, "learning_rate": 6.11351656626506e-07, "logits/chosen": -2.595703125, "logits/rejected": -2.6949219703674316, "logps/chosen": -456.45001220703125, "logps/rejected": -446.5, "loss": 0.0474, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.584326267242432, "rewards/margins": 7.818749904632568, "rewards/rejected": -12.400781631469727, "step": 4130 }, { "epoch": 1.5587349397590362, "grad_norm": 4.794907913514871, "learning_rate": 6.10410391566265e-07, "logits/chosen": -2.698046922683716, "logits/rejected": -2.762500047683716, "logps/chosen": -443.3999938964844, "logps/rejected": -451.7749938964844, "loss": 0.0551, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.888671875, "rewards/margins": 7.94921875, "rewards/rejected": -11.8359375, "step": 4140 }, { "epoch": 1.5625, "grad_norm": 14.986245649724607, "learning_rate": 6.094691265060241e-07, "logits/chosen": -2.535937547683716, "logits/rejected": -2.5814452171325684, "logps/chosen": -445.75, "logps/rejected": -452.2250061035156, "loss": 0.0248, "rewards/accuracies": 1.0, "rewards/chosen": -2.681640625, "rewards/margins": 8.403124809265137, "rewards/rejected": -11.079687118530273, "step": 4150 }, { "epoch": 1.5662650602409638, "grad_norm": 74.2980403823142, "learning_rate": 6.085278614457831e-07, "logits/chosen": -2.6460938453674316, "logits/rejected": -2.6822266578674316, "logps/chosen": -477.0, "logps/rejected": -496.8500061035156, "loss": 0.0636, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.316113471984863, "rewards/margins": 7.83984375, "rewards/rejected": -12.149999618530273, "step": 4160 }, { "epoch": 1.5700301204819276, "grad_norm": 7.789914030598397, "learning_rate": 6.075865963855421e-07, "logits/chosen": -2.638671875, "logits/rejected": -2.7289061546325684, "logps/chosen": -439.42498779296875, "logps/rejected": -447.3500061035156, "loss": 0.0241, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.197167873382568, "rewards/margins": 7.978906154632568, "rewards/rejected": -12.173437118530273, "step": 4170 }, { "epoch": 1.5737951807228916, "grad_norm": 60.3731887697034, "learning_rate": 6.066453313253012e-07, "logits/chosen": -2.6484375, "logits/rejected": -2.7154297828674316, "logps/chosen": -476.8500061035156, "logps/rejected": -447.32501220703125, "loss": 0.076, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.505175590515137, "rewards/margins": 7.350390434265137, "rewards/rejected": -11.853906631469727, "step": 4180 }, { "epoch": 1.5775602409638554, "grad_norm": 16.148121248067074, "learning_rate": 6.057040662650602e-07, "logits/chosen": -2.786914110183716, "logits/rejected": -2.7945313453674316, "logps/chosen": -442.1000061035156, "logps/rejected": -452.0249938964844, "loss": 0.0894, "rewards/accuracies": 0.96875, "rewards/chosen": -4.564843654632568, "rewards/margins": 7.3046875, "rewards/rejected": -11.868749618530273, "step": 4190 }, { "epoch": 1.5813253012048194, "grad_norm": 26.964736466003842, "learning_rate": 6.047628012048193e-07, "logits/chosen": -2.875781297683716, "logits/rejected": -2.9378905296325684, "logps/chosen": -422.7749938964844, "logps/rejected": -455.54998779296875, "loss": 0.0481, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.933398246765137, "rewards/margins": 7.73046875, "rewards/rejected": -12.66015625, "step": 4200 }, { "epoch": 1.5850903614457832, "grad_norm": 35.94114526285303, "learning_rate": 6.038215361445783e-07, "logits/chosen": -2.683789014816284, "logits/rejected": -2.813671827316284, "logps/chosen": -507.25, "logps/rejected": -469.45001220703125, "loss": 0.0405, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.085741996765137, "rewards/margins": 8.425000190734863, "rewards/rejected": -12.506250381469727, "step": 4210 }, { "epoch": 1.588855421686747, "grad_norm": 26.302754206458413, "learning_rate": 6.028802710843374e-07, "logits/chosen": -2.6820311546325684, "logits/rejected": -2.740234375, "logps/chosen": -458.70001220703125, "logps/rejected": -472.6000061035156, "loss": 0.0392, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.633008003234863, "rewards/margins": 7.526562690734863, "rewards/rejected": -12.159375190734863, "step": 4220 }, { "epoch": 1.5926204819277108, "grad_norm": 150.97924586592652, "learning_rate": 6.019390060240963e-07, "logits/chosen": -2.755078077316284, "logits/rejected": -2.842578172683716, "logps/chosen": -487.0, "logps/rejected": -492.75, "loss": 0.131, "rewards/accuracies": 0.96875, "rewards/chosen": -4.603125095367432, "rewards/margins": 7.443749904632568, "rewards/rejected": -12.042187690734863, "step": 4230 }, { "epoch": 1.5963855421686746, "grad_norm": 26.27100691692193, "learning_rate": 6.009977409638553e-07, "logits/chosen": -2.8082032203674316, "logits/rejected": -2.788867235183716, "logps/chosen": -461.5249938964844, "logps/rejected": -451.79998779296875, "loss": 0.1101, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.210351467132568, "rewards/margins": 7.313281059265137, "rewards/rejected": -11.530468940734863, "step": 4240 }, { "epoch": 1.6001506024096386, "grad_norm": 16.992698132133086, "learning_rate": 6.000564759036144e-07, "logits/chosen": -2.6666016578674316, "logits/rejected": -2.8089842796325684, "logps/chosen": -440.1499938964844, "logps/rejected": -467.5, "loss": 0.0413, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.157421827316284, "rewards/margins": 7.914843559265137, "rewards/rejected": -11.071874618530273, "step": 4250 }, { "epoch": 1.6039156626506024, "grad_norm": 35.063931529609455, "learning_rate": 5.991152108433735e-07, "logits/chosen": -2.5443358421325684, "logits/rejected": -2.630664110183716, "logps/chosen": -444.6499938964844, "logps/rejected": -439.6000061035156, "loss": 0.1347, "rewards/accuracies": 0.9375, "rewards/chosen": -3.1292481422424316, "rewards/margins": 7.024609565734863, "rewards/rejected": -10.158984184265137, "step": 4260 }, { "epoch": 1.6076807228915664, "grad_norm": 84.70136265360728, "learning_rate": 5.981739457831325e-07, "logits/chosen": -2.642382860183716, "logits/rejected": -2.74609375, "logps/chosen": -454.8500061035156, "logps/rejected": -453.3999938964844, "loss": 0.0722, "rewards/accuracies": 0.96875, "rewards/chosen": -3.077392578125, "rewards/margins": 7.602343559265137, "rewards/rejected": -10.678906440734863, "step": 4270 }, { "epoch": 1.6114457831325302, "grad_norm": 55.171073441056784, "learning_rate": 5.972326807228915e-07, "logits/chosen": -2.628124952316284, "logits/rejected": -2.6556639671325684, "logps/chosen": -398.79998779296875, "logps/rejected": -438.0249938964844, "loss": 0.0478, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.8305420875549316, "rewards/margins": 6.974609375, "rewards/rejected": -9.810937881469727, "step": 4280 }, { "epoch": 1.615210843373494, "grad_norm": 52.43793339622279, "learning_rate": 5.962914156626506e-07, "logits/chosen": -2.488476514816284, "logits/rejected": -2.625, "logps/chosen": -430.375, "logps/rejected": -464.2250061035156, "loss": 0.055, "rewards/accuracies": 0.96875, "rewards/chosen": -3.1659178733825684, "rewards/margins": 7.290625095367432, "rewards/rejected": -10.457812309265137, "step": 4290 }, { "epoch": 1.6189759036144578, "grad_norm": 16.064122437703613, "learning_rate": 5.953501506024096e-07, "logits/chosen": -2.573925733566284, "logits/rejected": -2.6640625, "logps/chosen": -468.57501220703125, "logps/rejected": -426.1499938964844, "loss": 0.098, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.862988233566284, "rewards/margins": 7.012499809265137, "rewards/rejected": -9.873437881469727, "step": 4300 }, { "epoch": 1.6227409638554215, "grad_norm": 86.05623151489898, "learning_rate": 5.944088855421687e-07, "logits/chosen": -2.6595702171325684, "logits/rejected": -2.7464842796325684, "logps/chosen": -460.1000061035156, "logps/rejected": -436.92498779296875, "loss": 0.0687, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.425488233566284, "rewards/margins": 7.310351371765137, "rewards/rejected": -10.737500190734863, "step": 4310 }, { "epoch": 1.6265060240963856, "grad_norm": 5.514496691549843, "learning_rate": 5.934676204819276e-07, "logits/chosen": -2.622851610183716, "logits/rejected": -2.6714844703674316, "logps/chosen": -454.5, "logps/rejected": -459.0249938964844, "loss": 0.0512, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.73486328125, "rewards/margins": 7.369531154632568, "rewards/rejected": -11.108593940734863, "step": 4320 }, { "epoch": 1.6302710843373494, "grad_norm": 26.07744570006893, "learning_rate": 5.925263554216867e-07, "logits/chosen": -2.6048827171325684, "logits/rejected": -2.6634764671325684, "logps/chosen": -445.6499938964844, "logps/rejected": -456.875, "loss": 0.0725, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.731249809265137, "rewards/margins": 7.432421684265137, "rewards/rejected": -12.168749809265137, "step": 4330 }, { "epoch": 1.6340361445783134, "grad_norm": 40.997939063092446, "learning_rate": 5.915850903614458e-07, "logits/chosen": -2.7386717796325684, "logits/rejected": -2.721874952316284, "logps/chosen": -416.6499938964844, "logps/rejected": -424.0, "loss": 0.0593, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.900048732757568, "rewards/margins": 7.489843845367432, "rewards/rejected": -12.387499809265137, "step": 4340 }, { "epoch": 1.6378012048192772, "grad_norm": 63.74780773921167, "learning_rate": 5.906438253012049e-07, "logits/chosen": -2.662890672683716, "logits/rejected": -2.6587891578674316, "logps/chosen": -434.6000061035156, "logps/rejected": -448.07501220703125, "loss": 0.0938, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.371679782867432, "rewards/margins": 7.622265815734863, "rewards/rejected": -11.995312690734863, "step": 4350 }, { "epoch": 1.641566265060241, "grad_norm": 18.07092594223021, "learning_rate": 5.897025602409638e-07, "logits/chosen": -2.6048827171325684, "logits/rejected": -2.6611328125, "logps/chosen": -390.8999938964844, "logps/rejected": -403.0249938964844, "loss": 0.0444, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.8056640625, "rewards/margins": 7.480078220367432, "rewards/rejected": -11.28515625, "step": 4360 }, { "epoch": 1.6453313253012047, "grad_norm": 45.235129977557825, "learning_rate": 5.887612951807228e-07, "logits/chosen": -2.5455079078674316, "logits/rejected": -2.646289110183716, "logps/chosen": -463.45001220703125, "logps/rejected": -477.3999938964844, "loss": 0.0821, "rewards/accuracies": 0.96875, "rewards/chosen": -3.6153807640075684, "rewards/margins": 7.742578029632568, "rewards/rejected": -11.359375, "step": 4370 }, { "epoch": 1.6490963855421685, "grad_norm": 10.431322540016366, "learning_rate": 5.878200301204819e-07, "logits/chosen": -2.568554639816284, "logits/rejected": -2.724414110183716, "logps/chosen": -445.9750061035156, "logps/rejected": -420.82501220703125, "loss": 0.0624, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.841015577316284, "rewards/margins": 7.174218654632568, "rewards/rejected": -11.017187118530273, "step": 4380 }, { "epoch": 1.6528614457831325, "grad_norm": 59.46538425462773, "learning_rate": 5.868787650602409e-07, "logits/chosen": -2.493945360183716, "logits/rejected": -2.6703124046325684, "logps/chosen": -479.04998779296875, "logps/rejected": -452.8999938964844, "loss": 0.0541, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.9955077171325684, "rewards/margins": 7.592577934265137, "rewards/rejected": -11.586718559265137, "step": 4390 }, { "epoch": 1.6566265060240963, "grad_norm": 19.076844220928155, "learning_rate": 5.859375e-07, "logits/chosen": -2.6429686546325684, "logits/rejected": -2.711132764816284, "logps/chosen": -436.88751220703125, "logps/rejected": -440.7749938964844, "loss": 0.0997, "rewards/accuracies": 0.96875, "rewards/chosen": -3.921557664871216, "rewards/margins": 7.588281154632568, "rewards/rejected": -11.509374618530273, "step": 4400 }, { "epoch": 1.6603915662650603, "grad_norm": 6.96534930858601, "learning_rate": 5.84996234939759e-07, "logits/chosen": -2.7408204078674316, "logits/rejected": -2.814648389816284, "logps/chosen": -480.6625061035156, "logps/rejected": -448.95001220703125, "loss": 0.0492, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.639697313308716, "rewards/margins": 7.421875, "rewards/rejected": -11.059374809265137, "step": 4410 }, { "epoch": 1.6641566265060241, "grad_norm": 89.58891006692788, "learning_rate": 5.840549698795181e-07, "logits/chosen": -2.489062547683716, "logits/rejected": -2.560742139816284, "logps/chosen": -469.0, "logps/rejected": -491.2250061035156, "loss": 0.0453, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.8475584983825684, "rewards/margins": 8.032031059265137, "rewards/rejected": -11.875, "step": 4420 }, { "epoch": 1.667921686746988, "grad_norm": 26.343666309605585, "learning_rate": 5.831137048192771e-07, "logits/chosen": -2.639453172683716, "logits/rejected": -2.7554688453674316, "logps/chosen": -424.4125061035156, "logps/rejected": -449.67498779296875, "loss": 0.0543, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.7129883766174316, "rewards/margins": 7.757421970367432, "rewards/rejected": -11.474218368530273, "step": 4430 }, { "epoch": 1.6716867469879517, "grad_norm": 60.411059035194995, "learning_rate": 5.821724397590361e-07, "logits/chosen": -2.5921874046325684, "logits/rejected": -2.6988282203674316, "logps/chosen": -472.875, "logps/rejected": -469.7250061035156, "loss": 0.0799, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.399706840515137, "rewards/margins": 7.823046684265137, "rewards/rejected": -12.221094131469727, "step": 4440 }, { "epoch": 1.6754518072289155, "grad_norm": 35.38846915550998, "learning_rate": 5.812311746987951e-07, "logits/chosen": -2.650390625, "logits/rejected": -2.765625, "logps/chosen": -450.45001220703125, "logps/rejected": -468.70001220703125, "loss": 0.0468, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.158886909484863, "rewards/margins": 8.518750190734863, "rewards/rejected": -12.676562309265137, "step": 4450 }, { "epoch": 1.6792168674698795, "grad_norm": 36.97330770224889, "learning_rate": 5.802899096385542e-07, "logits/chosen": -2.5804686546325684, "logits/rejected": -2.757617235183716, "logps/chosen": -459.3500061035156, "logps/rejected": -460.8999938964844, "loss": 0.1, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.2451171875, "rewards/margins": 7.122656345367432, "rewards/rejected": -12.372655868530273, "step": 4460 }, { "epoch": 1.6829819277108435, "grad_norm": 77.22247378593761, "learning_rate": 5.793486445783132e-07, "logits/chosen": -2.5093750953674316, "logits/rejected": -2.6908202171325684, "logps/chosen": -525.0750122070312, "logps/rejected": -461.875, "loss": 0.0703, "rewards/accuracies": 0.96875, "rewards/chosen": -4.673730373382568, "rewards/margins": 8.172656059265137, "rewards/rejected": -12.845312118530273, "step": 4470 }, { "epoch": 1.6867469879518073, "grad_norm": 12.857826203555105, "learning_rate": 5.784073795180723e-07, "logits/chosen": -2.531054735183716, "logits/rejected": -2.6126952171325684, "logps/chosen": -485.2749938964844, "logps/rejected": -476.0, "loss": 0.1028, "rewards/accuracies": 0.96875, "rewards/chosen": -4.158398628234863, "rewards/margins": 8.514062881469727, "rewards/rejected": -12.673437118530273, "step": 4480 }, { "epoch": 1.6905120481927711, "grad_norm": 32.16202810922273, "learning_rate": 5.774661144578313e-07, "logits/chosen": -2.494335889816284, "logits/rejected": -2.658203125, "logps/chosen": -429.2124938964844, "logps/rejected": -459.57501220703125, "loss": 0.0578, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.27099609375, "rewards/margins": 7.807031154632568, "rewards/rejected": -11.072656631469727, "step": 4490 }, { "epoch": 1.694277108433735, "grad_norm": 58.49701298819829, "learning_rate": 5.765248493975904e-07, "logits/chosen": -2.6478514671325684, "logits/rejected": -2.669921875, "logps/chosen": -456.01251220703125, "logps/rejected": -473.32501220703125, "loss": 0.0724, "rewards/accuracies": 0.96875, "rewards/chosen": -3.90673828125, "rewards/margins": 7.338281154632568, "rewards/rejected": -11.247655868530273, "step": 4500 }, { "epoch": 1.6980421686746987, "grad_norm": 77.14279486668802, "learning_rate": 5.755835843373493e-07, "logits/chosen": -2.6068358421325684, "logits/rejected": -2.6947264671325684, "logps/chosen": -463.0, "logps/rejected": -446.1499938964844, "loss": 0.0566, "rewards/accuracies": 0.96875, "rewards/chosen": -3.3140625953674316, "rewards/margins": 7.426953315734863, "rewards/rejected": -10.741406440734863, "step": 4510 }, { "epoch": 1.7018072289156625, "grad_norm": 78.23101968093022, "learning_rate": 5.746423192771084e-07, "logits/chosen": -2.5390625, "logits/rejected": -2.620898485183716, "logps/chosen": -434.2875061035156, "logps/rejected": -477.17498779296875, "loss": 0.0914, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.573925733566284, "rewards/margins": 8.0859375, "rewards/rejected": -11.653124809265137, "step": 4520 }, { "epoch": 1.7055722891566265, "grad_norm": 38.58459646850101, "learning_rate": 5.737010542168675e-07, "logits/chosen": -2.5824217796325684, "logits/rejected": -2.6572265625, "logps/chosen": -441.125, "logps/rejected": -438.95001220703125, "loss": 0.0977, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.228808641433716, "rewards/margins": 7.258203029632568, "rewards/rejected": -10.4765625, "step": 4530 }, { "epoch": 1.7093373493975905, "grad_norm": 19.060589638914923, "learning_rate": 5.727597891566265e-07, "logits/chosen": -2.4677734375, "logits/rejected": -2.607421875, "logps/chosen": -457.3500061035156, "logps/rejected": -431.1499938964844, "loss": 0.0779, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.8738770484924316, "rewards/margins": 7.832421779632568, "rewards/rejected": -10.70703125, "step": 4540 }, { "epoch": 1.7131024096385543, "grad_norm": 2.869372074376057, "learning_rate": 5.718185240963855e-07, "logits/chosen": -2.534960985183716, "logits/rejected": -2.482226610183716, "logps/chosen": -489.29998779296875, "logps/rejected": -466.5, "loss": 0.0689, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.054931640625, "rewards/margins": 7.784765720367432, "rewards/rejected": -10.84375, "step": 4550 }, { "epoch": 1.716867469879518, "grad_norm": 40.85943708875954, "learning_rate": 5.708772590361446e-07, "logits/chosen": -2.43359375, "logits/rejected": -2.583984375, "logps/chosen": -472.42498779296875, "logps/rejected": -459.8500061035156, "loss": 0.0363, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.3650879859924316, "rewards/margins": 7.969531059265137, "rewards/rejected": -11.326562881469727, "step": 4560 }, { "epoch": 1.720632530120482, "grad_norm": 119.65621796807187, "learning_rate": 5.699359939759037e-07, "logits/chosen": -2.516796827316284, "logits/rejected": -2.618945360183716, "logps/chosen": -437.9125061035156, "logps/rejected": -445.3500061035156, "loss": 0.0749, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.700976610183716, "rewards/margins": 7.790234565734863, "rewards/rejected": -11.491406440734863, "step": 4570 }, { "epoch": 1.7243975903614457, "grad_norm": 31.222302883786522, "learning_rate": 5.689947289156625e-07, "logits/chosen": -2.507617235183716, "logits/rejected": -2.578125, "logps/chosen": -467.4750061035156, "logps/rejected": -427.5, "loss": 0.0678, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.741943359375, "rewards/margins": 7.687890529632568, "rewards/rejected": -11.422656059265137, "step": 4580 }, { "epoch": 1.7281626506024095, "grad_norm": 20.329121098622988, "learning_rate": 5.680534638554216e-07, "logits/chosen": -2.504687547683716, "logits/rejected": -2.5550780296325684, "logps/chosen": -490.45001220703125, "logps/rejected": -486.70001220703125, "loss": 0.0424, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.588000535964966, "rewards/margins": 8.0, "rewards/rejected": -11.590624809265137, "step": 4590 }, { "epoch": 1.7319277108433735, "grad_norm": 6.592136509671126, "learning_rate": 5.671121987951807e-07, "logits/chosen": -2.628124952316284, "logits/rejected": -2.66015625, "logps/chosen": -409.4125061035156, "logps/rejected": -447.70001220703125, "loss": 0.0881, "rewards/accuracies": 0.96875, "rewards/chosen": -4.239941596984863, "rewards/margins": 7.557812690734863, "rewards/rejected": -11.795312881469727, "step": 4600 }, { "epoch": 1.7356927710843375, "grad_norm": 17.654896635022006, "learning_rate": 5.661709337349398e-07, "logits/chosen": -2.6187500953674316, "logits/rejected": -2.6025390625, "logps/chosen": -441.57501220703125, "logps/rejected": -526.2000122070312, "loss": 0.0745, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.114062309265137, "rewards/margins": 8.700780868530273, "rewards/rejected": -12.814844131469727, "step": 4610 }, { "epoch": 1.7394578313253013, "grad_norm": 75.16972815499815, "learning_rate": 5.652296686746987e-07, "logits/chosen": -2.5433592796325684, "logits/rejected": -2.623828172683716, "logps/chosen": -481.88751220703125, "logps/rejected": -468.95001220703125, "loss": 0.0983, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.7945313453674316, "rewards/margins": 7.525000095367432, "rewards/rejected": -11.328125, "step": 4620 }, { "epoch": 1.743222891566265, "grad_norm": 33.88312345823206, "learning_rate": 5.642884036144578e-07, "logits/chosen": -2.6001954078674316, "logits/rejected": -2.748828172683716, "logps/chosen": -447.3999938964844, "logps/rejected": -458.79998779296875, "loss": 0.0581, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.149706840515137, "rewards/margins": 7.810937404632568, "rewards/rejected": -11.960156440734863, "step": 4630 }, { "epoch": 1.7469879518072289, "grad_norm": 36.46079712219618, "learning_rate": 5.633471385542169e-07, "logits/chosen": -2.6044921875, "logits/rejected": -2.705859422683716, "logps/chosen": -474.125, "logps/rejected": -441.625, "loss": 0.0695, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.255468845367432, "rewards/margins": 8.358593940734863, "rewards/rejected": -12.611719131469727, "step": 4640 }, { "epoch": 1.7507530120481927, "grad_norm": 11.010764465728421, "learning_rate": 5.624058734939759e-07, "logits/chosen": -2.6128907203674316, "logits/rejected": -2.7152342796325684, "logps/chosen": -482.92498779296875, "logps/rejected": -471.17498779296875, "loss": 0.0784, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.035742282867432, "rewards/margins": 8.342187881469727, "rewards/rejected": -12.381250381469727, "step": 4650 }, { "epoch": 1.7545180722891565, "grad_norm": 30.25082830527258, "learning_rate": 5.614646084337349e-07, "logits/chosen": -2.5074219703674316, "logits/rejected": -2.6734375953674316, "logps/chosen": -462.04998779296875, "logps/rejected": -483.70001220703125, "loss": 0.065, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.356249809265137, "rewards/margins": 7.807421684265137, "rewards/rejected": -12.162500381469727, "step": 4660 }, { "epoch": 1.7582831325301205, "grad_norm": 39.749024952654274, "learning_rate": 5.605233433734939e-07, "logits/chosen": -2.6402344703674316, "logits/rejected": -2.6884765625, "logps/chosen": -487.6499938964844, "logps/rejected": -528.0, "loss": 0.0424, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.219336032867432, "rewards/margins": 7.737890720367432, "rewards/rejected": -12.962499618530273, "step": 4670 }, { "epoch": 1.7620481927710845, "grad_norm": 12.699700909633515, "learning_rate": 5.59582078313253e-07, "logits/chosen": -2.671093702316284, "logits/rejected": -2.7730469703674316, "logps/chosen": -551.0125122070312, "logps/rejected": -523.6500244140625, "loss": 0.0553, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.885156154632568, "rewards/margins": 8.764843940734863, "rewards/rejected": -13.6484375, "step": 4680 }, { "epoch": 1.7658132530120483, "grad_norm": 53.825335654441915, "learning_rate": 5.586408132530121e-07, "logits/chosen": -2.616406202316284, "logits/rejected": -2.7757811546325684, "logps/chosen": -446.625, "logps/rejected": -446.0249938964844, "loss": 0.1012, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.785058498382568, "rewards/margins": 8.249608993530273, "rewards/rejected": -13.03125, "step": 4690 }, { "epoch": 1.769578313253012, "grad_norm": 14.866476377376303, "learning_rate": 5.576995481927711e-07, "logits/chosen": -2.4410157203674316, "logits/rejected": -2.457812547683716, "logps/chosen": -468.29998779296875, "logps/rejected": -493.1000061035156, "loss": 0.0395, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.37841796875, "rewards/margins": 8.667187690734863, "rewards/rejected": -13.053125381469727, "step": 4700 }, { "epoch": 1.7733433734939759, "grad_norm": 21.00061317392406, "learning_rate": 5.567582831325301e-07, "logits/chosen": -2.60546875, "logits/rejected": -2.7054686546325684, "logps/chosen": -452.67498779296875, "logps/rejected": -454.75, "loss": 0.1512, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.8053221702575684, "rewards/margins": 7.6318359375, "rewards/rejected": -11.435155868530273, "step": 4710 }, { "epoch": 1.7771084337349397, "grad_norm": 43.417215417351066, "learning_rate": 5.558170180722891e-07, "logits/chosen": -2.580859422683716, "logits/rejected": -2.6343750953674316, "logps/chosen": -449.9375, "logps/rejected": -451.70001220703125, "loss": 0.0454, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.172558784484863, "rewards/margins": 8.243359565734863, "rewards/rejected": -12.417187690734863, "step": 4720 }, { "epoch": 1.7808734939759037, "grad_norm": 33.750789450512904, "learning_rate": 5.548757530120481e-07, "logits/chosen": -2.5693359375, "logits/rejected": -2.6148438453674316, "logps/chosen": -502.20001220703125, "logps/rejected": -478.79998779296875, "loss": 0.096, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.029980659484863, "rewards/margins": 8.01953125, "rewards/rejected": -13.051562309265137, "step": 4730 }, { "epoch": 1.7846385542168675, "grad_norm": 2.5125495882374347, "learning_rate": 5.539344879518072e-07, "logits/chosen": -2.526562452316284, "logits/rejected": -2.556445360183716, "logps/chosen": -437.9750061035156, "logps/rejected": -477.0249938964844, "loss": 0.0624, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.103906154632568, "rewards/margins": 8.08349609375, "rewards/rejected": -12.188281059265137, "step": 4740 }, { "epoch": 1.7884036144578315, "grad_norm": 14.114359660620044, "learning_rate": 5.529932228915663e-07, "logits/chosen": -2.678515672683716, "logits/rejected": -2.7242188453674316, "logps/chosen": -498.42498779296875, "logps/rejected": -479.5, "loss": 0.052, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.029296875, "rewards/margins": 8.44921875, "rewards/rejected": -13.483593940734863, "step": 4750 }, { "epoch": 1.7921686746987953, "grad_norm": 14.865396967616082, "learning_rate": 5.520519578313253e-07, "logits/chosen": -2.729687452316284, "logits/rejected": -2.748242139816284, "logps/chosen": -444.0, "logps/rejected": -452.1499938964844, "loss": 0.1249, "rewards/accuracies": 0.9375, "rewards/chosen": -5.28759765625, "rewards/margins": 7.757031440734863, "rewards/rejected": -13.040624618530273, "step": 4760 }, { "epoch": 1.795933734939759, "grad_norm": 6.833457066140748, "learning_rate": 5.511106927710843e-07, "logits/chosen": -2.7802734375, "logits/rejected": -2.7632813453674316, "logps/chosen": -505.7749938964844, "logps/rejected": -491.875, "loss": 0.0535, "rewards/accuracies": 0.96875, "rewards/chosen": -5.485156059265137, "rewards/margins": 8.06640625, "rewards/rejected": -13.553125381469727, "step": 4770 }, { "epoch": 1.7996987951807228, "grad_norm": 165.5460714726704, "learning_rate": 5.501694277108434e-07, "logits/chosen": -2.561328172683716, "logits/rejected": -2.6468749046325684, "logps/chosen": -422.57501220703125, "logps/rejected": -442.8500061035156, "loss": 0.0765, "rewards/accuracies": 0.96875, "rewards/chosen": -5.311327934265137, "rewards/margins": 7.649218559265137, "rewards/rejected": -12.957812309265137, "step": 4780 }, { "epoch": 1.8034638554216866, "grad_norm": 56.06264761821045, "learning_rate": 5.492281626506024e-07, "logits/chosen": -2.70703125, "logits/rejected": -2.6822266578674316, "logps/chosen": -436.79998779296875, "logps/rejected": -457.2749938964844, "loss": 0.0943, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.566796779632568, "rewards/margins": 7.671093940734863, "rewards/rejected": -13.245312690734863, "step": 4790 }, { "epoch": 1.8072289156626506, "grad_norm": 14.258401816790656, "learning_rate": 5.482868975903614e-07, "logits/chosen": -2.6371092796325684, "logits/rejected": -2.762890577316284, "logps/chosen": -487.875, "logps/rejected": -455.625, "loss": 0.0446, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.701171875, "rewards/margins": 8.326171875, "rewards/rejected": -13.02734375, "step": 4800 }, { "epoch": 1.8109939759036144, "grad_norm": 4.348209109716069, "learning_rate": 5.473456325301204e-07, "logits/chosen": -2.728320360183716, "logits/rejected": -2.7437500953674316, "logps/chosen": -459.95001220703125, "logps/rejected": -495.79998779296875, "loss": 0.0721, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.580078125, "rewards/margins": 7.672265529632568, "rewards/rejected": -12.252344131469727, "step": 4810 }, { "epoch": 1.8147590361445785, "grad_norm": 51.83787004644817, "learning_rate": 5.464043674698795e-07, "logits/chosen": -2.6099610328674316, "logits/rejected": -2.7691407203674316, "logps/chosen": -486.82501220703125, "logps/rejected": -458.8999938964844, "loss": 0.1063, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.170117378234863, "rewards/margins": 7.833593845367432, "rewards/rejected": -12.005468368530273, "step": 4820 }, { "epoch": 1.8185240963855422, "grad_norm": 31.82871332705769, "learning_rate": 5.454631024096386e-07, "logits/chosen": -2.7021484375, "logits/rejected": -2.7216796875, "logps/chosen": -402.92498779296875, "logps/rejected": -408.2749938964844, "loss": 0.092, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.1015625, "rewards/margins": 7.619921684265137, "rewards/rejected": -11.723437309265137, "step": 4830 }, { "epoch": 1.822289156626506, "grad_norm": 8.90529383557445, "learning_rate": 5.445218373493977e-07, "logits/chosen": -2.687304735183716, "logits/rejected": -2.7822265625, "logps/chosen": -482.04998779296875, "logps/rejected": -465.54998779296875, "loss": 0.0291, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.474413871765137, "rewards/margins": 8.259374618530273, "rewards/rejected": -12.739062309265137, "step": 4840 }, { "epoch": 1.8260542168674698, "grad_norm": 69.12485945711767, "learning_rate": 5.435805722891565e-07, "logits/chosen": -2.6791014671325684, "logits/rejected": -2.7886719703674316, "logps/chosen": -443.7250061035156, "logps/rejected": -446.1000061035156, "loss": 0.0965, "rewards/accuracies": 0.96875, "rewards/chosen": -4.205273628234863, "rewards/margins": 8.409765243530273, "rewards/rejected": -12.610937118530273, "step": 4850 }, { "epoch": 1.8298192771084336, "grad_norm": 76.18742208393154, "learning_rate": 5.426393072289156e-07, "logits/chosen": -2.654492139816284, "logits/rejected": -2.6910157203674316, "logps/chosen": -470.9750061035156, "logps/rejected": -476.29998779296875, "loss": 0.0942, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.786181449890137, "rewards/margins": 7.931640625, "rewards/rejected": -12.717187881469727, "step": 4860 }, { "epoch": 1.8335843373493976, "grad_norm": 5.767597226332029, "learning_rate": 5.416980421686747e-07, "logits/chosen": -2.612109422683716, "logits/rejected": -2.742968797683716, "logps/chosen": -500.8125, "logps/rejected": -481.5249938964844, "loss": 0.023, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.785742282867432, "rewards/margins": 7.994531154632568, "rewards/rejected": -12.7890625, "step": 4870 }, { "epoch": 1.8373493975903614, "grad_norm": 2.3953144824221586, "learning_rate": 5.407567771084337e-07, "logits/chosen": -2.5628905296325684, "logits/rejected": -2.6044921875, "logps/chosen": -449.32501220703125, "logps/rejected": -472.8500061035156, "loss": 0.0284, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.460741996765137, "rewards/margins": 8.196093559265137, "rewards/rejected": -12.657812118530273, "step": 4880 }, { "epoch": 1.8411144578313254, "grad_norm": 119.12086612200277, "learning_rate": 5.398155120481927e-07, "logits/chosen": -2.614453077316284, "logits/rejected": -2.645703077316284, "logps/chosen": -457.3500061035156, "logps/rejected": -481.8500061035156, "loss": 0.0859, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.7752685546875, "rewards/margins": 8.499608993530273, "rewards/rejected": -13.276562690734863, "step": 4890 }, { "epoch": 1.8448795180722892, "grad_norm": 5.528323513222679, "learning_rate": 5.388742469879518e-07, "logits/chosen": -2.6429686546325684, "logits/rejected": -2.646289110183716, "logps/chosen": -439.7875061035156, "logps/rejected": -482.3999938964844, "loss": 0.1244, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.338476657867432, "rewards/margins": 8.231249809265137, "rewards/rejected": -13.567187309265137, "step": 4900 }, { "epoch": 1.848644578313253, "grad_norm": 28.980070960519615, "learning_rate": 5.379329819277109e-07, "logits/chosen": -2.6783204078674316, "logits/rejected": -2.749218702316284, "logps/chosen": -486.3500061035156, "logps/rejected": -519.2999877929688, "loss": 0.0667, "rewards/accuracies": 0.96875, "rewards/chosen": -4.844140529632568, "rewards/margins": 9.078906059265137, "rewards/rejected": -13.9140625, "step": 4910 }, { "epoch": 1.8524096385542168, "grad_norm": 4.7039935694447905, "learning_rate": 5.369917168674698e-07, "logits/chosen": -2.607421875, "logits/rejected": -2.6791014671325684, "logps/chosen": -435.625, "logps/rejected": -429.1499938964844, "loss": 0.0385, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.412890434265137, "rewards/margins": 8.505468368530273, "rewards/rejected": -13.915624618530273, "step": 4920 }, { "epoch": 1.8561746987951806, "grad_norm": 24.341926071662087, "learning_rate": 5.360504518072288e-07, "logits/chosen": -2.7220702171325684, "logits/rejected": -2.7464842796325684, "logps/chosen": -457.42498779296875, "logps/rejected": -465.79998779296875, "loss": 0.0793, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -5.390234470367432, "rewards/margins": 8.103124618530273, "rewards/rejected": -13.4921875, "step": 4930 }, { "epoch": 1.8599397590361446, "grad_norm": 9.140149556129895, "learning_rate": 5.351091867469879e-07, "logits/chosen": -2.6312499046325684, "logits/rejected": -2.7330079078674316, "logps/chosen": -497.54998779296875, "logps/rejected": -500.20001220703125, "loss": 0.0391, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.076562404632568, "rewards/margins": 8.455469131469727, "rewards/rejected": -14.534375190734863, "step": 4940 }, { "epoch": 1.8637048192771084, "grad_norm": 82.83588071989257, "learning_rate": 5.34167921686747e-07, "logits/chosen": -2.651562452316284, "logits/rejected": -2.758007764816284, "logps/chosen": -496.8999938964844, "logps/rejected": -495.29998779296875, "loss": 0.1335, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.441308498382568, "rewards/margins": 8.079687118530273, "rewards/rejected": -13.5234375, "step": 4950 }, { "epoch": 1.8674698795180724, "grad_norm": 5.8720778358361185, "learning_rate": 5.33226656626506e-07, "logits/chosen": -2.644726514816284, "logits/rejected": -2.7744140625, "logps/chosen": -501.82501220703125, "logps/rejected": -470.17498779296875, "loss": 0.0227, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.623242378234863, "rewards/margins": 8.453906059265137, "rewards/rejected": -14.079687118530273, "step": 4960 }, { "epoch": 1.8712349397590362, "grad_norm": 19.04531688904279, "learning_rate": 5.32285391566265e-07, "logits/chosen": -2.6947264671325684, "logits/rejected": -2.711718797683716, "logps/chosen": -423.1499938964844, "logps/rejected": -446.625, "loss": 0.086, "rewards/accuracies": 0.96875, "rewards/chosen": -4.96826171875, "rewards/margins": 7.596484184265137, "rewards/rejected": -12.560937881469727, "step": 4970 }, { "epoch": 1.875, "grad_norm": 11.584412123793571, "learning_rate": 5.313441265060241e-07, "logits/chosen": -2.722851514816284, "logits/rejected": -2.7919921875, "logps/chosen": -446.6000061035156, "logps/rejected": -450.0249938964844, "loss": 0.0531, "rewards/accuracies": 0.96875, "rewards/chosen": -3.796398878097534, "rewards/margins": 8.082812309265137, "rewards/rejected": -11.875, "step": 4980 }, { "epoch": 1.8787650602409638, "grad_norm": 91.61671213555458, "learning_rate": 5.304028614457831e-07, "logits/chosen": -2.6744141578674316, "logits/rejected": -2.8121094703674316, "logps/chosen": -411.2749938964844, "logps/rejected": -385.79998779296875, "loss": 0.0726, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.6270508766174316, "rewards/margins": 7.585546970367432, "rewards/rejected": -11.213281631469727, "step": 4990 }, { "epoch": 1.8825301204819276, "grad_norm": 17.147479559577665, "learning_rate": 5.294615963855421e-07, "logits/chosen": -2.653515577316284, "logits/rejected": -2.770703077316284, "logps/chosen": -482.0, "logps/rejected": -447.6000061035156, "loss": 0.1144, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.114599704742432, "rewards/margins": 7.618554592132568, "rewards/rejected": -11.735937118530273, "step": 5000 }, { "epoch": 1.8862951807228916, "grad_norm": 11.44653834176385, "learning_rate": 5.285203313253012e-07, "logits/chosen": -2.7138671875, "logits/rejected": -2.7939453125, "logps/chosen": -444.54998779296875, "logps/rejected": -445.2749938964844, "loss": 0.1043, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.185156345367432, "rewards/margins": 7.594531059265137, "rewards/rejected": -12.779687881469727, "step": 5010 }, { "epoch": 1.8900602409638554, "grad_norm": 64.19678111315162, "learning_rate": 5.275790662650602e-07, "logits/chosen": -2.6410155296325684, "logits/rejected": -2.7455077171325684, "logps/chosen": -475.3999938964844, "logps/rejected": -442.375, "loss": 0.0846, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.853271484375, "rewards/margins": 7.576562404632568, "rewards/rejected": -12.425000190734863, "step": 5020 }, { "epoch": 1.8938253012048194, "grad_norm": 29.1287556139155, "learning_rate": 5.266378012048193e-07, "logits/chosen": -2.6197266578674316, "logits/rejected": -2.7035155296325684, "logps/chosen": -505.375, "logps/rejected": -514.8499755859375, "loss": 0.0684, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.683203220367432, "rewards/margins": 8.358593940734863, "rewards/rejected": -13.05078125, "step": 5030 }, { "epoch": 1.8975903614457832, "grad_norm": 15.740064137674924, "learning_rate": 5.256965361445783e-07, "logits/chosen": -2.575000047683716, "logits/rejected": -2.6703124046325684, "logps/chosen": -498.3500061035156, "logps/rejected": -491.04998779296875, "loss": 0.0408, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.9306640625, "rewards/margins": 7.903906345367432, "rewards/rejected": -12.829687118530273, "step": 5040 }, { "epoch": 1.901355421686747, "grad_norm": 8.198537139904708, "learning_rate": 5.247552710843374e-07, "logits/chosen": -2.705273389816284, "logits/rejected": -2.739062547683716, "logps/chosen": -492.54998779296875, "logps/rejected": -501.54998779296875, "loss": 0.0793, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.110302925109863, "rewards/margins": 8.571093559265137, "rewards/rejected": -13.684374809265137, "step": 5050 }, { "epoch": 1.9051204819277108, "grad_norm": 61.299280415767726, "learning_rate": 5.238140060240963e-07, "logits/chosen": -2.5716795921325684, "logits/rejected": -2.641406297683716, "logps/chosen": -438.42498779296875, "logps/rejected": -457.6499938964844, "loss": 0.0398, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.739941596984863, "rewards/margins": 8.216405868530273, "rewards/rejected": -12.959375381469727, "step": 5060 }, { "epoch": 1.9088855421686746, "grad_norm": 6.985570859634671, "learning_rate": 5.228727409638553e-07, "logits/chosen": -2.572070360183716, "logits/rejected": -2.7181639671325684, "logps/chosen": -425.57501220703125, "logps/rejected": -421.6499938964844, "loss": 0.0322, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.830371141433716, "rewards/margins": 8.138280868530273, "rewards/rejected": -11.965624809265137, "step": 5070 }, { "epoch": 1.9126506024096386, "grad_norm": 45.850227648599486, "learning_rate": 5.219314759036144e-07, "logits/chosen": -2.651562452316284, "logits/rejected": -2.697070360183716, "logps/chosen": -424.75, "logps/rejected": -437.67498779296875, "loss": 0.0863, "rewards/accuracies": 0.96875, "rewards/chosen": -3.904589891433716, "rewards/margins": 7.579297065734863, "rewards/rejected": -11.478124618530273, "step": 5080 }, { "epoch": 1.9164156626506024, "grad_norm": 23.25756528213225, "learning_rate": 5.209902108433735e-07, "logits/chosen": -2.7367186546325684, "logits/rejected": -2.7542967796325684, "logps/chosen": -430.04998779296875, "logps/rejected": -428.04998779296875, "loss": 0.0608, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.800097703933716, "rewards/margins": 7.637499809265137, "rewards/rejected": -11.435937881469727, "step": 5090 }, { "epoch": 1.9201807228915664, "grad_norm": 13.869680630625695, "learning_rate": 5.200489457831326e-07, "logits/chosen": -2.6673827171325684, "logits/rejected": -2.7074217796325684, "logps/chosen": -471.875, "logps/rejected": -482.75, "loss": 0.0632, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.5203857421875, "rewards/margins": 8.103515625, "rewards/rejected": -11.624218940734863, "step": 5100 }, { "epoch": 1.9239457831325302, "grad_norm": 17.260023538293694, "learning_rate": 5.191076807228915e-07, "logits/chosen": -2.746289014816284, "logits/rejected": -2.8316407203674316, "logps/chosen": -375.04998779296875, "logps/rejected": -409.54998779296875, "loss": 0.0679, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.412304639816284, "rewards/margins": 8.002734184265137, "rewards/rejected": -11.421093940734863, "step": 5110 }, { "epoch": 1.927710843373494, "grad_norm": 42.12743730189279, "learning_rate": 5.181664156626506e-07, "logits/chosen": -2.823437452316284, "logits/rejected": -2.895312547683716, "logps/chosen": -422.3999938964844, "logps/rejected": -438.2749938964844, "loss": 0.0794, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.609375, "rewards/margins": 7.551562309265137, "rewards/rejected": -11.157031059265137, "step": 5120 }, { "epoch": 1.9314759036144578, "grad_norm": 24.615375660848716, "learning_rate": 5.172251506024096e-07, "logits/chosen": -2.6781249046325684, "logits/rejected": -2.7416014671325684, "logps/chosen": -444.8500061035156, "logps/rejected": -443.5, "loss": 0.0381, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.71923828125, "rewards/margins": 8.103124618530273, "rewards/rejected": -11.8203125, "step": 5130 }, { "epoch": 1.9352409638554215, "grad_norm": 22.768161294553504, "learning_rate": 5.162838855421687e-07, "logits/chosen": -2.757031202316284, "logits/rejected": -2.817187547683716, "logps/chosen": -445.5249938964844, "logps/rejected": -422.625, "loss": 0.0594, "rewards/accuracies": 0.96875, "rewards/chosen": -4.131445407867432, "rewards/margins": 7.65625, "rewards/rejected": -11.785937309265137, "step": 5140 }, { "epoch": 1.9390060240963856, "grad_norm": 81.72259362617366, "learning_rate": 5.153426204819276e-07, "logits/chosen": -2.6435546875, "logits/rejected": -2.7730469703674316, "logps/chosen": -473.6499938964844, "logps/rejected": -456.1499938964844, "loss": 0.0749, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.406054496765137, "rewards/margins": 8.37890625, "rewards/rejected": -12.785937309265137, "step": 5150 }, { "epoch": 1.9427710843373494, "grad_norm": 2.870322854248886, "learning_rate": 5.144013554216867e-07, "logits/chosen": -2.6019530296325684, "logits/rejected": -2.6683592796325684, "logps/chosen": -478.3500061035156, "logps/rejected": -475.7250061035156, "loss": 0.0349, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.793359279632568, "rewards/margins": 8.237500190734863, "rewards/rejected": -13.026562690734863, "step": 5160 }, { "epoch": 1.9465361445783134, "grad_norm": 49.91688769524012, "learning_rate": 5.134600903614458e-07, "logits/chosen": -2.613085985183716, "logits/rejected": -2.681445360183716, "logps/chosen": -461.2749938964844, "logps/rejected": -489.54998779296875, "loss": 0.0586, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.806054592132568, "rewards/margins": 8.075390815734863, "rewards/rejected": -12.885937690734863, "step": 5170 }, { "epoch": 1.9503012048192772, "grad_norm": 1.1599130644831173, "learning_rate": 5.125188253012049e-07, "logits/chosen": -2.6820311546325684, "logits/rejected": -2.8031249046325684, "logps/chosen": -489.54998779296875, "logps/rejected": -475.8500061035156, "loss": 0.0753, "rewards/accuracies": 0.96875, "rewards/chosen": -5.104199409484863, "rewards/margins": 8.723437309265137, "rewards/rejected": -13.832812309265137, "step": 5180 }, { "epoch": 1.954066265060241, "grad_norm": 7.87318305319562, "learning_rate": 5.115775602409638e-07, "logits/chosen": -2.6787109375, "logits/rejected": -2.7855467796325684, "logps/chosen": -413.45001220703125, "logps/rejected": -472.75, "loss": 0.066, "rewards/accuracies": 0.96875, "rewards/chosen": -5.394140720367432, "rewards/margins": 8.408984184265137, "rewards/rejected": -13.803125381469727, "step": 5190 }, { "epoch": 1.9578313253012047, "grad_norm": 48.956674131458826, "learning_rate": 5.106362951807228e-07, "logits/chosen": -2.7064452171325684, "logits/rejected": -2.7865233421325684, "logps/chosen": -443.32501220703125, "logps/rejected": -477.07501220703125, "loss": 0.0999, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.133398532867432, "rewards/margins": 8.423437118530273, "rewards/rejected": -13.560937881469727, "step": 5200 }, { "epoch": 1.9615963855421685, "grad_norm": 3.025439345515415, "learning_rate": 5.096950301204819e-07, "logits/chosen": -2.5990233421325684, "logits/rejected": -2.713671922683716, "logps/chosen": -498.75, "logps/rejected": -483.45001220703125, "loss": 0.0671, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.648193359375, "rewards/margins": 8.876562118530273, "rewards/rejected": -13.524999618530273, "step": 5210 }, { "epoch": 1.9653614457831325, "grad_norm": 15.68569216473401, "learning_rate": 5.087537650602409e-07, "logits/chosen": -2.6158204078674316, "logits/rejected": -2.7138671875, "logps/chosen": -447.3374938964844, "logps/rejected": -527.5, "loss": 0.0679, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.552343845367432, "rewards/margins": 8.678906440734863, "rewards/rejected": -13.223437309265137, "step": 5220 }, { "epoch": 1.9691265060240963, "grad_norm": 17.197970760264468, "learning_rate": 5.078125e-07, "logits/chosen": -2.5806641578674316, "logits/rejected": -2.709765672683716, "logps/chosen": -474.57501220703125, "logps/rejected": -480.3500061035156, "loss": 0.0383, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.921954393386841, "rewards/margins": 8.392187118530273, "rewards/rejected": -12.31640625, "step": 5230 }, { "epoch": 1.9728915662650603, "grad_norm": 31.789709282145644, "learning_rate": 5.06871234939759e-07, "logits/chosen": -2.551562547683716, "logits/rejected": -2.695117235183716, "logps/chosen": -414.2124938964844, "logps/rejected": -422.9750061035156, "loss": 0.072, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.7154297828674316, "rewards/margins": 7.504296779632568, "rewards/rejected": -11.221875190734863, "step": 5240 }, { "epoch": 1.9766566265060241, "grad_norm": 66.09119649638215, "learning_rate": 5.059299698795181e-07, "logits/chosen": -2.687695264816284, "logits/rejected": -2.825000047683716, "logps/chosen": -437.7250061035156, "logps/rejected": -435.32501220703125, "loss": 0.0805, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -3.364697217941284, "rewards/margins": 8.123827934265137, "rewards/rejected": -11.4921875, "step": 5250 }, { "epoch": 1.980421686746988, "grad_norm": 53.35752124076411, "learning_rate": 5.049887048192771e-07, "logits/chosen": -2.621289014816284, "logits/rejected": -2.721874952316284, "logps/chosen": -426.1000061035156, "logps/rejected": -413.70001220703125, "loss": 0.0991, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.063281059265137, "rewards/margins": 7.799609184265137, "rewards/rejected": -11.864062309265137, "step": 5260 }, { "epoch": 1.9841867469879517, "grad_norm": 15.079724320921486, "learning_rate": 5.040474397590361e-07, "logits/chosen": -2.5634765625, "logits/rejected": -2.637890577316284, "logps/chosen": -442.3500061035156, "logps/rejected": -450.8999938964844, "loss": 0.065, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.487890720367432, "rewards/margins": 8.299219131469727, "rewards/rejected": -12.791406631469727, "step": 5270 }, { "epoch": 1.9879518072289155, "grad_norm": 52.81543858590121, "learning_rate": 5.031061746987951e-07, "logits/chosen": -2.607421875, "logits/rejected": -2.728710889816284, "logps/chosen": -419.6499938964844, "logps/rejected": -464.0, "loss": 0.0651, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.057226657867432, "rewards/margins": 7.989453315734863, "rewards/rejected": -13.050000190734863, "step": 5280 }, { "epoch": 1.9917168674698795, "grad_norm": 119.49685759075845, "learning_rate": 5.021649096385542e-07, "logits/chosen": -2.611132860183716, "logits/rejected": -2.6675782203674316, "logps/chosen": -455.57501220703125, "logps/rejected": -470.04998779296875, "loss": 0.0689, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.967187404632568, "rewards/margins": 7.967577934265137, "rewards/rejected": -12.932812690734863, "step": 5290 }, { "epoch": 1.9954819277108435, "grad_norm": 46.61916807436691, "learning_rate": 5.012236445783132e-07, "logits/chosen": -2.66796875, "logits/rejected": -2.7115235328674316, "logps/chosen": -424.59375, "logps/rejected": -439.07501220703125, "loss": 0.0696, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.330737113952637, "rewards/margins": 7.566015720367432, "rewards/rejected": -11.892969131469727, "step": 5300 }, { "epoch": 1.9992469879518073, "grad_norm": 25.582031953755457, "learning_rate": 5.002823795180723e-07, "logits/chosen": -2.623242139816284, "logits/rejected": -2.757031202316284, "logps/chosen": -461.92498779296875, "logps/rejected": -466.6000061035156, "loss": 0.0458, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.99169921875, "rewards/margins": 7.989843845367432, "rewards/rejected": -11.975781440734863, "step": 5310 }, { "epoch": 2.003012048192771, "grad_norm": 1.3348044819940406, "learning_rate": 4.993411144578312e-07, "logits/chosen": -2.6734375953674316, "logits/rejected": -2.7855467796325684, "logps/chosen": -440.95001220703125, "logps/rejected": -474.67498779296875, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": -4.142748832702637, "rewards/margins": 8.7421875, "rewards/rejected": -12.880468368530273, "step": 5320 }, { "epoch": 2.006777108433735, "grad_norm": 11.688094290880382, "learning_rate": 4.983998493975903e-07, "logits/chosen": -2.550585985183716, "logits/rejected": -2.6669921875, "logps/chosen": -531.2999877929688, "logps/rejected": -497.5, "loss": 0.0118, "rewards/accuracies": 1.0, "rewards/chosen": -4.019189357757568, "rewards/margins": 9.140625, "rewards/rejected": -13.157812118530273, "step": 5330 }, { "epoch": 2.0105421686746987, "grad_norm": 5.134784110241516, "learning_rate": 4.974585843373494e-07, "logits/chosen": -2.6792969703674316, "logits/rejected": -2.753710985183716, "logps/chosen": -452.6499938964844, "logps/rejected": -475.04998779296875, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -4.419629096984863, "rewards/margins": 8.964062690734863, "rewards/rejected": -13.384374618530273, "step": 5340 }, { "epoch": 2.0143072289156625, "grad_norm": 5.325827623136848, "learning_rate": 4.965173192771085e-07, "logits/chosen": -2.6693358421325684, "logits/rejected": -2.815234422683716, "logps/chosen": -426.875, "logps/rejected": -472.1499938964844, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": -4.871484279632568, "rewards/margins": 9.995312690734863, "rewards/rejected": -14.860937118530273, "step": 5350 }, { "epoch": 2.0180722891566263, "grad_norm": 1.2166862060852872, "learning_rate": 4.955760542168675e-07, "logits/chosen": -2.6607422828674316, "logits/rejected": -2.7769532203674316, "logps/chosen": -493.3999938964844, "logps/rejected": -486.6499938964844, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -4.145092964172363, "rewards/margins": 9.534375190734863, "rewards/rejected": -13.675000190734863, "step": 5360 }, { "epoch": 2.0218373493975905, "grad_norm": 10.376627550376273, "learning_rate": 4.946347891566264e-07, "logits/chosen": -2.768359422683716, "logits/rejected": -2.951367139816284, "logps/chosen": -412.45001220703125, "logps/rejected": -440.17498779296875, "loss": 0.0219, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.294921875, "rewards/margins": 9.385156631469727, "rewards/rejected": -13.678906440734863, "step": 5370 }, { "epoch": 2.0256024096385543, "grad_norm": 4.786447926382152, "learning_rate": 4.936935240963855e-07, "logits/chosen": -2.6626954078674316, "logits/rejected": -2.766796827316284, "logps/chosen": -478.67498779296875, "logps/rejected": -494.0, "loss": 0.0131, "rewards/accuracies": 1.0, "rewards/chosen": -4.388769626617432, "rewards/margins": 10.392969131469727, "rewards/rejected": -14.774999618530273, "step": 5380 }, { "epoch": 2.029367469879518, "grad_norm": 0.5366021335447003, "learning_rate": 4.927522590361445e-07, "logits/chosen": -2.7699217796325684, "logits/rejected": -2.9703125953674316, "logps/chosen": -455.29998779296875, "logps/rejected": -465.6499938964844, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": -4.881933689117432, "rewards/margins": 9.33984375, "rewards/rejected": -14.215624809265137, "step": 5390 }, { "epoch": 2.033132530120482, "grad_norm": 154.97946841285622, "learning_rate": 4.918109939759036e-07, "logits/chosen": -2.841796875, "logits/rejected": -2.9828124046325684, "logps/chosen": -488.3999938964844, "logps/rejected": -516.9500122070312, "loss": 0.0501, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.708984375, "rewards/margins": 10.335156440734863, "rewards/rejected": -16.046875, "step": 5400 }, { "epoch": 2.0368975903614457, "grad_norm": 12.239985214155594, "learning_rate": 4.908697289156626e-07, "logits/chosen": -2.7494139671325684, "logits/rejected": -2.9769530296325684, "logps/chosen": -488.7250061035156, "logps/rejected": -500.25, "loss": 0.0266, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.602734565734863, "rewards/margins": 9.828516006469727, "rewards/rejected": -15.432812690734863, "step": 5410 }, { "epoch": 2.0406626506024095, "grad_norm": 2.4854469240299655, "learning_rate": 4.899284638554217e-07, "logits/chosen": -2.65625, "logits/rejected": -2.8785157203674316, "logps/chosen": -468.2749938964844, "logps/rejected": -461.57501220703125, "loss": 0.0077, "rewards/accuracies": 1.0, "rewards/chosen": -5.2763671875, "rewards/margins": 9.653905868530273, "rewards/rejected": -14.931249618530273, "step": 5420 }, { "epoch": 2.0444277108433733, "grad_norm": 10.5034141809338, "learning_rate": 4.889871987951807e-07, "logits/chosen": -2.869140625, "logits/rejected": -2.919726610183716, "logps/chosen": -434.95001220703125, "logps/rejected": -491.04998779296875, "loss": 0.0086, "rewards/accuracies": 1.0, "rewards/chosen": -5.562890529632568, "rewards/margins": 9.51953125, "rewards/rejected": -15.084375381469727, "step": 5430 }, { "epoch": 2.0481927710843375, "grad_norm": 150.9750844589674, "learning_rate": 4.880459337349398e-07, "logits/chosen": -2.8519530296325684, "logits/rejected": -3.010937452316284, "logps/chosen": -516.9625244140625, "logps/rejected": -517.0250244140625, "loss": 0.0347, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.9384765625, "rewards/margins": 9.978124618530273, "rewards/rejected": -14.917187690734863, "step": 5440 }, { "epoch": 2.0519578313253013, "grad_norm": 0.5261544091978303, "learning_rate": 4.871046686746988e-07, "logits/chosen": -2.974609375, "logits/rejected": -3.096874952316284, "logps/chosen": -441.2124938964844, "logps/rejected": -469.42498779296875, "loss": 0.0214, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.326952934265137, "rewards/margins": 9.911718368530273, "rewards/rejected": -15.240625381469727, "step": 5450 }, { "epoch": 2.055722891566265, "grad_norm": 0.5050419500535277, "learning_rate": 4.861634036144578e-07, "logits/chosen": -2.829296827316284, "logits/rejected": -2.986328125, "logps/chosen": -479.20001220703125, "logps/rejected": -488.6000061035156, "loss": 0.0207, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.879296779632568, "rewards/margins": 9.391406059265137, "rewards/rejected": -15.264062881469727, "step": 5460 }, { "epoch": 2.059487951807229, "grad_norm": 3.25700317603491, "learning_rate": 4.852221385542168e-07, "logits/chosen": -2.7720704078674316, "logits/rejected": -2.966015577316284, "logps/chosen": -473.0249938964844, "logps/rejected": -483.95001220703125, "loss": 0.0191, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.914648532867432, "rewards/margins": 9.417187690734863, "rewards/rejected": -14.3359375, "step": 5470 }, { "epoch": 2.0632530120481927, "grad_norm": 1.1315136009600686, "learning_rate": 4.842808734939759e-07, "logits/chosen": -2.7574219703674316, "logits/rejected": -2.8896484375, "logps/chosen": -450.0, "logps/rejected": -504.8999938964844, "loss": 0.0188, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.140820503234863, "rewards/margins": 10.082812309265137, "rewards/rejected": -15.225000381469727, "step": 5480 }, { "epoch": 2.0670180722891565, "grad_norm": 3.705162391307205, "learning_rate": 4.833396084337349e-07, "logits/chosen": -2.7513670921325684, "logits/rejected": -2.8623046875, "logps/chosen": -481.25, "logps/rejected": -477.0, "loss": 0.0094, "rewards/accuracies": 1.0, "rewards/chosen": -5.507616996765137, "rewards/margins": 10.103906631469727, "rewards/rejected": -15.604687690734863, "step": 5490 }, { "epoch": 2.0707831325301207, "grad_norm": 0.9383226889553943, "learning_rate": 4.823983433734939e-07, "logits/chosen": -2.6220703125, "logits/rejected": -2.815624952316284, "logps/chosen": -477.9750061035156, "logps/rejected": -504.95001220703125, "loss": 0.0194, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.896874904632568, "rewards/margins": 10.72265625, "rewards/rejected": -15.621874809265137, "step": 5500 }, { "epoch": 2.0745481927710845, "grad_norm": 23.20829697589279, "learning_rate": 4.81457078313253e-07, "logits/chosen": -2.8257813453674316, "logits/rejected": -2.879687547683716, "logps/chosen": -451.4750061035156, "logps/rejected": -548.9500122070312, "loss": 0.0105, "rewards/accuracies": 1.0, "rewards/chosen": -6.055956840515137, "rewards/margins": 10.478906631469727, "rewards/rejected": -16.526561737060547, "step": 5510 }, { "epoch": 2.0783132530120483, "grad_norm": 2.2767456370948307, "learning_rate": 4.805158132530121e-07, "logits/chosen": -2.651562452316284, "logits/rejected": -2.764453172683716, "logps/chosen": -465.25, "logps/rejected": -514.1500244140625, "loss": 0.0165, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.774609565734863, "rewards/margins": 10.583593368530273, "rewards/rejected": -16.353124618530273, "step": 5520 }, { "epoch": 2.082078313253012, "grad_norm": 8.996249791886456, "learning_rate": 4.795745481927711e-07, "logits/chosen": -2.7718749046325684, "logits/rejected": -2.993359327316284, "logps/chosen": -504.92498779296875, "logps/rejected": -496.79998779296875, "loss": 0.0079, "rewards/accuracies": 1.0, "rewards/chosen": -5.9677734375, "rewards/margins": 10.236719131469727, "rewards/rejected": -16.203125, "step": 5530 }, { "epoch": 2.085843373493976, "grad_norm": 27.872751674355996, "learning_rate": 4.7863328313253e-07, "logits/chosen": -2.7535157203674316, "logits/rejected": -2.994140625, "logps/chosen": -459.625, "logps/rejected": -442.95001220703125, "loss": 0.0293, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.785595893859863, "rewards/margins": 9.443359375, "rewards/rejected": -14.229687690734863, "step": 5540 }, { "epoch": 2.0896084337349397, "grad_norm": 1.7524289596856566, "learning_rate": 4.776920180722891e-07, "logits/chosen": -2.7476563453674316, "logits/rejected": -2.9312500953674316, "logps/chosen": -453.875, "logps/rejected": -444.6499938964844, "loss": 0.0456, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.854687690734863, "rewards/margins": 9.325780868530273, "rewards/rejected": -14.174219131469727, "step": 5550 }, { "epoch": 2.0933734939759034, "grad_norm": 1.415918935689111, "learning_rate": 4.7675075301204815e-07, "logits/chosen": -2.6851563453674316, "logits/rejected": -2.8511719703674316, "logps/chosen": -483.04998779296875, "logps/rejected": -491.25, "loss": 0.0079, "rewards/accuracies": 1.0, "rewards/chosen": -4.822265625, "rewards/margins": 9.879687309265137, "rewards/rejected": -14.692187309265137, "step": 5560 }, { "epoch": 2.0971385542168677, "grad_norm": 9.848011642189565, "learning_rate": 4.758094879518072e-07, "logits/chosen": -2.7232422828674316, "logits/rejected": -2.8248047828674316, "logps/chosen": -462.45001220703125, "logps/rejected": -501.2749938964844, "loss": 0.0116, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.47607421875, "rewards/margins": 10.264843940734863, "rewards/rejected": -15.75, "step": 5570 }, { "epoch": 2.1009036144578315, "grad_norm": 71.53767683285497, "learning_rate": 4.7486822289156626e-07, "logits/chosen": -2.733593702316284, "logits/rejected": -2.8648438453674316, "logps/chosen": -440.4750061035156, "logps/rejected": -471.17498779296875, "loss": 0.0148, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.8330078125, "rewards/margins": 9.756250381469727, "rewards/rejected": -14.5859375, "step": 5580 }, { "epoch": 2.1046686746987953, "grad_norm": 2.1388744516767777, "learning_rate": 4.739269578313253e-07, "logits/chosen": -2.908203125, "logits/rejected": -3.0123047828674316, "logps/chosen": -451.82501220703125, "logps/rejected": -471.875, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -5.788427829742432, "rewards/margins": 10.178125381469727, "rewards/rejected": -15.978124618530273, "step": 5590 }, { "epoch": 2.108433734939759, "grad_norm": 3.1995933504234064, "learning_rate": 4.729856927710843e-07, "logits/chosen": -2.809375047683716, "logits/rejected": -2.918750047683716, "logps/chosen": -431.7250061035156, "logps/rejected": -490.25, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": -5.159863471984863, "rewards/margins": 10.37109375, "rewards/rejected": -15.53125, "step": 5600 }, { "epoch": 2.112198795180723, "grad_norm": 25.769434197233316, "learning_rate": 4.7204442771084334e-07, "logits/chosen": -2.772265672683716, "logits/rejected": -2.905078172683716, "logps/chosen": -468.13751220703125, "logps/rejected": -493.6499938964844, "loss": 0.0109, "rewards/accuracies": 1.0, "rewards/chosen": -6.096386909484863, "rewards/margins": 9.708593368530273, "rewards/rejected": -15.801562309265137, "step": 5610 }, { "epoch": 2.1159638554216866, "grad_norm": 6.416906609932266, "learning_rate": 4.711031626506024e-07, "logits/chosen": -2.771484375, "logits/rejected": -3.0279297828674316, "logps/chosen": -480.6875, "logps/rejected": -469.42498779296875, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -4.965624809265137, "rewards/margins": 11.049219131469727, "rewards/rejected": -16.01953125, "step": 5620 }, { "epoch": 2.1197289156626504, "grad_norm": 38.518838405849976, "learning_rate": 4.701618975903614e-07, "logits/chosen": -2.745898485183716, "logits/rejected": -2.946484327316284, "logps/chosen": -475.20001220703125, "logps/rejected": -477.5249938964844, "loss": 0.0343, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.755663871765137, "rewards/margins": 9.832812309265137, "rewards/rejected": -15.59375, "step": 5630 }, { "epoch": 2.1234939759036147, "grad_norm": 16.42625481006789, "learning_rate": 4.6922063253012047e-07, "logits/chosen": -2.865429639816284, "logits/rejected": -2.9671874046325684, "logps/chosen": -491.6000061035156, "logps/rejected": -527.5499877929688, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": -6.072070121765137, "rewards/margins": 10.115625381469727, "rewards/rejected": -16.1953125, "step": 5640 }, { "epoch": 2.1272590361445785, "grad_norm": 1.8045150105022187, "learning_rate": 4.682793674698795e-07, "logits/chosen": -2.7730469703674316, "logits/rejected": -2.919921875, "logps/chosen": -502.25, "logps/rejected": -536.7999877929688, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -5.706250190734863, "rewards/margins": 10.708593368530273, "rewards/rejected": -16.412500381469727, "step": 5650 }, { "epoch": 2.1310240963855422, "grad_norm": 18.00929380144962, "learning_rate": 4.673381024096386e-07, "logits/chosen": -2.7802734375, "logits/rejected": -2.91796875, "logps/chosen": -459.75, "logps/rejected": -477.3500061035156, "loss": 0.0147, "rewards/accuracies": 1.0, "rewards/chosen": -5.1474609375, "rewards/margins": 10.264843940734863, "rewards/rejected": -15.413281440734863, "step": 5660 }, { "epoch": 2.134789156626506, "grad_norm": 8.449947388047551, "learning_rate": 4.6639683734939755e-07, "logits/chosen": -2.8003907203674316, "logits/rejected": -2.954296827316284, "logps/chosen": -449.8999938964844, "logps/rejected": -521.125, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": -4.902929782867432, "rewards/margins": 10.379687309265137, "rewards/rejected": -15.2890625, "step": 5670 }, { "epoch": 2.13855421686747, "grad_norm": 58.0528082236089, "learning_rate": 4.6545557228915663e-07, "logits/chosen": -2.835156202316284, "logits/rejected": -3.0132813453674316, "logps/chosen": -520.8499755859375, "logps/rejected": -549.9500122070312, "loss": 0.0168, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.2840576171875, "rewards/margins": 10.413281440734863, "rewards/rejected": -15.685937881469727, "step": 5680 }, { "epoch": 2.1423192771084336, "grad_norm": 5.12085450546133, "learning_rate": 4.6451430722891566e-07, "logits/chosen": -2.7601561546325684, "logits/rejected": -2.97265625, "logps/chosen": -457.3500061035156, "logps/rejected": -476.8500061035156, "loss": 0.011, "rewards/accuracies": 1.0, "rewards/chosen": -4.518359184265137, "rewards/margins": 9.990625381469727, "rewards/rejected": -14.503125190734863, "step": 5690 }, { "epoch": 2.1460843373493974, "grad_norm": 4.1037803927218155, "learning_rate": 4.6357304216867463e-07, "logits/chosen": -2.8548827171325684, "logits/rejected": -2.8929686546325684, "logps/chosen": -491.7749938964844, "logps/rejected": -504.25, "loss": 0.016, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.845898628234863, "rewards/margins": 10.411718368530273, "rewards/rejected": -16.248437881469727, "step": 5700 }, { "epoch": 2.1498493975903616, "grad_norm": 1.6306235705398122, "learning_rate": 4.626317771084337e-07, "logits/chosen": -2.7623047828674316, "logits/rejected": -2.905078172683716, "logps/chosen": -473.42498779296875, "logps/rejected": -512.6500244140625, "loss": 0.0121, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.323828220367432, "rewards/margins": 10.086718559265137, "rewards/rejected": -16.409374237060547, "step": 5710 }, { "epoch": 2.1536144578313254, "grad_norm": 11.74513259717657, "learning_rate": 4.6169051204819274e-07, "logits/chosen": -2.6494140625, "logits/rejected": -2.876171827316284, "logps/chosen": -479.3999938964844, "logps/rejected": -509.6499938964844, "loss": 0.017, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.852343559265137, "rewards/margins": 10.339062690734863, "rewards/rejected": -15.192968368530273, "step": 5720 }, { "epoch": 2.1573795180722892, "grad_norm": 2.002677502258854, "learning_rate": 4.607492469879518e-07, "logits/chosen": -2.806640625, "logits/rejected": -3.0234375, "logps/chosen": -500.70001220703125, "logps/rejected": -502.5, "loss": 0.015, "rewards/accuracies": 1.0, "rewards/chosen": -4.983593940734863, "rewards/margins": 10.232812881469727, "rewards/rejected": -15.223437309265137, "step": 5730 }, { "epoch": 2.161144578313253, "grad_norm": 1.1311190759568497, "learning_rate": 4.598079819277108e-07, "logits/chosen": -2.8042969703674316, "logits/rejected": -3.0179686546325684, "logps/chosen": -451.0, "logps/rejected": -483.2250061035156, "loss": 0.0133, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.822656154632568, "rewards/margins": 10.805468559265137, "rewards/rejected": -15.619531631469727, "step": 5740 }, { "epoch": 2.164909638554217, "grad_norm": 3.6898930723306265, "learning_rate": 4.5886671686746987e-07, "logits/chosen": -2.7220702171325684, "logits/rejected": -2.829296827316284, "logps/chosen": -434.3999938964844, "logps/rejected": -504.3500061035156, "loss": 0.0186, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.173144340515137, "rewards/margins": 10.975000381469727, "rewards/rejected": -17.157812118530273, "step": 5750 }, { "epoch": 2.1686746987951806, "grad_norm": 0.6426512511372586, "learning_rate": 4.579254518072289e-07, "logits/chosen": -2.8472657203674316, "logits/rejected": -2.986328125, "logps/chosen": -506.95001220703125, "logps/rejected": -518.75, "loss": 0.0109, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.650000095367432, "rewards/margins": 10.409375190734863, "rewards/rejected": -17.0625, "step": 5760 }, { "epoch": 2.1724397590361444, "grad_norm": 6.615026380075798, "learning_rate": 4.569841867469879e-07, "logits/chosen": -2.840039014816284, "logits/rejected": -2.9384765625, "logps/chosen": -503.7250061035156, "logps/rejected": -528.7999877929688, "loss": 0.0192, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.675000190734863, "rewards/margins": 10.6875, "rewards/rejected": -17.365625381469727, "step": 5770 }, { "epoch": 2.1762048192771086, "grad_norm": 51.3413755071572, "learning_rate": 4.5604292168674695e-07, "logits/chosen": -2.8695311546325684, "logits/rejected": -2.917187452316284, "logps/chosen": -442.0, "logps/rejected": -506.3999938964844, "loss": 0.013, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.964648246765137, "rewards/margins": 10.670312881469727, "rewards/rejected": -15.641406059265137, "step": 5780 }, { "epoch": 2.1799698795180724, "grad_norm": 4.4919823637532135, "learning_rate": 4.5510165662650603e-07, "logits/chosen": -2.900390625, "logits/rejected": -2.9867186546325684, "logps/chosen": -444.04998779296875, "logps/rejected": -474.1000061035156, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": -5.253710746765137, "rewards/margins": 10.56640625, "rewards/rejected": -15.821874618530273, "step": 5790 }, { "epoch": 2.183734939759036, "grad_norm": 1.1894640737634419, "learning_rate": 4.5416039156626506e-07, "logits/chosen": -2.7671875953674316, "logits/rejected": -2.887500047683716, "logps/chosen": -530.9000244140625, "logps/rejected": -576.3499755859375, "loss": 0.0121, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.943554878234863, "rewards/margins": 11.721875190734863, "rewards/rejected": -17.674999237060547, "step": 5800 }, { "epoch": 2.1875, "grad_norm": 7.725677173780749, "learning_rate": 4.532191265060241e-07, "logits/chosen": -2.7705078125, "logits/rejected": -2.9273438453674316, "logps/chosen": -514.5250244140625, "logps/rejected": -527.5499877929688, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -6.114843845367432, "rewards/margins": 10.72265625, "rewards/rejected": -16.827342987060547, "step": 5810 }, { "epoch": 2.191265060240964, "grad_norm": 12.043936327351245, "learning_rate": 4.522778614457831e-07, "logits/chosen": -2.8121094703674316, "logits/rejected": -2.967578172683716, "logps/chosen": -470.32501220703125, "logps/rejected": -495.45001220703125, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": -5.720703125, "rewards/margins": 11.1953125, "rewards/rejected": -16.921875, "step": 5820 }, { "epoch": 2.1950301204819276, "grad_norm": 1.169349099759805, "learning_rate": 4.513365963855422e-07, "logits/chosen": -2.835156202316284, "logits/rejected": -3.0078125, "logps/chosen": -509.375, "logps/rejected": -526.25, "loss": 0.0185, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.527734279632568, "rewards/margins": 11.306249618530273, "rewards/rejected": -16.826562881469727, "step": 5830 }, { "epoch": 2.1987951807228914, "grad_norm": 17.06784695552393, "learning_rate": 4.5039533132530116e-07, "logits/chosen": -2.973437547683716, "logits/rejected": -3.146484375, "logps/chosen": -452.9750061035156, "logps/rejected": -481.125, "loss": 0.0167, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.052343845367432, "rewards/margins": 10.734375, "rewards/rejected": -16.785938262939453, "step": 5840 }, { "epoch": 2.2025602409638556, "grad_norm": 2.8427766571850985, "learning_rate": 4.494540662650602e-07, "logits/chosen": -2.8033204078674316, "logits/rejected": -2.9312500953674316, "logps/chosen": -476.04998779296875, "logps/rejected": -529.2999877929688, "loss": 0.0086, "rewards/accuracies": 1.0, "rewards/chosen": -7.049609184265137, "rewards/margins": 10.33984375, "rewards/rejected": -17.376562118530273, "step": 5850 }, { "epoch": 2.2063253012048194, "grad_norm": 16.3596491608203, "learning_rate": 4.4851280120481927e-07, "logits/chosen": -2.8121094703674316, "logits/rejected": -2.927734375, "logps/chosen": -484.5, "logps/rejected": -510.4750061035156, "loss": 0.0243, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.465234279632568, "rewards/margins": 10.409375190734863, "rewards/rejected": -16.8671875, "step": 5860 }, { "epoch": 2.210090361445783, "grad_norm": 3.9141502201051326, "learning_rate": 4.4757153614457824e-07, "logits/chosen": -2.88671875, "logits/rejected": -3.0472655296325684, "logps/chosen": -460.3999938964844, "logps/rejected": -485.29998779296875, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": -5.742578029632568, "rewards/margins": 9.642969131469727, "rewards/rejected": -15.385937690734863, "step": 5870 }, { "epoch": 2.213855421686747, "grad_norm": 4.297250504487651, "learning_rate": 4.466302710843373e-07, "logits/chosen": -2.919140577316284, "logits/rejected": -2.9957032203674316, "logps/chosen": -466.125, "logps/rejected": -509.2749938964844, "loss": 0.0079, "rewards/accuracies": 1.0, "rewards/chosen": -5.646679878234863, "rewards/margins": 10.30859375, "rewards/rejected": -15.940625190734863, "step": 5880 }, { "epoch": 2.2176204819277108, "grad_norm": 7.331358073703476, "learning_rate": 4.4568900602409635e-07, "logits/chosen": -2.7583985328674316, "logits/rejected": -3.022656202316284, "logps/chosen": -515.9500122070312, "logps/rejected": -495.8500061035156, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": -6.111132621765137, "rewards/margins": 11.260156631469727, "rewards/rejected": -17.379688262939453, "step": 5890 }, { "epoch": 2.2213855421686746, "grad_norm": 10.793446741243457, "learning_rate": 4.4474774096385543e-07, "logits/chosen": -2.9058594703674316, "logits/rejected": -3.1156249046325684, "logps/chosen": -475.625, "logps/rejected": -507.75, "loss": 0.0163, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.890722751617432, "rewards/margins": 10.684374809265137, "rewards/rejected": -17.590625762939453, "step": 5900 }, { "epoch": 2.2251506024096384, "grad_norm": 52.57593986250344, "learning_rate": 4.438064759036144e-07, "logits/chosen": -2.939648389816284, "logits/rejected": -3.089062452316284, "logps/chosen": -494.04998779296875, "logps/rejected": -506.3999938964844, "loss": 0.0153, "rewards/accuracies": 1.0, "rewards/chosen": -6.248266696929932, "rewards/margins": 10.173437118530273, "rewards/rejected": -16.4296875, "step": 5910 }, { "epoch": 2.2289156626506026, "grad_norm": 47.903009338218325, "learning_rate": 4.428652108433735e-07, "logits/chosen": -2.835156202316284, "logits/rejected": -2.9867186546325684, "logps/chosen": -490.45001220703125, "logps/rejected": -505.0249938964844, "loss": 0.0352, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.811767578125, "rewards/margins": 10.307812690734863, "rewards/rejected": -16.123437881469727, "step": 5920 }, { "epoch": 2.2326807228915664, "grad_norm": 0.31714453263687487, "learning_rate": 4.419239457831325e-07, "logits/chosen": -2.83837890625, "logits/rejected": -2.9478516578674316, "logps/chosen": -468.1000061035156, "logps/rejected": -517.7750244140625, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": -5.467968940734863, "rewards/margins": 10.763280868530273, "rewards/rejected": -16.221874237060547, "step": 5930 }, { "epoch": 2.23644578313253, "grad_norm": 0.46462861499548413, "learning_rate": 4.4098268072289154e-07, "logits/chosen": -2.8060545921325684, "logits/rejected": -2.997265577316284, "logps/chosen": -503.2250061035156, "logps/rejected": -484.25, "loss": 0.0191, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.483593940734863, "rewards/margins": 10.403124809265137, "rewards/rejected": -15.885937690734863, "step": 5940 }, { "epoch": 2.240210843373494, "grad_norm": 1.3218420102079314, "learning_rate": 4.4004141566265056e-07, "logits/chosen": -2.791210889816284, "logits/rejected": -2.9535155296325684, "logps/chosen": -500.1000061035156, "logps/rejected": -522.4500122070312, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -6.464453220367432, "rewards/margins": 10.536718368530273, "rewards/rejected": -17.004688262939453, "step": 5950 }, { "epoch": 2.2439759036144578, "grad_norm": 3.9312610616161305, "learning_rate": 4.3910015060240964e-07, "logits/chosen": -2.907031297683716, "logits/rejected": -3.060546875, "logps/chosen": -489.7749938964844, "logps/rejected": -500.5, "loss": 0.018, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.955078125, "rewards/margins": 10.57421875, "rewards/rejected": -17.528125762939453, "step": 5960 }, { "epoch": 2.2477409638554215, "grad_norm": 2.6964668961429656, "learning_rate": 4.3815888554216867e-07, "logits/chosen": -2.924609422683716, "logits/rejected": -3.0052733421325684, "logps/chosen": -440.5, "logps/rejected": -494.07501220703125, "loss": 0.0146, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.678124904632568, "rewards/margins": 11.293749809265137, "rewards/rejected": -16.967967987060547, "step": 5970 }, { "epoch": 2.2515060240963853, "grad_norm": 13.186028972532885, "learning_rate": 4.372176204819277e-07, "logits/chosen": -2.7548828125, "logits/rejected": -2.906445264816284, "logps/chosen": -490.2124938964844, "logps/rejected": -510.8500061035156, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": -5.587500095367432, "rewards/margins": 10.524999618530273, "rewards/rejected": -16.106250762939453, "step": 5980 }, { "epoch": 2.2552710843373496, "grad_norm": 0.7295420822210599, "learning_rate": 4.362763554216867e-07, "logits/chosen": -2.885546922683716, "logits/rejected": -3.0464844703674316, "logps/chosen": -480.17498779296875, "logps/rejected": -515.8499755859375, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": -5.449609279632568, "rewards/margins": 10.215624809265137, "rewards/rejected": -15.6640625, "step": 5990 }, { "epoch": 2.2590361445783134, "grad_norm": 2.587569681008467, "learning_rate": 4.353350903614458e-07, "logits/chosen": -2.765429735183716, "logits/rejected": -2.896679639816284, "logps/chosen": -469.6000061035156, "logps/rejected": -504.20001220703125, "loss": 0.0091, "rewards/accuracies": 1.0, "rewards/chosen": -6.247265815734863, "rewards/margins": 10.094531059265137, "rewards/rejected": -16.332813262939453, "step": 6000 }, { "epoch": 2.262801204819277, "grad_norm": 4.976484040790034, "learning_rate": 4.343938253012048e-07, "logits/chosen": -2.702343702316284, "logits/rejected": -2.8408203125, "logps/chosen": -453.3500061035156, "logps/rejected": -528.9500122070312, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -5.543359279632568, "rewards/margins": 10.862500190734863, "rewards/rejected": -16.401561737060547, "step": 6010 }, { "epoch": 2.266566265060241, "grad_norm": 19.526670306286153, "learning_rate": 4.334525602409638e-07, "logits/chosen": -2.8857421875, "logits/rejected": -3.0083985328674316, "logps/chosen": -436.3999938964844, "logps/rejected": -497.3999938964844, "loss": 0.0285, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.041211128234863, "rewards/margins": 11.06640625, "rewards/rejected": -17.092187881469727, "step": 6020 }, { "epoch": 2.2703313253012047, "grad_norm": 1.318347639105201, "learning_rate": 4.325112951807229e-07, "logits/chosen": -2.9417967796325684, "logits/rejected": -3.081249952316284, "logps/chosen": -445.42498779296875, "logps/rejected": -492.70001220703125, "loss": 0.0115, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.069140434265137, "rewards/margins": 10.57421875, "rewards/rejected": -17.645313262939453, "step": 6030 }, { "epoch": 2.2740963855421685, "grad_norm": 2.4740119518386168, "learning_rate": 4.315700301204819e-07, "logits/chosen": -2.786914110183716, "logits/rejected": -3.0101561546325684, "logps/chosen": -458.57501220703125, "logps/rejected": -509.25, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": -5.80078125, "rewards/margins": 10.65234375, "rewards/rejected": -16.448436737060547, "step": 6040 }, { "epoch": 2.2778614457831328, "grad_norm": 0.7526272986487645, "learning_rate": 4.3062876506024094e-07, "logits/chosen": -2.839648485183716, "logits/rejected": -3.006054639816284, "logps/chosen": -480.9375, "logps/rejected": -518.0, "loss": 0.0109, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.142773628234863, "rewards/margins": 11.142187118530273, "rewards/rejected": -17.284374237060547, "step": 6050 }, { "epoch": 2.2816265060240966, "grad_norm": 4.876454282402792, "learning_rate": 4.2968749999999996e-07, "logits/chosen": -2.836718797683716, "logits/rejected": -2.880859375, "logps/chosen": -448.9624938964844, "logps/rejected": -523.4000244140625, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": -5.93359375, "rewards/margins": 11.442968368530273, "rewards/rejected": -17.375, "step": 6060 }, { "epoch": 2.2853915662650603, "grad_norm": 1.491503824434827, "learning_rate": 4.2874623493975904e-07, "logits/chosen": -2.8626952171325684, "logits/rejected": -3.025585889816284, "logps/chosen": -482.70001220703125, "logps/rejected": -512.4500122070312, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -6.0908203125, "rewards/margins": 11.802343368530273, "rewards/rejected": -17.873437881469727, "step": 6070 }, { "epoch": 2.289156626506024, "grad_norm": 8.051542622856662, "learning_rate": 4.27804969879518e-07, "logits/chosen": -2.7662110328674316, "logits/rejected": -2.888671875, "logps/chosen": -455.75, "logps/rejected": -504.25, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": -6.009765625, "rewards/margins": 10.717187881469727, "rewards/rejected": -16.728124618530273, "step": 6080 }, { "epoch": 2.292921686746988, "grad_norm": 3.207207896591977, "learning_rate": 4.268637048192771e-07, "logits/chosen": -2.8521485328674316, "logits/rejected": -2.993359327316284, "logps/chosen": -453.67498779296875, "logps/rejected": -503.3999938964844, "loss": 0.0276, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -6.451952934265137, "rewards/margins": 11.334375381469727, "rewards/rejected": -17.787500381469727, "step": 6090 }, { "epoch": 2.2966867469879517, "grad_norm": 100.44382367843995, "learning_rate": 4.259224397590361e-07, "logits/chosen": -2.9300780296325684, "logits/rejected": -3.09375, "logps/chosen": -446.6000061035156, "logps/rejected": -491.7250061035156, "loss": 0.0159, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.199999809265137, "rewards/margins": 10.309374809265137, "rewards/rejected": -17.504688262939453, "step": 6100 }, { "epoch": 2.3004518072289155, "grad_norm": 22.80337421799205, "learning_rate": 4.249811746987952e-07, "logits/chosen": -2.8355469703674316, "logits/rejected": -3.009765625, "logps/chosen": -485.5249938964844, "logps/rejected": -527.4500122070312, "loss": 0.0141, "rewards/accuracies": 1.0, "rewards/chosen": -5.06494140625, "rewards/margins": 11.000781059265137, "rewards/rejected": -16.060937881469727, "step": 6110 }, { "epoch": 2.3042168674698793, "grad_norm": 1.8048126279705725, "learning_rate": 4.240399096385542e-07, "logits/chosen": -2.7275390625, "logits/rejected": -2.860546827316284, "logps/chosen": -554.3875122070312, "logps/rejected": -549.2000122070312, "loss": 0.0149, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.67919921875, "rewards/margins": 11.13671875, "rewards/rejected": -16.8046875, "step": 6120 }, { "epoch": 2.3079819277108435, "grad_norm": 1.333133090745289, "learning_rate": 4.2309864457831325e-07, "logits/chosen": -2.7958984375, "logits/rejected": -3.0234375, "logps/chosen": -478.875, "logps/rejected": -505.45001220703125, "loss": 0.0167, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.372509956359863, "rewards/margins": 10.925000190734863, "rewards/rejected": -16.307811737060547, "step": 6130 }, { "epoch": 2.3117469879518073, "grad_norm": 2.4807549139137572, "learning_rate": 4.221573795180723e-07, "logits/chosen": -2.9175782203674316, "logits/rejected": -3.055859327316284, "logps/chosen": -480.07501220703125, "logps/rejected": -527.7999877929688, "loss": 0.0146, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.167773246765137, "rewards/margins": 10.997655868530273, "rewards/rejected": -17.170312881469727, "step": 6140 }, { "epoch": 2.315512048192771, "grad_norm": 7.6692112897363245, "learning_rate": 4.212161144578313e-07, "logits/chosen": -2.841601610183716, "logits/rejected": -2.9794921875, "logps/chosen": -477.4750061035156, "logps/rejected": -501.79998779296875, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -6.201171875, "rewards/margins": 11.485937118530273, "rewards/rejected": -17.6796875, "step": 6150 }, { "epoch": 2.319277108433735, "grad_norm": 1.0079206025919496, "learning_rate": 4.2027484939759033e-07, "logits/chosen": -2.9291014671325684, "logits/rejected": -3.107421875, "logps/chosen": -461.875, "logps/rejected": -503.54998779296875, "loss": 0.0138, "rewards/accuracies": 1.0, "rewards/chosen": -7.156640529632568, "rewards/margins": 10.764843940734863, "rewards/rejected": -17.924999237060547, "step": 6160 }, { "epoch": 2.3230421686746987, "grad_norm": 1.6468789362366443, "learning_rate": 4.193335843373494e-07, "logits/chosen": -2.815234422683716, "logits/rejected": -2.8980469703674316, "logps/chosen": -474.7749938964844, "logps/rejected": -522.2000122070312, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": -6.529492378234863, "rewards/margins": 11.021093368530273, "rewards/rejected": -17.553125381469727, "step": 6170 }, { "epoch": 2.3268072289156625, "grad_norm": 0.8059182238493301, "learning_rate": 4.1839231927710844e-07, "logits/chosen": -2.8648438453674316, "logits/rejected": -2.97265625, "logps/chosen": -446.45001220703125, "logps/rejected": -539.75, "loss": 0.0102, "rewards/accuracies": 1.0, "rewards/chosen": -6.219140529632568, "rewards/margins": 11.805468559265137, "rewards/rejected": -18.028125762939453, "step": 6180 }, { "epoch": 2.3305722891566267, "grad_norm": 2.5753564680822114, "learning_rate": 4.174510542168674e-07, "logits/chosen": -2.7386717796325684, "logits/rejected": -2.852343797683716, "logps/chosen": -522.5999755859375, "logps/rejected": -538.5499877929688, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -5.805468559265137, "rewards/margins": 11.182031631469727, "rewards/rejected": -16.993749618530273, "step": 6190 }, { "epoch": 2.3343373493975905, "grad_norm": 23.36925213892038, "learning_rate": 4.165097891566265e-07, "logits/chosen": -2.7896485328674316, "logits/rejected": -3.049999952316284, "logps/chosen": -537.625, "logps/rejected": -519.5750122070312, "loss": 0.0094, "rewards/accuracies": 1.0, "rewards/chosen": -4.890332221984863, "rewards/margins": 10.609375, "rewards/rejected": -15.490625381469727, "step": 6200 }, { "epoch": 2.3381024096385543, "grad_norm": 1.263402697661097, "learning_rate": 4.155685240963855e-07, "logits/chosen": -2.892578125, "logits/rejected": -3.0980467796325684, "logps/chosen": -487.7250061035156, "logps/rejected": -498.04998779296875, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": -5.627734184265137, "rewards/margins": 11.259374618530273, "rewards/rejected": -16.887500762939453, "step": 6210 }, { "epoch": 2.341867469879518, "grad_norm": 4.758854848396466, "learning_rate": 4.1462725903614455e-07, "logits/chosen": -2.807421922683716, "logits/rejected": -3.051562547683716, "logps/chosen": -481.5249938964844, "logps/rejected": -498.32501220703125, "loss": 0.0212, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.603125095367432, "rewards/margins": 10.173437118530273, "rewards/rejected": -16.779687881469727, "step": 6220 }, { "epoch": 2.345632530120482, "grad_norm": 0.43188966185560446, "learning_rate": 4.136859939759036e-07, "logits/chosen": -2.862109422683716, "logits/rejected": -2.992968797683716, "logps/chosen": -441.79998779296875, "logps/rejected": -512.0999755859375, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -6.12109375, "rewards/margins": 11.743749618530273, "rewards/rejected": -17.862499237060547, "step": 6230 }, { "epoch": 2.3493975903614457, "grad_norm": 27.19657062813019, "learning_rate": 4.1274472891566265e-07, "logits/chosen": -2.9097657203674316, "logits/rejected": -3.1128907203674316, "logps/chosen": -468.8999938964844, "logps/rejected": -512.4749755859375, "loss": 0.025, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.275000095367432, "rewards/margins": 10.610937118530273, "rewards/rejected": -16.892187118530273, "step": 6240 }, { "epoch": 2.3531626506024095, "grad_norm": 0.6857879360468445, "learning_rate": 4.118034638554217e-07, "logits/chosen": -2.8773436546325684, "logits/rejected": -3.0121092796325684, "logps/chosen": -450.2250061035156, "logps/rejected": -472.79998779296875, "loss": 0.0161, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.699804782867432, "rewards/margins": 10.146875381469727, "rewards/rejected": -15.853124618530273, "step": 6250 }, { "epoch": 2.3569277108433733, "grad_norm": 78.71744665484313, "learning_rate": 4.108621987951807e-07, "logits/chosen": -2.895312547683716, "logits/rejected": -3.0660157203674316, "logps/chosen": -475.6499938964844, "logps/rejected": -557.9500122070312, "loss": 0.0087, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.372265815734863, "rewards/margins": 11.530468940734863, "rewards/rejected": -17.892187118530273, "step": 6260 }, { "epoch": 2.3606927710843375, "grad_norm": 6.07428859959085, "learning_rate": 4.0992093373493973e-07, "logits/chosen": -2.89453125, "logits/rejected": -2.8998045921325684, "logps/chosen": -492.625, "logps/rejected": -541.125, "loss": 0.0394, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -7.861328125, "rewards/margins": 10.473437309265137, "rewards/rejected": -18.339061737060547, "step": 6270 }, { "epoch": 2.3644578313253013, "grad_norm": 46.27879358562744, "learning_rate": 4.089796686746988e-07, "logits/chosen": -2.7705078125, "logits/rejected": -3.0113282203674316, "logps/chosen": -461.29998779296875, "logps/rejected": -529.5999755859375, "loss": 0.0104, "rewards/accuracies": 1.0, "rewards/chosen": -5.912890434265137, "rewards/margins": 11.458593368530273, "rewards/rejected": -17.370311737060547, "step": 6280 }, { "epoch": 2.368222891566265, "grad_norm": 4.438988359607491, "learning_rate": 4.080384036144578e-07, "logits/chosen": -2.8033204078674316, "logits/rejected": -3.005859375, "logps/chosen": -429.4750061035156, "logps/rejected": -482.95001220703125, "loss": 0.0116, "rewards/accuracies": 1.0, "rewards/chosen": -6.526171684265137, "rewards/margins": 10.574999809265137, "rewards/rejected": -17.112499237060547, "step": 6290 }, { "epoch": 2.371987951807229, "grad_norm": 4.3890678421925085, "learning_rate": 4.0709713855421687e-07, "logits/chosen": -3.021289110183716, "logits/rejected": -3.1148438453674316, "logps/chosen": -449.8125, "logps/rejected": -479.7250061035156, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": -5.825390815734863, "rewards/margins": 10.67578125, "rewards/rejected": -16.501562118530273, "step": 6300 }, { "epoch": 2.3757530120481927, "grad_norm": 0.9019763478038866, "learning_rate": 4.061558734939759e-07, "logits/chosen": -3.016796827316284, "logits/rejected": -3.178515672683716, "logps/chosen": -499.625, "logps/rejected": -544.9500122070312, "loss": 0.0224, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.479687690734863, "rewards/margins": 11.064062118530273, "rewards/rejected": -18.542186737060547, "step": 6310 }, { "epoch": 2.3795180722891565, "grad_norm": 8.620947746280711, "learning_rate": 4.052146084337349e-07, "logits/chosen": -2.916015625, "logits/rejected": -3.180468797683716, "logps/chosen": -486.20001220703125, "logps/rejected": -525.3499755859375, "loss": 0.0171, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.887890815734863, "rewards/margins": 11.707812309265137, "rewards/rejected": -18.589061737060547, "step": 6320 }, { "epoch": 2.3832831325301207, "grad_norm": 3.336204209734776, "learning_rate": 4.0427334337349395e-07, "logits/chosen": -3.141796827316284, "logits/rejected": -3.231250047683716, "logps/chosen": -457.0, "logps/rejected": -491.0, "loss": 0.0172, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -8.410547256469727, "rewards/margins": 10.123437881469727, "rewards/rejected": -18.546875, "step": 6330 }, { "epoch": 2.3870481927710845, "grad_norm": 13.269779055051371, "learning_rate": 4.0333207831325297e-07, "logits/chosen": -3.0201172828674316, "logits/rejected": -3.215625047683716, "logps/chosen": -462.82501220703125, "logps/rejected": -524.0999755859375, "loss": 0.0116, "rewards/accuracies": 1.0, "rewards/chosen": -6.488671779632568, "rewards/margins": 11.234375, "rewards/rejected": -17.731250762939453, "step": 6340 }, { "epoch": 2.3908132530120483, "grad_norm": 3.4304202033179503, "learning_rate": 4.0239081325301205e-07, "logits/chosen": -3.076953172683716, "logits/rejected": -3.219531297683716, "logps/chosen": -424.7250061035156, "logps/rejected": -493.6499938964844, "loss": 0.0261, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.013085842132568, "rewards/margins": 11.763280868530273, "rewards/rejected": -17.784374237060547, "step": 6350 }, { "epoch": 2.394578313253012, "grad_norm": 2.537067987063551, "learning_rate": 4.0144954819277103e-07, "logits/chosen": -2.910351514816284, "logits/rejected": -3.0425782203674316, "logps/chosen": -422.5249938964844, "logps/rejected": -508.29998779296875, "loss": 0.0421, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.752539157867432, "rewards/margins": 10.793749809265137, "rewards/rejected": -16.540624618530273, "step": 6360 }, { "epoch": 2.398343373493976, "grad_norm": 6.211315158903426, "learning_rate": 4.005082831325301e-07, "logits/chosen": -3.0035157203674316, "logits/rejected": -3.1812500953674316, "logps/chosen": -447.20001220703125, "logps/rejected": -498.75, "loss": 0.0105, "rewards/accuracies": 1.0, "rewards/chosen": -5.077685356140137, "rewards/margins": 11.169530868530273, "rewards/rejected": -16.237499237060547, "step": 6370 }, { "epoch": 2.4021084337349397, "grad_norm": 10.541989289421972, "learning_rate": 3.9956701807228913e-07, "logits/chosen": -2.9906249046325684, "logits/rejected": -3.098437547683716, "logps/chosen": -471.92498779296875, "logps/rejected": -508.0, "loss": 0.0122, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.87060546875, "rewards/margins": 11.484375, "rewards/rejected": -17.353124618530273, "step": 6380 }, { "epoch": 2.4058734939759034, "grad_norm": 1.0521169169818405, "learning_rate": 3.9862575301204816e-07, "logits/chosen": -2.893749952316284, "logits/rejected": -2.989453077316284, "logps/chosen": -496.20001220703125, "logps/rejected": -519.2000122070312, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -6.224413871765137, "rewards/margins": 10.827343940734863, "rewards/rejected": -17.057811737060547, "step": 6390 }, { "epoch": 2.4096385542168672, "grad_norm": 6.695232502450278, "learning_rate": 3.976844879518072e-07, "logits/chosen": -2.9175782203674316, "logits/rejected": -3.091015577316284, "logps/chosen": -511.0, "logps/rejected": -505.0, "loss": 0.0319, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.838574409484863, "rewards/margins": 10.14453125, "rewards/rejected": -15.973437309265137, "step": 6400 }, { "epoch": 2.4134036144578315, "grad_norm": 3.5392073117183687, "learning_rate": 3.9674322289156627e-07, "logits/chosen": -2.789843797683716, "logits/rejected": -2.9945311546325684, "logps/chosen": -590.4749755859375, "logps/rejected": -565.0999755859375, "loss": 0.0093, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.319140434265137, "rewards/margins": 11.1875, "rewards/rejected": -17.515625, "step": 6410 }, { "epoch": 2.4171686746987953, "grad_norm": 95.53068497807615, "learning_rate": 3.958019578313253e-07, "logits/chosen": -3.171875, "logits/rejected": -3.2437500953674316, "logps/chosen": -458.875, "logps/rejected": -534.3499755859375, "loss": 0.0099, "rewards/accuracies": 1.0, "rewards/chosen": -6.9755859375, "rewards/margins": 10.866406440734863, "rewards/rejected": -17.84375, "step": 6420 }, { "epoch": 2.420933734939759, "grad_norm": 10.107358210481133, "learning_rate": 3.948606927710843e-07, "logits/chosen": -2.898632764816284, "logits/rejected": -2.9722657203674316, "logps/chosen": -483.79998779296875, "logps/rejected": -527.625, "loss": 0.0127, "rewards/accuracies": 1.0, "rewards/chosen": -6.787499904632568, "rewards/margins": 11.302343368530273, "rewards/rejected": -18.075000762939453, "step": 6430 }, { "epoch": 2.424698795180723, "grad_norm": 24.35672306590192, "learning_rate": 3.9391942771084335e-07, "logits/chosen": -2.87890625, "logits/rejected": -3.100781202316284, "logps/chosen": -492.79998779296875, "logps/rejected": -509.875, "loss": 0.0294, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.153515815734863, "rewards/margins": 11.193750381469727, "rewards/rejected": -18.359375, "step": 6440 }, { "epoch": 2.4284638554216866, "grad_norm": 2.683619818408214, "learning_rate": 3.929781626506024e-07, "logits/chosen": -2.8896484375, "logits/rejected": -3.084765672683716, "logps/chosen": -479.625, "logps/rejected": -515.9000244140625, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -6.810156345367432, "rewards/margins": 11.175000190734863, "rewards/rejected": -17.978124618530273, "step": 6450 }, { "epoch": 2.4322289156626504, "grad_norm": 12.549823040658282, "learning_rate": 3.920368975903614e-07, "logits/chosen": -2.9378905296325684, "logits/rejected": -3.048828125, "logps/chosen": -462.6499938964844, "logps/rejected": -528.0999755859375, "loss": 0.0091, "rewards/accuracies": 1.0, "rewards/chosen": -6.2109375, "rewards/margins": 11.033594131469727, "rewards/rejected": -17.2421875, "step": 6460 }, { "epoch": 2.4359939759036147, "grad_norm": 13.886567736018277, "learning_rate": 3.910956325301205e-07, "logits/chosen": -2.9371094703674316, "logits/rejected": -3.03125, "logps/chosen": -436.07501220703125, "logps/rejected": -477.3999938964844, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": -5.399804592132568, "rewards/margins": 11.209375381469727, "rewards/rejected": -16.610937118530273, "step": 6470 }, { "epoch": 2.4397590361445785, "grad_norm": 1.1667084047368879, "learning_rate": 3.901543674698795e-07, "logits/chosen": -3.077343702316284, "logits/rejected": -3.1617188453674316, "logps/chosen": -441.95001220703125, "logps/rejected": -505.25, "loss": 0.0181, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.757031440734863, "rewards/margins": 10.932031631469727, "rewards/rejected": -16.681249618530273, "step": 6480 }, { "epoch": 2.4435240963855422, "grad_norm": 5.766767166156217, "learning_rate": 3.892131024096386e-07, "logits/chosen": -2.892578125, "logits/rejected": -3.040234327316284, "logps/chosen": -456.9125061035156, "logps/rejected": -514.1500244140625, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -6.565625190734863, "rewards/margins": 10.992968559265137, "rewards/rejected": -17.568750381469727, "step": 6490 }, { "epoch": 2.447289156626506, "grad_norm": 1.327616106078676, "learning_rate": 3.8827183734939756e-07, "logits/chosen": -2.942578077316284, "logits/rejected": -3.1011719703674316, "logps/chosen": -523.3499755859375, "logps/rejected": -542.9500122070312, "loss": 0.021, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.581640720367432, "rewards/margins": 11.771093368530273, "rewards/rejected": -18.356250762939453, "step": 6500 }, { "epoch": 2.45105421686747, "grad_norm": 1.3603081959164443, "learning_rate": 3.873305722891566e-07, "logits/chosen": -3.0335936546325684, "logits/rejected": -3.221484422683716, "logps/chosen": -435.2749938964844, "logps/rejected": -480.42498779296875, "loss": 0.0095, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.786035060882568, "rewards/margins": 11.542187690734863, "rewards/rejected": -17.333593368530273, "step": 6510 }, { "epoch": 2.4548192771084336, "grad_norm": 114.802677490962, "learning_rate": 3.8638930722891567e-07, "logits/chosen": -2.8814454078674316, "logits/rejected": -3.063281297683716, "logps/chosen": -460.57501220703125, "logps/rejected": -511.29998779296875, "loss": 0.0578, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.26708984375, "rewards/margins": 10.981249809265137, "rewards/rejected": -17.248437881469727, "step": 6520 }, { "epoch": 2.4585843373493974, "grad_norm": 5.374026767965433, "learning_rate": 3.8544804216867464e-07, "logits/chosen": -2.9990234375, "logits/rejected": -3.1859374046325684, "logps/chosen": -481.82501220703125, "logps/rejected": -490.6499938964844, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -5.858788967132568, "rewards/margins": 11.40625, "rewards/rejected": -17.2734375, "step": 6530 }, { "epoch": 2.462349397590361, "grad_norm": 89.22074758012847, "learning_rate": 3.845067771084337e-07, "logits/chosen": -2.9722657203674316, "logits/rejected": -3.1748046875, "logps/chosen": -478.25, "logps/rejected": -526.25, "loss": 0.0415, "rewards/accuracies": 0.96875, "rewards/chosen": -6.055078029632568, "rewards/margins": 11.38671875, "rewards/rejected": -17.448436737060547, "step": 6540 }, { "epoch": 2.4661144578313254, "grad_norm": 1.4757767893089697, "learning_rate": 3.8356551204819275e-07, "logits/chosen": -2.890625, "logits/rejected": -3.0843749046325684, "logps/chosen": -504.73748779296875, "logps/rejected": -502.6000061035156, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -5.465234279632568, "rewards/margins": 11.904687881469727, "rewards/rejected": -17.381250381469727, "step": 6550 }, { "epoch": 2.4698795180722892, "grad_norm": 1.2932252697709141, "learning_rate": 3.826242469879518e-07, "logits/chosen": -2.8929686546325684, "logits/rejected": -3.05859375, "logps/chosen": -505.04998779296875, "logps/rejected": -545.25, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -7.063281059265137, "rewards/margins": 11.564062118530273, "rewards/rejected": -18.625, "step": 6560 }, { "epoch": 2.473644578313253, "grad_norm": 1.1870984235968334, "learning_rate": 3.816829819277108e-07, "logits/chosen": -3.026562452316284, "logits/rejected": -3.163281202316284, "logps/chosen": -453.2250061035156, "logps/rejected": -503.3500061035156, "loss": 0.0217, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.109765529632568, "rewards/margins": 11.279687881469727, "rewards/rejected": -17.395313262939453, "step": 6570 }, { "epoch": 2.477409638554217, "grad_norm": 92.87987461752424, "learning_rate": 3.807417168674699e-07, "logits/chosen": -2.951953172683716, "logits/rejected": -3.07421875, "logps/chosen": -465.92498779296875, "logps/rejected": -525.2000122070312, "loss": 0.0166, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.669531345367432, "rewards/margins": 10.948437690734863, "rewards/rejected": -17.612499237060547, "step": 6580 }, { "epoch": 2.4811746987951806, "grad_norm": 0.4264352740231347, "learning_rate": 3.798004518072289e-07, "logits/chosen": -2.859375, "logits/rejected": -3.06640625, "logps/chosen": -517.5, "logps/rejected": -547.1500244140625, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": -6.672070503234863, "rewards/margins": 11.16796875, "rewards/rejected": -17.846874237060547, "step": 6590 }, { "epoch": 2.4849397590361444, "grad_norm": 2.941239500210369, "learning_rate": 3.7885918674698793e-07, "logits/chosen": -2.865429639816284, "logits/rejected": -3.065234422683716, "logps/chosen": -508.82501220703125, "logps/rejected": -506.8500061035156, "loss": 0.0239, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.484179496765137, "rewards/margins": 10.021875381469727, "rewards/rejected": -15.506250381469727, "step": 6600 }, { "epoch": 2.4887048192771086, "grad_norm": 0.21582404600933938, "learning_rate": 3.7791792168674696e-07, "logits/chosen": -2.901171922683716, "logits/rejected": -3.0140624046325684, "logps/chosen": -539.25, "logps/rejected": -539.0250244140625, "loss": 0.0225, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.522070407867432, "rewards/margins": 10.720312118530273, "rewards/rejected": -16.25, "step": 6610 }, { "epoch": 2.4924698795180724, "grad_norm": 5.186486160881516, "learning_rate": 3.7697665662650604e-07, "logits/chosen": -2.975781202316284, "logits/rejected": -3.1781249046325684, "logps/chosen": -450.0375061035156, "logps/rejected": -498.04998779296875, "loss": 0.0121, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.549023628234863, "rewards/margins": 11.137499809265137, "rewards/rejected": -16.6875, "step": 6620 }, { "epoch": 2.496234939759036, "grad_norm": 1.9376408063161548, "learning_rate": 3.7603539156626506e-07, "logits/chosen": -2.9345703125, "logits/rejected": -3.1070313453674316, "logps/chosen": -515.5499877929688, "logps/rejected": -513.3499755859375, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -5.650586128234863, "rewards/margins": 11.150781631469727, "rewards/rejected": -16.796875, "step": 6630 }, { "epoch": 2.5, "grad_norm": 43.85747992683338, "learning_rate": 3.750941265060241e-07, "logits/chosen": -2.958984375, "logits/rejected": -3.1578125953674316, "logps/chosen": -474.125, "logps/rejected": -502.70001220703125, "loss": 0.0136, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.447656154632568, "rewards/margins": 11.294530868530273, "rewards/rejected": -17.734375, "step": 6640 }, { "epoch": 2.503765060240964, "grad_norm": 6.065694349983064, "learning_rate": 3.741528614457831e-07, "logits/chosen": -2.948437452316284, "logits/rejected": -3.138476610183716, "logps/chosen": -517.8499755859375, "logps/rejected": -538.7249755859375, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": -6.393359184265137, "rewards/margins": 11.803125381469727, "rewards/rejected": -18.189062118530273, "step": 6650 }, { "epoch": 2.5075301204819276, "grad_norm": 58.922617458700564, "learning_rate": 3.732115963855422e-07, "logits/chosen": -2.99609375, "logits/rejected": -3.2054686546325684, "logps/chosen": -451.20001220703125, "logps/rejected": -476.45001220703125, "loss": 0.0158, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.921093940734863, "rewards/margins": 10.740625381469727, "rewards/rejected": -16.6640625, "step": 6660 }, { "epoch": 2.5112951807228914, "grad_norm": 1.479565298055058, "learning_rate": 3.7227033132530117e-07, "logits/chosen": -3.024218797683716, "logits/rejected": -3.162109375, "logps/chosen": -450.5249938964844, "logps/rejected": -497.70001220703125, "loss": 0.0189, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.389452934265137, "rewards/margins": 11.267187118530273, "rewards/rejected": -17.654687881469727, "step": 6670 }, { "epoch": 2.515060240963855, "grad_norm": 19.204272318578397, "learning_rate": 3.713290662650602e-07, "logits/chosen": -3.03125, "logits/rejected": -3.1910157203674316, "logps/chosen": -500.0249938964844, "logps/rejected": -539.75, "loss": 0.0118, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.553515434265137, "rewards/margins": 10.587499618530273, "rewards/rejected": -17.142187118530273, "step": 6680 }, { "epoch": 2.5188253012048194, "grad_norm": 6.621788927815687, "learning_rate": 3.703878012048193e-07, "logits/chosen": -3.019726514816284, "logits/rejected": -3.2613282203674316, "logps/chosen": -442.1499938964844, "logps/rejected": -487.75, "loss": 0.0149, "rewards/accuracies": 1.0, "rewards/chosen": -6.622851371765137, "rewards/margins": 10.98046875, "rewards/rejected": -17.600000381469727, "step": 6690 }, { "epoch": 2.522590361445783, "grad_norm": 0.9633449213350064, "learning_rate": 3.6944653614457825e-07, "logits/chosen": -2.97265625, "logits/rejected": -3.134765625, "logps/chosen": -481.0, "logps/rejected": -510.7250061035156, "loss": 0.0096, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.306738376617432, "rewards/margins": 11.287500381469727, "rewards/rejected": -18.592187881469727, "step": 6700 }, { "epoch": 2.526355421686747, "grad_norm": 7.954304481238919, "learning_rate": 3.6850527108433733e-07, "logits/chosen": -3.044921875, "logits/rejected": -3.2054686546325684, "logps/chosen": -478.54998779296875, "logps/rejected": -516.0999755859375, "loss": 0.0119, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.483593940734863, "rewards/margins": 10.9921875, "rewards/rejected": -17.478124618530273, "step": 6710 }, { "epoch": 2.5301204819277108, "grad_norm": 1.7155575238095708, "learning_rate": 3.6756400602409636e-07, "logits/chosen": -3.1314454078674316, "logits/rejected": -3.2734375, "logps/chosen": -429.79998779296875, "logps/rejected": -511.54998779296875, "loss": 0.025, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.414843559265137, "rewards/margins": 11.379687309265137, "rewards/rejected": -17.78125, "step": 6720 }, { "epoch": 2.5338855421686746, "grad_norm": 18.4004862742856, "learning_rate": 3.6662274096385544e-07, "logits/chosen": -3.0648436546325684, "logits/rejected": -3.2925782203674316, "logps/chosen": -452.3999938964844, "logps/rejected": -477.8999938964844, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": -6.314648628234863, "rewards/margins": 10.92578125, "rewards/rejected": -17.248437881469727, "step": 6730 }, { "epoch": 2.537650602409639, "grad_norm": 1.3233272116376549, "learning_rate": 3.656814759036144e-07, "logits/chosen": -3.055859327316284, "logits/rejected": -3.2349610328674316, "logps/chosen": -564.5499877929688, "logps/rejected": -548.2999877929688, "loss": 0.0108, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.951952934265137, "rewards/margins": 11.296875, "rewards/rejected": -18.255468368530273, "step": 6740 }, { "epoch": 2.5414156626506026, "grad_norm": 0.6632471002840651, "learning_rate": 3.647402108433735e-07, "logits/chosen": -2.9710936546325684, "logits/rejected": -3.2339844703674316, "logps/chosen": -492.3500061035156, "logps/rejected": -507.3999938964844, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": -6.413866996765137, "rewards/margins": 11.577343940734863, "rewards/rejected": -18.0, "step": 6750 }, { "epoch": 2.5451807228915664, "grad_norm": 7.68772244969437, "learning_rate": 3.637989457831325e-07, "logits/chosen": -3.0625, "logits/rejected": -3.219921827316284, "logps/chosen": -545.0999755859375, "logps/rejected": -538.625, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": -7.076952934265137, "rewards/margins": 10.670312881469727, "rewards/rejected": -17.743749618530273, "step": 6760 }, { "epoch": 2.54894578313253, "grad_norm": 1.1268368449217778, "learning_rate": 3.6285768072289154e-07, "logits/chosen": -3.1371092796325684, "logits/rejected": -3.272656202316284, "logps/chosen": -467.0, "logps/rejected": -505.6499938964844, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -6.629980564117432, "rewards/margins": 10.69140625, "rewards/rejected": -17.315624237060547, "step": 6770 }, { "epoch": 2.552710843373494, "grad_norm": 16.594433954106023, "learning_rate": 3.6191641566265057e-07, "logits/chosen": -3.0044922828674316, "logits/rejected": -3.2769532203674316, "logps/chosen": -513.6500244140625, "logps/rejected": -496.0, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": -7.731640815734863, "rewards/margins": 11.743749618530273, "rewards/rejected": -19.479686737060547, "step": 6780 }, { "epoch": 2.5564759036144578, "grad_norm": 23.29076976229831, "learning_rate": 3.6097515060240965e-07, "logits/chosen": -2.8490233421325684, "logits/rejected": -3.0796875953674316, "logps/chosen": -537.7750244140625, "logps/rejected": -515.9500122070312, "loss": 0.0118, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.484375, "rewards/margins": 11.713281631469727, "rewards/rejected": -18.200000762939453, "step": 6790 }, { "epoch": 2.5602409638554215, "grad_norm": 3.2883227402148285, "learning_rate": 3.600338855421687e-07, "logits/chosen": -3.0445313453674316, "logits/rejected": -3.220703125, "logps/chosen": -474.45001220703125, "logps/rejected": -505.79998779296875, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": -6.65234375, "rewards/margins": 10.567187309265137, "rewards/rejected": -17.214061737060547, "step": 6800 }, { "epoch": 2.5640060240963853, "grad_norm": 2.6976866844952814, "learning_rate": 3.590926204819277e-07, "logits/chosen": -3.12890625, "logits/rejected": -3.192187547683716, "logps/chosen": -441.7250061035156, "logps/rejected": -500.0, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": -6.162109375, "rewards/margins": 11.10546875, "rewards/rejected": -17.270313262939453, "step": 6810 }, { "epoch": 2.567771084337349, "grad_norm": 77.25114829103886, "learning_rate": 3.5815135542168673e-07, "logits/chosen": -3.0478515625, "logits/rejected": -3.2193360328674316, "logps/chosen": -470.3125, "logps/rejected": -482.6000061035156, "loss": 0.0171, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.030468940734863, "rewards/margins": 11.858593940734863, "rewards/rejected": -17.887500762939453, "step": 6820 }, { "epoch": 2.5715361445783134, "grad_norm": 62.04991446532097, "learning_rate": 3.5721009036144576e-07, "logits/chosen": -2.8687500953674316, "logits/rejected": -3.21484375, "logps/chosen": -454.3500061035156, "logps/rejected": -520.3499755859375, "loss": 0.0138, "rewards/accuracies": 1.0, "rewards/chosen": -5.730078220367432, "rewards/margins": 11.4609375, "rewards/rejected": -17.190624237060547, "step": 6830 }, { "epoch": 2.575301204819277, "grad_norm": 13.875010650097007, "learning_rate": 3.562688253012048e-07, "logits/chosen": -3.0185546875, "logits/rejected": -3.1636719703674316, "logps/chosen": -470.625, "logps/rejected": -521.125, "loss": 0.014, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.669140815734863, "rewards/margins": 10.834375381469727, "rewards/rejected": -17.503124237060547, "step": 6840 }, { "epoch": 2.579066265060241, "grad_norm": 0.989986200914317, "learning_rate": 3.553275602409638e-07, "logits/chosen": -2.7972655296325684, "logits/rejected": -3.017578125, "logps/chosen": -510.45001220703125, "logps/rejected": -519.6749877929688, "loss": 0.0079, "rewards/accuracies": 1.0, "rewards/chosen": -6.1220703125, "rewards/margins": 11.135156631469727, "rewards/rejected": -17.250782012939453, "step": 6850 }, { "epoch": 2.5828313253012047, "grad_norm": 5.0738922224215965, "learning_rate": 3.543862951807229e-07, "logits/chosen": -2.988085985183716, "logits/rejected": -3.065234422683716, "logps/chosen": -447.8999938964844, "logps/rejected": -501.25, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": -6.155663967132568, "rewards/margins": 11.606249809265137, "rewards/rejected": -17.759374618530273, "step": 6860 }, { "epoch": 2.5865963855421685, "grad_norm": 0.9251599067044342, "learning_rate": 3.534450301204819e-07, "logits/chosen": -3.0785155296325684, "logits/rejected": -3.1351561546325684, "logps/chosen": -452.3500061035156, "logps/rejected": -509.29998779296875, "loss": 0.0163, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.096093654632568, "rewards/margins": 10.345312118530273, "rewards/rejected": -16.435937881469727, "step": 6870 }, { "epoch": 2.5903614457831328, "grad_norm": 102.96619515299746, "learning_rate": 3.5250376506024094e-07, "logits/chosen": -2.989453077316284, "logits/rejected": -3.1361327171325684, "logps/chosen": -445.57501220703125, "logps/rejected": -507.3999938964844, "loss": 0.016, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.114453315734863, "rewards/margins": 9.84765625, "rewards/rejected": -16.967187881469727, "step": 6880 }, { "epoch": 2.5941265060240966, "grad_norm": 0.9707709050453807, "learning_rate": 3.5156249999999997e-07, "logits/chosen": -2.943359375, "logits/rejected": -3.076953172683716, "logps/chosen": -494.0249938964844, "logps/rejected": -511.6499938964844, "loss": 0.0137, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.566015720367432, "rewards/margins": 10.586718559265137, "rewards/rejected": -17.15625, "step": 6890 }, { "epoch": 2.5978915662650603, "grad_norm": 77.96239367340783, "learning_rate": 3.5062123493975905e-07, "logits/chosen": -2.931640625, "logits/rejected": -3.1292967796325684, "logps/chosen": -460.6499938964844, "logps/rejected": -496.45001220703125, "loss": 0.0369, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.842577934265137, "rewards/margins": 10.940625190734863, "rewards/rejected": -17.774999618530273, "step": 6900 }, { "epoch": 2.601656626506024, "grad_norm": 2.983356522872865, "learning_rate": 3.49679969879518e-07, "logits/chosen": -2.985546827316284, "logits/rejected": -3.147656202316284, "logps/chosen": -481.79998779296875, "logps/rejected": -517.7000122070312, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": -7.225390434265137, "rewards/margins": 10.774218559265137, "rewards/rejected": -18.004688262939453, "step": 6910 }, { "epoch": 2.605421686746988, "grad_norm": 7.163258286451071, "learning_rate": 3.487387048192771e-07, "logits/chosen": -2.8402342796325684, "logits/rejected": -3.0492186546325684, "logps/chosen": -504.5249938964844, "logps/rejected": -526.4000244140625, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -6.26171875, "rewards/margins": 11.975781440734863, "rewards/rejected": -18.253124237060547, "step": 6920 }, { "epoch": 2.6091867469879517, "grad_norm": 2.429326544357806, "learning_rate": 3.4779743975903613e-07, "logits/chosen": -3.0123047828674316, "logits/rejected": -3.050976514816284, "logps/chosen": -402.6000061035156, "logps/rejected": -516.8499755859375, "loss": 0.0117, "rewards/accuracies": 1.0, "rewards/chosen": -7.583593845367432, "rewards/margins": 11.07421875, "rewards/rejected": -18.653125762939453, "step": 6930 }, { "epoch": 2.6129518072289155, "grad_norm": 126.60956582822377, "learning_rate": 3.468561746987952e-07, "logits/chosen": -3.09765625, "logits/rejected": -3.256054639816284, "logps/chosen": -451.3500061035156, "logps/rejected": -495.2749938964844, "loss": 0.0145, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.362109184265137, "rewards/margins": 11.449999809265137, "rewards/rejected": -18.8046875, "step": 6940 }, { "epoch": 2.6167168674698793, "grad_norm": 4.827844399255219, "learning_rate": 3.459149096385542e-07, "logits/chosen": -2.914257764816284, "logits/rejected": -3.087109327316284, "logps/chosen": -533.625, "logps/rejected": -502.9750061035156, "loss": 0.0144, "rewards/accuracies": 1.0, "rewards/chosen": -6.6220703125, "rewards/margins": 10.319531440734863, "rewards/rejected": -16.939062118530273, "step": 6950 }, { "epoch": 2.6204819277108435, "grad_norm": 14.19635703463623, "learning_rate": 3.4497364457831326e-07, "logits/chosen": -3.0126953125, "logits/rejected": -3.0640625953674316, "logps/chosen": -431.67498779296875, "logps/rejected": -517.1500244140625, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -6.36376953125, "rewards/margins": 12.220312118530273, "rewards/rejected": -18.5859375, "step": 6960 }, { "epoch": 2.6242469879518073, "grad_norm": 8.752099338950265, "learning_rate": 3.440323795180723e-07, "logits/chosen": -2.9996094703674316, "logits/rejected": -3.1195311546325684, "logps/chosen": -490.0249938964844, "logps/rejected": -514.2000122070312, "loss": 0.0227, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.657422065734863, "rewards/margins": 10.6484375, "rewards/rejected": -17.310937881469727, "step": 6970 }, { "epoch": 2.628012048192771, "grad_norm": 1.510064686256855, "learning_rate": 3.4309111445783126e-07, "logits/chosen": -2.944531202316284, "logits/rejected": -3.0492186546325684, "logps/chosen": -514.8499755859375, "logps/rejected": -519.7249755859375, "loss": 0.0141, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.528515815734863, "rewards/margins": 10.706250190734863, "rewards/rejected": -17.243749618530273, "step": 6980 }, { "epoch": 2.631777108433735, "grad_norm": 6.422231157752804, "learning_rate": 3.4214984939759034e-07, "logits/chosen": -2.968554735183716, "logits/rejected": -3.198046922683716, "logps/chosen": -506.67498779296875, "logps/rejected": -520.2000122070312, "loss": 0.0186, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.755859375, "rewards/margins": 10.322656631469727, "rewards/rejected": -17.073436737060547, "step": 6990 }, { "epoch": 2.6355421686746987, "grad_norm": 3.3782505029181653, "learning_rate": 3.4120858433734937e-07, "logits/chosen": -3.0552735328674316, "logits/rejected": -3.25390625, "logps/chosen": -482.73748779296875, "logps/rejected": -490.3999938964844, "loss": 0.0132, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.262499809265137, "rewards/margins": 10.76171875, "rewards/rejected": -18.0234375, "step": 7000 }, { "epoch": 2.6393072289156625, "grad_norm": 2.49312600285999, "learning_rate": 3.4026731927710845e-07, "logits/chosen": -2.95703125, "logits/rejected": -3.1624999046325684, "logps/chosen": -523.5999755859375, "logps/rejected": -534.625, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -7.465039253234863, "rewards/margins": 11.266406059265137, "rewards/rejected": -18.728124618530273, "step": 7010 }, { "epoch": 2.6430722891566267, "grad_norm": 6.172687365810999, "learning_rate": 3.393260542168674e-07, "logits/chosen": -2.96484375, "logits/rejected": -3.1011719703674316, "logps/chosen": -507.75, "logps/rejected": -519.25, "loss": 0.0081, "rewards/accuracies": 1.0, "rewards/chosen": -7.319531440734863, "rewards/margins": 11.037500381469727, "rewards/rejected": -18.364063262939453, "step": 7020 }, { "epoch": 2.6468373493975905, "grad_norm": 14.63442907798637, "learning_rate": 3.383847891566265e-07, "logits/chosen": -2.8521485328674316, "logits/rejected": -3.042187452316284, "logps/chosen": -554.9000244140625, "logps/rejected": -578.8499755859375, "loss": 0.014, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.234765529632568, "rewards/margins": 11.753125190734863, "rewards/rejected": -18.973438262939453, "step": 7030 }, { "epoch": 2.6506024096385543, "grad_norm": 3.822778097592554, "learning_rate": 3.3744352409638553e-07, "logits/chosen": -3.059765577316284, "logits/rejected": -3.2337889671325684, "logps/chosen": -450.7250061035156, "logps/rejected": -499.32501220703125, "loss": 0.0178, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.893750190734863, "rewards/margins": 10.848437309265137, "rewards/rejected": -17.732812881469727, "step": 7040 }, { "epoch": 2.654367469879518, "grad_norm": 0.13148857860879506, "learning_rate": 3.3650225903614455e-07, "logits/chosen": -2.931640625, "logits/rejected": -3.1773438453674316, "logps/chosen": -476.5249938964844, "logps/rejected": -508.3500061035156, "loss": 0.0201, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.077734470367432, "rewards/margins": 9.826562881469727, "rewards/rejected": -16.903125762939453, "step": 7050 }, { "epoch": 2.658132530120482, "grad_norm": 8.846385917446876, "learning_rate": 3.355609939759036e-07, "logits/chosen": -3.025195360183716, "logits/rejected": -3.2203125953674316, "logps/chosen": -495.07501220703125, "logps/rejected": -501.5249938964844, "loss": 0.0157, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.070116996765137, "rewards/margins": 10.912500381469727, "rewards/rejected": -17.981250762939453, "step": 7060 }, { "epoch": 2.6618975903614457, "grad_norm": 3.364312565279638, "learning_rate": 3.3461972891566266e-07, "logits/chosen": -3.1605467796325684, "logits/rejected": -3.271484375, "logps/chosen": -465.9750061035156, "logps/rejected": -501.04998779296875, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": -8.069531440734863, "rewards/margins": 10.529687881469727, "rewards/rejected": -18.59375, "step": 7070 }, { "epoch": 2.6656626506024095, "grad_norm": 0.5167666717049662, "learning_rate": 3.336784638554217e-07, "logits/chosen": -2.951953172683716, "logits/rejected": -3.0267577171325684, "logps/chosen": -506.2250061035156, "logps/rejected": -553.9000244140625, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -6.761132717132568, "rewards/margins": 11.701562881469727, "rewards/rejected": -18.462499618530273, "step": 7080 }, { "epoch": 2.6694277108433733, "grad_norm": 14.102989070496573, "learning_rate": 3.327371987951807e-07, "logits/chosen": -3.094921827316284, "logits/rejected": -3.2406249046325684, "logps/chosen": -480.1499938964844, "logps/rejected": -523.6500244140625, "loss": 0.0085, "rewards/accuracies": 1.0, "rewards/chosen": -6.792773246765137, "rewards/margins": 10.686718940734863, "rewards/rejected": -17.479686737060547, "step": 7090 }, { "epoch": 2.6731927710843375, "grad_norm": 32.64817627663841, "learning_rate": 3.3179593373493974e-07, "logits/chosen": -3.079882860183716, "logits/rejected": -3.203906297683716, "logps/chosen": -434.17498779296875, "logps/rejected": -516.0, "loss": 0.017, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.416894435882568, "rewards/margins": 11.600781440734863, "rewards/rejected": -18.014062881469727, "step": 7100 }, { "epoch": 2.6769578313253013, "grad_norm": 41.63959677483844, "learning_rate": 3.308546686746988e-07, "logits/chosen": -3.0162110328674316, "logits/rejected": -3.237109422683716, "logps/chosen": -453.1000061035156, "logps/rejected": -520.5, "loss": 0.0109, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.586230278015137, "rewards/margins": 11.353906631469727, "rewards/rejected": -17.948436737060547, "step": 7110 }, { "epoch": 2.680722891566265, "grad_norm": 23.456723216889912, "learning_rate": 3.299134036144578e-07, "logits/chosen": -3.111523389816284, "logits/rejected": -3.221484422683716, "logps/chosen": -481.75, "logps/rejected": -502.1499938964844, "loss": 0.0256, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.330859184265137, "rewards/margins": 10.798437118530273, "rewards/rejected": -18.121875762939453, "step": 7120 }, { "epoch": 2.684487951807229, "grad_norm": 4.608233315445487, "learning_rate": 3.289721385542169e-07, "logits/chosen": -2.984375, "logits/rejected": -3.2308592796325684, "logps/chosen": -565.25, "logps/rejected": -541.5499877929688, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -7.228515625, "rewards/margins": 12.686718940734863, "rewards/rejected": -19.921875, "step": 7130 }, { "epoch": 2.6882530120481927, "grad_norm": 5.957708814096942, "learning_rate": 3.280308734939759e-07, "logits/chosen": -2.876757860183716, "logits/rejected": -3.117968797683716, "logps/chosen": -472.8500061035156, "logps/rejected": -536.0, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": -6.229296684265137, "rewards/margins": 11.453125, "rewards/rejected": -17.676563262939453, "step": 7140 }, { "epoch": 2.6920180722891565, "grad_norm": 1.2956097965306919, "learning_rate": 3.270896084337349e-07, "logits/chosen": -3.0341796875, "logits/rejected": -3.2210936546325684, "logps/chosen": -504.17498779296875, "logps/rejected": -534.4500122070312, "loss": 0.0343, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.420312404632568, "rewards/margins": 11.132031440734863, "rewards/rejected": -17.5546875, "step": 7150 }, { "epoch": 2.6957831325301207, "grad_norm": 4.666174276898122, "learning_rate": 3.2614834337349395e-07, "logits/chosen": -2.94921875, "logits/rejected": -3.225781202316284, "logps/chosen": -540.5499877929688, "logps/rejected": -513.4500122070312, "loss": 0.0219, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.458593845367432, "rewards/margins": 11.444531440734863, "rewards/rejected": -18.918750762939453, "step": 7160 }, { "epoch": 2.6995481927710845, "grad_norm": 5.104822840817946, "learning_rate": 3.25207078313253e-07, "logits/chosen": -2.9574217796325684, "logits/rejected": -3.1292967796325684, "logps/chosen": -461.1499938964844, "logps/rejected": -505.6499938964844, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": -6.296093940734863, "rewards/margins": 11.375, "rewards/rejected": -17.676563262939453, "step": 7170 }, { "epoch": 2.7033132530120483, "grad_norm": 2.706732903845263, "learning_rate": 3.2426581325301206e-07, "logits/chosen": -2.931835889816284, "logits/rejected": -3.1226563453674316, "logps/chosen": -509.45001220703125, "logps/rejected": -526.0499877929688, "loss": 0.0163, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.299609184265137, "rewards/margins": 10.249218940734863, "rewards/rejected": -17.557811737060547, "step": 7180 }, { "epoch": 2.707078313253012, "grad_norm": 2.2141349024726615, "learning_rate": 3.2332454819277103e-07, "logits/chosen": -3.080859422683716, "logits/rejected": -3.250781297683716, "logps/chosen": -414.20001220703125, "logps/rejected": -497.45001220703125, "loss": 0.0095, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.436718940734863, "rewards/margins": 12.116406440734863, "rewards/rejected": -18.545312881469727, "step": 7190 }, { "epoch": 2.710843373493976, "grad_norm": 17.832699885512824, "learning_rate": 3.223832831325301e-07, "logits/chosen": -3.116992235183716, "logits/rejected": -3.187695264816284, "logps/chosen": -495.3500061035156, "logps/rejected": -547.5, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": -6.530077934265137, "rewards/margins": 11.874218940734863, "rewards/rejected": -18.407812118530273, "step": 7200 }, { "epoch": 2.7146084337349397, "grad_norm": 0.39401704158051737, "learning_rate": 3.2144201807228914e-07, "logits/chosen": -3.048828125, "logits/rejected": -3.1402344703674316, "logps/chosen": -453.57501220703125, "logps/rejected": -516.9749755859375, "loss": 0.0135, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.753125190734863, "rewards/margins": 11.598437309265137, "rewards/rejected": -17.345312118530273, "step": 7210 }, { "epoch": 2.7183734939759034, "grad_norm": 5.588648103730119, "learning_rate": 3.2050075301204817e-07, "logits/chosen": -2.9375, "logits/rejected": -3.2249999046325684, "logps/chosen": -499.3999938964844, "logps/rejected": -493.57501220703125, "loss": 0.0097, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.235156059265137, "rewards/margins": 11.3515625, "rewards/rejected": -16.575000762939453, "step": 7220 }, { "epoch": 2.7221385542168672, "grad_norm": 2.6444796249737967, "learning_rate": 3.195594879518072e-07, "logits/chosen": -3.0337891578674316, "logits/rejected": -3.2093749046325684, "logps/chosen": -433.32501220703125, "logps/rejected": -478.8999938964844, "loss": 0.034, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -5.599023342132568, "rewards/margins": 10.939844131469727, "rewards/rejected": -16.543750762939453, "step": 7230 }, { "epoch": 2.7259036144578315, "grad_norm": 21.753814112817913, "learning_rate": 3.1861822289156627e-07, "logits/chosen": -2.9115233421325684, "logits/rejected": -3.026562452316284, "logps/chosen": -479.3999938964844, "logps/rejected": -502.70001220703125, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": -5.138525485992432, "rewards/margins": 11.891406059265137, "rewards/rejected": -17.017187118530273, "step": 7240 }, { "epoch": 2.7296686746987953, "grad_norm": 8.946608385826877, "learning_rate": 3.176769578313253e-07, "logits/chosen": -2.9507813453674316, "logits/rejected": -3.1656250953674316, "logps/chosen": -487.57501220703125, "logps/rejected": -510.1499938964844, "loss": 0.0266, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.363476753234863, "rewards/margins": 11.469531059265137, "rewards/rejected": -16.829687118530273, "step": 7250 }, { "epoch": 2.733433734939759, "grad_norm": 19.864552982283772, "learning_rate": 3.167356927710843e-07, "logits/chosen": -3.043750047683716, "logits/rejected": -3.198046922683716, "logps/chosen": -442.70001220703125, "logps/rejected": -492.70001220703125, "loss": 0.0106, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.358007907867432, "rewards/margins": 12.448437690734863, "rewards/rejected": -18.807811737060547, "step": 7260 }, { "epoch": 2.737198795180723, "grad_norm": 6.746759578702116, "learning_rate": 3.1579442771084335e-07, "logits/chosen": -2.7796874046325684, "logits/rejected": -2.9341797828674316, "logps/chosen": -510.79998779296875, "logps/rejected": -539.125, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -6.206445217132568, "rewards/margins": 11.827343940734863, "rewards/rejected": -18.03125, "step": 7270 }, { "epoch": 2.7409638554216866, "grad_norm": 9.900681665741622, "learning_rate": 3.1485316265060243e-07, "logits/chosen": -2.990234375, "logits/rejected": -3.2152342796325684, "logps/chosen": -487.17498779296875, "logps/rejected": -535.1500244140625, "loss": 0.0134, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.108691215515137, "rewards/margins": 11.247655868530273, "rewards/rejected": -17.357812881469727, "step": 7280 }, { "epoch": 2.744728915662651, "grad_norm": 2.443291900574204, "learning_rate": 3.139118975903614e-07, "logits/chosen": -3.0609374046325684, "logits/rejected": -3.1449217796325684, "logps/chosen": -480.1499938964844, "logps/rejected": -557.3499755859375, "loss": 0.0204, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.271484375, "rewards/margins": 11.760156631469727, "rewards/rejected": -18.043750762939453, "step": 7290 }, { "epoch": 2.7484939759036147, "grad_norm": 3.2476348709650003, "learning_rate": 3.129706325301205e-07, "logits/chosen": -3.09375, "logits/rejected": -3.23046875, "logps/chosen": -450.1000061035156, "logps/rejected": -515.8499755859375, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -6.771484375, "rewards/margins": 12.112500190734863, "rewards/rejected": -18.887500762939453, "step": 7300 }, { "epoch": 2.7522590361445785, "grad_norm": 9.383898935890796, "learning_rate": 3.120293674698795e-07, "logits/chosen": -3.056640625, "logits/rejected": -3.293750047683716, "logps/chosen": -476.25, "logps/rejected": -512.7999877929688, "loss": 0.038, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -7.754687309265137, "rewards/margins": 11.150781631469727, "rewards/rejected": -18.89453125, "step": 7310 }, { "epoch": 2.7560240963855422, "grad_norm": 22.006003272710213, "learning_rate": 3.1108810240963854e-07, "logits/chosen": -2.9761719703674316, "logits/rejected": -3.2046875953674316, "logps/chosen": -521.6500244140625, "logps/rejected": -525.5, "loss": 0.0093, "rewards/accuracies": 1.0, "rewards/chosen": -7.257421970367432, "rewards/margins": 10.755468368530273, "rewards/rejected": -18.018749237060547, "step": 7320 }, { "epoch": 2.759789156626506, "grad_norm": 5.687544564995767, "learning_rate": 3.1014683734939757e-07, "logits/chosen": -2.988476514816284, "logits/rejected": -3.1722655296325684, "logps/chosen": -463.0, "logps/rejected": -524.75, "loss": 0.0102, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.764843940734863, "rewards/margins": 12.10546875, "rewards/rejected": -18.871875762939453, "step": 7330 }, { "epoch": 2.76355421686747, "grad_norm": 2.782239511681144, "learning_rate": 3.092055722891566e-07, "logits/chosen": -2.9828124046325684, "logits/rejected": -3.2562499046325684, "logps/chosen": -480.2749938964844, "logps/rejected": -478.70001220703125, "loss": 0.025, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.419921875, "rewards/margins": 11.142187118530273, "rewards/rejected": -16.5546875, "step": 7340 }, { "epoch": 2.7673192771084336, "grad_norm": 28.294353291132992, "learning_rate": 3.0826430722891567e-07, "logits/chosen": -2.9175782203674316, "logits/rejected": -3.0732421875, "logps/chosen": -431.3999938964844, "logps/rejected": -494.0249938964844, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": -5.622265815734863, "rewards/margins": 11.285937309265137, "rewards/rejected": -16.912500381469727, "step": 7350 }, { "epoch": 2.7710843373493974, "grad_norm": 24.467132123185955, "learning_rate": 3.0732304216867465e-07, "logits/chosen": -2.9371094703674316, "logits/rejected": -3.1246094703674316, "logps/chosen": -457.04998779296875, "logps/rejected": -482.0, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": -6.006249904632568, "rewards/margins": 10.969531059265137, "rewards/rejected": -16.9765625, "step": 7360 }, { "epoch": 2.774849397590361, "grad_norm": 0.9557649393905067, "learning_rate": 3.063817771084337e-07, "logits/chosen": -3.035937547683716, "logits/rejected": -3.149609327316284, "logps/chosen": -468.2749938964844, "logps/rejected": -526.9000244140625, "loss": 0.0167, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.030957221984863, "rewards/margins": 11.865625381469727, "rewards/rejected": -17.897655487060547, "step": 7370 }, { "epoch": 2.7786144578313254, "grad_norm": 100.04455989568582, "learning_rate": 3.0544051204819275e-07, "logits/chosen": -2.8863282203674316, "logits/rejected": -3.134765625, "logps/chosen": -494.29998779296875, "logps/rejected": -548.9000244140625, "loss": 0.022, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.679491996765137, "rewards/margins": 11.669530868530273, "rewards/rejected": -18.337499618530273, "step": 7380 }, { "epoch": 2.7823795180722892, "grad_norm": 17.379046099515023, "learning_rate": 3.0449924698795183e-07, "logits/chosen": -3.0501952171325684, "logits/rejected": -3.248046875, "logps/chosen": -457.625, "logps/rejected": -504.3500061035156, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": -6.308398246765137, "rewards/margins": 11.637499809265137, "rewards/rejected": -17.946874618530273, "step": 7390 }, { "epoch": 2.786144578313253, "grad_norm": 2.964874607070143, "learning_rate": 3.035579819277108e-07, "logits/chosen": -2.8892579078674316, "logits/rejected": -3.1167969703674316, "logps/chosen": -491.7250061035156, "logps/rejected": -527.375, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -6.461328029632568, "rewards/margins": 11.567187309265137, "rewards/rejected": -18.0234375, "step": 7400 }, { "epoch": 2.789909638554217, "grad_norm": 61.49607004341861, "learning_rate": 3.026167168674699e-07, "logits/chosen": -3.18359375, "logits/rejected": -3.276171922683716, "logps/chosen": -491.125, "logps/rejected": -532.9500122070312, "loss": 0.0114, "rewards/accuracies": 1.0, "rewards/chosen": -7.554296970367432, "rewards/margins": 11.435937881469727, "rewards/rejected": -19.0, "step": 7410 }, { "epoch": 2.7936746987951806, "grad_norm": 8.244861158571, "learning_rate": 3.016754518072289e-07, "logits/chosen": -2.9781250953674316, "logits/rejected": -3.0326170921325684, "logps/chosen": -511.875, "logps/rejected": -581.1500244140625, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": -6.818749904632568, "rewards/margins": 11.66015625, "rewards/rejected": -18.481250762939453, "step": 7420 }, { "epoch": 2.797439759036145, "grad_norm": 99.7385112451798, "learning_rate": 3.0073418674698794e-07, "logits/chosen": -3.145312547683716, "logits/rejected": -3.282031297683716, "logps/chosen": -489.67498779296875, "logps/rejected": -550.9500122070312, "loss": 0.0326, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.592187404632568, "rewards/margins": 12.1015625, "rewards/rejected": -18.701562881469727, "step": 7430 }, { "epoch": 2.8012048192771086, "grad_norm": 1.1176114974596412, "learning_rate": 2.9979292168674696e-07, "logits/chosen": -3.106640577316284, "logits/rejected": -3.25, "logps/chosen": -429.4750061035156, "logps/rejected": -495.6499938964844, "loss": 0.0242, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.776171684265137, "rewards/margins": 11.23046875, "rewards/rejected": -18.006250381469727, "step": 7440 }, { "epoch": 2.8049698795180724, "grad_norm": 6.234112023292483, "learning_rate": 2.9885165662650604e-07, "logits/chosen": -3.007617235183716, "logits/rejected": -3.222460985183716, "logps/chosen": -453.79998779296875, "logps/rejected": -513.125, "loss": 0.0125, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.281640529632568, "rewards/margins": 12.296093940734863, "rewards/rejected": -19.5703125, "step": 7450 }, { "epoch": 2.808734939759036, "grad_norm": 1.2547387204229072, "learning_rate": 2.9791039156626507e-07, "logits/chosen": -3.015625, "logits/rejected": -3.0816407203674316, "logps/chosen": -514.0, "logps/rejected": -581.0999755859375, "loss": 0.0072, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.485156059265137, "rewards/margins": 12.435155868530273, "rewards/rejected": -19.926563262939453, "step": 7460 }, { "epoch": 2.8125, "grad_norm": 6.279188396407058, "learning_rate": 2.9696912650602404e-07, "logits/chosen": -2.9124999046325684, "logits/rejected": -3.1031250953674316, "logps/chosen": -524.7000122070312, "logps/rejected": -548.9500122070312, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -7.655859470367432, "rewards/margins": 11.846875190734863, "rewards/rejected": -19.504688262939453, "step": 7470 }, { "epoch": 2.816265060240964, "grad_norm": 1.6839670330911385, "learning_rate": 2.960278614457831e-07, "logits/chosen": -3.008593797683716, "logits/rejected": -3.160937547683716, "logps/chosen": -427.75, "logps/rejected": -530.9500122070312, "loss": 0.0127, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.545702934265137, "rewards/margins": 11.40234375, "rewards/rejected": -18.954687118530273, "step": 7480 }, { "epoch": 2.8200301204819276, "grad_norm": 11.538944142532468, "learning_rate": 2.9508659638554215e-07, "logits/chosen": -3.039257764816284, "logits/rejected": -3.194531202316284, "logps/chosen": -451.875, "logps/rejected": -505.32501220703125, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -6.262890815734863, "rewards/margins": 12.240625381469727, "rewards/rejected": -18.501562118530273, "step": 7490 }, { "epoch": 2.8237951807228914, "grad_norm": 0.41260870824331397, "learning_rate": 2.941453313253012e-07, "logits/chosen": -3.0308594703674316, "logits/rejected": -3.206249952316284, "logps/chosen": -452.1000061035156, "logps/rejected": -519.0, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -6.881640434265137, "rewards/margins": 11.832812309265137, "rewards/rejected": -18.717187881469727, "step": 7500 }, { "epoch": 2.827560240963855, "grad_norm": 0.04601080934018315, "learning_rate": 2.932040662650602e-07, "logits/chosen": -2.9658203125, "logits/rejected": -3.1656250953674316, "logps/chosen": -476.95001220703125, "logps/rejected": -527.9500122070312, "loss": 0.0223, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -7.826367378234863, "rewards/margins": 11.965624809265137, "rewards/rejected": -19.782812118530273, "step": 7510 }, { "epoch": 2.8313253012048194, "grad_norm": 47.73091780096316, "learning_rate": 2.922628012048193e-07, "logits/chosen": -2.9691405296325684, "logits/rejected": -3.1953125, "logps/chosen": -500.7749938964844, "logps/rejected": -502.54998779296875, "loss": 0.0105, "rewards/accuracies": 1.0, "rewards/chosen": -6.707812309265137, "rewards/margins": 11.392187118530273, "rewards/rejected": -18.107812881469727, "step": 7520 }, { "epoch": 2.835090361445783, "grad_norm": 12.814105400332476, "learning_rate": 2.9132153614457826e-07, "logits/chosen": -3.003124952316284, "logits/rejected": -3.1324219703674316, "logps/chosen": -441.1000061035156, "logps/rejected": -488.45001220703125, "loss": 0.0153, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.025390625, "rewards/margins": 10.923437118530273, "rewards/rejected": -16.953125, "step": 7530 }, { "epoch": 2.838855421686747, "grad_norm": 3.4503640771616815, "learning_rate": 2.9038027108433734e-07, "logits/chosen": -2.9234375953674316, "logits/rejected": -3.1410155296325684, "logps/chosen": -494.45001220703125, "logps/rejected": -531.25, "loss": 0.0099, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.26953125, "rewards/margins": 11.922656059265137, "rewards/rejected": -18.200000762939453, "step": 7540 }, { "epoch": 2.8426204819277108, "grad_norm": 1.392884270433832, "learning_rate": 2.8943900602409636e-07, "logits/chosen": -2.958203077316284, "logits/rejected": -3.1484375, "logps/chosen": -490.0249938964844, "logps/rejected": -502.45001220703125, "loss": 0.0396, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.974218845367432, "rewards/margins": 11.713281631469727, "rewards/rejected": -18.676563262939453, "step": 7550 }, { "epoch": 2.8463855421686746, "grad_norm": 1.9634502310012396, "learning_rate": 2.8849774096385544e-07, "logits/chosen": -3.1402344703674316, "logits/rejected": -3.2671875953674316, "logps/chosen": -457.1000061035156, "logps/rejected": -488.95001220703125, "loss": 0.0089, "rewards/accuracies": 1.0, "rewards/chosen": -7.424218654632568, "rewards/margins": 11.323437690734863, "rewards/rejected": -18.743749618530273, "step": 7560 }, { "epoch": 2.850150602409639, "grad_norm": 1.4710438195771365, "learning_rate": 2.875564759036144e-07, "logits/chosen": -2.936328172683716, "logits/rejected": -3.076953172683716, "logps/chosen": -508.57501220703125, "logps/rejected": -538.0, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -6.908789157867432, "rewards/margins": 11.263280868530273, "rewards/rejected": -18.162500381469727, "step": 7570 }, { "epoch": 2.8539156626506026, "grad_norm": 2.4189932716767673, "learning_rate": 2.866152108433735e-07, "logits/chosen": -2.955078125, "logits/rejected": -3.235546827316284, "logps/chosen": -494.42498779296875, "logps/rejected": -489.625, "loss": 0.0118, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.709179878234863, "rewards/margins": 11.030468940734863, "rewards/rejected": -17.754688262939453, "step": 7580 }, { "epoch": 2.8576807228915664, "grad_norm": 0.1821957997271709, "learning_rate": 2.856739457831325e-07, "logits/chosen": -2.987499952316284, "logits/rejected": -3.2925782203674316, "logps/chosen": -422.1499938964844, "logps/rejected": -484.54998779296875, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -6.020703315734863, "rewards/margins": 11.52734375, "rewards/rejected": -17.540624618530273, "step": 7590 }, { "epoch": 2.86144578313253, "grad_norm": 5.786140484205354, "learning_rate": 2.8473268072289155e-07, "logits/chosen": -3.018749952316284, "logits/rejected": -3.192187547683716, "logps/chosen": -475.57501220703125, "logps/rejected": -518.7000122070312, "loss": 0.0229, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.233788967132568, "rewards/margins": 11.747655868530273, "rewards/rejected": -17.996875762939453, "step": 7600 }, { "epoch": 2.865210843373494, "grad_norm": 1.5409051336265234, "learning_rate": 2.837914156626506e-07, "logits/chosen": -2.984375, "logits/rejected": -3.1929688453674316, "logps/chosen": -502.45001220703125, "logps/rejected": -520.2000122070312, "loss": 0.0246, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.433984279632568, "rewards/margins": 11.422656059265137, "rewards/rejected": -17.860937118530273, "step": 7610 }, { "epoch": 2.8689759036144578, "grad_norm": 0.19871929958945606, "learning_rate": 2.8285015060240966e-07, "logits/chosen": -2.9945311546325684, "logits/rejected": -3.1371092796325684, "logps/chosen": -506.5, "logps/rejected": -525.9000244140625, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -6.012499809265137, "rewards/margins": 11.228124618530273, "rewards/rejected": -17.229686737060547, "step": 7620 }, { "epoch": 2.8727409638554215, "grad_norm": 0.5596968778822106, "learning_rate": 2.819088855421687e-07, "logits/chosen": -2.951953172683716, "logits/rejected": -3.149218797683716, "logps/chosen": -494.20001220703125, "logps/rejected": -540.0499877929688, "loss": 0.0086, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.19921875, "rewards/margins": 11.067187309265137, "rewards/rejected": -17.264062881469727, "step": 7630 }, { "epoch": 2.8765060240963853, "grad_norm": 0.029552443903417328, "learning_rate": 2.8096762048192766e-07, "logits/chosen": -3.0982422828674316, "logits/rejected": -3.2085938453674316, "logps/chosen": -442.07501220703125, "logps/rejected": -499.70001220703125, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": -6.521874904632568, "rewards/margins": 11.49609375, "rewards/rejected": -18.024999618530273, "step": 7640 }, { "epoch": 2.880271084337349, "grad_norm": 1.0867123529303342, "learning_rate": 2.8002635542168674e-07, "logits/chosen": -2.875, "logits/rejected": -3.188281297683716, "logps/chosen": -524.375, "logps/rejected": -523.6500244140625, "loss": 0.0438, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.567480564117432, "rewards/margins": 11.776562690734863, "rewards/rejected": -18.346874237060547, "step": 7650 }, { "epoch": 2.8840361445783134, "grad_norm": 3.7298271175336035, "learning_rate": 2.7908509036144576e-07, "logits/chosen": -3.0589842796325684, "logits/rejected": -3.1429686546325684, "logps/chosen": -511.125, "logps/rejected": -516.4500122070312, "loss": 0.0256, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.036328315734863, "rewards/margins": 11.407031059265137, "rewards/rejected": -18.451562881469727, "step": 7660 }, { "epoch": 2.887801204819277, "grad_norm": 1.247685984022712, "learning_rate": 2.781438253012048e-07, "logits/chosen": -3.0425782203674316, "logits/rejected": -3.325390577316284, "logps/chosen": -498.95001220703125, "logps/rejected": -507.3500061035156, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -7.215234279632568, "rewards/margins": 10.853124618530273, "rewards/rejected": -18.076562881469727, "step": 7670 }, { "epoch": 2.891566265060241, "grad_norm": 6.5628888866583415, "learning_rate": 2.772025602409638e-07, "logits/chosen": -3.034374952316284, "logits/rejected": -3.2544922828674316, "logps/chosen": -433.4125061035156, "logps/rejected": -488.3999938964844, "loss": 0.0114, "rewards/accuracies": 1.0, "rewards/chosen": -7.168749809265137, "rewards/margins": 11.15234375, "rewards/rejected": -18.325000762939453, "step": 7680 }, { "epoch": 2.8953313253012047, "grad_norm": 0.5027456848441475, "learning_rate": 2.762612951807229e-07, "logits/chosen": -3.0921874046325684, "logits/rejected": -3.188671827316284, "logps/chosen": -451.95001220703125, "logps/rejected": -560.0750122070312, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -6.643750190734863, "rewards/margins": 11.721094131469727, "rewards/rejected": -18.362499237060547, "step": 7690 }, { "epoch": 2.8990963855421685, "grad_norm": 31.037795192207756, "learning_rate": 2.753200301204819e-07, "logits/chosen": -2.8804688453674316, "logits/rejected": -3.171093702316284, "logps/chosen": -515.0999755859375, "logps/rejected": -531.5999755859375, "loss": 0.0089, "rewards/accuracies": 1.0, "rewards/chosen": -6.801953315734863, "rewards/margins": 11.595312118530273, "rewards/rejected": -18.401561737060547, "step": 7700 }, { "epoch": 2.9028614457831328, "grad_norm": 12.025978268568984, "learning_rate": 2.7437876506024095e-07, "logits/chosen": -2.940234422683716, "logits/rejected": -3.0960936546325684, "logps/chosen": -490.5, "logps/rejected": -545.6500244140625, "loss": 0.0285, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.691796779632568, "rewards/margins": 10.930468559265137, "rewards/rejected": -18.623437881469727, "step": 7710 }, { "epoch": 2.9066265060240966, "grad_norm": 128.6430244501136, "learning_rate": 2.734375e-07, "logits/chosen": -3.042773485183716, "logits/rejected": -3.160351514816284, "logps/chosen": -494.57501220703125, "logps/rejected": -543.4749755859375, "loss": 0.0086, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.7548828125, "rewards/margins": 11.951562881469727, "rewards/rejected": -18.693750381469727, "step": 7720 }, { "epoch": 2.9103915662650603, "grad_norm": 4.992728854225822, "learning_rate": 2.7249623493975906e-07, "logits/chosen": -2.9478516578674316, "logits/rejected": -3.042773485183716, "logps/chosen": -506.6000061035156, "logps/rejected": -554.4749755859375, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -6.779296875, "rewards/margins": 11.551562309265137, "rewards/rejected": -18.345312118530273, "step": 7730 }, { "epoch": 2.914156626506024, "grad_norm": 17.728830685240855, "learning_rate": 2.7155496987951803e-07, "logits/chosen": -2.957812547683716, "logits/rejected": -3.1664061546325684, "logps/chosen": -442.88751220703125, "logps/rejected": -498.32501220703125, "loss": 0.0103, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.743359565734863, "rewards/margins": 11.844531059265137, "rewards/rejected": -18.587499618530273, "step": 7740 }, { "epoch": 2.917921686746988, "grad_norm": 4.461055817113122, "learning_rate": 2.706137048192771e-07, "logits/chosen": -2.9712891578674316, "logits/rejected": -3.2582030296325684, "logps/chosen": -485.2250061035156, "logps/rejected": -512.0499877929688, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -5.860937595367432, "rewards/margins": 12.503125190734863, "rewards/rejected": -18.3515625, "step": 7750 }, { "epoch": 2.9216867469879517, "grad_norm": 0.8044892950878754, "learning_rate": 2.6967243975903614e-07, "logits/chosen": -3.215625047683716, "logits/rejected": -3.4136719703674316, "logps/chosen": -411.3999938964844, "logps/rejected": -474.8999938964844, "loss": 0.0077, "rewards/accuracies": 1.0, "rewards/chosen": -6.970898628234863, "rewards/margins": 11.485156059265137, "rewards/rejected": -18.448436737060547, "step": 7760 }, { "epoch": 2.9254518072289155, "grad_norm": 8.204482319797346, "learning_rate": 2.687311746987952e-07, "logits/chosen": -2.9189453125, "logits/rejected": -3.2621092796325684, "logps/chosen": -442.7749938964844, "logps/rejected": -489.5, "loss": 0.0113, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.272265434265137, "rewards/margins": 11.720312118530273, "rewards/rejected": -18.001562118530273, "step": 7770 }, { "epoch": 2.9292168674698793, "grad_norm": 0.5953397235574643, "learning_rate": 2.677899096385542e-07, "logits/chosen": -3.013671875, "logits/rejected": -3.2279295921325684, "logps/chosen": -444.36248779296875, "logps/rejected": -500.95001220703125, "loss": 0.018, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.21826171875, "rewards/margins": 11.95703125, "rewards/rejected": -18.1875, "step": 7780 }, { "epoch": 2.9329819277108435, "grad_norm": 7.659421804930133, "learning_rate": 2.6684864457831327e-07, "logits/chosen": -3.1324219703674316, "logits/rejected": -3.25390625, "logps/chosen": -442.9750061035156, "logps/rejected": -528.6500244140625, "loss": 0.009, "rewards/accuracies": 1.0, "rewards/chosen": -6.9248046875, "rewards/margins": 11.89453125, "rewards/rejected": -18.815624237060547, "step": 7790 }, { "epoch": 2.9367469879518073, "grad_norm": 0.43626567607239286, "learning_rate": 2.659073795180723e-07, "logits/chosen": -3.154296875, "logits/rejected": -3.251953125, "logps/chosen": -447.07501220703125, "logps/rejected": -533.0, "loss": 0.0339, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.106249809265137, "rewards/margins": 12.514062881469727, "rewards/rejected": -19.6171875, "step": 7800 }, { "epoch": 2.940512048192771, "grad_norm": 1.3283506302917112, "learning_rate": 2.6496611445783127e-07, "logits/chosen": -2.983203172683716, "logits/rejected": -3.1644530296325684, "logps/chosen": -492.32501220703125, "logps/rejected": -506.1000061035156, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -6.665625095367432, "rewards/margins": 11.399218559265137, "rewards/rejected": -18.065624237060547, "step": 7810 }, { "epoch": 2.944277108433735, "grad_norm": 21.295150150533157, "learning_rate": 2.6402484939759035e-07, "logits/chosen": -3.063281297683716, "logits/rejected": -3.1890625953674316, "logps/chosen": -444.375, "logps/rejected": -498.3999938964844, "loss": 0.0211, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.964453220367432, "rewards/margins": 11.759374618530273, "rewards/rejected": -18.731250762939453, "step": 7820 }, { "epoch": 2.9480421686746987, "grad_norm": 2.527106906493377, "learning_rate": 2.630835843373494e-07, "logits/chosen": -3.109179735183716, "logits/rejected": -3.119921922683716, "logps/chosen": -482.5, "logps/rejected": -563.5250244140625, "loss": 0.031, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -6.936596870422363, "rewards/margins": 12.193750381469727, "rewards/rejected": -19.1328125, "step": 7830 }, { "epoch": 2.9518072289156625, "grad_norm": 8.877450633833122, "learning_rate": 2.6214231927710845e-07, "logits/chosen": -2.8939452171325684, "logits/rejected": -3.1402344703674316, "logps/chosen": -496.2250061035156, "logps/rejected": -549.4500122070312, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -6.8115234375, "rewards/margins": 11.606249809265137, "rewards/rejected": -18.420312881469727, "step": 7840 }, { "epoch": 2.9555722891566267, "grad_norm": 11.312658090086346, "learning_rate": 2.6120105421686743e-07, "logits/chosen": -3.0210938453674316, "logits/rejected": -3.270703077316284, "logps/chosen": -466.8999938964844, "logps/rejected": -536.9500122070312, "loss": 0.0178, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -7.026659965515137, "rewards/margins": 12.638280868530273, "rewards/rejected": -19.654687881469727, "step": 7850 }, { "epoch": 2.9593373493975905, "grad_norm": 0.9297622742355143, "learning_rate": 2.602597891566265e-07, "logits/chosen": -2.951171875, "logits/rejected": -3.0748047828674316, "logps/chosen": -453.79998779296875, "logps/rejected": -513.0999755859375, "loss": 0.0148, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -7.198632717132568, "rewards/margins": 11.046875, "rewards/rejected": -18.25, "step": 7860 }, { "epoch": 2.9631024096385543, "grad_norm": 27.990403983659814, "learning_rate": 2.5931852409638553e-07, "logits/chosen": -3.020312547683716, "logits/rejected": -3.1468749046325684, "logps/chosen": -423.8500061035156, "logps/rejected": -488.42498779296875, "loss": 0.0119, "rewards/accuracies": 1.0, "rewards/chosen": -7.532812595367432, "rewards/margins": 11.256250381469727, "rewards/rejected": -18.787500381469727, "step": 7870 }, { "epoch": 2.966867469879518, "grad_norm": 18.692669322407443, "learning_rate": 2.5837725903614456e-07, "logits/chosen": -3.028125047683716, "logits/rejected": -3.1683592796325684, "logps/chosen": -479.20001220703125, "logps/rejected": -525.9500122070312, "loss": 0.0127, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.797656059265137, "rewards/margins": 12.480859756469727, "rewards/rejected": -19.282812118530273, "step": 7880 }, { "epoch": 2.970632530120482, "grad_norm": 4.8114524677482455, "learning_rate": 2.574359939759036e-07, "logits/chosen": -3.0904297828674316, "logits/rejected": -3.257031202316284, "logps/chosen": -459.2749938964844, "logps/rejected": -501.29998779296875, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": -7.231640815734863, "rewards/margins": 12.366406440734863, "rewards/rejected": -19.600000381469727, "step": 7890 }, { "epoch": 2.9743975903614457, "grad_norm": 7.90624113468138, "learning_rate": 2.5649472891566267e-07, "logits/chosen": -3.0101561546325684, "logits/rejected": -3.2412109375, "logps/chosen": -519.8499755859375, "logps/rejected": -555.6749877929688, "loss": 0.01, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.511132717132568, "rewards/margins": 12.141406059265137, "rewards/rejected": -18.657812118530273, "step": 7900 }, { "epoch": 2.9781626506024095, "grad_norm": 40.567209549198246, "learning_rate": 2.555534638554217e-07, "logits/chosen": -2.931835889816284, "logits/rejected": -3.122851610183716, "logps/chosen": -549.9249877929688, "logps/rejected": -541.4000244140625, "loss": 0.0111, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.259765625, "rewards/margins": 11.650781631469727, "rewards/rejected": -18.909374237060547, "step": 7910 }, { "epoch": 2.9819277108433733, "grad_norm": 11.734074991303837, "learning_rate": 2.546121987951807e-07, "logits/chosen": -3.2828125953674316, "logits/rejected": -3.293750047683716, "logps/chosen": -437.79998779296875, "logps/rejected": -519.0499877929688, "loss": 0.032, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -8.210156440734863, "rewards/margins": 11.76171875, "rewards/rejected": -19.964061737060547, "step": 7920 }, { "epoch": 2.9856927710843375, "grad_norm": 71.43531657726223, "learning_rate": 2.5367093373493975e-07, "logits/chosen": -3.0843749046325684, "logits/rejected": -3.339648485183716, "logps/chosen": -486.8500061035156, "logps/rejected": -527.25, "loss": 0.0415, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -7.958984375, "rewards/margins": 11.315625190734863, "rewards/rejected": -19.271875381469727, "step": 7930 }, { "epoch": 2.9894578313253013, "grad_norm": 5.173802685845021, "learning_rate": 2.5272966867469883e-07, "logits/chosen": -3.0765624046325684, "logits/rejected": -3.2339844703674316, "logps/chosen": -522.2999877929688, "logps/rejected": -529.2000122070312, "loss": 0.0256, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.352734565734863, "rewards/margins": 11.739843368530273, "rewards/rejected": -18.075000762939453, "step": 7940 }, { "epoch": 2.993222891566265, "grad_norm": 0.20264620652884935, "learning_rate": 2.517884036144578e-07, "logits/chosen": -3.061718702316284, "logits/rejected": -3.1796875, "logps/chosen": -470.25, "logps/rejected": -517.5999755859375, "loss": 0.0121, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.644824028015137, "rewards/margins": 11.090624809265137, "rewards/rejected": -18.739843368530273, "step": 7950 }, { "epoch": 2.996987951807229, "grad_norm": 3.947598727955517, "learning_rate": 2.5084713855421683e-07, "logits/chosen": -3.146484375, "logits/rejected": -3.32421875, "logps/chosen": -486.2749938964844, "logps/rejected": -528.9749755859375, "loss": 0.031, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -7.880468845367432, "rewards/margins": 11.060937881469727, "rewards/rejected": -18.9453125, "step": 7960 }, { "epoch": 3.0007530120481927, "grad_norm": 0.2966153940503003, "learning_rate": 2.499058734939759e-07, "logits/chosen": -3.0244140625, "logits/rejected": -3.2113280296325684, "logps/chosen": -514.8250122070312, "logps/rejected": -514.0499877929688, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": -7.670702934265137, "rewards/margins": 12.106249809265137, "rewards/rejected": -19.778125762939453, "step": 7970 }, { "epoch": 3.0045180722891565, "grad_norm": 3.559199359626638, "learning_rate": 2.4896460843373493e-07, "logits/chosen": -2.9222655296325684, "logits/rejected": -3.229687452316284, "logps/chosen": -452.45001220703125, "logps/rejected": -484.20001220703125, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -6.053906440734863, "rewards/margins": 12.260937690734863, "rewards/rejected": -18.3046875, "step": 7980 }, { "epoch": 3.0082831325301207, "grad_norm": 0.4594288201898531, "learning_rate": 2.4802334337349396e-07, "logits/chosen": -3.126757860183716, "logits/rejected": -3.291015625, "logps/chosen": -537.3250122070312, "logps/rejected": -570.5999755859375, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -7.094922065734863, "rewards/margins": 12.4375, "rewards/rejected": -19.520313262939453, "step": 7990 }, { "epoch": 3.0120481927710845, "grad_norm": 0.8958030125673064, "learning_rate": 2.47082078313253e-07, "logits/chosen": -3.102343797683716, "logits/rejected": -3.2109375, "logps/chosen": -506.875, "logps/rejected": -539.4500122070312, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -7.316992282867432, "rewards/margins": 12.5859375, "rewards/rejected": -19.895313262939453, "step": 8000 }, { "epoch": 3.0158132530120483, "grad_norm": 0.41127793787717637, "learning_rate": 2.46140813253012e-07, "logits/chosen": -2.932421922683716, "logits/rejected": -3.06640625, "logps/chosen": -512.8250122070312, "logps/rejected": -546.8499755859375, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -7.313281059265137, "rewards/margins": 12.227343559265137, "rewards/rejected": -19.540624618530273, "step": 8010 }, { "epoch": 3.019578313253012, "grad_norm": 0.02999173770700996, "learning_rate": 2.451995481927711e-07, "logits/chosen": -3.0160155296325684, "logits/rejected": -3.233203172683716, "logps/chosen": -476.92498779296875, "logps/rejected": -525.4500122070312, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -6.74609375, "rewards/margins": 12.800000190734863, "rewards/rejected": -19.548437118530273, "step": 8020 }, { "epoch": 3.023343373493976, "grad_norm": 4.9318057288885635, "learning_rate": 2.442582831325301e-07, "logits/chosen": -2.9037108421325684, "logits/rejected": -3.150390625, "logps/chosen": -520.25, "logps/rejected": -559.5999755859375, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": -6.619921684265137, "rewards/margins": 13.215624809265137, "rewards/rejected": -19.839061737060547, "step": 8030 }, { "epoch": 3.0271084337349397, "grad_norm": 0.2109753420081425, "learning_rate": 2.4331701807228915e-07, "logits/chosen": -3.028125047683716, "logits/rejected": -3.173046827316284, "logps/chosen": -455.0, "logps/rejected": -527.0, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -7.227929592132568, "rewards/margins": 11.874218940734863, "rewards/rejected": -19.107812881469727, "step": 8040 }, { "epoch": 3.0308734939759034, "grad_norm": 0.22584205701640572, "learning_rate": 2.4237575301204817e-07, "logits/chosen": -3.0699219703674316, "logits/rejected": -3.317578077316284, "logps/chosen": -467.32501220703125, "logps/rejected": -523.9000244140625, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -6.954297065734863, "rewards/margins": 12.529687881469727, "rewards/rejected": -19.485937118530273, "step": 8050 }, { "epoch": 3.0346385542168677, "grad_norm": 4.842606631103915, "learning_rate": 2.414344879518072e-07, "logits/chosen": -3.0531249046325684, "logits/rejected": -3.3148436546325684, "logps/chosen": -473.375, "logps/rejected": -504.0, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -7.303515434265137, "rewards/margins": 12.140625, "rewards/rejected": -19.448436737060547, "step": 8060 }, { "epoch": 3.0384036144578315, "grad_norm": 0.7784681633247329, "learning_rate": 2.404932228915663e-07, "logits/chosen": -3.03125, "logits/rejected": -3.2398438453674316, "logps/chosen": -491.6000061035156, "logps/rejected": -527.2000122070312, "loss": 0.0136, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.440820217132568, "rewards/margins": 12.8515625, "rewards/rejected": -19.282812118530273, "step": 8070 }, { "epoch": 3.0421686746987953, "grad_norm": 0.3280392895389633, "learning_rate": 2.395519578313253e-07, "logits/chosen": -3.08203125, "logits/rejected": -3.2142577171325684, "logps/chosen": -476.54998779296875, "logps/rejected": -545.9500122070312, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -6.496874809265137, "rewards/margins": 13.248437881469727, "rewards/rejected": -19.7421875, "step": 8080 }, { "epoch": 3.045933734939759, "grad_norm": 0.7151260368201864, "learning_rate": 2.3861069277108433e-07, "logits/chosen": -3.0634765625, "logits/rejected": -3.2046875953674316, "logps/chosen": -518.75, "logps/rejected": -550.5499877929688, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -6.616015434265137, "rewards/margins": 12.917187690734863, "rewards/rejected": -19.521875381469727, "step": 8090 }, { "epoch": 3.049698795180723, "grad_norm": 0.17859406158218574, "learning_rate": 2.3766942771084336e-07, "logits/chosen": -3.100781202316284, "logits/rejected": -3.298046827316284, "logps/chosen": -492.32501220703125, "logps/rejected": -500.5249938964844, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -6.019921779632568, "rewards/margins": 11.51953125, "rewards/rejected": -17.5390625, "step": 8100 }, { "epoch": 3.0534638554216866, "grad_norm": 0.11054644952912003, "learning_rate": 2.3672816265060239e-07, "logits/chosen": -3.1546874046325684, "logits/rejected": -3.4007811546325684, "logps/chosen": -487.67498779296875, "logps/rejected": -518.3499755859375, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -6.672265529632568, "rewards/margins": 12.557031631469727, "rewards/rejected": -19.214061737060547, "step": 8110 }, { "epoch": 3.0572289156626504, "grad_norm": 0.6765414213396294, "learning_rate": 2.3578689759036144e-07, "logits/chosen": -2.9300780296325684, "logits/rejected": -3.1527342796325684, "logps/chosen": -589.4500122070312, "logps/rejected": -550.0999755859375, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -7.002343654632568, "rewards/margins": 13.409375190734863, "rewards/rejected": -20.423437118530273, "step": 8120 }, { "epoch": 3.0609939759036147, "grad_norm": 1.8218432717436164, "learning_rate": 2.3484563253012047e-07, "logits/chosen": -2.9654297828674316, "logits/rejected": -3.1156249046325684, "logps/chosen": -537.25, "logps/rejected": -579.9500122070312, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -7.739843845367432, "rewards/margins": 13.405468940734863, "rewards/rejected": -21.137500762939453, "step": 8130 }, { "epoch": 3.0647590361445785, "grad_norm": 1.5675240166168585, "learning_rate": 2.3390436746987952e-07, "logits/chosen": -3.184765577316284, "logits/rejected": -3.3746094703674316, "logps/chosen": -457.82501220703125, "logps/rejected": -530.2000122070312, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -7.166796684265137, "rewards/margins": 12.328906059265137, "rewards/rejected": -19.490625381469727, "step": 8140 }, { "epoch": 3.0685240963855422, "grad_norm": 1.1174090837363189, "learning_rate": 2.3296310240963855e-07, "logits/chosen": -2.979687452316284, "logits/rejected": -3.2339844703674316, "logps/chosen": -469.45001220703125, "logps/rejected": -514.7000122070312, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -6.243945121765137, "rewards/margins": 13.0625, "rewards/rejected": -19.301563262939453, "step": 8150 }, { "epoch": 3.072289156626506, "grad_norm": 0.31087875943128007, "learning_rate": 2.320218373493976e-07, "logits/chosen": -3.189453125, "logits/rejected": -3.3667969703674316, "logps/chosen": -434.45001220703125, "logps/rejected": -526.5, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -6.453417778015137, "rewards/margins": 12.875781059265137, "rewards/rejected": -19.332813262939453, "step": 8160 }, { "epoch": 3.07605421686747, "grad_norm": 2.1916481404849386, "learning_rate": 2.3108057228915663e-07, "logits/chosen": -3.1519532203674316, "logits/rejected": -3.271484375, "logps/chosen": -508.95001220703125, "logps/rejected": -585.25, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -7.211328029632568, "rewards/margins": 12.8046875, "rewards/rejected": -20.018749237060547, "step": 8170 }, { "epoch": 3.0798192771084336, "grad_norm": 0.529152179269979, "learning_rate": 2.3013930722891565e-07, "logits/chosen": -3.119921922683716, "logits/rejected": -3.314453125, "logps/chosen": -467.125, "logps/rejected": -513.7000122070312, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -7.682421684265137, "rewards/margins": 12.829687118530273, "rewards/rejected": -20.518749237060547, "step": 8180 }, { "epoch": 3.0835843373493974, "grad_norm": 3.1540357334991223, "learning_rate": 2.291980421686747e-07, "logits/chosen": -3.2103514671325684, "logits/rejected": -3.391796827316284, "logps/chosen": -482.67498779296875, "logps/rejected": -542.4000244140625, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -8.571874618530273, "rewards/margins": 12.602343559265137, "rewards/rejected": -21.174999237060547, "step": 8190 }, { "epoch": 3.0873493975903616, "grad_norm": 5.257144104708099, "learning_rate": 2.282567771084337e-07, "logits/chosen": -3.192187547683716, "logits/rejected": -3.34375, "logps/chosen": -503.29998779296875, "logps/rejected": -543.0499877929688, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -6.970703125, "rewards/margins": 12.568750381469727, "rewards/rejected": -19.551563262939453, "step": 8200 }, { "epoch": 3.0911144578313254, "grad_norm": 18.33108484411021, "learning_rate": 2.2731551204819276e-07, "logits/chosen": -3.1031250953674316, "logits/rejected": -3.2515625953674316, "logps/chosen": -509.2749938964844, "logps/rejected": -552.0, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -8.225390434265137, "rewards/margins": 12.660937309265137, "rewards/rejected": -20.890625, "step": 8210 }, { "epoch": 3.0948795180722892, "grad_norm": 0.41245594402980257, "learning_rate": 2.2637424698795179e-07, "logits/chosen": -3.0589842796325684, "logits/rejected": -3.246289014816284, "logps/chosen": -507.125, "logps/rejected": -571.3499755859375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -7.735547065734863, "rewards/margins": 13.260156631469727, "rewards/rejected": -21.003124237060547, "step": 8220 }, { "epoch": 3.098644578313253, "grad_norm": 0.08956126017407537, "learning_rate": 2.2543298192771084e-07, "logits/chosen": -3.066210985183716, "logits/rejected": -3.368359327316284, "logps/chosen": -459.8500061035156, "logps/rejected": -499.29998779296875, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -6.897656440734863, "rewards/margins": 12.762499809265137, "rewards/rejected": -19.659374237060547, "step": 8230 }, { "epoch": 3.102409638554217, "grad_norm": 0.2353399755636862, "learning_rate": 2.2449171686746987e-07, "logits/chosen": -3.087890625, "logits/rejected": -3.1968750953674316, "logps/chosen": -485.1499938964844, "logps/rejected": -562.2000122070312, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -7.193749904632568, "rewards/margins": 12.771875381469727, "rewards/rejected": -19.975000381469727, "step": 8240 }, { "epoch": 3.1061746987951806, "grad_norm": 0.15859591459077313, "learning_rate": 2.235504518072289e-07, "logits/chosen": -3.0316405296325684, "logits/rejected": -3.3109374046325684, "logps/chosen": -484.2749938964844, "logps/rejected": -543.8250122070312, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -6.677734375, "rewards/margins": 13.009374618530273, "rewards/rejected": -19.685937881469727, "step": 8250 }, { "epoch": 3.1099397590361444, "grad_norm": 0.21656503087295415, "learning_rate": 2.2260918674698795e-07, "logits/chosen": -3.1841797828674316, "logits/rejected": -3.234375, "logps/chosen": -498.1499938964844, "logps/rejected": -575.0, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -7.765625, "rewards/margins": 12.903124809265137, "rewards/rejected": -20.668750762939453, "step": 8260 }, { "epoch": 3.1137048192771086, "grad_norm": 0.8652590807567508, "learning_rate": 2.2166792168674697e-07, "logits/chosen": -3.0894532203674316, "logits/rejected": -3.3218750953674316, "logps/chosen": -570.5250244140625, "logps/rejected": -588.5999755859375, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -7.541406154632568, "rewards/margins": 12.914843559265137, "rewards/rejected": -20.454687118530273, "step": 8270 }, { "epoch": 3.1174698795180724, "grad_norm": 0.3967215292513464, "learning_rate": 2.2072665662650602e-07, "logits/chosen": -3.0269532203674316, "logits/rejected": -3.114453077316284, "logps/chosen": -471.45001220703125, "logps/rejected": -553.9000244140625, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -8.725781440734863, "rewards/margins": 12.576562881469727, "rewards/rejected": -21.2890625, "step": 8280 }, { "epoch": 3.121234939759036, "grad_norm": 1.0752535500862308, "learning_rate": 2.1978539156626505e-07, "logits/chosen": -3.062304735183716, "logits/rejected": -3.2080078125, "logps/chosen": -508.3999938964844, "logps/rejected": -585.6500244140625, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -7.668749809265137, "rewards/margins": 12.909375190734863, "rewards/rejected": -20.579687118530273, "step": 8290 }, { "epoch": 3.125, "grad_norm": 1.0268811862108416, "learning_rate": 2.1884412650602408e-07, "logits/chosen": -3.078125, "logits/rejected": -3.296093702316284, "logps/chosen": -447.92498779296875, "logps/rejected": -538.1749877929688, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -8.042577743530273, "rewards/margins": 12.681249618530273, "rewards/rejected": -20.714061737060547, "step": 8300 }, { "epoch": 3.128765060240964, "grad_norm": 1.1351167031669898, "learning_rate": 2.1790286144578313e-07, "logits/chosen": -3.026562452316284, "logits/rejected": -3.247265577316284, "logps/chosen": -438.5249938964844, "logps/rejected": -526.4000244140625, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -7.970703125, "rewards/margins": 12.131250381469727, "rewards/rejected": -20.1015625, "step": 8310 }, { "epoch": 3.1325301204819276, "grad_norm": 0.42721813648776846, "learning_rate": 2.1696159638554216e-07, "logits/chosen": -3.075390577316284, "logits/rejected": -3.2972655296325684, "logps/chosen": -487.6499938964844, "logps/rejected": -562.9500122070312, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -6.322656154632568, "rewards/margins": 13.909375190734863, "rewards/rejected": -20.231250762939453, "step": 8320 }, { "epoch": 3.1362951807228914, "grad_norm": 0.20358635773670966, "learning_rate": 2.160203313253012e-07, "logits/chosen": -2.9453125, "logits/rejected": -3.182812452316284, "logps/chosen": -491.6499938964844, "logps/rejected": -539.2000122070312, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -7.474023342132568, "rewards/margins": 12.589062690734863, "rewards/rejected": -20.0703125, "step": 8330 }, { "epoch": 3.1400602409638556, "grad_norm": 0.3063800530951642, "learning_rate": 2.1507906626506024e-07, "logits/chosen": -3.0316405296325684, "logits/rejected": -3.289843797683716, "logps/chosen": -526.125, "logps/rejected": -592.75, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -7.46484375, "rewards/margins": 13.013280868530273, "rewards/rejected": -20.484375, "step": 8340 }, { "epoch": 3.1438253012048194, "grad_norm": 0.36231304584361973, "learning_rate": 2.141378012048193e-07, "logits/chosen": -3.091015577316284, "logits/rejected": -3.313671827316284, "logps/chosen": -467.17498779296875, "logps/rejected": -489.75, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -6.529492378234863, "rewards/margins": 12.395312309265137, "rewards/rejected": -18.924999237060547, "step": 8350 }, { "epoch": 3.147590361445783, "grad_norm": 0.12861660029608946, "learning_rate": 2.131965361445783e-07, "logits/chosen": -3.151562452316284, "logits/rejected": -3.303515672683716, "logps/chosen": -466.82501220703125, "logps/rejected": -536.4249877929688, "loss": 0.0051, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.177148342132568, "rewards/margins": 12.77734375, "rewards/rejected": -19.959375381469727, "step": 8360 }, { "epoch": 3.151355421686747, "grad_norm": 1.1089028061857362, "learning_rate": 2.1225527108433732e-07, "logits/chosen": -3.103515625, "logits/rejected": -3.264453172683716, "logps/chosen": -492.3999938964844, "logps/rejected": -540.2000122070312, "loss": 0.0251, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.809374809265137, "rewards/margins": 12.700780868530273, "rewards/rejected": -20.528125762939453, "step": 8370 }, { "epoch": 3.1551204819277108, "grad_norm": 5.785471216687418, "learning_rate": 2.1131400602409637e-07, "logits/chosen": -3.12109375, "logits/rejected": -3.4124999046325684, "logps/chosen": -524.0, "logps/rejected": -559.0999755859375, "loss": 0.0079, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.532422065734863, "rewards/margins": 12.529687881469727, "rewards/rejected": -20.073436737060547, "step": 8380 }, { "epoch": 3.1588855421686746, "grad_norm": 1.1156515008592016, "learning_rate": 2.103727409638554e-07, "logits/chosen": -2.981640577316284, "logits/rejected": -3.231250047683716, "logps/chosen": -511.8500061035156, "logps/rejected": -550.4500122070312, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -6.692187309265137, "rewards/margins": 12.985937118530273, "rewards/rejected": -19.674999237060547, "step": 8390 }, { "epoch": 3.1626506024096384, "grad_norm": 0.7050328552247555, "learning_rate": 2.0943147590361445e-07, "logits/chosen": -3.1158204078674316, "logits/rejected": -3.2455077171325684, "logps/chosen": -449.4750061035156, "logps/rejected": -551.7999877929688, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -7.099413871765137, "rewards/margins": 12.848437309265137, "rewards/rejected": -19.953125, "step": 8400 }, { "epoch": 3.1664156626506026, "grad_norm": 0.3129070971934255, "learning_rate": 2.0849021084337348e-07, "logits/chosen": -3.044921875, "logits/rejected": -3.2679686546325684, "logps/chosen": -509.95001220703125, "logps/rejected": -582.5499877929688, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -7.432031154632568, "rewards/margins": 12.638280868530273, "rewards/rejected": -20.071874618530273, "step": 8410 }, { "epoch": 3.1701807228915664, "grad_norm": 0.16284979073855763, "learning_rate": 2.0754894578313253e-07, "logits/chosen": -3.26171875, "logits/rejected": -3.3812499046325684, "logps/chosen": -441.45001220703125, "logps/rejected": -525.5499877929688, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -7.5234375, "rewards/margins": 12.7421875, "rewards/rejected": -20.2734375, "step": 8420 }, { "epoch": 3.17394578313253, "grad_norm": 0.2034360248220376, "learning_rate": 2.0660768072289156e-07, "logits/chosen": -3.0511717796325684, "logits/rejected": -3.2464842796325684, "logps/chosen": -507.70001220703125, "logps/rejected": -543.0999755859375, "loss": 0.0081, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.104077339172363, "rewards/margins": 12.482812881469727, "rewards/rejected": -19.595312118530273, "step": 8430 }, { "epoch": 3.177710843373494, "grad_norm": 0.5796379122030629, "learning_rate": 2.0566641566265058e-07, "logits/chosen": -3.0777344703674316, "logits/rejected": -3.349609375, "logps/chosen": -508.4750061035156, "logps/rejected": -533.0999755859375, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -7.560937404632568, "rewards/margins": 13.6796875, "rewards/rejected": -21.234375, "step": 8440 }, { "epoch": 3.1814759036144578, "grad_norm": 0.48683741835848143, "learning_rate": 2.0472515060240964e-07, "logits/chosen": -3.1898436546325684, "logits/rejected": -3.383984327316284, "logps/chosen": -457.07501220703125, "logps/rejected": -524.0499877929688, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -7.224609375, "rewards/margins": 12.729687690734863, "rewards/rejected": -19.959375381469727, "step": 8450 }, { "epoch": 3.1852409638554215, "grad_norm": 0.1268695249568918, "learning_rate": 2.0378388554216866e-07, "logits/chosen": -3.1546874046325684, "logits/rejected": -3.3499999046325684, "logps/chosen": -554.4000244140625, "logps/rejected": -565.6500244140625, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -7.786913871765137, "rewards/margins": 12.453906059265137, "rewards/rejected": -20.237499237060547, "step": 8460 }, { "epoch": 3.1890060240963853, "grad_norm": 1.8685828380862155, "learning_rate": 2.0284262048192772e-07, "logits/chosen": -3.1839842796325684, "logits/rejected": -3.3296875953674316, "logps/chosen": -463.88751220703125, "logps/rejected": -544.4500122070312, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -8.130859375, "rewards/margins": 13.042187690734863, "rewards/rejected": -21.171875, "step": 8470 }, { "epoch": 3.1927710843373496, "grad_norm": 3.0758884974457623, "learning_rate": 2.0190135542168674e-07, "logits/chosen": -3.186328172683716, "logits/rejected": -3.3687500953674316, "logps/chosen": -504.125, "logps/rejected": -537.3499755859375, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -6.670312404632568, "rewards/margins": 13.517187118530273, "rewards/rejected": -20.185937881469727, "step": 8480 }, { "epoch": 3.1965361445783134, "grad_norm": 1.181637371268535, "learning_rate": 2.0096009036144577e-07, "logits/chosen": -3.181640625, "logits/rejected": -3.3511719703674316, "logps/chosen": -457.67498779296875, "logps/rejected": -535.5250244140625, "loss": 0.0058, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.514843940734863, "rewards/margins": 12.11328125, "rewards/rejected": -19.643749237060547, "step": 8490 }, { "epoch": 3.200301204819277, "grad_norm": 0.23194372722903703, "learning_rate": 2.0001882530120482e-07, "logits/chosen": -3.0855469703674316, "logits/rejected": -3.298828125, "logps/chosen": -476.42498779296875, "logps/rejected": -556.1500244140625, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -6.794921875, "rewards/margins": 13.012499809265137, "rewards/rejected": -19.803125381469727, "step": 8500 }, { "epoch": 3.204066265060241, "grad_norm": 1.201185407202513, "learning_rate": 1.9907756024096385e-07, "logits/chosen": -3.153515577316284, "logits/rejected": -3.41796875, "logps/chosen": -429.75, "logps/rejected": -500.3500061035156, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -7.503125190734863, "rewards/margins": 12.598437309265137, "rewards/rejected": -20.095312118530273, "step": 8510 }, { "epoch": 3.2078313253012047, "grad_norm": 8.756333545642088, "learning_rate": 1.981362951807229e-07, "logits/chosen": -3.1181640625, "logits/rejected": -3.407421827316284, "logps/chosen": -478.1000061035156, "logps/rejected": -524.5499877929688, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -6.987109184265137, "rewards/margins": 12.639843940734863, "rewards/rejected": -19.615625381469727, "step": 8520 }, { "epoch": 3.2115963855421685, "grad_norm": 0.37373450299343397, "learning_rate": 1.971950301204819e-07, "logits/chosen": -3.2265625, "logits/rejected": -3.3882813453674316, "logps/chosen": -491.125, "logps/rejected": -542.4000244140625, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -7.155859470367432, "rewards/margins": 12.464062690734863, "rewards/rejected": -19.614063262939453, "step": 8530 }, { "epoch": 3.2153614457831328, "grad_norm": 3.8001625035542212, "learning_rate": 1.9625376506024096e-07, "logits/chosen": -3.132031202316284, "logits/rejected": -3.344531297683716, "logps/chosen": -454.2749938964844, "logps/rejected": -490.92498779296875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -7.261328220367432, "rewards/margins": 12.470312118530273, "rewards/rejected": -19.734375, "step": 8540 }, { "epoch": 3.2191265060240966, "grad_norm": 0.46397159640264785, "learning_rate": 1.9531249999999998e-07, "logits/chosen": -3.0531249046325684, "logits/rejected": -3.4267578125, "logps/chosen": -463.6000061035156, "logps/rejected": -502.79998779296875, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -7.491064548492432, "rewards/margins": 11.739062309265137, "rewards/rejected": -19.234375, "step": 8550 }, { "epoch": 3.2228915662650603, "grad_norm": 1.8424881145362333, "learning_rate": 1.94371234939759e-07, "logits/chosen": -3.0582032203674316, "logits/rejected": -3.153125047683716, "logps/chosen": -413.57501220703125, "logps/rejected": -504.54998779296875, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -6.237890720367432, "rewards/margins": 13.314062118530273, "rewards/rejected": -19.553125381469727, "step": 8560 }, { "epoch": 3.226656626506024, "grad_norm": 0.06857819237044539, "learning_rate": 1.9342996987951806e-07, "logits/chosen": -3.11328125, "logits/rejected": -3.256054639816284, "logps/chosen": -476.4375, "logps/rejected": -549.9749755859375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -6.494531154632568, "rewards/margins": 13.477343559265137, "rewards/rejected": -19.970312118530273, "step": 8570 }, { "epoch": 3.230421686746988, "grad_norm": 3.902436811337756, "learning_rate": 1.924887048192771e-07, "logits/chosen": -3.0374999046325684, "logits/rejected": -3.2874999046325684, "logps/chosen": -495.25, "logps/rejected": -506.95001220703125, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -7.164843559265137, "rewards/margins": 12.142187118530273, "rewards/rejected": -19.306249618530273, "step": 8580 }, { "epoch": 3.2341867469879517, "grad_norm": 24.644249541235833, "learning_rate": 1.9154743975903614e-07, "logits/chosen": -3.1480469703674316, "logits/rejected": -3.37890625, "logps/chosen": -476.79998779296875, "logps/rejected": -531.7000122070312, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -7.826171875, "rewards/margins": 12.616406440734863, "rewards/rejected": -20.4453125, "step": 8590 }, { "epoch": 3.2379518072289155, "grad_norm": 0.7138074463657175, "learning_rate": 1.9060617469879517e-07, "logits/chosen": -3.1683592796325684, "logits/rejected": -3.285351514816284, "logps/chosen": -522.5999755859375, "logps/rejected": -575.7000122070312, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -7.8515625, "rewards/margins": 13.546875, "rewards/rejected": -21.3984375, "step": 8600 }, { "epoch": 3.2417168674698793, "grad_norm": 0.07214847115226396, "learning_rate": 1.8966490963855422e-07, "logits/chosen": -3.042187452316284, "logits/rejected": -3.250781297683716, "logps/chosen": -583.9249877929688, "logps/rejected": -582.8499755859375, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -8.160937309265137, "rewards/margins": 13.332812309265137, "rewards/rejected": -21.506250381469727, "step": 8610 }, { "epoch": 3.2454819277108435, "grad_norm": 0.19253409734194532, "learning_rate": 1.8872364457831325e-07, "logits/chosen": -3.094531297683716, "logits/rejected": -3.2249999046325684, "logps/chosen": -526.0, "logps/rejected": -559.5, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -7.476953029632568, "rewards/margins": 12.819531440734863, "rewards/rejected": -20.295312881469727, "step": 8620 }, { "epoch": 3.2492469879518073, "grad_norm": 0.421661141900361, "learning_rate": 1.8778237951807228e-07, "logits/chosen": -2.979296922683716, "logits/rejected": -3.3335938453674316, "logps/chosen": -483.2250061035156, "logps/rejected": -499.95001220703125, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -6.683203220367432, "rewards/margins": 13.090624809265137, "rewards/rejected": -19.771875381469727, "step": 8630 }, { "epoch": 3.253012048192771, "grad_norm": 0.65164625225587, "learning_rate": 1.8684111445783133e-07, "logits/chosen": -3.008593797683716, "logits/rejected": -3.3082032203674316, "logps/chosen": -551.4625244140625, "logps/rejected": -560.5999755859375, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -7.605273246765137, "rewards/margins": 12.657031059265137, "rewards/rejected": -20.2578125, "step": 8640 }, { "epoch": 3.256777108433735, "grad_norm": 0.5065445103733768, "learning_rate": 1.8589984939759036e-07, "logits/chosen": -3.141796827316284, "logits/rejected": -3.3628907203674316, "logps/chosen": -468.75, "logps/rejected": -526.5, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -8.551172256469727, "rewards/margins": 12.989062309265137, "rewards/rejected": -21.528125762939453, "step": 8650 }, { "epoch": 3.2605421686746987, "grad_norm": 0.3343677443535976, "learning_rate": 1.849585843373494e-07, "logits/chosen": -3.1640625, "logits/rejected": -3.3984375, "logps/chosen": -491.1000061035156, "logps/rejected": -572.3499755859375, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -8.035937309265137, "rewards/margins": 13.198437690734863, "rewards/rejected": -21.232812881469727, "step": 8660 }, { "epoch": 3.2643072289156625, "grad_norm": 0.10729141220422019, "learning_rate": 1.8401731927710844e-07, "logits/chosen": -3.32421875, "logits/rejected": -3.451953172683716, "logps/chosen": -457.5, "logps/rejected": -527.8499755859375, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -8.141016006469727, "rewards/margins": 12.875, "rewards/rejected": -21.028125762939453, "step": 8670 }, { "epoch": 3.2680722891566267, "grad_norm": 0.530280664329718, "learning_rate": 1.8307605421686744e-07, "logits/chosen": -3.055859327316284, "logits/rejected": -3.377734422683716, "logps/chosen": -458.875, "logps/rejected": -492.20001220703125, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -6.565234184265137, "rewards/margins": 12.78125, "rewards/rejected": -19.350000381469727, "step": 8680 }, { "epoch": 3.2718373493975905, "grad_norm": 0.08779708566373187, "learning_rate": 1.821347891566265e-07, "logits/chosen": -3.013671875, "logits/rejected": -3.2269530296325684, "logps/chosen": -491.3500061035156, "logps/rejected": -533.5999755859375, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -6.828515529632568, "rewards/margins": 13.108593940734863, "rewards/rejected": -19.939062118530273, "step": 8690 }, { "epoch": 3.2756024096385543, "grad_norm": 0.8950104948462423, "learning_rate": 1.8119352409638552e-07, "logits/chosen": -3.0621094703674316, "logits/rejected": -3.285937547683716, "logps/chosen": -516.25, "logps/rejected": -551.4000244140625, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -7.451562404632568, "rewards/margins": 12.471875190734863, "rewards/rejected": -19.932811737060547, "step": 8700 }, { "epoch": 3.279367469879518, "grad_norm": 1.8468458683353122, "learning_rate": 1.8025225903614457e-07, "logits/chosen": -3.096874952316284, "logits/rejected": -3.266406297683716, "logps/chosen": -495.95001220703125, "logps/rejected": -554.4500122070312, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -6.560546875, "rewards/margins": 13.385937690734863, "rewards/rejected": -19.951562881469727, "step": 8710 }, { "epoch": 3.283132530120482, "grad_norm": 6.025416202135182, "learning_rate": 1.793109939759036e-07, "logits/chosen": -3.1380858421325684, "logits/rejected": -3.508593797683716, "logps/chosen": -548.625, "logps/rejected": -560.2000122070312, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -8.116406440734863, "rewards/margins": 13.5, "rewards/rejected": -21.606250762939453, "step": 8720 }, { "epoch": 3.2868975903614457, "grad_norm": 0.5984917282269399, "learning_rate": 1.7836972891566265e-07, "logits/chosen": -3.096484422683716, "logits/rejected": -3.2724609375, "logps/chosen": -501.3999938964844, "logps/rejected": -541.25, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -8.023046493530273, "rewards/margins": 12.934374809265137, "rewards/rejected": -20.962499618530273, "step": 8730 }, { "epoch": 3.2906626506024095, "grad_norm": 0.22329530045461413, "learning_rate": 1.7742846385542167e-07, "logits/chosen": -3.203906297683716, "logits/rejected": -3.360546827316284, "logps/chosen": -473.9750061035156, "logps/rejected": -531.9500122070312, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -8.14453125, "rewards/margins": 13.407812118530273, "rewards/rejected": -21.543750762939453, "step": 8740 }, { "epoch": 3.2944277108433733, "grad_norm": 0.2862591490346807, "learning_rate": 1.764871987951807e-07, "logits/chosen": -3.0699219703674316, "logits/rejected": -3.27734375, "logps/chosen": -515.2999877929688, "logps/rejected": -562.1500244140625, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -7.569140434265137, "rewards/margins": 13.159375190734863, "rewards/rejected": -20.729686737060547, "step": 8750 }, { "epoch": 3.2981927710843375, "grad_norm": 1.4401793948720951, "learning_rate": 1.7554593373493975e-07, "logits/chosen": -3.194531202316284, "logits/rejected": -3.447265625, "logps/chosen": -554.75, "logps/rejected": -558.75, "loss": 0.0106, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.8046875, "rewards/margins": 13.114062309265137, "rewards/rejected": -20.909374237060547, "step": 8760 }, { "epoch": 3.3019578313253013, "grad_norm": 0.8607991758294465, "learning_rate": 1.7460466867469878e-07, "logits/chosen": -3.303515672683716, "logits/rejected": -3.387500047683716, "logps/chosen": -441.70001220703125, "logps/rejected": -568.7999877929688, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -7.892968654632568, "rewards/margins": 13.375, "rewards/rejected": -21.270313262939453, "step": 8770 }, { "epoch": 3.305722891566265, "grad_norm": 1.9984254707173248, "learning_rate": 1.7366340361445783e-07, "logits/chosen": -2.9505858421325684, "logits/rejected": -3.267578125, "logps/chosen": -528.8375244140625, "logps/rejected": -562.0999755859375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -7.405468940734863, "rewards/margins": 13.2890625, "rewards/rejected": -20.690624237060547, "step": 8780 }, { "epoch": 3.309487951807229, "grad_norm": 0.742946703572234, "learning_rate": 1.7272213855421686e-07, "logits/chosen": -3.1781249046325684, "logits/rejected": -3.391796827316284, "logps/chosen": -477.9750061035156, "logps/rejected": -556.4000244140625, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -7.674218654632568, "rewards/margins": 13.168749809265137, "rewards/rejected": -20.856250762939453, "step": 8790 }, { "epoch": 3.3132530120481927, "grad_norm": 0.5349408865354515, "learning_rate": 1.7178087349397591e-07, "logits/chosen": -3.3421874046325684, "logits/rejected": -3.356250047683716, "logps/chosen": -482.20001220703125, "logps/rejected": -587.5499877929688, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -8.469922065734863, "rewards/margins": 13.290624618530273, "rewards/rejected": -21.753124237060547, "step": 8800 }, { "epoch": 3.3170180722891565, "grad_norm": 2.2227696391772542, "learning_rate": 1.7083960843373494e-07, "logits/chosen": -3.1962890625, "logits/rejected": -3.260937452316284, "logps/chosen": -474.67498779296875, "logps/rejected": -579.7999877929688, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -8.053906440734863, "rewards/margins": 13.802343368530273, "rewards/rejected": -21.848438262939453, "step": 8810 }, { "epoch": 3.3207831325301207, "grad_norm": 0.3165031793653135, "learning_rate": 1.6989834337349397e-07, "logits/chosen": -3.064453125, "logits/rejected": -3.216796875, "logps/chosen": -513.1500244140625, "logps/rejected": -593.0, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -7.688281059265137, "rewards/margins": 13.16796875, "rewards/rejected": -20.862499237060547, "step": 8820 }, { "epoch": 3.3245481927710845, "grad_norm": 0.6198729947651188, "learning_rate": 1.6895707831325302e-07, "logits/chosen": -3.146289110183716, "logits/rejected": -3.412109375, "logps/chosen": -457.29998779296875, "logps/rejected": -533.9749755859375, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -6.8203125, "rewards/margins": 14.5078125, "rewards/rejected": -21.332813262939453, "step": 8830 }, { "epoch": 3.3283132530120483, "grad_norm": 0.4256979127685352, "learning_rate": 1.6801581325301205e-07, "logits/chosen": -3.096874952316284, "logits/rejected": -3.2906250953674316, "logps/chosen": -482.125, "logps/rejected": -558.1749877929688, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -8.415868759155273, "rewards/margins": 12.70703125, "rewards/rejected": -21.120311737060547, "step": 8840 }, { "epoch": 3.332078313253012, "grad_norm": 0.15967672757290188, "learning_rate": 1.6707454819277107e-07, "logits/chosen": -3.127734422683716, "logits/rejected": -3.198437452316284, "logps/chosen": -540.1500244140625, "logps/rejected": -576.5, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -8.1298828125, "rewards/margins": 13.392187118530273, "rewards/rejected": -21.510936737060547, "step": 8850 }, { "epoch": 3.335843373493976, "grad_norm": 1.756880380243811, "learning_rate": 1.661332831325301e-07, "logits/chosen": -3.209765672683716, "logits/rejected": -3.454296827316284, "logps/chosen": -482.79998779296875, "logps/rejected": -541.0999755859375, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -8.535547256469727, "rewards/margins": 12.796875, "rewards/rejected": -21.317188262939453, "step": 8860 }, { "epoch": 3.3396084337349397, "grad_norm": 56.93217679490525, "learning_rate": 1.6519201807228913e-07, "logits/chosen": -3.244921922683716, "logits/rejected": -3.2222657203674316, "logps/chosen": -474.75, "logps/rejected": -588.25, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -8.633203506469727, "rewards/margins": 13.657812118530273, "rewards/rejected": -22.290624618530273, "step": 8870 }, { "epoch": 3.3433734939759034, "grad_norm": 0.25986275097968164, "learning_rate": 1.6425075301204818e-07, "logits/chosen": -3.1070313453674316, "logits/rejected": -3.3140625953674316, "logps/chosen": -551.2000122070312, "logps/rejected": -591.4500122070312, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -8.075390815734863, "rewards/margins": 14.202343940734863, "rewards/rejected": -22.267187118530273, "step": 8880 }, { "epoch": 3.3471385542168672, "grad_norm": 0.09365638469104744, "learning_rate": 1.633094879518072e-07, "logits/chosen": -3.3382811546325684, "logits/rejected": -3.5999999046325684, "logps/chosen": -466.75, "logps/rejected": -505.54998779296875, "loss": 0.0115, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -8.522265434265137, "rewards/margins": 12.15625, "rewards/rejected": -20.693750381469727, "step": 8890 }, { "epoch": 3.3509036144578315, "grad_norm": 0.049240451512430186, "learning_rate": 1.6236822289156626e-07, "logits/chosen": -3.1253905296325684, "logits/rejected": -3.36328125, "logps/chosen": -512.0499877929688, "logps/rejected": -548.75, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -7.714062690734863, "rewards/margins": 13.057031631469727, "rewards/rejected": -20.764062881469727, "step": 8900 }, { "epoch": 3.3546686746987953, "grad_norm": 0.3606812131016062, "learning_rate": 1.6142695783132529e-07, "logits/chosen": -3.061328172683716, "logits/rejected": -3.167187452316284, "logps/chosen": -497.79998779296875, "logps/rejected": -592.1500244140625, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -8.067480087280273, "rewards/margins": 13.28125, "rewards/rejected": -21.356250762939453, "step": 8910 }, { "epoch": 3.358433734939759, "grad_norm": 0.11542199732723786, "learning_rate": 1.6048569277108434e-07, "logits/chosen": -3.0751953125, "logits/rejected": -3.2757811546325684, "logps/chosen": -503.375, "logps/rejected": -532.8499755859375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -7.721093654632568, "rewards/margins": 13.515625, "rewards/rejected": -21.2421875, "step": 8920 }, { "epoch": 3.362198795180723, "grad_norm": 1.0515864788730362, "learning_rate": 1.5954442771084337e-07, "logits/chosen": -3.1171875, "logits/rejected": -3.23046875, "logps/chosen": -496.8500061035156, "logps/rejected": -549.2999877929688, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -7.376562595367432, "rewards/margins": 12.965624809265137, "rewards/rejected": -20.340625762939453, "step": 8930 }, { "epoch": 3.3659638554216866, "grad_norm": 0.4862935857471619, "learning_rate": 1.586031626506024e-07, "logits/chosen": -3.121875047683716, "logits/rejected": -3.255078077316284, "logps/chosen": -548.625, "logps/rejected": -551.7999877929688, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -8.161328315734863, "rewards/margins": 13.559374809265137, "rewards/rejected": -21.732812881469727, "step": 8940 }, { "epoch": 3.3697289156626504, "grad_norm": 0.07875144432653729, "learning_rate": 1.5766189759036145e-07, "logits/chosen": -3.1753907203674316, "logits/rejected": -3.393359422683716, "logps/chosen": -458.1499938964844, "logps/rejected": -528.5499877929688, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -6.939062595367432, "rewards/margins": 13.379687309265137, "rewards/rejected": -20.306249618530273, "step": 8950 }, { "epoch": 3.3734939759036147, "grad_norm": 0.623818154751474, "learning_rate": 1.5672063253012047e-07, "logits/chosen": -3.1640625, "logits/rejected": -3.28125, "logps/chosen": -500.1000061035156, "logps/rejected": -548.5, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -7.34375, "rewards/margins": 13.206250190734863, "rewards/rejected": -20.560937881469727, "step": 8960 }, { "epoch": 3.3772590361445785, "grad_norm": 0.5634901749266856, "learning_rate": 1.5577936746987953e-07, "logits/chosen": -3.0386719703674316, "logits/rejected": -3.327343702316284, "logps/chosen": -541.6500244140625, "logps/rejected": -569.0, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -7.395312309265137, "rewards/margins": 13.244531631469727, "rewards/rejected": -20.651561737060547, "step": 8970 }, { "epoch": 3.3810240963855422, "grad_norm": 9.744640044833469, "learning_rate": 1.5483810240963855e-07, "logits/chosen": -3.075390577316284, "logits/rejected": -3.371875047683716, "logps/chosen": -483.5249938964844, "logps/rejected": -576.5499877929688, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -7.287304878234863, "rewards/margins": 14.274999618530273, "rewards/rejected": -21.571874618530273, "step": 8980 }, { "epoch": 3.384789156626506, "grad_norm": 0.2737077125127548, "learning_rate": 1.538968373493976e-07, "logits/chosen": -3.1812500953674316, "logits/rejected": -3.4351563453674316, "logps/chosen": -431.32501220703125, "logps/rejected": -494.7749938964844, "loss": 0.01, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.396484375, "rewards/margins": 12.475781440734863, "rewards/rejected": -19.871875762939453, "step": 8990 }, { "epoch": 3.38855421686747, "grad_norm": 0.9390086894966171, "learning_rate": 1.5295557228915663e-07, "logits/chosen": -3.209765672683716, "logits/rejected": -3.5113282203674316, "logps/chosen": -469.6000061035156, "logps/rejected": -504.125, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -7.645703315734863, "rewards/margins": 12.393750190734863, "rewards/rejected": -20.040624618530273, "step": 9000 }, { "epoch": 3.3923192771084336, "grad_norm": 0.07758822517239702, "learning_rate": 1.5201430722891563e-07, "logits/chosen": -3.186328172683716, "logits/rejected": -3.441601514816284, "logps/chosen": -473.2749938964844, "logps/rejected": -524.1500244140625, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": -7.552343845367432, "rewards/margins": 13.342968940734863, "rewards/rejected": -20.895313262939453, "step": 9010 }, { "epoch": 3.3960843373493974, "grad_norm": 1.0378993292097731, "learning_rate": 1.5107304216867469e-07, "logits/chosen": -3.1664061546325684, "logits/rejected": -3.405078172683716, "logps/chosen": -457.625, "logps/rejected": -512.2249755859375, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -7.479687690734863, "rewards/margins": 12.484375, "rewards/rejected": -19.970312118530273, "step": 9020 }, { "epoch": 3.399849397590361, "grad_norm": 1.1870438738998323, "learning_rate": 1.501317771084337e-07, "logits/chosen": -3.08203125, "logits/rejected": -3.352343797683716, "logps/chosen": -535.5250244140625, "logps/rejected": -568.7999877929688, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -7.224999904632568, "rewards/margins": 12.564062118530273, "rewards/rejected": -19.792186737060547, "step": 9030 }, { "epoch": 3.4036144578313254, "grad_norm": 0.05336392677033291, "learning_rate": 1.4919051204819277e-07, "logits/chosen": -3.15234375, "logits/rejected": -3.430468797683716, "logps/chosen": -496.1499938964844, "logps/rejected": -537.1500244140625, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -7.211328029632568, "rewards/margins": 12.524999618530273, "rewards/rejected": -19.740625381469727, "step": 9040 }, { "epoch": 3.4073795180722892, "grad_norm": 0.5583363801472438, "learning_rate": 1.482492469879518e-07, "logits/chosen": -3.138671875, "logits/rejected": -3.6390624046325684, "logps/chosen": -493.3500061035156, "logps/rejected": -531.5499877929688, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -8.025390625, "rewards/margins": 13.293749809265137, "rewards/rejected": -21.3125, "step": 9050 }, { "epoch": 3.411144578313253, "grad_norm": 1.0909055126503302, "learning_rate": 1.4730798192771085e-07, "logits/chosen": -3.1451172828674316, "logits/rejected": -3.4515624046325684, "logps/chosen": -509.79998779296875, "logps/rejected": -536.3499755859375, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -7.836328029632568, "rewards/margins": 12.059374809265137, "rewards/rejected": -19.887500762939453, "step": 9060 }, { "epoch": 3.414909638554217, "grad_norm": 0.29792670954107825, "learning_rate": 1.4636671686746987e-07, "logits/chosen": -3.104687452316284, "logits/rejected": -3.4371094703674316, "logps/chosen": -579.8250122070312, "logps/rejected": -567.0, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -8.235156059265137, "rewards/margins": 13.057812690734863, "rewards/rejected": -21.296875, "step": 9070 }, { "epoch": 3.4186746987951806, "grad_norm": 2.0108876303720726, "learning_rate": 1.454254518072289e-07, "logits/chosen": -3.1171875, "logits/rejected": -3.343554735183716, "logps/chosen": -474.0249938964844, "logps/rejected": -545.9500122070312, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -7.75390625, "rewards/margins": 13.068750381469727, "rewards/rejected": -20.828125, "step": 9080 }, { "epoch": 3.4224397590361444, "grad_norm": 1.178044898631137, "learning_rate": 1.4448418674698795e-07, "logits/chosen": -3.283203125, "logits/rejected": -3.3863282203674316, "logps/chosen": -518.7750244140625, "logps/rejected": -561.9000244140625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -7.274609565734863, "rewards/margins": 13.783594131469727, "rewards/rejected": -21.059375762939453, "step": 9090 }, { "epoch": 3.4262048192771086, "grad_norm": 0.13067413935322106, "learning_rate": 1.4354292168674698e-07, "logits/chosen": -3.2132811546325684, "logits/rejected": -3.421093702316284, "logps/chosen": -488.9750061035156, "logps/rejected": -512.7000122070312, "loss": 0.0078, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.713476657867432, "rewards/margins": 12.868749618530273, "rewards/rejected": -20.598438262939453, "step": 9100 }, { "epoch": 3.4299698795180724, "grad_norm": 0.09474396858595369, "learning_rate": 1.4260165662650603e-07, "logits/chosen": -3.1791014671325684, "logits/rejected": -3.468945264816284, "logps/chosen": -530.4249877929688, "logps/rejected": -575.0499877929688, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -8.696093559265137, "rewards/margins": 12.948437690734863, "rewards/rejected": -21.645313262939453, "step": 9110 }, { "epoch": 3.433734939759036, "grad_norm": 0.18779209597682933, "learning_rate": 1.4166039156626506e-07, "logits/chosen": -3.1441407203674316, "logits/rejected": -3.371875047683716, "logps/chosen": -490.6000061035156, "logps/rejected": -547.0999755859375, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -7.967382907867432, "rewards/margins": 13.03125, "rewards/rejected": -20.998437881469727, "step": 9120 }, { "epoch": 3.4375, "grad_norm": 0.1574619326147452, "learning_rate": 1.4071912650602409e-07, "logits/chosen": -3.056835889816284, "logits/rejected": -3.2210936546325684, "logps/chosen": -537.5374755859375, "logps/rejected": -586.5499877929688, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -7.973046779632568, "rewards/margins": 13.703125, "rewards/rejected": -21.6796875, "step": 9130 }, { "epoch": 3.441265060240964, "grad_norm": 1.8931240906597806, "learning_rate": 1.3977786144578314e-07, "logits/chosen": -3.145312547683716, "logits/rejected": -3.3626952171325684, "logps/chosen": -510.6499938964844, "logps/rejected": -524.0499877929688, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -7.698046684265137, "rewards/margins": 12.720312118530273, "rewards/rejected": -20.421875, "step": 9140 }, { "epoch": 3.4450301204819276, "grad_norm": 0.7802228018368255, "learning_rate": 1.3883659638554216e-07, "logits/chosen": -3.075390577316284, "logits/rejected": -3.3343749046325684, "logps/chosen": -501.1499938964844, "logps/rejected": -511.79998779296875, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -7.043554782867432, "rewards/margins": 13.860156059265137, "rewards/rejected": -20.90625, "step": 9150 }, { "epoch": 3.4487951807228914, "grad_norm": 0.7340825084288259, "learning_rate": 1.3789533132530122e-07, "logits/chosen": -3.1078124046325684, "logits/rejected": -3.268749952316284, "logps/chosen": -556.0250244140625, "logps/rejected": -556.8499755859375, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -8.244140625, "rewards/margins": 12.756250381469727, "rewards/rejected": -20.995311737060547, "step": 9160 }, { "epoch": 3.4525602409638556, "grad_norm": 0.18595300592804087, "learning_rate": 1.3695406626506022e-07, "logits/chosen": -3.227734327316284, "logits/rejected": -3.452343702316284, "logps/chosen": -419.70001220703125, "logps/rejected": -514.8499755859375, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -8.203125, "rewards/margins": 13.702343940734863, "rewards/rejected": -21.90625, "step": 9170 }, { "epoch": 3.4563253012048194, "grad_norm": 0.5701691398309551, "learning_rate": 1.3601280120481927e-07, "logits/chosen": -2.9964842796325684, "logits/rejected": -3.318164110183716, "logps/chosen": -482.92498779296875, "logps/rejected": -516.2249755859375, "loss": 0.0058, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.506249904632568, "rewards/margins": 13.265625, "rewards/rejected": -20.765625, "step": 9180 }, { "epoch": 3.460090361445783, "grad_norm": 0.2084718384379429, "learning_rate": 1.350715361445783e-07, "logits/chosen": -3.206249952316284, "logits/rejected": -3.474609375, "logps/chosen": -473.42498779296875, "logps/rejected": -543.9000244140625, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -7.178906440734863, "rewards/margins": 14.071874618530273, "rewards/rejected": -21.265625, "step": 9190 }, { "epoch": 3.463855421686747, "grad_norm": 0.457364118880633, "learning_rate": 1.3413027108433732e-07, "logits/chosen": -3.1585936546325684, "logits/rejected": -3.420703172683716, "logps/chosen": -468.7749938964844, "logps/rejected": -528.75, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -7.801562309265137, "rewards/margins": 12.608593940734863, "rewards/rejected": -20.3984375, "step": 9200 }, { "epoch": 3.4676204819277108, "grad_norm": 1.014756178246062, "learning_rate": 1.3318900602409638e-07, "logits/chosen": -3.1382813453674316, "logits/rejected": -3.4267578125, "logps/chosen": -470.4750061035156, "logps/rejected": -525.75, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -7.79296875, "rewards/margins": 13.0859375, "rewards/rejected": -20.878124237060547, "step": 9210 }, { "epoch": 3.4713855421686746, "grad_norm": 0.14855765771828683, "learning_rate": 1.322477409638554e-07, "logits/chosen": -3.078906297683716, "logits/rejected": -3.286328077316284, "logps/chosen": -522.0250244140625, "logps/rejected": -556.75, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -7.626953125, "rewards/margins": 12.989843368530273, "rewards/rejected": -20.621875762939453, "step": 9220 }, { "epoch": 3.475150602409639, "grad_norm": 0.0693758670281561, "learning_rate": 1.3130647590361446e-07, "logits/chosen": -3.123046875, "logits/rejected": -3.393749952316284, "logps/chosen": -480.0375061035156, "logps/rejected": -515.5, "loss": 0.0065, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.735156059265137, "rewards/margins": 13.503125190734863, "rewards/rejected": -20.234375, "step": 9230 }, { "epoch": 3.4789156626506026, "grad_norm": 3.14705976586923, "learning_rate": 1.3036521084337348e-07, "logits/chosen": -3.219921827316284, "logits/rejected": -3.448437452316284, "logps/chosen": -455.2250061035156, "logps/rejected": -506.0, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -7.112500190734863, "rewards/margins": 12.506250381469727, "rewards/rejected": -19.621875762939453, "step": 9240 }, { "epoch": 3.4826807228915664, "grad_norm": 0.03547660413286716, "learning_rate": 1.2942394578313254e-07, "logits/chosen": -3.1078124046325684, "logits/rejected": -3.33203125, "logps/chosen": -543.125, "logps/rejected": -588.0499877929688, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -7.200390815734863, "rewards/margins": 13.340624809265137, "rewards/rejected": -20.548437118530273, "step": 9250 }, { "epoch": 3.48644578313253, "grad_norm": 18.14578021085255, "learning_rate": 1.2848268072289156e-07, "logits/chosen": -3.090625047683716, "logits/rejected": -3.3296875953674316, "logps/chosen": -471.3999938964844, "logps/rejected": -544.0499877929688, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -7.744531154632568, "rewards/margins": 12.345312118530273, "rewards/rejected": -20.084375381469727, "step": 9260 }, { "epoch": 3.490210843373494, "grad_norm": 0.4552623557770859, "learning_rate": 1.275414156626506e-07, "logits/chosen": -3.1039061546325684, "logits/rejected": -3.2691407203674316, "logps/chosen": -508.5, "logps/rejected": -580.1500244140625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -8.617578506469727, "rewards/margins": 13.659375190734863, "rewards/rejected": -22.278125762939453, "step": 9270 }, { "epoch": 3.4939759036144578, "grad_norm": 1.9954003530484754, "learning_rate": 1.2660015060240964e-07, "logits/chosen": -3.205859422683716, "logits/rejected": -3.463671922683716, "logps/chosen": -462.54998779296875, "logps/rejected": -522.1500244140625, "loss": 0.0073, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -8.758203506469727, "rewards/margins": 12.611719131469727, "rewards/rejected": -21.360937118530273, "step": 9280 }, { "epoch": 3.4977409638554215, "grad_norm": 0.2020747036628924, "learning_rate": 1.2565888554216867e-07, "logits/chosen": -3.128124952316284, "logits/rejected": -3.267578125, "logps/chosen": -469.3500061035156, "logps/rejected": -536.4500122070312, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -7.087109565734863, "rewards/margins": 13.4453125, "rewards/rejected": -20.535938262939453, "step": 9290 }, { "epoch": 3.5015060240963853, "grad_norm": 0.438553304777474, "learning_rate": 1.247176204819277e-07, "logits/chosen": -3.0074219703674316, "logits/rejected": -3.3089842796325684, "logps/chosen": -471.6625061035156, "logps/rejected": -525.0750122070312, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -7.163671970367432, "rewards/margins": 13.428125381469727, "rewards/rejected": -20.592187881469727, "step": 9300 }, { "epoch": 3.505271084337349, "grad_norm": 0.5408583873721895, "learning_rate": 1.2377635542168675e-07, "logits/chosen": -3.1675782203674316, "logits/rejected": -3.380078077316284, "logps/chosen": -482.8374938964844, "logps/rejected": -543.7999877929688, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -6.513671875, "rewards/margins": 12.8671875, "rewards/rejected": -19.385936737060547, "step": 9310 }, { "epoch": 3.5090361445783134, "grad_norm": 0.5684972473695816, "learning_rate": 1.2283509036144578e-07, "logits/chosen": -3.0726561546325684, "logits/rejected": -3.28515625, "logps/chosen": -496.8999938964844, "logps/rejected": -515.7999877929688, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -7.567578315734863, "rewards/margins": 12.628125190734863, "rewards/rejected": -20.206249237060547, "step": 9320 }, { "epoch": 3.512801204819277, "grad_norm": 0.25055387169377863, "learning_rate": 1.2189382530120483e-07, "logits/chosen": -3.1343750953674316, "logits/rejected": -3.3375000953674316, "logps/chosen": -572.7999877929688, "logps/rejected": -587.9000244140625, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -7.493750095367432, "rewards/margins": 13.512499809265137, "rewards/rejected": -21.012500762939453, "step": 9330 }, { "epoch": 3.516566265060241, "grad_norm": 0.8755874281898781, "learning_rate": 1.2095256024096386e-07, "logits/chosen": -3.1474609375, "logits/rejected": -3.333984375, "logps/chosen": -496.6499938964844, "logps/rejected": -583.75, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -7.696533203125, "rewards/margins": 12.861719131469727, "rewards/rejected": -20.551563262939453, "step": 9340 }, { "epoch": 3.5203313253012047, "grad_norm": 0.5132468246622796, "learning_rate": 1.2001129518072288e-07, "logits/chosen": -3.014843702316284, "logits/rejected": -3.326171875, "logps/chosen": -460.2749938964844, "logps/rejected": -523.25, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -7.71875, "rewards/margins": 12.746874809265137, "rewards/rejected": -20.481250762939453, "step": 9350 }, { "epoch": 3.5240963855421685, "grad_norm": 1.5376751528382269, "learning_rate": 1.1907003012048192e-07, "logits/chosen": -3.126171827316284, "logits/rejected": -3.374218702316284, "logps/chosen": -476.3999938964844, "logps/rejected": -535.4000244140625, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -8.119921684265137, "rewards/margins": 12.956250190734863, "rewards/rejected": -21.075000762939453, "step": 9360 }, { "epoch": 3.5278614457831328, "grad_norm": 0.9212172848895119, "learning_rate": 1.1812876506024095e-07, "logits/chosen": -3.283203125, "logits/rejected": -3.4398436546325684, "logps/chosen": -444.0, "logps/rejected": -538.5499877929688, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -7.121777534484863, "rewards/margins": 13.588281631469727, "rewards/rejected": -20.717187881469727, "step": 9370 }, { "epoch": 3.5316265060240966, "grad_norm": 2.6591330681164225, "learning_rate": 1.1718749999999999e-07, "logits/chosen": -3.1070313453674316, "logits/rejected": -3.291796922683716, "logps/chosen": -437.70001220703125, "logps/rejected": -551.5499877929688, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -7.100390434265137, "rewards/margins": 13.407812118530273, "rewards/rejected": -20.4921875, "step": 9380 }, { "epoch": 3.5353915662650603, "grad_norm": 1.8378685509920474, "learning_rate": 1.1624623493975903e-07, "logits/chosen": -3.122265577316284, "logits/rejected": -3.2496094703674316, "logps/chosen": -524.5750122070312, "logps/rejected": -563.0999755859375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -7.55859375, "rewards/margins": 12.990625381469727, "rewards/rejected": -20.5546875, "step": 9390 }, { "epoch": 3.539156626506024, "grad_norm": 5.285700881226986, "learning_rate": 1.1530496987951807e-07, "logits/chosen": -3.2437500953674316, "logits/rejected": -3.4097657203674316, "logps/chosen": -440.45001220703125, "logps/rejected": -517.7999877929688, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -7.400976657867432, "rewards/margins": 12.934374809265137, "rewards/rejected": -20.342187881469727, "step": 9400 }, { "epoch": 3.542921686746988, "grad_norm": 0.02227670253230779, "learning_rate": 1.1436370481927711e-07, "logits/chosen": -3.2347655296325684, "logits/rejected": -3.4300780296325684, "logps/chosen": -499.29998779296875, "logps/rejected": -544.75, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -7.874609470367432, "rewards/margins": 14.091405868530273, "rewards/rejected": -21.956249237060547, "step": 9410 }, { "epoch": 3.5466867469879517, "grad_norm": 0.47859349101291143, "learning_rate": 1.1342243975903614e-07, "logits/chosen": -3.216796875, "logits/rejected": -3.3695311546325684, "logps/chosen": -476.4750061035156, "logps/rejected": -553.3499755859375, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -8.6328125, "rewards/margins": 12.496874809265137, "rewards/rejected": -21.126562118530273, "step": 9420 }, { "epoch": 3.5504518072289155, "grad_norm": 2.709272706847146, "learning_rate": 1.1248117469879518e-07, "logits/chosen": -3.241406202316284, "logits/rejected": -3.556640625, "logps/chosen": -565.7000122070312, "logps/rejected": -562.0499877929688, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -7.738671779632568, "rewards/margins": 13.703906059265137, "rewards/rejected": -21.443750381469727, "step": 9430 }, { "epoch": 3.5542168674698793, "grad_norm": 0.16997767336889155, "learning_rate": 1.1153990963855422e-07, "logits/chosen": -3.1328125, "logits/rejected": -3.311328172683716, "logps/chosen": -532.5, "logps/rejected": -577.1500244140625, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -8.021875381469727, "rewards/margins": 13.779687881469727, "rewards/rejected": -21.823436737060547, "step": 9440 }, { "epoch": 3.5579819277108435, "grad_norm": 48.35788228305697, "learning_rate": 1.1059864457831326e-07, "logits/chosen": -3.286328077316284, "logits/rejected": -3.5601563453674316, "logps/chosen": -445.6000061035156, "logps/rejected": -509.8500061035156, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -7.416796684265137, "rewards/margins": 13.307812690734863, "rewards/rejected": -20.714061737060547, "step": 9450 }, { "epoch": 3.5617469879518073, "grad_norm": 0.48213977466817404, "learning_rate": 1.0965737951807228e-07, "logits/chosen": -3.3167967796325684, "logits/rejected": -3.4761719703674316, "logps/chosen": -467.0249938964844, "logps/rejected": -546.2999877929688, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -7.940625190734863, "rewards/margins": 13.857812881469727, "rewards/rejected": -21.810937881469727, "step": 9460 }, { "epoch": 3.565512048192771, "grad_norm": 0.07426472211667177, "learning_rate": 1.0871611445783132e-07, "logits/chosen": -3.158007860183716, "logits/rejected": -3.422656297683716, "logps/chosen": -481.6000061035156, "logps/rejected": -537.25, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -7.858984470367432, "rewards/margins": 13.546875, "rewards/rejected": -21.409374237060547, "step": 9470 }, { "epoch": 3.569277108433735, "grad_norm": 0.10410549032379784, "learning_rate": 1.0777484939759035e-07, "logits/chosen": -3.091015577316284, "logits/rejected": -3.2138671875, "logps/chosen": -525.375, "logps/rejected": -599.0999755859375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -8.426172256469727, "rewards/margins": 12.899999618530273, "rewards/rejected": -21.331249237060547, "step": 9480 }, { "epoch": 3.5730421686746987, "grad_norm": 17.520431432908765, "learning_rate": 1.0683358433734939e-07, "logits/chosen": -3.115234375, "logits/rejected": -3.3941407203674316, "logps/chosen": -549.7750244140625, "logps/rejected": -558.5499877929688, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -8.335546493530273, "rewards/margins": 12.100781440734863, "rewards/rejected": -20.435937881469727, "step": 9490 }, { "epoch": 3.5768072289156625, "grad_norm": 1.1725487218893218, "learning_rate": 1.0589231927710843e-07, "logits/chosen": -3.1332030296325684, "logits/rejected": -3.384765625, "logps/chosen": -479.54998779296875, "logps/rejected": -556.2999877929688, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -7.824804782867432, "rewards/margins": 13.550000190734863, "rewards/rejected": -21.390625, "step": 9500 }, { "epoch": 3.5805722891566267, "grad_norm": 7.363307350008423, "learning_rate": 1.0495105421686747e-07, "logits/chosen": -3.2158203125, "logits/rejected": -3.4847655296325684, "logps/chosen": -488.29998779296875, "logps/rejected": -523.0, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -7.189062595367432, "rewards/margins": 12.6953125, "rewards/rejected": -19.879688262939453, "step": 9510 }, { "epoch": 3.5843373493975905, "grad_norm": 5.487024011234019, "learning_rate": 1.0400978915662651e-07, "logits/chosen": -3.145312547683716, "logits/rejected": -3.423046827316284, "logps/chosen": -489.1000061035156, "logps/rejected": -551.7999877929688, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -7.181836128234863, "rewards/margins": 12.7421875, "rewards/rejected": -19.935937881469727, "step": 9520 }, { "epoch": 3.5881024096385543, "grad_norm": 1.1487474362817984, "learning_rate": 1.0306852409638555e-07, "logits/chosen": -3.1380858421325684, "logits/rejected": -3.3656249046325684, "logps/chosen": -525.3250122070312, "logps/rejected": -573.0999755859375, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -8.077343940734863, "rewards/margins": 13.0078125, "rewards/rejected": -21.095312118530273, "step": 9530 }, { "epoch": 3.591867469879518, "grad_norm": 0.47798833934596335, "learning_rate": 1.0212725903614456e-07, "logits/chosen": -3.177734375, "logits/rejected": -3.479687452316284, "logps/chosen": -460.04998779296875, "logps/rejected": -519.4000244140625, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -7.589062690734863, "rewards/margins": 12.657812118530273, "rewards/rejected": -20.254688262939453, "step": 9540 }, { "epoch": 3.595632530120482, "grad_norm": 0.10790027051854, "learning_rate": 1.011859939759036e-07, "logits/chosen": -3.1412110328674316, "logits/rejected": -3.3109374046325684, "logps/chosen": -486.625, "logps/rejected": -554.5999755859375, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -7.316796779632568, "rewards/margins": 13.223437309265137, "rewards/rejected": -20.543750762939453, "step": 9550 }, { "epoch": 3.5993975903614457, "grad_norm": 0.2631586478935155, "learning_rate": 1.0024472891566264e-07, "logits/chosen": -3.237109422683716, "logits/rejected": -3.392578125, "logps/chosen": -464.75, "logps/rejected": -553.0999755859375, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -7.65234375, "rewards/margins": 13.003125190734863, "rewards/rejected": -20.653125762939453, "step": 9560 }, { "epoch": 3.6031626506024095, "grad_norm": 0.03707155987313302, "learning_rate": 9.930346385542168e-08, "logits/chosen": -3.2085938453674316, "logits/rejected": -3.4527344703674316, "logps/chosen": -468.79998779296875, "logps/rejected": -555.7999877929688, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -7.838086128234863, "rewards/margins": 14.090624809265137, "rewards/rejected": -21.9296875, "step": 9570 }, { "epoch": 3.6069277108433733, "grad_norm": 7.443905292867574, "learning_rate": 9.836219879518072e-08, "logits/chosen": -3.1585936546325684, "logits/rejected": -3.334765672683716, "logps/chosen": -473.61248779296875, "logps/rejected": -532.2999877929688, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -7.488671779632568, "rewards/margins": 12.717968940734863, "rewards/rejected": -20.215625762939453, "step": 9580 }, { "epoch": 3.6106927710843375, "grad_norm": 0.3203391728463291, "learning_rate": 9.742093373493976e-08, "logits/chosen": -3.163281202316284, "logits/rejected": -3.2964844703674316, "logps/chosen": -443.67498779296875, "logps/rejected": -530.9249877929688, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -7.237109184265137, "rewards/margins": 13.081250190734863, "rewards/rejected": -20.3125, "step": 9590 }, { "epoch": 3.6144578313253013, "grad_norm": 2.4723265257669893, "learning_rate": 9.64796686746988e-08, "logits/chosen": -3.0298829078674316, "logits/rejected": -3.409374952316284, "logps/chosen": -485.625, "logps/rejected": -529.4500122070312, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -7.96875, "rewards/margins": 13.167187690734863, "rewards/rejected": -21.137500762939453, "step": 9600 }, { "epoch": 3.618222891566265, "grad_norm": 0.8071008373992514, "learning_rate": 9.553840361445783e-08, "logits/chosen": -3.1869139671325684, "logits/rejected": -3.416015625, "logps/chosen": -476.8374938964844, "logps/rejected": -541.7999877929688, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -7.448046684265137, "rewards/margins": 13.071874618530273, "rewards/rejected": -20.528125762939453, "step": 9610 }, { "epoch": 3.621987951807229, "grad_norm": 0.30777905791152926, "learning_rate": 9.459713855421685e-08, "logits/chosen": -3.117968797683716, "logits/rejected": -3.2925782203674316, "logps/chosen": -513.75, "logps/rejected": -563.2999877929688, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -8.173047065734863, "rewards/margins": 12.90625, "rewards/rejected": -21.079687118530273, "step": 9620 }, { "epoch": 3.6257530120481927, "grad_norm": 0.7504768368655701, "learning_rate": 9.36558734939759e-08, "logits/chosen": -3.1722655296325684, "logits/rejected": -3.418750047683716, "logps/chosen": -483.57501220703125, "logps/rejected": -518.8499755859375, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -8.203516006469727, "rewards/margins": 13.565625190734863, "rewards/rejected": -21.768749237060547, "step": 9630 }, { "epoch": 3.6295180722891565, "grad_norm": 0.7371308219353869, "learning_rate": 9.271460843373493e-08, "logits/chosen": -2.981250047683716, "logits/rejected": -3.2152342796325684, "logps/chosen": -510.8500061035156, "logps/rejected": -582.4500122070312, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -7.510546684265137, "rewards/margins": 14.529687881469727, "rewards/rejected": -22.049999237060547, "step": 9640 }, { "epoch": 3.6332831325301207, "grad_norm": 0.1907789156836919, "learning_rate": 9.177334337349397e-08, "logits/chosen": -3.119140625, "logits/rejected": -3.283203125, "logps/chosen": -543.4249877929688, "logps/rejected": -581.8499755859375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -8.521875381469727, "rewards/margins": 13.654687881469727, "rewards/rejected": -22.168750762939453, "step": 9650 }, { "epoch": 3.6370481927710845, "grad_norm": 0.6206563157976901, "learning_rate": 9.083207831325301e-08, "logits/chosen": -3.0384764671325684, "logits/rejected": -3.358203172683716, "logps/chosen": -514.3499755859375, "logps/rejected": -567.8499755859375, "loss": 0.0048, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.232812404632568, "rewards/margins": 13.274999618530273, "rewards/rejected": -20.517187118530273, "step": 9660 }, { "epoch": 3.6408132530120483, "grad_norm": 0.3086601078935915, "learning_rate": 8.989081325301204e-08, "logits/chosen": -3.1597657203674316, "logits/rejected": -3.3812499046325684, "logps/chosen": -485.51251220703125, "logps/rejected": -544.75, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -8.337109565734863, "rewards/margins": 12.901562690734863, "rewards/rejected": -21.2421875, "step": 9670 }, { "epoch": 3.644578313253012, "grad_norm": 3.5067287000054894, "learning_rate": 8.894954819277108e-08, "logits/chosen": -3.1128907203674316, "logits/rejected": -3.1875, "logps/chosen": -489.625, "logps/rejected": -588.1500244140625, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -7.514452934265137, "rewards/margins": 12.764062881469727, "rewards/rejected": -20.274999618530273, "step": 9680 }, { "epoch": 3.648343373493976, "grad_norm": 4.321771066554955, "learning_rate": 8.800828313253012e-08, "logits/chosen": -3.019726514816284, "logits/rejected": -3.1771483421325684, "logps/chosen": -471.92498779296875, "logps/rejected": -531.9500122070312, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -7.338671684265137, "rewards/margins": 12.585156440734863, "rewards/rejected": -19.9296875, "step": 9690 }, { "epoch": 3.6521084337349397, "grad_norm": 0.050567960104116905, "learning_rate": 8.706701807228915e-08, "logits/chosen": -3.013476610183716, "logits/rejected": -3.283984422683716, "logps/chosen": -515.9500122070312, "logps/rejected": -554.5499877929688, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -7.030871391296387, "rewards/margins": 13.006250381469727, "rewards/rejected": -20.029687881469727, "step": 9700 }, { "epoch": 3.6558734939759034, "grad_norm": 33.00776652835899, "learning_rate": 8.612575301204819e-08, "logits/chosen": -3.072265625, "logits/rejected": -3.455859422683716, "logps/chosen": -522.7000122070312, "logps/rejected": -537.4500122070312, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -7.779687404632568, "rewards/margins": 13.768750190734863, "rewards/rejected": -21.542186737060547, "step": 9710 }, { "epoch": 3.6596385542168672, "grad_norm": 1.5425553162452552, "learning_rate": 8.518448795180723e-08, "logits/chosen": -3.1640625, "logits/rejected": -3.358593702316284, "logps/chosen": -453.20001220703125, "logps/rejected": -552.0499877929688, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -7.329882621765137, "rewards/margins": 13.3203125, "rewards/rejected": -20.631250381469727, "step": 9720 }, { "epoch": 3.6634036144578315, "grad_norm": 1.137042617247605, "learning_rate": 8.424322289156627e-08, "logits/chosen": -3.192187547683716, "logits/rejected": -3.5259766578674316, "logps/chosen": -494.1000061035156, "logps/rejected": -579.7999877929688, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -7.6845703125, "rewards/margins": 12.8828125, "rewards/rejected": -20.5703125, "step": 9730 }, { "epoch": 3.6671686746987953, "grad_norm": 0.08184439627835267, "learning_rate": 8.33019578313253e-08, "logits/chosen": -3.1552734375, "logits/rejected": -3.3695311546325684, "logps/chosen": -435.79998779296875, "logps/rejected": -540.2999877929688, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -8.041406631469727, "rewards/margins": 12.604687690734863, "rewards/rejected": -20.642187118530273, "step": 9740 }, { "epoch": 3.670933734939759, "grad_norm": 1.5824576414658693, "learning_rate": 8.236069277108433e-08, "logits/chosen": -3.096874952316284, "logits/rejected": -3.314648389816284, "logps/chosen": -504.79998779296875, "logps/rejected": -526.7999877929688, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -7.786718845367432, "rewards/margins": 13.318750381469727, "rewards/rejected": -21.090625762939453, "step": 9750 }, { "epoch": 3.674698795180723, "grad_norm": 0.28394742852866434, "learning_rate": 8.141942771084337e-08, "logits/chosen": -3.319140672683716, "logits/rejected": -3.4828124046325684, "logps/chosen": -486.75, "logps/rejected": -556.7000122070312, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -7.894140720367432, "rewards/margins": 13.708593368530273, "rewards/rejected": -21.607812881469727, "step": 9760 }, { "epoch": 3.6784638554216866, "grad_norm": 2.9887204002415695, "learning_rate": 8.047816265060241e-08, "logits/chosen": -3.227734327316284, "logits/rejected": -3.5335936546325684, "logps/chosen": -471.625, "logps/rejected": -515.2000122070312, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -7.722265720367432, "rewards/margins": 13.025781631469727, "rewards/rejected": -20.75, "step": 9770 }, { "epoch": 3.682228915662651, "grad_norm": 26.271657665121715, "learning_rate": 7.953689759036144e-08, "logits/chosen": -3.1675782203674316, "logits/rejected": -3.33984375, "logps/chosen": -493.70001220703125, "logps/rejected": -560.75, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -7.571484565734863, "rewards/margins": 14.028124809265137, "rewards/rejected": -21.600000381469727, "step": 9780 }, { "epoch": 3.6859939759036147, "grad_norm": 0.24052137719771288, "learning_rate": 7.859563253012048e-08, "logits/chosen": -3.211718797683716, "logits/rejected": -3.4378905296325684, "logps/chosen": -478.6875, "logps/rejected": -526.75, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -7.726953029632568, "rewards/margins": 13.997655868530273, "rewards/rejected": -21.728124618530273, "step": 9790 }, { "epoch": 3.6897590361445785, "grad_norm": 0.8925404517601431, "learning_rate": 7.76543674698795e-08, "logits/chosen": -3.1871094703674316, "logits/rejected": -3.3267579078674316, "logps/chosen": -529.0999755859375, "logps/rejected": -601.8499755859375, "loss": 0.0071, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.730859279632568, "rewards/margins": 12.356249809265137, "rewards/rejected": -20.081249237060547, "step": 9800 }, { "epoch": 3.6935240963855422, "grad_norm": 2.707883107395165, "learning_rate": 7.671310240963855e-08, "logits/chosen": -3.224609375, "logits/rejected": -3.430859327316284, "logps/chosen": -503.82501220703125, "logps/rejected": -565.2999877929688, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -8.310155868530273, "rewards/margins": 13.074999809265137, "rewards/rejected": -21.396875381469727, "step": 9810 }, { "epoch": 3.697289156626506, "grad_norm": 0.8369625718492493, "learning_rate": 7.577183734939759e-08, "logits/chosen": -3.1761717796325684, "logits/rejected": -3.342578172683716, "logps/chosen": -529.9500122070312, "logps/rejected": -548.9000244140625, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -7.7890625, "rewards/margins": 13.399999618530273, "rewards/rejected": -21.1875, "step": 9820 }, { "epoch": 3.70105421686747, "grad_norm": 0.5674256271688138, "learning_rate": 7.483057228915663e-08, "logits/chosen": -3.234375, "logits/rejected": -3.3472657203674316, "logps/chosen": -515.0999755859375, "logps/rejected": -568.0499877929688, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -8.373827934265137, "rewards/margins": 12.869531631469727, "rewards/rejected": -21.245311737060547, "step": 9830 }, { "epoch": 3.7048192771084336, "grad_norm": 0.9601227452319075, "learning_rate": 7.388930722891567e-08, "logits/chosen": -3.191601514816284, "logits/rejected": -3.5484375953674316, "logps/chosen": -536.25, "logps/rejected": -521.9500122070312, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -8.177734375, "rewards/margins": 12.703125, "rewards/rejected": -20.876562118530273, "step": 9840 }, { "epoch": 3.7085843373493974, "grad_norm": 0.46693242396681545, "learning_rate": 7.29480421686747e-08, "logits/chosen": -3.299999952316284, "logits/rejected": -3.481640577316284, "logps/chosen": -462.86248779296875, "logps/rejected": -501.1000061035156, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -7.983593940734863, "rewards/margins": 12.649218559265137, "rewards/rejected": -20.637500762939453, "step": 9850 }, { "epoch": 3.712349397590361, "grad_norm": 1.932932054558969, "learning_rate": 7.200677710843372e-08, "logits/chosen": -3.262890577316284, "logits/rejected": -3.491406202316284, "logps/chosen": -522.9625244140625, "logps/rejected": -575.5, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -7.800390720367432, "rewards/margins": 14.315625190734863, "rewards/rejected": -22.118749618530273, "step": 9860 }, { "epoch": 3.7161144578313254, "grad_norm": 0.6678931105722546, "learning_rate": 7.106551204819276e-08, "logits/chosen": -3.209765672683716, "logits/rejected": -3.443359375, "logps/chosen": -483.04998779296875, "logps/rejected": -540.2000122070312, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -8.133203506469727, "rewards/margins": 12.739062309265137, "rewards/rejected": -20.862499237060547, "step": 9870 }, { "epoch": 3.7198795180722892, "grad_norm": 0.38805343826714667, "learning_rate": 7.01242469879518e-08, "logits/chosen": -3.0687499046325684, "logits/rejected": -3.2933592796325684, "logps/chosen": -517.25, "logps/rejected": -562.0, "loss": 0.0098, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.867578029632568, "rewards/margins": 12.948437690734863, "rewards/rejected": -20.814062118530273, "step": 9880 }, { "epoch": 3.723644578313253, "grad_norm": 1.6379428521194452, "learning_rate": 6.918298192771084e-08, "logits/chosen": -3.139453172683716, "logits/rejected": -3.412109375, "logps/chosen": -451.8500061035156, "logps/rejected": -528.3499755859375, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -7.462109565734863, "rewards/margins": 13.173437118530273, "rewards/rejected": -20.625, "step": 9890 }, { "epoch": 3.727409638554217, "grad_norm": 0.4383635367502594, "learning_rate": 6.824171686746988e-08, "logits/chosen": -3.1644530296325684, "logits/rejected": -3.302734375, "logps/chosen": -493.42498779296875, "logps/rejected": -570.5999755859375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -7.494531154632568, "rewards/margins": 13.739062309265137, "rewards/rejected": -21.237499237060547, "step": 9900 }, { "epoch": 3.7311746987951806, "grad_norm": 0.22725460635463196, "learning_rate": 6.730045180722892e-08, "logits/chosen": -2.995312452316284, "logits/rejected": -3.223437547683716, "logps/chosen": -508.9750061035156, "logps/rejected": -561.25, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -6.908593654632568, "rewards/margins": 13.609375, "rewards/rejected": -20.514062881469727, "step": 9910 }, { "epoch": 3.734939759036145, "grad_norm": 0.5071456169254213, "learning_rate": 6.635918674698796e-08, "logits/chosen": -3.083984375, "logits/rejected": -3.282031297683716, "logps/chosen": -572.4500122070312, "logps/rejected": -573.625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -6.979687690734863, "rewards/margins": 13.1328125, "rewards/rejected": -20.112499237060547, "step": 9920 }, { "epoch": 3.7387048192771086, "grad_norm": 8.288917421117258, "learning_rate": 6.541792168674699e-08, "logits/chosen": -3.1402344703674316, "logits/rejected": -3.4574217796325684, "logps/chosen": -460.17498779296875, "logps/rejected": -551.5, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -7.2109375, "rewards/margins": 12.6953125, "rewards/rejected": -19.890625, "step": 9930 }, { "epoch": 3.7424698795180724, "grad_norm": 0.02911023367474065, "learning_rate": 6.447665662650601e-08, "logits/chosen": -3.1089844703674316, "logits/rejected": -3.3734374046325684, "logps/chosen": -470.04998779296875, "logps/rejected": -548.8499755859375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -7.194921970367432, "rewards/margins": 12.908594131469727, "rewards/rejected": -20.098438262939453, "step": 9940 }, { "epoch": 3.746234939759036, "grad_norm": 0.31733680555933563, "learning_rate": 6.353539156626505e-08, "logits/chosen": -3.2828125953674316, "logits/rejected": -3.415820360183716, "logps/chosen": -462.625, "logps/rejected": -532.7000122070312, "loss": 0.0081, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -8.91015625, "rewards/margins": 12.306249618530273, "rewards/rejected": -21.207813262939453, "step": 9950 }, { "epoch": 3.75, "grad_norm": 0.04469585004308893, "learning_rate": 6.259412650602409e-08, "logits/chosen": -3.220703125, "logits/rejected": -3.528515577316284, "logps/chosen": -477.3999938964844, "logps/rejected": -519.8250122070312, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -7.573437690734863, "rewards/margins": 13.165624618530273, "rewards/rejected": -20.728124618530273, "step": 9960 }, { "epoch": 3.753765060240964, "grad_norm": 154.05027573349372, "learning_rate": 6.165286144578313e-08, "logits/chosen": -3.116992235183716, "logits/rejected": -3.340625047683716, "logps/chosen": -485.5, "logps/rejected": -554.3499755859375, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -6.940234184265137, "rewards/margins": 14.212499618530273, "rewards/rejected": -21.149999618530273, "step": 9970 }, { "epoch": 3.7575301204819276, "grad_norm": 0.24459689021743974, "learning_rate": 6.071159638554216e-08, "logits/chosen": -3.169726610183716, "logits/rejected": -3.4867186546325684, "logps/chosen": -439.6000061035156, "logps/rejected": -519.0499877929688, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -7.413671970367432, "rewards/margins": 12.852343559265137, "rewards/rejected": -20.284374237060547, "step": 9980 }, { "epoch": 3.7612951807228914, "grad_norm": 66.79333377252759, "learning_rate": 5.97703313253012e-08, "logits/chosen": -3.2300782203674316, "logits/rejected": -3.448046922683716, "logps/chosen": -470.82501220703125, "logps/rejected": -505.79998779296875, "loss": 0.0105, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -8.038671493530273, "rewards/margins": 12.461718559265137, "rewards/rejected": -20.496875762939453, "step": 9990 }, { "epoch": 3.765060240963855, "grad_norm": 1.604566538762394, "learning_rate": 5.882906626506024e-08, "logits/chosen": -2.916015625, "logits/rejected": -3.2281250953674316, "logps/chosen": -540.9249877929688, "logps/rejected": -598.9500122070312, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -6.469922065734863, "rewards/margins": 13.850000381469727, "rewards/rejected": -20.309375762939453, "step": 10000 }, { "epoch": 3.7688253012048194, "grad_norm": 6.490940617317951, "learning_rate": 5.788780120481927e-08, "logits/chosen": -3.044140577316284, "logits/rejected": -3.242968797683716, "logps/chosen": -456.17498779296875, "logps/rejected": -531.4500122070312, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -7.170312404632568, "rewards/margins": 12.873437881469727, "rewards/rejected": -20.049999237060547, "step": 10010 }, { "epoch": 3.772590361445783, "grad_norm": 0.7111910683503128, "learning_rate": 5.694653614457831e-08, "logits/chosen": -3.081835985183716, "logits/rejected": -3.369921922683716, "logps/chosen": -472.4750061035156, "logps/rejected": -536.7000122070312, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -7.264062404632568, "rewards/margins": 13.470312118530273, "rewards/rejected": -20.735937118530273, "step": 10020 }, { "epoch": 3.776355421686747, "grad_norm": 1.1710247684593167, "learning_rate": 5.600527108433735e-08, "logits/chosen": -3.0816407203674316, "logits/rejected": -3.4515624046325684, "logps/chosen": -487.75, "logps/rejected": -537.6500244140625, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -7.663281440734863, "rewards/margins": 12.9140625, "rewards/rejected": -20.581249237060547, "step": 10030 }, { "epoch": 3.7801204819277108, "grad_norm": 0.30166957228260366, "learning_rate": 5.506400602409638e-08, "logits/chosen": -3.2171874046325684, "logits/rejected": -3.4898438453674316, "logps/chosen": -440.0249938964844, "logps/rejected": -509.7250061035156, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -7.529687404632568, "rewards/margins": 13.160937309265137, "rewards/rejected": -20.6953125, "step": 10040 }, { "epoch": 3.7838855421686746, "grad_norm": 0.5500723573315597, "learning_rate": 5.412274096385542e-08, "logits/chosen": -3.271679639816284, "logits/rejected": -3.4097657203674316, "logps/chosen": -421.9750061035156, "logps/rejected": -530.7000122070312, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -8.037890434265137, "rewards/margins": 13.467187881469727, "rewards/rejected": -21.517187118530273, "step": 10050 }, { "epoch": 3.787650602409639, "grad_norm": 0.2976436134153088, "learning_rate": 5.318147590361446e-08, "logits/chosen": -2.9173827171325684, "logits/rejected": -3.1875, "logps/chosen": -512.1749877929688, "logps/rejected": -566.1500244140625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -8.285547256469727, "rewards/margins": 13.300000190734863, "rewards/rejected": -21.575000762939453, "step": 10060 }, { "epoch": 3.7914156626506026, "grad_norm": 0.48683080803346906, "learning_rate": 5.224021084337349e-08, "logits/chosen": -3.203125, "logits/rejected": -3.37109375, "logps/chosen": -474.2749938964844, "logps/rejected": -532.4500122070312, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -8.126172065734863, "rewards/margins": 13.842968940734863, "rewards/rejected": -21.967187881469727, "step": 10070 }, { "epoch": 3.7951807228915664, "grad_norm": 0.7412923453471185, "learning_rate": 5.1298945783132524e-08, "logits/chosen": -3.198437452316284, "logits/rejected": -3.448046922683716, "logps/chosen": -517.3250122070312, "logps/rejected": -584.7999877929688, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -8.123437881469727, "rewards/margins": 14.559374809265137, "rewards/rejected": -22.6875, "step": 10080 }, { "epoch": 3.79894578313253, "grad_norm": 0.21988001832611956, "learning_rate": 5.0357680722891564e-08, "logits/chosen": -3.1781249046325684, "logits/rejected": -3.2308592796325684, "logps/chosen": -543.8250122070312, "logps/rejected": -604.3499755859375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -8.903515815734863, "rewards/margins": 13.5546875, "rewards/rejected": -22.457813262939453, "step": 10090 }, { "epoch": 3.802710843373494, "grad_norm": 1.7945832746667099, "learning_rate": 4.94164156626506e-08, "logits/chosen": -3.124218702316284, "logits/rejected": -3.262500047683716, "logps/chosen": -467.0, "logps/rejected": -568.5999755859375, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -7.939453125, "rewards/margins": 13.115625381469727, "rewards/rejected": -21.057811737060547, "step": 10100 }, { "epoch": 3.8064759036144578, "grad_norm": 2.2468336612072894, "learning_rate": 4.847515060240964e-08, "logits/chosen": -3.2203125953674316, "logits/rejected": -3.3499999046325684, "logps/chosen": -440.07501220703125, "logps/rejected": -538.0999755859375, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -7.460156440734863, "rewards/margins": 13.154687881469727, "rewards/rejected": -20.620311737060547, "step": 10110 }, { "epoch": 3.8102409638554215, "grad_norm": 1.092287030787835, "learning_rate": 4.753388554216867e-08, "logits/chosen": -3.4183592796325684, "logits/rejected": -3.553906202316284, "logps/chosen": -412.4375, "logps/rejected": -517.0999755859375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -7.541406154632568, "rewards/margins": 13.956250190734863, "rewards/rejected": -21.510936737060547, "step": 10120 }, { "epoch": 3.8140060240963853, "grad_norm": 0.1737514145155962, "learning_rate": 4.659262048192771e-08, "logits/chosen": -3.186328172683716, "logits/rejected": -3.456249952316284, "logps/chosen": -499.45001220703125, "logps/rejected": -568.9749755859375, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -7.708593845367432, "rewards/margins": 13.896875381469727, "rewards/rejected": -21.603124618530273, "step": 10130 }, { "epoch": 3.817771084337349, "grad_norm": 0.7473125971363874, "learning_rate": 4.5651355421686744e-08, "logits/chosen": -3.176562547683716, "logits/rejected": -3.414843797683716, "logps/chosen": -461.0, "logps/rejected": -533.7999877929688, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -7.62109375, "rewards/margins": 12.538281440734863, "rewards/rejected": -20.162500381469727, "step": 10140 }, { "epoch": 3.8215361445783134, "grad_norm": 0.07766443249718237, "learning_rate": 4.4710090361445784e-08, "logits/chosen": -3.1128907203674316, "logits/rejected": -3.4253907203674316, "logps/chosen": -516.0, "logps/rejected": -546.7999877929688, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -7.769921779632568, "rewards/margins": 12.995312690734863, "rewards/rejected": -20.754688262939453, "step": 10150 }, { "epoch": 3.825301204819277, "grad_norm": 0.2889733031491674, "learning_rate": 4.3768825301204824e-08, "logits/chosen": -3.0687499046325684, "logits/rejected": -3.2164063453674316, "logps/chosen": -491.6000061035156, "logps/rejected": -586.1500244140625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -7.850390434265137, "rewards/margins": 13.9765625, "rewards/rejected": -21.828125, "step": 10160 }, { "epoch": 3.829066265060241, "grad_norm": 0.6027573710023351, "learning_rate": 4.282756024096385e-08, "logits/chosen": -3.006640672683716, "logits/rejected": -3.2103514671325684, "logps/chosen": -521.7000122070312, "logps/rejected": -565.9000244140625, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -7.977148532867432, "rewards/margins": 12.267187118530273, "rewards/rejected": -20.237499237060547, "step": 10170 }, { "epoch": 3.8328313253012047, "grad_norm": 0.9484918077531389, "learning_rate": 4.188629518072289e-08, "logits/chosen": -3.217578172683716, "logits/rejected": -3.561328172683716, "logps/chosen": -440.54998779296875, "logps/rejected": -499.1000061035156, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -8.287890434265137, "rewards/margins": 11.496874809265137, "rewards/rejected": -19.782812118530273, "step": 10180 }, { "epoch": 3.8365963855421685, "grad_norm": 0.26050458914793684, "learning_rate": 4.094503012048193e-08, "logits/chosen": -3.2378907203674316, "logits/rejected": -3.4957032203674316, "logps/chosen": -460.8500061035156, "logps/rejected": -511.6000061035156, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -7.826952934265137, "rewards/margins": 12.964062690734863, "rewards/rejected": -20.782812118530273, "step": 10190 }, { "epoch": 3.8403614457831328, "grad_norm": 0.8266809283237395, "learning_rate": 4.0003765060240957e-08, "logits/chosen": -3.071484327316284, "logits/rejected": -3.251171827316284, "logps/chosen": -528.5750122070312, "logps/rejected": -574.25, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -7.701171875, "rewards/margins": 14.314062118530273, "rewards/rejected": -22.012500762939453, "step": 10200 }, { "epoch": 3.8441265060240966, "grad_norm": 0.25791684331990383, "learning_rate": 3.9062499999999997e-08, "logits/chosen": -3.186718702316284, "logits/rejected": -3.412890672683716, "logps/chosen": -495.42498779296875, "logps/rejected": -556.7000122070312, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -8.139843940734863, "rewards/margins": 12.922656059265137, "rewards/rejected": -21.0625, "step": 10210 }, { "epoch": 3.8478915662650603, "grad_norm": 0.42036243350923286, "learning_rate": 3.8121234939759036e-08, "logits/chosen": -3.26953125, "logits/rejected": -3.4457030296325684, "logps/chosen": -480.82501220703125, "logps/rejected": -543.1500244140625, "loss": 0.007, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -8.158984184265137, "rewards/margins": 12.926562309265137, "rewards/rejected": -21.0859375, "step": 10220 }, { "epoch": 3.851656626506024, "grad_norm": 4.9451882416786725, "learning_rate": 3.717996987951807e-08, "logits/chosen": -3.1734375953674316, "logits/rejected": -3.503124952316284, "logps/chosen": -472.0249938964844, "logps/rejected": -509.04998779296875, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -7.505859375, "rewards/margins": 12.8828125, "rewards/rejected": -20.384374618530273, "step": 10230 }, { "epoch": 3.855421686746988, "grad_norm": 3.249574412090327, "learning_rate": 3.623870481927711e-08, "logits/chosen": -3.268359422683716, "logits/rejected": -3.3695311546325684, "logps/chosen": -476.0249938964844, "logps/rejected": -532.7999877929688, "loss": 0.0078, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.965624809265137, "rewards/margins": 13.31640625, "rewards/rejected": -21.285938262939453, "step": 10240 }, { "epoch": 3.8591867469879517, "grad_norm": 13.734873230865205, "learning_rate": 3.529743975903614e-08, "logits/chosen": -3.1949219703674316, "logits/rejected": -3.3121094703674316, "logps/chosen": -438.04998779296875, "logps/rejected": -525.7000122070312, "loss": 0.0054, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.239062309265137, "rewards/margins": 13.3359375, "rewards/rejected": -20.573436737060547, "step": 10250 }, { "epoch": 3.8629518072289155, "grad_norm": 0.9213294195400203, "learning_rate": 3.4356174698795176e-08, "logits/chosen": -3.202343702316284, "logits/rejected": -3.318359375, "logps/chosen": -529.7249755859375, "logps/rejected": -584.5499877929688, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -7.767578125, "rewards/margins": 13.0390625, "rewards/rejected": -20.809375762939453, "step": 10260 }, { "epoch": 3.8667168674698793, "grad_norm": 0.125803507019974, "learning_rate": 3.3414909638554216e-08, "logits/chosen": -3.1734375953674316, "logits/rejected": -3.3023438453674316, "logps/chosen": -528.8499755859375, "logps/rejected": -583.4000244140625, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -8.619140625, "rewards/margins": 12.964062690734863, "rewards/rejected": -21.573436737060547, "step": 10270 }, { "epoch": 3.8704819277108435, "grad_norm": 0.033326107361742054, "learning_rate": 3.2473644578313256e-08, "logits/chosen": -3.1734375953674316, "logits/rejected": -3.3984375, "logps/chosen": -499.6000061035156, "logps/rejected": -535.25, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -8.275781631469727, "rewards/margins": 12.965624809265137, "rewards/rejected": -21.251562118530273, "step": 10280 }, { "epoch": 3.8742469879518073, "grad_norm": 0.09621672895912865, "learning_rate": 3.153237951807228e-08, "logits/chosen": -3.024218797683716, "logits/rejected": -3.258984327316284, "logps/chosen": -508.75, "logps/rejected": -540.0250244140625, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -6.944531440734863, "rewards/margins": 13.326562881469727, "rewards/rejected": -20.259374618530273, "step": 10290 }, { "epoch": 3.878012048192771, "grad_norm": 0.6572616537214288, "learning_rate": 3.059111445783132e-08, "logits/chosen": -3.134960889816284, "logits/rejected": -3.375, "logps/chosen": -535.0, "logps/rejected": -587.3499755859375, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -8.279492378234863, "rewards/margins": 13.293749809265137, "rewards/rejected": -21.565624237060547, "step": 10300 }, { "epoch": 3.881777108433735, "grad_norm": 0.2116882278657283, "learning_rate": 2.9649849397590362e-08, "logits/chosen": -3.143749952316284, "logits/rejected": -3.331249952316284, "logps/chosen": -436.375, "logps/rejected": -535.4500122070312, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -8.074609756469727, "rewards/margins": 12.8046875, "rewards/rejected": -20.884374618530273, "step": 10310 }, { "epoch": 3.8855421686746987, "grad_norm": 0.40774786248469835, "learning_rate": 2.8708584337349396e-08, "logits/chosen": -3.134570360183716, "logits/rejected": -3.3218750953674316, "logps/chosen": -486.04998779296875, "logps/rejected": -554.5, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -8.220703125, "rewards/margins": 12.099218368530273, "rewards/rejected": -20.309375762939453, "step": 10320 }, { "epoch": 3.8893072289156625, "grad_norm": 0.4868005187112494, "learning_rate": 2.7767319277108432e-08, "logits/chosen": -3.276562452316284, "logits/rejected": -3.452343702316284, "logps/chosen": -459.3999938964844, "logps/rejected": -521.4500122070312, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -8.08984375, "rewards/margins": 12.983593940734863, "rewards/rejected": -21.081249237060547, "step": 10330 }, { "epoch": 3.8930722891566267, "grad_norm": 0.029248029055676136, "learning_rate": 2.682605421686747e-08, "logits/chosen": -3.180468797683716, "logits/rejected": -3.3275389671325684, "logps/chosen": -466.5, "logps/rejected": -566.5, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -8.054296493530273, "rewards/margins": 13.072656631469727, "rewards/rejected": -21.129688262939453, "step": 10340 }, { "epoch": 3.8968373493975905, "grad_norm": 0.0967887224093876, "learning_rate": 2.5884789156626505e-08, "logits/chosen": -3.166210889816284, "logits/rejected": -3.382617235183716, "logps/chosen": -442.3999938964844, "logps/rejected": -531.5499877929688, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -7.4140625, "rewards/margins": 13.010937690734863, "rewards/rejected": -20.431249618530273, "step": 10350 }, { "epoch": 3.9006024096385543, "grad_norm": 0.40708338200656013, "learning_rate": 2.4943524096385542e-08, "logits/chosen": -3.130078077316284, "logits/rejected": -3.4742188453674316, "logps/chosen": -465.0874938964844, "logps/rejected": -522.0499877929688, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -7.901171684265137, "rewards/margins": 13.465624809265137, "rewards/rejected": -21.371875762939453, "step": 10360 }, { "epoch": 3.904367469879518, "grad_norm": 0.15968264984793779, "learning_rate": 2.400225903614458e-08, "logits/chosen": -3.1207032203674316, "logits/rejected": -3.3177733421325684, "logps/chosen": -491.3374938964844, "logps/rejected": -523.0499877929688, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -7.523828029632568, "rewards/margins": 12.720312118530273, "rewards/rejected": -20.234375, "step": 10370 }, { "epoch": 3.908132530120482, "grad_norm": 0.4630715584000286, "learning_rate": 2.3060993975903612e-08, "logits/chosen": -3.173828125, "logits/rejected": -3.4332032203674316, "logps/chosen": -461.6499938964844, "logps/rejected": -544.75, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -8.568750381469727, "rewards/margins": 13.052343368530273, "rewards/rejected": -21.6171875, "step": 10380 }, { "epoch": 3.9118975903614457, "grad_norm": 0.24065777023041734, "learning_rate": 2.2119728915662652e-08, "logits/chosen": -3.2828125953674316, "logits/rejected": -3.5511717796325684, "logps/chosen": -446.375, "logps/rejected": -484.70001220703125, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -7.695703029632568, "rewards/margins": 12.671875, "rewards/rejected": -20.381250381469727, "step": 10390 }, { "epoch": 3.9156626506024095, "grad_norm": 0.22018539657406058, "learning_rate": 2.1178463855421685e-08, "logits/chosen": -3.0640625953674316, "logits/rejected": -3.224609375, "logps/chosen": -536.4249877929688, "logps/rejected": -558.9000244140625, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -7.83984375, "rewards/margins": 13.987500190734863, "rewards/rejected": -21.821874618530273, "step": 10400 }, { "epoch": 3.9194277108433733, "grad_norm": 1.2788580487376686, "learning_rate": 2.0237198795180722e-08, "logits/chosen": -2.950390577316284, "logits/rejected": -3.302734375, "logps/chosen": -498.1499938964844, "logps/rejected": -546.4000244140625, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -7.922265529632568, "rewards/margins": 13.596875190734863, "rewards/rejected": -21.518749237060547, "step": 10410 }, { "epoch": 3.9231927710843375, "grad_norm": 0.1571472257095392, "learning_rate": 1.9295933734939758e-08, "logits/chosen": -3.1187500953674316, "logits/rejected": -3.382031202316284, "logps/chosen": -472.0, "logps/rejected": -546.5499877929688, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -7.053124904632568, "rewards/margins": 12.473437309265137, "rewards/rejected": -19.532812118530273, "step": 10420 }, { "epoch": 3.9269578313253013, "grad_norm": 0.2694204775964876, "learning_rate": 1.8354668674698795e-08, "logits/chosen": -2.986523389816284, "logits/rejected": -3.270703077316284, "logps/chosen": -506.07501220703125, "logps/rejected": -529.75, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -8.080370903015137, "rewards/margins": 13.2578125, "rewards/rejected": -21.350000381469727, "step": 10430 }, { "epoch": 3.930722891566265, "grad_norm": 0.264928349855542, "learning_rate": 1.741340361445783e-08, "logits/chosen": -3.274218797683716, "logits/rejected": -3.4761719703674316, "logps/chosen": -456.6000061035156, "logps/rejected": -546.2999877929688, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -7.641015529632568, "rewards/margins": 13.208593368530273, "rewards/rejected": -20.831249237060547, "step": 10440 }, { "epoch": 3.934487951807229, "grad_norm": 0.504148654932743, "learning_rate": 1.6472138554216868e-08, "logits/chosen": -3.064453125, "logits/rejected": -3.43359375, "logps/chosen": -478.42498779296875, "logps/rejected": -524.2999877929688, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -7.391015529632568, "rewards/margins": 13.348437309265137, "rewards/rejected": -20.748437881469727, "step": 10450 }, { "epoch": 3.9382530120481927, "grad_norm": 5.284742981862749, "learning_rate": 1.5530873493975905e-08, "logits/chosen": -3.2515625953674316, "logits/rejected": -3.4007811546325684, "logps/chosen": -468.75, "logps/rejected": -569.1500244140625, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -7.90625, "rewards/margins": 14.705469131469727, "rewards/rejected": -22.621875762939453, "step": 10460 }, { "epoch": 3.9420180722891565, "grad_norm": 12.118183339002215, "learning_rate": 1.4589608433734938e-08, "logits/chosen": -3.197460889816284, "logits/rejected": -3.354296922683716, "logps/chosen": -506.17498779296875, "logps/rejected": -573.75, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -8.371874809265137, "rewards/margins": 13.207812309265137, "rewards/rejected": -21.579687118530273, "step": 10470 }, { "epoch": 3.9457831325301207, "grad_norm": 0.21151603855330753, "learning_rate": 1.3648343373493974e-08, "logits/chosen": -3.2457032203674316, "logits/rejected": -3.483203172683716, "logps/chosen": -441.7250061035156, "logps/rejected": -516.75, "loss": 0.0096, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.706640720367432, "rewards/margins": 13.142187118530273, "rewards/rejected": -20.84375, "step": 10480 }, { "epoch": 3.9495481927710845, "grad_norm": 23.672528317191194, "learning_rate": 1.2707078313253011e-08, "logits/chosen": -3.16796875, "logits/rejected": -3.305468797683716, "logps/chosen": -545.6749877929688, "logps/rejected": -593.3499755859375, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -8.276952743530273, "rewards/margins": 13.509374618530273, "rewards/rejected": -21.795312881469727, "step": 10490 }, { "epoch": 3.9533132530120483, "grad_norm": 2.0118463299347233, "learning_rate": 1.1765813253012048e-08, "logits/chosen": -3.279296875, "logits/rejected": -3.359375, "logps/chosen": -491.5249938964844, "logps/rejected": -562.5, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -8.266797065734863, "rewards/margins": 13.765625, "rewards/rejected": -22.029687881469727, "step": 10500 }, { "epoch": 3.957078313253012, "grad_norm": 0.1416801085480074, "learning_rate": 1.0824548192771083e-08, "logits/chosen": -3.1546874046325684, "logits/rejected": -3.447265625, "logps/chosen": -484.57501220703125, "logps/rejected": -526.2999877929688, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -7.553124904632568, "rewards/margins": 12.504687309265137, "rewards/rejected": -20.049999237060547, "step": 10510 }, { "epoch": 3.960843373493976, "grad_norm": 3.8731134359838593, "learning_rate": 9.883283132530119e-09, "logits/chosen": -3.3617186546325684, "logits/rejected": -3.580078125, "logps/chosen": -459.17498779296875, "logps/rejected": -534.4500122070312, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -8.46484375, "rewards/margins": 13.625781059265137, "rewards/rejected": -22.09375, "step": 10520 }, { "epoch": 3.9646084337349397, "grad_norm": 0.4342282993345563, "learning_rate": 8.942018072289156e-09, "logits/chosen": -3.189453125, "logits/rejected": -3.4046874046325684, "logps/chosen": -473.92498779296875, "logps/rejected": -565.25, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -8.126172065734863, "rewards/margins": 13.799219131469727, "rewards/rejected": -21.928125381469727, "step": 10530 }, { "epoch": 3.9683734939759034, "grad_norm": 0.14368172923130088, "learning_rate": 8.000753012048192e-09, "logits/chosen": -3.23046875, "logits/rejected": -3.3628907203674316, "logps/chosen": -536.4000244140625, "logps/rejected": -592.7000122070312, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -8.01171875, "rewards/margins": 13.1640625, "rewards/rejected": -21.168750762939453, "step": 10540 }, { "epoch": 3.9721385542168672, "grad_norm": 0.09993421856942902, "learning_rate": 7.059487951807229e-09, "logits/chosen": -3.223437547683716, "logits/rejected": -3.345703125, "logps/chosen": -484.95001220703125, "logps/rejected": -566.7999877929688, "loss": 0.0078, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -8.398828506469727, "rewards/margins": 13.28515625, "rewards/rejected": -21.690624237060547, "step": 10550 }, { "epoch": 3.9759036144578315, "grad_norm": 3.0417676029216896, "learning_rate": 6.118222891566265e-09, "logits/chosen": -3.0439453125, "logits/rejected": -3.360156297683716, "logps/chosen": -507.54998779296875, "logps/rejected": -557.4000244140625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -7.210156440734863, "rewards/margins": 13.678125381469727, "rewards/rejected": -20.887500762939453, "step": 10560 }, { "epoch": 3.9796686746987953, "grad_norm": 0.8662040692381247, "learning_rate": 5.176957831325301e-09, "logits/chosen": -3.2710938453674316, "logits/rejected": -3.456249952316284, "logps/chosen": -466.95001220703125, "logps/rejected": -526.4000244140625, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -8.540820121765137, "rewards/margins": 12.693750381469727, "rewards/rejected": -21.232812881469727, "step": 10570 }, { "epoch": 3.983433734939759, "grad_norm": 0.34707501555328996, "learning_rate": 4.235692771084337e-09, "logits/chosen": -3.317578077316284, "logits/rejected": -3.39453125, "logps/chosen": -451.1000061035156, "logps/rejected": -562.3499755859375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -8.630468368530273, "rewards/margins": 12.643750190734863, "rewards/rejected": -21.262500762939453, "step": 10580 }, { "epoch": 3.987198795180723, "grad_norm": 0.2138835087408416, "learning_rate": 3.2944277108433736e-09, "logits/chosen": -3.135937452316284, "logits/rejected": -3.325390577316284, "logps/chosen": -494.1000061035156, "logps/rejected": -555.0499877929688, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -8.311718940734863, "rewards/margins": 13.509374618530273, "rewards/rejected": -21.810937881469727, "step": 10590 }, { "epoch": 3.9909638554216866, "grad_norm": 0.31916940166268276, "learning_rate": 2.3531626506024098e-09, "logits/chosen": -3.2621092796325684, "logits/rejected": -3.393359422683716, "logps/chosen": -484.375, "logps/rejected": -535.3499755859375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -7.768750190734863, "rewards/margins": 13.590624809265137, "rewards/rejected": -21.357812881469727, "step": 10600 }, { "epoch": 3.994728915662651, "grad_norm": 1.1414418752848086, "learning_rate": 1.4118975903614457e-09, "logits/chosen": -3.262890577316284, "logits/rejected": -3.4898438453674316, "logps/chosen": -495.04998779296875, "logps/rejected": -536.4000244140625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -7.374218940734863, "rewards/margins": 13.1640625, "rewards/rejected": -20.535938262939453, "step": 10610 }, { "epoch": 3.9984939759036147, "grad_norm": 0.7373645597098939, "learning_rate": 4.706325301204819e-10, "logits/chosen": -2.977343797683716, "logits/rejected": -3.2367186546325684, "logps/chosen": -576.9500122070312, "logps/rejected": -576.5499877929688, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -8.208593368530273, "rewards/margins": 12.853124618530273, "rewards/rejected": -21.075000762939453, "step": 10620 } ], "logging_steps": 10, "max_steps": 10624, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }