{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.6153846153846154, "eval_steps": 20, "global_step": 700, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004615384615384616, "grad_norm": 77.62861420400046, "learning_rate": 1.1494252873563218e-08, "logits/chosen": -0.39378097653388977, "logits/rejected": -0.392289400100708, "logps/chosen": -26.18341064453125, "logps/rejected": -36.94904708862305, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.009230769230769232, "grad_norm": 80.08246728297878, "learning_rate": 2.2988505747126436e-08, "logits/chosen": -0.44716474413871765, "logits/rejected": -0.44228029251098633, "logps/chosen": -17.735511779785156, "logps/rejected": -21.76218605041504, "loss": 0.6997, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.007351329084485769, "rewards/margins": -0.02768969163298607, "rewards/rejected": 0.02033836394548416, "step": 4 }, { "epoch": 0.013846153846153847, "grad_norm": 76.89861808590157, "learning_rate": 3.448275862068965e-08, "logits/chosen": -0.3927414119243622, "logits/rejected": -0.3852512836456299, "logps/chosen": -26.153886795043945, "logps/rejected": -35.22654342651367, "loss": 0.694, "rewards/accuracies": 0.3611111044883728, "rewards/chosen": 0.010106331668794155, "rewards/margins": -0.00652913236990571, "rewards/rejected": 0.016635464504361153, "step": 6 }, { "epoch": 0.018461538461538463, "grad_norm": 78.45131228879585, "learning_rate": 4.597701149425287e-08, "logits/chosen": -0.3985685110092163, "logits/rejected": -0.39009273052215576, "logps/chosen": -31.09583854675293, "logps/rejected": -49.36472702026367, "loss": 0.6879, "rewards/accuracies": 0.5277777910232544, "rewards/chosen": 0.015792137011885643, "rewards/margins": 0.02373766154050827, "rewards/rejected": -0.007945524528622627, "step": 8 }, { "epoch": 0.023076923076923078, "grad_norm": 82.89860130430746, "learning_rate": 5.747126436781609e-08, "logits/chosen": -0.39535653591156006, "logits/rejected": -0.39429333806037903, "logps/chosen": -22.599395751953125, "logps/rejected": -23.222135543823242, "loss": 0.6866, "rewards/accuracies": 0.6388888955116272, "rewards/chosen": 0.007150494493544102, "rewards/margins": 0.02798917144536972, "rewards/rejected": -0.020838677883148193, "step": 10 }, { "epoch": 0.027692307692307693, "grad_norm": 117.39494231799054, "learning_rate": 6.89655172413793e-08, "logits/chosen": -0.41870343685150146, "logits/rejected": -0.40543413162231445, "logps/chosen": -24.215105056762695, "logps/rejected": -51.66640853881836, "loss": 0.6942, "rewards/accuracies": 0.5, "rewards/chosen": -0.002258555730804801, "rewards/margins": -0.0022148755379021168, "rewards/rejected": -4.368026930023916e-05, "step": 12 }, { "epoch": 0.03230769230769231, "grad_norm": 119.11292176389695, "learning_rate": 8.045977011494252e-08, "logits/chosen": -0.39709413051605225, "logits/rejected": -0.3866692781448364, "logps/chosen": -40.26705551147461, "logps/rejected": -49.558101654052734, "loss": 0.7002, "rewards/accuracies": 0.4166666567325592, "rewards/chosen": -0.0014084478607401252, "rewards/margins": -0.0035214456729590893, "rewards/rejected": 0.002112997928634286, "step": 14 }, { "epoch": 0.036923076923076927, "grad_norm": 79.4074561971729, "learning_rate": 9.195402298850574e-08, "logits/chosen": -0.4073406755924225, "logits/rejected": -0.40320590138435364, "logps/chosen": -14.51986026763916, "logps/rejected": -33.133087158203125, "loss": 0.6954, "rewards/accuracies": 0.5, "rewards/chosen": -0.02937469817698002, "rewards/margins": 0.01247786171734333, "rewards/rejected": -0.04185255989432335, "step": 16 }, { "epoch": 0.04153846153846154, "grad_norm": 77.82159916717212, "learning_rate": 1.0344827586206897e-07, "logits/chosen": -0.3760643005371094, "logits/rejected": -0.372781902551651, "logps/chosen": -33.00544357299805, "logps/rejected": -26.901779174804688, "loss": 0.6849, "rewards/accuracies": 0.3611111044883728, "rewards/chosen": 0.0013620555400848389, "rewards/margins": -0.003130674362182617, "rewards/rejected": 0.004492729902267456, "step": 18 }, { "epoch": 0.046153846153846156, "grad_norm": 108.8932368559959, "learning_rate": 1.1494252873563217e-07, "logits/chosen": -0.4303821325302124, "logits/rejected": -0.424153596162796, "logps/chosen": -19.584293365478516, "logps/rejected": -45.71126174926758, "loss": 0.7056, "rewards/accuracies": 0.3888888955116272, "rewards/chosen": -0.03170390427112579, "rewards/margins": -0.04419155791401863, "rewards/rejected": 0.012487656436860561, "step": 20 }, { "epoch": 0.046153846153846156, "eval_logits/chosen": -0.4027663469314575, "eval_logits/rejected": -0.39729759097099304, "eval_logps/chosen": -28.84078598022461, "eval_logps/rejected": -34.18613052368164, "eval_loss": 0.6907632350921631, "eval_rewards/accuracies": 0.46889400482177734, "eval_rewards/chosen": 0.008829275146126747, "eval_rewards/margins": 0.009880865924060345, "eval_rewards/rejected": -0.0010515897301957011, "eval_runtime": 507.109, "eval_samples_per_second": 3.419, "eval_steps_per_second": 0.428, "step": 20 }, { "epoch": 0.05076923076923077, "grad_norm": 101.61777237089647, "learning_rate": 1.2643678160919542e-07, "logits/chosen": -0.384340763092041, "logits/rejected": -0.37305957078933716, "logps/chosen": -30.11992645263672, "logps/rejected": -47.615745544433594, "loss": 0.6883, "rewards/accuracies": 0.5, "rewards/chosen": -0.004662156105041504, "rewards/margins": 0.019971728324890137, "rewards/rejected": -0.02463388442993164, "step": 22 }, { "epoch": 0.055384615384615386, "grad_norm": 81.66770019708967, "learning_rate": 1.379310344827586e-07, "logits/chosen": -0.3925296664237976, "logits/rejected": -0.38723400235176086, "logps/chosen": -39.30214309692383, "logps/rejected": -34.80133819580078, "loss": 0.6872, "rewards/accuracies": 0.4444444477558136, "rewards/chosen": -0.005930910352617502, "rewards/margins": 0.017527200281620026, "rewards/rejected": -0.02345811016857624, "step": 24 }, { "epoch": 0.06, "grad_norm": 70.20934502195146, "learning_rate": 1.4942528735632184e-07, "logits/chosen": -0.37832385301589966, "logits/rejected": -0.37566468119621277, "logps/chosen": -37.249229431152344, "logps/rejected": -24.97306251525879, "loss": 0.704, "rewards/accuracies": 0.5, "rewards/chosen": -0.02592300996184349, "rewards/margins": -0.010374132543802261, "rewards/rejected": -0.015548878349363804, "step": 26 }, { "epoch": 0.06461538461538462, "grad_norm": 99.3771053784467, "learning_rate": 1.6091954022988505e-07, "logits/chosen": -0.4069358706474304, "logits/rejected": -0.39999136328697205, "logps/chosen": -30.038801193237305, "logps/rejected": -63.18869400024414, "loss": 0.67, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.01920890063047409, "rewards/margins": 0.0764741525053978, "rewards/rejected": -0.05726524814963341, "step": 28 }, { "epoch": 0.06923076923076923, "grad_norm": 64.88721078392425, "learning_rate": 1.7241379310344828e-07, "logits/chosen": -0.4044470191001892, "logits/rejected": -0.3935093879699707, "logps/chosen": -28.796165466308594, "logps/rejected": -35.16952896118164, "loss": 0.6859, "rewards/accuracies": 0.5833333134651184, "rewards/chosen": 0.013178395107388496, "rewards/margins": 0.027274958789348602, "rewards/rejected": -0.01409656461328268, "step": 30 }, { "epoch": 0.07384615384615385, "grad_norm": 74.32683930411818, "learning_rate": 1.839080459770115e-07, "logits/chosen": -0.42920663952827454, "logits/rejected": -0.4239201247692108, "logps/chosen": -31.740190505981445, "logps/rejected": -28.374773025512695, "loss": 0.6812, "rewards/accuracies": 0.5277777910232544, "rewards/chosen": -0.00046405859757214785, "rewards/margins": 0.02151392214000225, "rewards/rejected": -0.02197798155248165, "step": 32 }, { "epoch": 0.07846153846153846, "grad_norm": 72.79202223720442, "learning_rate": 1.9540229885057472e-07, "logits/chosen": -0.4149067997932434, "logits/rejected": -0.4115830659866333, "logps/chosen": -29.186758041381836, "logps/rejected": -37.32665252685547, "loss": 0.6764, "rewards/accuracies": 0.5555555820465088, "rewards/chosen": -0.02662133239209652, "rewards/margins": 0.03733319416642189, "rewards/rejected": -0.06395452469587326, "step": 34 }, { "epoch": 0.08307692307692308, "grad_norm": 77.0819305761821, "learning_rate": 2.0689655172413793e-07, "logits/chosen": -0.3898068964481354, "logits/rejected": -0.38638022541999817, "logps/chosen": -30.309282302856445, "logps/rejected": -24.573936462402344, "loss": 0.6791, "rewards/accuracies": 0.5555555820465088, "rewards/chosen": 0.0286308191716671, "rewards/margins": 0.04911021888256073, "rewards/rejected": -0.020479395985603333, "step": 36 }, { "epoch": 0.0876923076923077, "grad_norm": 76.51367020428489, "learning_rate": 2.1839080459770114e-07, "logits/chosen": -0.4303935468196869, "logits/rejected": -0.4284617006778717, "logps/chosen": -29.51013946533203, "logps/rejected": -41.60896682739258, "loss": 0.674, "rewards/accuracies": 0.6111111044883728, "rewards/chosen": -0.019182804971933365, "rewards/margins": 0.12665918469429016, "rewards/rejected": -0.14584198594093323, "step": 38 }, { "epoch": 0.09230769230769231, "grad_norm": 63.3916809364922, "learning_rate": 2.2988505747126435e-07, "logits/chosen": -0.37574881315231323, "logits/rejected": -0.3659113943576813, "logps/chosen": -36.16166687011719, "logps/rejected": -42.74641418457031, "loss": 0.6584, "rewards/accuracies": 0.6944444179534912, "rewards/chosen": 0.022213276475667953, "rewards/margins": 0.16043633222579956, "rewards/rejected": -0.1382230669260025, "step": 40 }, { "epoch": 0.09230769230769231, "eval_logits/chosen": -0.4059803783893585, "eval_logits/rejected": -0.40035995841026306, "eval_logps/chosen": -28.868152618408203, "eval_logps/rejected": -34.372467041015625, "eval_loss": 0.6648333072662354, "eval_rewards/accuracies": 0.5967742204666138, "eval_rewards/chosen": -0.004853636492043734, "eval_rewards/margins": 0.08936604112386703, "eval_rewards/rejected": -0.09421967715024948, "eval_runtime": 507.4088, "eval_samples_per_second": 3.417, "eval_steps_per_second": 0.428, "step": 40 }, { "epoch": 0.09692307692307692, "grad_norm": 67.22743409960357, "learning_rate": 2.413793103448276e-07, "logits/chosen": -0.37783488631248474, "logits/rejected": -0.3711128532886505, "logps/chosen": -24.303695678710938, "logps/rejected": -39.026466369628906, "loss": 0.6771, "rewards/accuracies": 0.6111111044883728, "rewards/chosen": -0.03717127442359924, "rewards/margins": 0.09637530148029327, "rewards/rejected": -0.13354657590389252, "step": 42 }, { "epoch": 0.10153846153846154, "grad_norm": 75.76051856227585, "learning_rate": 2.5287356321839084e-07, "logits/chosen": -0.40161463618278503, "logits/rejected": -0.39407879114151, "logps/chosen": -27.27153205871582, "logps/rejected": -59.36478805541992, "loss": 0.6396, "rewards/accuracies": 0.5833333134651184, "rewards/chosen": -0.038735900074243546, "rewards/margins": 0.16949793696403503, "rewards/rejected": -0.20823383331298828, "step": 44 }, { "epoch": 0.10615384615384615, "grad_norm": 70.36879349750134, "learning_rate": 2.64367816091954e-07, "logits/chosen": -0.40112772583961487, "logits/rejected": -0.3897283375263214, "logps/chosen": -37.23591232299805, "logps/rejected": -31.54336166381836, "loss": 0.6628, "rewards/accuracies": 0.5277777910232544, "rewards/chosen": -0.026717450469732285, "rewards/margins": 0.08677893131971359, "rewards/rejected": -0.11349637061357498, "step": 46 }, { "epoch": 0.11076923076923077, "grad_norm": 65.06224759698821, "learning_rate": 2.758620689655172e-07, "logits/chosen": -0.42118698358535767, "logits/rejected": -0.41589176654815674, "logps/chosen": -28.167463302612305, "logps/rejected": -40.67127227783203, "loss": 0.6652, "rewards/accuracies": 0.6388888955116272, "rewards/chosen": -0.06725379824638367, "rewards/margins": 0.17473283410072327, "rewards/rejected": -0.24198664724826813, "step": 48 }, { "epoch": 0.11538461538461539, "grad_norm": 62.17884271669337, "learning_rate": 2.873563218390804e-07, "logits/chosen": -0.37570783495903015, "logits/rejected": -0.3694818317890167, "logps/chosen": -33.85892105102539, "logps/rejected": -34.08824920654297, "loss": 0.6478, "rewards/accuracies": 0.6111111044883728, "rewards/chosen": -0.0616380050778389, "rewards/margins": 0.15994474291801453, "rewards/rejected": -0.22158274054527283, "step": 50 }, { "epoch": 0.12, "grad_norm": 66.02767666235415, "learning_rate": 2.988505747126437e-07, "logits/chosen": -0.4029790461063385, "logits/rejected": -0.39577487111091614, "logps/chosen": -33.29621505737305, "logps/rejected": -43.378292083740234, "loss": 0.6306, "rewards/accuracies": 0.6388888955116272, "rewards/chosen": -0.06801849603652954, "rewards/margins": 0.24606987833976746, "rewards/rejected": -0.3140883445739746, "step": 52 }, { "epoch": 0.12461538461538461, "grad_norm": 59.67862301913293, "learning_rate": 3.103448275862069e-07, "logits/chosen": -0.3944869637489319, "logits/rejected": -0.3836838901042938, "logps/chosen": -23.152067184448242, "logps/rejected": -61.55735397338867, "loss": 0.6302, "rewards/accuracies": 0.75, "rewards/chosen": -0.11231276392936707, "rewards/margins": 0.5306358337402344, "rewards/rejected": -0.642948567867279, "step": 54 }, { "epoch": 0.12923076923076923, "grad_norm": 61.05637248259778, "learning_rate": 3.218390804597701e-07, "logits/chosen": -0.4085512161254883, "logits/rejected": -0.40952953696250916, "logps/chosen": -28.892059326171875, "logps/rejected": -32.07326889038086, "loss": 0.6393, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.040772695094347, "rewards/margins": 0.13633577525615692, "rewards/rejected": -0.17710846662521362, "step": 56 }, { "epoch": 0.13384615384615384, "grad_norm": 69.18026553461712, "learning_rate": 3.333333333333333e-07, "logits/chosen": -0.42128846049308777, "logits/rejected": -0.4181976020336151, "logps/chosen": -31.82688331604004, "logps/rejected": -25.970035552978516, "loss": 0.607, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.0521199107170105, "rewards/margins": 0.2045416533946991, "rewards/rejected": -0.2566615641117096, "step": 58 }, { "epoch": 0.13846153846153847, "grad_norm": 60.695212475757, "learning_rate": 3.4482758620689656e-07, "logits/chosen": -0.4786983132362366, "logits/rejected": -0.47286850214004517, "logps/chosen": -24.57256317138672, "logps/rejected": -31.28465461730957, "loss": 0.6053, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.12995412945747375, "rewards/margins": 0.30061429738998413, "rewards/rejected": -0.4305684268474579, "step": 60 }, { "epoch": 0.13846153846153847, "eval_logits/chosen": -0.4070414900779724, "eval_logits/rejected": -0.40151268243789673, "eval_logps/chosen": -29.108257293701172, "eval_logps/rejected": -35.0603141784668, "eval_loss": 0.6001957654953003, "eval_rewards/accuracies": 0.7569124698638916, "eval_rewards/chosen": -0.12490677088499069, "eval_rewards/margins": 0.3132374584674835, "eval_rewards/rejected": -0.4381442368030548, "eval_runtime": 502.9842, "eval_samples_per_second": 3.447, "eval_steps_per_second": 0.431, "step": 60 }, { "epoch": 0.14307692307692307, "grad_norm": 56.630746732460224, "learning_rate": 3.5632183908045977e-07, "logits/chosen": -0.38231950998306274, "logits/rejected": -0.37658825516700745, "logps/chosen": -31.32072639465332, "logps/rejected": -38.20649337768555, "loss": 0.6028, "rewards/accuracies": 0.7777777910232544, "rewards/chosen": -0.18813973665237427, "rewards/margins": 0.3638599216938019, "rewards/rejected": -0.5519996881484985, "step": 62 }, { "epoch": 0.1476923076923077, "grad_norm": 66.55026033516815, "learning_rate": 3.67816091954023e-07, "logits/chosen": -0.3958435654640198, "logits/rejected": -0.39101698994636536, "logps/chosen": -35.14918518066406, "logps/rejected": -27.609230041503906, "loss": 0.5751, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": -0.14902949333190918, "rewards/margins": 0.24196968972682953, "rewards/rejected": -0.3909991681575775, "step": 64 }, { "epoch": 0.1523076923076923, "grad_norm": 57.72064529374417, "learning_rate": 3.793103448275862e-07, "logits/chosen": -0.45348331332206726, "logits/rejected": -0.44826263189315796, "logps/chosen": -21.034719467163086, "logps/rejected": -37.23699188232422, "loss": 0.5605, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.17452064156532288, "rewards/margins": 0.41191089153289795, "rewards/rejected": -0.586431622505188, "step": 66 }, { "epoch": 0.15692307692307692, "grad_norm": 61.678241231096024, "learning_rate": 3.9080459770114945e-07, "logits/chosen": -0.396350234746933, "logits/rejected": -0.38391950726509094, "logps/chosen": -23.88064956665039, "logps/rejected": -44.434303283691406, "loss": 0.5514, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": -0.2237495332956314, "rewards/margins": 0.720125138759613, "rewards/rejected": -0.9438745975494385, "step": 68 }, { "epoch": 0.16153846153846155, "grad_norm": 56.53414926436091, "learning_rate": 4.0229885057471266e-07, "logits/chosen": -0.4000495672225952, "logits/rejected": -0.3912370204925537, "logps/chosen": -27.373992919921875, "logps/rejected": -30.162841796875, "loss": 0.5373, "rewards/accuracies": 0.75, "rewards/chosen": -0.24068985879421234, "rewards/margins": 0.37481439113616943, "rewards/rejected": -0.6155042052268982, "step": 70 }, { "epoch": 0.16615384615384615, "grad_norm": 58.402369553225924, "learning_rate": 4.1379310344827586e-07, "logits/chosen": -0.36843451857566833, "logits/rejected": -0.3666560649871826, "logps/chosen": -22.388916015625, "logps/rejected": -31.36478042602539, "loss": 0.5261, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.26846611499786377, "rewards/margins": 0.35641592741012573, "rewards/rejected": -0.6248820424079895, "step": 72 }, { "epoch": 0.17076923076923076, "grad_norm": 47.84253412577215, "learning_rate": 4.25287356321839e-07, "logits/chosen": -0.44409671425819397, "logits/rejected": -0.4389974772930145, "logps/chosen": -36.514095306396484, "logps/rejected": -20.807750701904297, "loss": 0.531, "rewards/accuracies": 0.7777777910232544, "rewards/chosen": -0.3515840470790863, "rewards/margins": 0.3204071819782257, "rewards/rejected": -0.671991229057312, "step": 74 }, { "epoch": 0.1753846153846154, "grad_norm": 56.7179390915731, "learning_rate": 4.367816091954023e-07, "logits/chosen": -0.4147721230983734, "logits/rejected": -0.4097798764705658, "logps/chosen": -22.868803024291992, "logps/rejected": -34.208255767822266, "loss": 0.5108, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": -0.33877652883529663, "rewards/margins": 0.41241833567619324, "rewards/rejected": -0.7511948347091675, "step": 76 }, { "epoch": 0.18, "grad_norm": 47.3547653876473, "learning_rate": 4.482758620689655e-07, "logits/chosen": -0.45158883929252625, "logits/rejected": -0.445837140083313, "logps/chosen": -22.202173233032227, "logps/rejected": -38.98337936401367, "loss": 0.5301, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.4212031364440918, "rewards/margins": 0.5504710674285889, "rewards/rejected": -0.9716740846633911, "step": 78 }, { "epoch": 0.18461538461538463, "grad_norm": 52.40158205205406, "learning_rate": 4.597701149425287e-07, "logits/chosen": -0.41750791668891907, "logits/rejected": -0.40789926052093506, "logps/chosen": -20.486360549926758, "logps/rejected": -25.128028869628906, "loss": 0.4791, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": -0.2454337775707245, "rewards/margins": 0.6140785813331604, "rewards/rejected": -0.8595123291015625, "step": 80 }, { "epoch": 0.18461538461538463, "eval_logits/chosen": -0.4095688462257385, "eval_logits/rejected": -0.4042481482028961, "eval_logps/chosen": -29.752004623413086, "eval_logps/rejected": -36.58811950683594, "eval_loss": 0.4853779375553131, "eval_rewards/accuracies": 0.7845622301101685, "eval_rewards/chosen": -0.4467814266681671, "eval_rewards/margins": 0.7552650570869446, "eval_rewards/rejected": -1.2020463943481445, "eval_runtime": 506.2237, "eval_samples_per_second": 3.425, "eval_steps_per_second": 0.429, "step": 80 }, { "epoch": 0.18923076923076923, "grad_norm": 43.72387138390406, "learning_rate": 4.712643678160919e-07, "logits/chosen": -0.39603105187416077, "logits/rejected": -0.3899454176425934, "logps/chosen": -23.087444305419922, "logps/rejected": -35.859256744384766, "loss": 0.446, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": -0.458638072013855, "rewards/margins": 0.8271440863609314, "rewards/rejected": -1.2857822179794312, "step": 82 }, { "epoch": 0.19384615384615383, "grad_norm": 44.83975322257744, "learning_rate": 4.827586206896552e-07, "logits/chosen": -0.4487308859825134, "logits/rejected": -0.43710604310035706, "logps/chosen": -25.820518493652344, "logps/rejected": -68.19234466552734, "loss": 0.4273, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.5129004120826721, "rewards/margins": 1.662545919418335, "rewards/rejected": -2.1754465103149414, "step": 84 }, { "epoch": 0.19846153846153847, "grad_norm": 54.93176049768621, "learning_rate": 4.942528735632184e-07, "logits/chosen": -0.45068052411079407, "logits/rejected": -0.4449460804462433, "logps/chosen": -27.34906578063965, "logps/rejected": -41.636783599853516, "loss": 0.4362, "rewards/accuracies": 0.75, "rewards/chosen": -0.7617433071136475, "rewards/margins": 0.9456864595413208, "rewards/rejected": -1.7074297666549683, "step": 86 }, { "epoch": 0.20307692307692307, "grad_norm": 45.8440019669453, "learning_rate": 4.999979670146248e-07, "logits/chosen": -0.3581588566303253, "logits/rejected": -0.35617658495903015, "logps/chosen": -32.53798294067383, "logps/rejected": -25.812297821044922, "loss": 0.4268, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.518390417098999, "rewards/margins": 0.6697157621383667, "rewards/rejected": -1.1881061792373657, "step": 88 }, { "epoch": 0.2076923076923077, "grad_norm": 72.89823473398597, "learning_rate": 4.99981703330008e-07, "logits/chosen": -0.4502779245376587, "logits/rejected": -0.4415137767791748, "logps/chosen": -27.567726135253906, "logps/rejected": -42.95169448852539, "loss": 0.4505, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.8387865424156189, "rewards/margins": 1.254158854484558, "rewards/rejected": -2.092945098876953, "step": 90 }, { "epoch": 0.2123076923076923, "grad_norm": 31.30173564407195, "learning_rate": 4.99949177018813e-07, "logits/chosen": -0.37603163719177246, "logits/rejected": -0.3692868947982788, "logps/chosen": -24.33488655090332, "logps/rejected": -35.71257400512695, "loss": 0.3758, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": -0.6546725630760193, "rewards/margins": 1.322723388671875, "rewards/rejected": -1.977396011352539, "step": 92 }, { "epoch": 0.2169230769230769, "grad_norm": 56.63562589010852, "learning_rate": 4.999003901970474e-07, "logits/chosen": -0.42034757137298584, "logits/rejected": -0.4143223166465759, "logps/chosen": -43.816368103027344, "logps/rejected": -30.78759765625, "loss": 0.4948, "rewards/accuracies": 0.6944444179534912, "rewards/chosen": -1.2731213569641113, "rewards/margins": 0.4397517442703247, "rewards/rejected": -1.7128729820251465, "step": 94 }, { "epoch": 0.22153846153846155, "grad_norm": 36.62178180991854, "learning_rate": 4.998353460385512e-07, "logits/chosen": -0.4228645861148834, "logits/rejected": -0.4190582036972046, "logps/chosen": -26.416019439697266, "logps/rejected": -41.38105773925781, "loss": 0.3797, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.8715569972991943, "rewards/margins": 1.3316190242767334, "rewards/rejected": -2.2031757831573486, "step": 96 }, { "epoch": 0.22615384615384615, "grad_norm": 45.578861983373265, "learning_rate": 4.997540487747892e-07, "logits/chosen": -0.4397938847541809, "logits/rejected": -0.4387800395488739, "logps/chosen": -31.172990798950195, "logps/rejected": -36.46592712402344, "loss": 0.3925, "rewards/accuracies": 0.7777777910232544, "rewards/chosen": -1.025800108909607, "rewards/margins": 1.368214726448059, "rewards/rejected": -2.394014835357666, "step": 98 }, { "epoch": 0.23076923076923078, "grad_norm": 42.102342673823976, "learning_rate": 4.996565036945769e-07, "logits/chosen": -0.4058091640472412, "logits/rejected": -0.3902934789657593, "logps/chosen": -39.06974792480469, "logps/rejected": -31.91332244873047, "loss": 0.4001, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.8707465529441833, "rewards/margins": 1.09341299533844, "rewards/rejected": -1.9641594886779785, "step": 100 }, { "epoch": 0.23076923076923078, "eval_logits/chosen": -0.4174375534057617, "eval_logits/rejected": -0.41175583004951477, "eval_logps/chosen": -30.664905548095703, "eval_logps/rejected": -38.85734176635742, "eval_loss": 0.38428741693496704, "eval_rewards/accuracies": 0.7857142686843872, "eval_rewards/chosen": -0.9032310247421265, "eval_rewards/margins": 1.4334266185760498, "eval_rewards/rejected": -2.336657762527466, "eval_runtime": 507.513, "eval_samples_per_second": 3.417, "eval_steps_per_second": 0.428, "step": 100 }, { "epoch": 0.2353846153846154, "grad_norm": 30.53424675224716, "learning_rate": 4.995427171437356e-07, "logits/chosen": -0.4080338478088379, "logits/rejected": -0.4020484387874603, "logps/chosen": -23.007694244384766, "logps/rejected": -45.17842483520508, "loss": 0.3195, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -1.3807095289230347, "rewards/margins": 1.465849757194519, "rewards/rejected": -2.8465592861175537, "step": 102 }, { "epoch": 0.24, "grad_norm": 40.234017347381936, "learning_rate": 4.994126965246796e-07, "logits/chosen": -0.44685712456703186, "logits/rejected": -0.4413563311100006, "logps/chosen": -26.35468292236328, "logps/rejected": -36.53682327270508, "loss": 0.3306, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.9342181086540222, "rewards/margins": 1.5329720973968506, "rewards/rejected": -2.4671900272369385, "step": 104 }, { "epoch": 0.24461538461538462, "grad_norm": 41.88988266165195, "learning_rate": 4.992664502959351e-07, "logits/chosen": -0.42318201065063477, "logits/rejected": -0.4150063693523407, "logps/chosen": -27.9631404876709, "logps/rejected": -62.13208770751953, "loss": 0.2862, "rewards/accuracies": 0.9722222089767456, "rewards/chosen": -1.0194923877716064, "rewards/margins": 2.330876111984253, "rewards/rejected": -3.3503682613372803, "step": 106 }, { "epoch": 0.24923076923076923, "grad_norm": 39.10202150780922, "learning_rate": 4.991039879715898e-07, "logits/chosen": -0.3958354592323303, "logits/rejected": -0.3896031677722931, "logps/chosen": -27.523771286010742, "logps/rejected": -40.87248229980469, "loss": 0.3262, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -1.521608829498291, "rewards/margins": 1.5079147815704346, "rewards/rejected": -3.0295236110687256, "step": 108 }, { "epoch": 0.25384615384615383, "grad_norm": 36.63078833568431, "learning_rate": 4.989253201206736e-07, "logits/chosen": -0.420526921749115, "logits/rejected": -0.421051025390625, "logps/chosen": -25.432598114013672, "logps/rejected": -19.715707778930664, "loss": 0.3771, "rewards/accuracies": 0.7222222089767456, "rewards/chosen": -1.0420479774475098, "rewards/margins": 0.8879384994506836, "rewards/rejected": -1.9299864768981934, "step": 110 }, { "epoch": 0.25846153846153846, "grad_norm": 53.00134374506722, "learning_rate": 4.987304583664712e-07, "logits/chosen": -0.3946026563644409, "logits/rejected": -0.3954353928565979, "logps/chosen": -44.86084747314453, "logps/rejected": -54.95661926269531, "loss": 0.384, "rewards/accuracies": 0.7777777910232544, "rewards/chosen": -1.299651861190796, "rewards/margins": 1.7314188480377197, "rewards/rejected": -3.0310707092285156, "step": 112 }, { "epoch": 0.2630769230769231, "grad_norm": 39.90720939417114, "learning_rate": 4.985194153857662e-07, "logits/chosen": -0.38169896602630615, "logits/rejected": -0.37973132729530334, "logps/chosen": -31.202735900878906, "logps/rejected": -27.499685287475586, "loss": 0.4059, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": -1.3250929117202759, "rewards/margins": 1.301426649093628, "rewards/rejected": -2.6265194416046143, "step": 114 }, { "epoch": 0.2676923076923077, "grad_norm": 29.63846905129276, "learning_rate": 4.982922049080163e-07, "logits/chosen": -0.4111602008342743, "logits/rejected": -0.40882954001426697, "logps/chosen": -26.8565731048584, "logps/rejected": -37.42183303833008, "loss": 0.3351, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -1.1551212072372437, "rewards/margins": 1.4092646837234497, "rewards/rejected": -2.564385414123535, "step": 116 }, { "epoch": 0.2723076923076923, "grad_norm": 37.890391248858556, "learning_rate": 4.980488417144599e-07, "logits/chosen": -0.3708282709121704, "logits/rejected": -0.35484203696250916, "logps/chosen": -25.0501708984375, "logps/rejected": -81.45349884033203, "loss": 0.3826, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -1.1718049049377441, "rewards/margins": 3.5377848148345947, "rewards/rejected": -4.70958948135376, "step": 118 }, { "epoch": 0.27692307692307694, "grad_norm": 30.394654171618882, "learning_rate": 4.977893416371544e-07, "logits/chosen": -0.38731849193573, "logits/rejected": -0.3795092701911926, "logps/chosen": -26.774648666381836, "logps/rejected": -35.751922607421875, "loss": 0.3203, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -1.150585651397705, "rewards/margins": 2.026820659637451, "rewards/rejected": -3.177406072616577, "step": 120 }, { "epoch": 0.27692307692307694, "eval_logits/chosen": -0.41345661878585815, "eval_logits/rejected": -0.408238023519516, "eval_logps/chosen": -30.98581886291504, "eval_logps/rejected": -40.20292282104492, "eval_loss": 0.3370819687843323, "eval_rewards/accuracies": 0.7960829734802246, "eval_rewards/chosen": -1.0636885166168213, "eval_rewards/margins": 1.9457571506500244, "eval_rewards/rejected": -3.0094454288482666, "eval_runtime": 507.6848, "eval_samples_per_second": 3.416, "eval_steps_per_second": 0.427, "step": 120 }, { "epoch": 0.2815384615384615, "grad_norm": 31.08330328074679, "learning_rate": 4.975137215579469e-07, "logits/chosen": -0.3912862539291382, "logits/rejected": -0.3866044282913208, "logps/chosen": -41.20629119873047, "logps/rejected": -29.166568756103516, "loss": 0.3172, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -1.0720988512039185, "rewards/margins": 1.7669881582260132, "rewards/rejected": -2.8390870094299316, "step": 122 }, { "epoch": 0.28615384615384615, "grad_norm": 36.03577411995754, "learning_rate": 4.972219994073755e-07, "logits/chosen": -0.4430157244205475, "logits/rejected": -0.43155643343925476, "logps/chosen": -31.21184539794922, "logps/rejected": -75.72866821289062, "loss": 0.3293, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -1.414463996887207, "rewards/margins": 3.396198272705078, "rewards/rejected": -4.810662269592285, "step": 124 }, { "epoch": 0.2907692307692308, "grad_norm": 26.818821992064425, "learning_rate": 4.969141941635025e-07, "logits/chosen": -0.46176183223724365, "logits/rejected": -0.45913922786712646, "logps/chosen": -34.07700729370117, "logps/rejected": -46.17119216918945, "loss": 0.3067, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -1.42012357711792, "rewards/margins": 2.144068717956543, "rewards/rejected": -3.564192533493042, "step": 126 }, { "epoch": 0.2953846153846154, "grad_norm": 41.84026187312963, "learning_rate": 4.965903258506806e-07, "logits/chosen": -0.3863135278224945, "logits/rejected": -0.38551628589630127, "logps/chosen": -32.156890869140625, "logps/rejected": -36.172698974609375, "loss": 0.2817, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -1.1282252073287964, "rewards/margins": 1.8321247100830078, "rewards/rejected": -2.9603495597839355, "step": 128 }, { "epoch": 0.3, "grad_norm": 33.47106216198385, "learning_rate": 4.962504155382493e-07, "logits/chosen": -0.4146730601787567, "logits/rejected": -0.4086844027042389, "logps/chosen": -27.40284538269043, "logps/rejected": -31.536325454711914, "loss": 0.304, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.7551792860031128, "rewards/margins": 1.8165247440338135, "rewards/rejected": -2.5717039108276367, "step": 130 }, { "epoch": 0.3046153846153846, "grad_norm": 41.63671364987837, "learning_rate": 4.958944853391652e-07, "logits/chosen": -0.39497292041778564, "logits/rejected": -0.39087140560150146, "logps/chosen": -28.94989013671875, "logps/rejected": -35.44032287597656, "loss": 0.3446, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -1.1855361461639404, "rewards/margins": 1.9116501808166504, "rewards/rejected": -3.0971860885620117, "step": 132 }, { "epoch": 0.30923076923076925, "grad_norm": 36.036961293000005, "learning_rate": 4.955225584085624e-07, "logits/chosen": -0.4517758786678314, "logits/rejected": -0.4465089440345764, "logps/chosen": -23.50043487548828, "logps/rejected": -38.347137451171875, "loss": 0.3309, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.7946065068244934, "rewards/margins": 2.4905617237091064, "rewards/rejected": -3.285168409347534, "step": 134 }, { "epoch": 0.31384615384615383, "grad_norm": 26.618040874020934, "learning_rate": 4.951346589422467e-07, "logits/chosen": -0.41716066002845764, "logits/rejected": -0.41079726815223694, "logps/chosen": -32.53648376464844, "logps/rejected": -62.08100128173828, "loss": 0.3244, "rewards/accuracies": 0.75, "rewards/chosen": -1.0272445678710938, "rewards/margins": 2.8604350090026855, "rewards/rejected": -3.8876795768737793, "step": 136 }, { "epoch": 0.31846153846153846, "grad_norm": 46.97692992148951, "learning_rate": 4.94730812175122e-07, "logits/chosen": -0.4412165582180023, "logits/rejected": -0.4350200295448303, "logps/chosen": -33.020076751708984, "logps/rejected": -41.40916442871094, "loss": 0.3548, "rewards/accuracies": 0.7777777910232544, "rewards/chosen": -1.1360547542572021, "rewards/margins": 1.9712878465652466, "rewards/rejected": -3.1073427200317383, "step": 138 }, { "epoch": 0.3230769230769231, "grad_norm": 31.685112161918536, "learning_rate": 4.943110443795476e-07, "logits/chosen": -0.44387710094451904, "logits/rejected": -0.4434346854686737, "logps/chosen": -36.96713638305664, "logps/rejected": -34.54549026489258, "loss": 0.314, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -1.3938254117965698, "rewards/margins": 1.7875707149505615, "rewards/rejected": -3.181396007537842, "step": 140 }, { "epoch": 0.3230769230769231, "eval_logits/chosen": -0.4154263436794281, "eval_logits/rejected": -0.41024988889694214, "eval_logps/chosen": -31.096752166748047, "eval_logps/rejected": -41.11034393310547, "eval_loss": 0.30621179938316345, "eval_rewards/accuracies": 0.8006912469863892, "eval_rewards/chosen": -1.119154930114746, "eval_rewards/margins": 2.344003438949585, "eval_rewards/rejected": -3.463158130645752, "eval_runtime": 507.9564, "eval_samples_per_second": 3.414, "eval_steps_per_second": 0.427, "step": 140 }, { "epoch": 0.32769230769230767, "grad_norm": 37.763110596552, "learning_rate": 4.938753828636297e-07, "logits/chosen": -0.40920180082321167, "logits/rejected": -0.406100869178772, "logps/chosen": -31.89576530456543, "logps/rejected": -25.759078979492188, "loss": 0.3698, "rewards/accuracies": 0.7222222089767456, "rewards/chosen": -1.025688648223877, "rewards/margins": 1.3222806453704834, "rewards/rejected": -2.3479690551757812, "step": 142 }, { "epoch": 0.3323076923076923, "grad_norm": 31.761758820762065, "learning_rate": 4.934238559694447e-07, "logits/chosen": -0.410123348236084, "logits/rejected": -0.3894162178039551, "logps/chosen": -28.004728317260742, "logps/rejected": -52.7032585144043, "loss": 0.2524, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -1.1515227556228638, "rewards/margins": 3.0199975967407227, "rewards/rejected": -4.171520233154297, "step": 144 }, { "epoch": 0.33692307692307694, "grad_norm": 35.03182406562912, "learning_rate": 4.929564930711957e-07, "logits/chosen": -0.3986736238002777, "logits/rejected": -0.38986000418663025, "logps/chosen": -23.621021270751953, "logps/rejected": -33.703369140625, "loss": 0.2815, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -1.2688480615615845, "rewards/margins": 2.2587625980377197, "rewards/rejected": -3.5276107788085938, "step": 146 }, { "epoch": 0.3415384615384615, "grad_norm": 28.27179448892304, "learning_rate": 4.924733245733008e-07, "logits/chosen": -0.45226117968559265, "logits/rejected": -0.45015546679496765, "logps/chosen": -35.56471633911133, "logps/rejected": -30.972333908081055, "loss": 0.2878, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -1.620578408241272, "rewards/margins": 1.8046082258224487, "rewards/rejected": -3.425187110900879, "step": 148 }, { "epoch": 0.34615384615384615, "grad_norm": 26.196797087704713, "learning_rate": 4.91974381908416e-07, "logits/chosen": -0.39081981778144836, "logits/rejected": -0.37716490030288696, "logps/chosen": -23.622695922851562, "logps/rejected": -66.68536376953125, "loss": 0.2559, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -1.1397902965545654, "rewards/margins": 4.303215980529785, "rewards/rejected": -5.4430060386657715, "step": 150 }, { "epoch": 0.3507692307692308, "grad_norm": 30.650177263316735, "learning_rate": 4.914596975353898e-07, "logits/chosen": -0.3720633387565613, "logits/rejected": -0.3665693998336792, "logps/chosen": -21.698226928710938, "logps/rejected": -27.71235466003418, "loss": 0.2953, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.6390548348426819, "rewards/margins": 1.9336141347885132, "rewards/rejected": -2.5726687908172607, "step": 152 }, { "epoch": 0.3553846153846154, "grad_norm": 51.98551487041421, "learning_rate": 4.909293049371519e-07, "logits/chosen": -0.3571963608264923, "logits/rejected": -0.3496527373790741, "logps/chosen": -43.302207946777344, "logps/rejected": -41.63944625854492, "loss": 0.2966, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -1.2618622779846191, "rewards/margins": 2.554023265838623, "rewards/rejected": -3.8158857822418213, "step": 154 }, { "epoch": 0.36, "grad_norm": 34.70591783391935, "learning_rate": 4.903832386185343e-07, "logits/chosen": -0.4354073405265808, "logits/rejected": -0.43241533637046814, "logps/chosen": -32.38044357299805, "logps/rejected": -34.786006927490234, "loss": 0.3071, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -1.2626214027404785, "rewards/margins": 1.8995192050933838, "rewards/rejected": -3.1621406078338623, "step": 156 }, { "epoch": 0.3646153846153846, "grad_norm": 31.13575589482068, "learning_rate": 4.89821534104028e-07, "logits/chosen": -0.3570865988731384, "logits/rejected": -0.35065561532974243, "logps/chosen": -26.85542869567871, "logps/rejected": -44.916175842285156, "loss": 0.2522, "rewards/accuracies": 0.7777777910232544, "rewards/chosen": -1.25833261013031, "rewards/margins": 2.684157371520996, "rewards/rejected": -3.9424901008605957, "step": 158 }, { "epoch": 0.36923076923076925, "grad_norm": 36.957558600327985, "learning_rate": 4.892442279354698e-07, "logits/chosen": -0.39342963695526123, "logits/rejected": -0.38523659110069275, "logps/chosen": -37.74699401855469, "logps/rejected": -51.69345474243164, "loss": 0.3077, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -1.3077665567398071, "rewards/margins": 2.8542850017547607, "rewards/rejected": -4.162051677703857, "step": 160 }, { "epoch": 0.36923076923076925, "eval_logits/chosen": -0.4094816744327545, "eval_logits/rejected": -0.4042726457118988, "eval_logps/chosen": -31.450454711914062, "eval_logps/rejected": -41.97096252441406, "eval_loss": 0.2800266444683075, "eval_rewards/accuracies": 0.8145161271095276, "eval_rewards/chosen": -1.2960065603256226, "eval_rewards/margins": 2.5974619388580322, "eval_rewards/rejected": -3.893468141555786, "eval_runtime": 507.4751, "eval_samples_per_second": 3.417, "eval_steps_per_second": 0.428, "step": 160 }, { "epoch": 0.37384615384615383, "grad_norm": 44.84501904315108, "learning_rate": 4.886513576696673e-07, "logits/chosen": -0.45205196738243103, "logits/rejected": -0.4440386891365051, "logps/chosen": -35.41313171386719, "logps/rejected": -33.47145080566406, "loss": 0.2746, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": -1.5193215608596802, "rewards/margins": 2.079702854156494, "rewards/rejected": -3.599024534225464, "step": 162 }, { "epoch": 0.37846153846153846, "grad_norm": 31.521724469250227, "learning_rate": 4.880429618759543e-07, "logits/chosen": -0.38396933674812317, "logits/rejected": -0.3828057050704956, "logps/chosen": -26.2564640045166, "logps/rejected": -31.42851448059082, "loss": 0.2991, "rewards/accuracies": 0.9444444179534912, "rewards/chosen": -0.8485214710235596, "rewards/margins": 2.313399076461792, "rewards/rejected": -3.1619203090667725, "step": 164 }, { "epoch": 0.3830769230769231, "grad_norm": 26.49383324524101, "learning_rate": 4.874190801336817e-07, "logits/chosen": -0.4060750901699066, "logits/rejected": -0.40515270829200745, "logps/chosen": -32.15226745605469, "logps/rejected": -42.27301025390625, "loss": 0.2492, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -1.1808879375457764, "rewards/margins": 2.775684118270874, "rewards/rejected": -3.9565718173980713, "step": 166 }, { "epoch": 0.38769230769230767, "grad_norm": 21.446603096164726, "learning_rate": 4.867797530296431e-07, "logits/chosen": -0.48834460973739624, "logits/rejected": -0.47467708587646484, "logps/chosen": -40.508033752441406, "logps/rejected": -61.70073699951172, "loss": 0.19, "rewards/accuracies": 0.9722222089767456, "rewards/chosen": -1.6477137804031372, "rewards/margins": 3.7894787788391113, "rewards/rejected": -5.437192916870117, "step": 168 }, { "epoch": 0.3923076923076923, "grad_norm": 25.454588319834855, "learning_rate": 4.861250221554343e-07, "logits/chosen": -0.4046197235584259, "logits/rejected": -0.39474785327911377, "logps/chosen": -25.933032989501953, "logps/rejected": -43.12753677368164, "loss": 0.2781, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -1.188935399055481, "rewards/margins": 3.230454683303833, "rewards/rejected": -4.419390678405762, "step": 170 }, { "epoch": 0.39692307692307693, "grad_norm": 21.570810462356484, "learning_rate": 4.854549301047476e-07, "logits/chosen": -0.4159725308418274, "logits/rejected": -0.412297785282135, "logps/chosen": -29.65894889831543, "logps/rejected": -29.721923828125, "loss": 0.2943, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -1.3663020133972168, "rewards/margins": 2.203003406524658, "rewards/rejected": -3.569305419921875, "step": 172 }, { "epoch": 0.4015384615384615, "grad_norm": 34.498122716048385, "learning_rate": 4.847695204706005e-07, "logits/chosen": -0.42067256569862366, "logits/rejected": -0.4190225601196289, "logps/chosen": -28.59151268005371, "logps/rejected": -32.56970977783203, "loss": 0.3004, "rewards/accuracies": 0.75, "rewards/chosen": -1.398697018623352, "rewards/margins": 1.9461326599121094, "rewards/rejected": -3.344829559326172, "step": 174 }, { "epoch": 0.40615384615384614, "grad_norm": 33.265958384248954, "learning_rate": 4.840688378425e-07, "logits/chosen": -0.4313834011554718, "logits/rejected": -0.42209184169769287, "logps/chosen": -32.20521545410156, "logps/rejected": -38.21830749511719, "loss": 0.2271, "rewards/accuracies": 0.9444444179534912, "rewards/chosen": -1.2593474388122559, "rewards/margins": 2.9152119159698486, "rewards/rejected": -4.174560070037842, "step": 176 }, { "epoch": 0.4107692307692308, "grad_norm": 32.01001371216788, "learning_rate": 4.833529278035422e-07, "logits/chosen": -0.4627840220928192, "logits/rejected": -0.4408462941646576, "logps/chosen": -23.77450180053711, "logps/rejected": -79.35948181152344, "loss": 0.2278, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -1.3465075492858887, "rewards/margins": 5.6684393882751465, "rewards/rejected": -7.014946937561035, "step": 178 }, { "epoch": 0.4153846153846154, "grad_norm": 34.473550063979566, "learning_rate": 4.826218369274459e-07, "logits/chosen": -0.42453229427337646, "logits/rejected": -0.41142189502716064, "logps/chosen": -31.429290771484375, "logps/rejected": -66.93350219726562, "loss": 0.2566, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.882953405380249, "rewards/margins": 4.219231128692627, "rewards/rejected": -5.102184295654297, "step": 180 }, { "epoch": 0.4153846153846154, "eval_logits/chosen": -0.4025198221206665, "eval_logits/rejected": -0.39768728613853455, "eval_logps/chosen": -31.50428009033203, "eval_logps/rejected": -42.43844985961914, "eval_loss": 0.2629016935825348, "eval_rewards/accuracies": 0.8260368704795837, "eval_rewards/chosen": -1.32291841506958, "eval_rewards/margins": 2.8042914867401123, "eval_rewards/rejected": -4.127209186553955, "eval_runtime": 506.5854, "eval_samples_per_second": 3.423, "eval_steps_per_second": 0.428, "step": 180 }, { "epoch": 0.42, "grad_norm": 20.985743890963306, "learning_rate": 4.818756127755237e-07, "logits/chosen": -0.41872772574424744, "logits/rejected": -0.4168924391269684, "logps/chosen": -29.70162582397461, "logps/rejected": -34.24154281616211, "loss": 0.2212, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": -1.3913389444351196, "rewards/margins": 2.59101939201355, "rewards/rejected": -3.98235821723938, "step": 182 }, { "epoch": 0.4246153846153846, "grad_norm": 30.21853081235467, "learning_rate": 4.811143038935873e-07, "logits/chosen": -0.4257875978946686, "logits/rejected": -0.4192647635936737, "logps/chosen": -29.643821716308594, "logps/rejected": -39.97819137573242, "loss": 0.2765, "rewards/accuracies": 0.7777777910232544, "rewards/chosen": -1.2075949907302856, "rewards/margins": 2.8008697032928467, "rewards/rejected": -4.008464336395264, "step": 184 }, { "epoch": 0.42923076923076925, "grad_norm": 23.967391070949134, "learning_rate": 4.803379598087899e-07, "logits/chosen": -0.4222433865070343, "logits/rejected": -0.41896089911460876, "logps/chosen": -34.925689697265625, "logps/rejected": -29.42085075378418, "loss": 0.2426, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.9498284459114075, "rewards/margins": 2.2179977893829346, "rewards/rejected": -3.1678264141082764, "step": 186 }, { "epoch": 0.4338461538461538, "grad_norm": 42.32195394973849, "learning_rate": 4.795466310264034e-07, "logits/chosen": -0.38863155245780945, "logits/rejected": -0.3801649808883667, "logps/chosen": -28.73379135131836, "logps/rejected": -65.8857650756836, "loss": 0.3138, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -1.5714842081069946, "rewards/margins": 4.0624680519104, "rewards/rejected": -5.633952617645264, "step": 188 }, { "epoch": 0.43846153846153846, "grad_norm": 35.500283075205516, "learning_rate": 4.787403690265335e-07, "logits/chosen": -0.4426600933074951, "logits/rejected": -0.4381566643714905, "logps/chosen": -29.472740173339844, "logps/rejected": -45.07543182373047, "loss": 0.2575, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": -1.4053057432174683, "rewards/margins": 2.799929618835449, "rewards/rejected": -4.205235958099365, "step": 190 }, { "epoch": 0.4430769230769231, "grad_norm": 40.770232211446604, "learning_rate": 4.779192262607702e-07, "logits/chosen": -0.4231228232383728, "logits/rejected": -0.41750800609588623, "logps/chosen": -36.65898513793945, "logps/rejected": -35.16616439819336, "loss": 0.2484, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -1.057456135749817, "rewards/margins": 2.6991770267486572, "rewards/rejected": -3.7566332817077637, "step": 192 }, { "epoch": 0.44769230769230767, "grad_norm": 30.71947398758085, "learning_rate": 4.770832561487758e-07, "logits/chosen": -0.3831440508365631, "logits/rejected": -0.3782423734664917, "logps/chosen": -37.04620361328125, "logps/rejected": -37.253990173339844, "loss": 0.1846, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -1.3419545888900757, "rewards/margins": 3.054530620574951, "rewards/rejected": -4.396485328674316, "step": 194 }, { "epoch": 0.4523076923076923, "grad_norm": 21.272516019706714, "learning_rate": 4.762325130748097e-07, "logits/chosen": -0.4278413653373718, "logits/rejected": -0.4242137372493744, "logps/chosen": -44.19279861450195, "logps/rejected": -32.90611267089844, "loss": 0.2497, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -1.4761689901351929, "rewards/margins": 2.20241641998291, "rewards/rejected": -3.6785850524902344, "step": 196 }, { "epoch": 0.45692307692307693, "grad_norm": 21.425954883285144, "learning_rate": 4.7536705238418995e-07, "logits/chosen": -0.3664802610874176, "logits/rejected": -0.35965999960899353, "logps/chosen": -31.303218841552734, "logps/rejected": -34.49939727783203, "loss": 0.2811, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": -1.3029550313949585, "rewards/margins": 2.5633440017700195, "rewards/rejected": -3.8662986755371094, "step": 198 }, { "epoch": 0.46153846153846156, "grad_norm": 29.393204469647774, "learning_rate": 4.7448693037969336e-07, "logits/chosen": -0.3988102674484253, "logits/rejected": -0.40094831585884094, "logps/chosen": -36.04091262817383, "logps/rejected": -37.4947624206543, "loss": 0.2104, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -1.1071678400039673, "rewards/margins": 2.673301935195923, "rewards/rejected": -3.7804696559906006, "step": 200 }, { "epoch": 0.46153846153846156, "eval_logits/chosen": -0.40656572580337524, "eval_logits/rejected": -0.4014260768890381, "eval_logps/chosen": -31.341594696044922, "eval_logps/rejected": -42.71249008178711, "eval_loss": 0.2473345398902893, "eval_rewards/accuracies": 0.828341007232666, "eval_rewards/chosen": -1.2415763139724731, "eval_rewards/margins": 3.0226547718048096, "eval_rewards/rejected": -4.2642316818237305, "eval_runtime": 503.2499, "eval_samples_per_second": 3.446, "eval_steps_per_second": 0.431, "step": 200 }, { "epoch": 0.46615384615384614, "grad_norm": 29.71415358124677, "learning_rate": 4.735922043178923e-07, "logits/chosen": -0.42111077904701233, "logits/rejected": -0.41629061102867126, "logps/chosen": -31.40985870361328, "logps/rejected": -35.38710021972656, "loss": 0.2122, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -1.4481463432312012, "rewards/margins": 2.6133227348327637, "rewards/rejected": -4.061469078063965, "step": 202 }, { "epoch": 0.4707692307692308, "grad_norm": 29.03584736368545, "learning_rate": 4.7268293240543017e-07, "logits/chosen": -0.3920665979385376, "logits/rejected": -0.38839492201805115, "logps/chosen": -32.668731689453125, "logps/rejected": -53.08720397949219, "loss": 0.2789, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": -1.486663818359375, "rewards/margins": 3.260793924331665, "rewards/rejected": -4.747457504272461, "step": 204 }, { "epoch": 0.4753846153846154, "grad_norm": 33.98721704408111, "learning_rate": 4.717591737952344e-07, "logits/chosen": -0.42823246121406555, "logits/rejected": -0.4223634898662567, "logps/chosen": -28.75157356262207, "logps/rejected": -48.49557113647461, "loss": 0.2342, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -1.4805346727371216, "rewards/margins": 2.9494967460632324, "rewards/rejected": -4.430031776428223, "step": 206 }, { "epoch": 0.48, "grad_norm": 26.383575142546274, "learning_rate": 4.7082098858266837e-07, "logits/chosen": -0.4134213626384735, "logits/rejected": -0.4001366198062897, "logps/chosen": -19.751869201660156, "logps/rejected": -71.13929748535156, "loss": 0.305, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -1.3059040307998657, "rewards/margins": 4.722702503204346, "rewards/rejected": -6.028607368469238, "step": 208 }, { "epoch": 0.4846153846153846, "grad_norm": 15.081846916930868, "learning_rate": 4.698684378016222e-07, "logits/chosen": -0.40595147013664246, "logits/rejected": -0.3996593952178955, "logps/chosen": -38.71549987792969, "logps/rejected": -55.73625564575195, "loss": 0.2136, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -1.5198744535446167, "rewards/margins": 3.521613121032715, "rewards/rejected": -5.041487693786621, "step": 210 }, { "epoch": 0.48923076923076925, "grad_norm": 42.43120032107903, "learning_rate": 4.6890158342054174e-07, "logits/chosen": -0.40591755509376526, "logits/rejected": -0.39412975311279297, "logps/chosen": -30.40068244934082, "logps/rejected": -47.40129470825195, "loss": 0.2528, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -1.3047279119491577, "rewards/margins": 3.392256736755371, "rewards/rejected": -4.696984767913818, "step": 212 }, { "epoch": 0.4938461538461538, "grad_norm": 33.508731289267686, "learning_rate": 4.679204883383973e-07, "logits/chosen": -0.4079332947731018, "logits/rejected": -0.4049822986125946, "logps/chosen": -26.06205940246582, "logps/rejected": -42.55049514770508, "loss": 0.2425, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": -1.4307770729064941, "rewards/margins": 3.4067466259002686, "rewards/rejected": -4.837523937225342, "step": 214 }, { "epoch": 0.49846153846153846, "grad_norm": 29.498712056364305, "learning_rate": 4.669252163805919e-07, "logits/chosen": -0.39454489946365356, "logits/rejected": -0.3856600224971771, "logps/chosen": -35.02345657348633, "logps/rejected": -47.77273178100586, "loss": 0.2396, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -1.402431845664978, "rewards/margins": 3.170076370239258, "rewards/rejected": -4.572508335113525, "step": 216 }, { "epoch": 0.5030769230769231, "grad_norm": 32.015222516830526, "learning_rate": 4.65915832294809e-07, "logits/chosen": -0.43728315830230713, "logits/rejected": -0.4331842064857483, "logps/chosen": -20.569881439208984, "logps/rejected": -44.81495666503906, "loss": 0.3207, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": -1.6754631996154785, "rewards/margins": 3.1900627613067627, "rewards/rejected": -4.86552619934082, "step": 218 }, { "epoch": 0.5076923076923077, "grad_norm": 23.874271586258814, "learning_rate": 4.6489240174680026e-07, "logits/chosen": -0.3791297674179077, "logits/rejected": -0.37405622005462646, "logps/chosen": -26.703208923339844, "logps/rejected": -29.02743148803711, "loss": 0.3039, "rewards/accuracies": 0.75, "rewards/chosen": -1.5167583227157593, "rewards/margins": 1.895190715789795, "rewards/rejected": -3.4119489192962646, "step": 220 }, { "epoch": 0.5076923076923077, "eval_logits/chosen": -0.40251195430755615, "eval_logits/rejected": -0.3974216878414154, "eval_logps/chosen": -31.499027252197266, "eval_logps/rejected": -43.283504486083984, "eval_loss": 0.23615001142024994, "eval_rewards/accuracies": 0.8306451439857483, "eval_rewards/chosen": -1.3202918767929077, "eval_rewards/margins": 3.2294461727142334, "eval_rewards/rejected": -4.549737453460693, "eval_runtime": 507.3634, "eval_samples_per_second": 3.418, "eval_steps_per_second": 0.428, "step": 220 }, { "epoch": 0.5123076923076924, "grad_norm": 14.098783343782902, "learning_rate": 4.638549913161138e-07, "logits/chosen": -0.4351051151752472, "logits/rejected": -0.42956972122192383, "logps/chosen": -34.70240783691406, "logps/rejected": -38.96794509887695, "loss": 0.1967, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -1.350088119506836, "rewards/margins": 3.141369342803955, "rewards/rejected": -4.491456985473633, "step": 222 }, { "epoch": 0.5169230769230769, "grad_norm": 20.47877406948092, "learning_rate": 4.6280366849176267e-07, "logits/chosen": -0.44398435950279236, "logits/rejected": -0.4355509579181671, "logps/chosen": -30.79046630859375, "logps/rejected": -38.120731353759766, "loss": 0.184, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -1.3002269268035889, "rewards/margins": 2.935243844985962, "rewards/rejected": -4.235470771789551, "step": 224 }, { "epoch": 0.5215384615384615, "grad_norm": 20.106391291915543, "learning_rate": 4.6173850166783446e-07, "logits/chosen": -0.45229944586753845, "logits/rejected": -0.452068954706192, "logps/chosen": -32.20356750488281, "logps/rejected": -32.88765335083008, "loss": 0.2129, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -1.5311998128890991, "rewards/margins": 2.4090046882629395, "rewards/rejected": -3.940204620361328, "step": 226 }, { "epoch": 0.5261538461538462, "grad_norm": 30.109857390873604, "learning_rate": 4.606595601390417e-07, "logits/chosen": -0.41887974739074707, "logits/rejected": -0.40805181860923767, "logps/chosen": -31.261016845703125, "logps/rejected": -56.04242706298828, "loss": 0.2011, "rewards/accuracies": 0.9444444179534912, "rewards/chosen": -1.6612772941589355, "rewards/margins": 3.778890371322632, "rewards/rejected": -5.4401679039001465, "step": 228 }, { "epoch": 0.5307692307692308, "grad_norm": 35.792806728940704, "learning_rate": 4.595669140962143e-07, "logits/chosen": -0.38995400071144104, "logits/rejected": -0.3782784044742584, "logps/chosen": -25.376953125, "logps/rejected": -81.69935607910156, "loss": 0.3022, "rewards/accuracies": 0.75, "rewards/chosen": -1.7850780487060547, "rewards/margins": 5.242076396942139, "rewards/rejected": -7.027154922485352, "step": 230 }, { "epoch": 0.5353846153846153, "grad_norm": 19.621976359044655, "learning_rate": 4.5846063462173284e-07, "logits/chosen": -0.42400074005126953, "logits/rejected": -0.4208061993122101, "logps/chosen": -25.61681365966797, "logps/rejected": -51.50150680541992, "loss": 0.2398, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -1.3391889333724976, "rewards/margins": 3.420403003692627, "rewards/rejected": -4.759592056274414, "step": 232 }, { "epoch": 0.54, "grad_norm": 34.64535802460503, "learning_rate": 4.573407936849044e-07, "logits/chosen": -0.4029981195926666, "logits/rejected": -0.39651092886924744, "logps/chosen": -36.64603805541992, "logps/rejected": -46.107872009277344, "loss": 0.2682, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -1.1697323322296143, "rewards/margins": 3.484927177429199, "rewards/rejected": -4.654658794403076, "step": 234 }, { "epoch": 0.5446153846153846, "grad_norm": 32.21714603686374, "learning_rate": 4.5620746413728063e-07, "logits/chosen": -0.456825852394104, "logits/rejected": -0.451083779335022, "logps/chosen": -48.70059585571289, "logps/rejected": -39.24748992919922, "loss": 0.1833, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -1.573285460472107, "rewards/margins": 3.0970544815063477, "rewards/rejected": -4.670339584350586, "step": 236 }, { "epoch": 0.5492307692307692, "grad_norm": 19.890036205802677, "learning_rate": 4.550607197079185e-07, "logits/chosen": -0.45712196826934814, "logits/rejected": -0.45414966344833374, "logps/chosen": -25.207489013671875, "logps/rejected": -30.65032958984375, "loss": 0.2327, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.9366336464881897, "rewards/margins": 2.6559956073760986, "rewards/rejected": -3.5926294326782227, "step": 238 }, { "epoch": 0.5538461538461539, "grad_norm": 16.022227667064776, "learning_rate": 4.5390063499858353e-07, "logits/chosen": -0.4476924538612366, "logits/rejected": -0.4420890808105469, "logps/chosen": -33.19588088989258, "logps/rejected": -45.9852409362793, "loss": 0.1603, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -1.5992447137832642, "rewards/margins": 3.585657835006714, "rewards/rejected": -5.184902667999268, "step": 240 }, { "epoch": 0.5538461538461539, "eval_logits/chosen": -0.40862879157066345, "eval_logits/rejected": -0.4036082923412323, "eval_logps/chosen": -30.955970764160156, "eval_logps/rejected": -43.175209045410156, "eval_loss": 0.22531619668006897, "eval_rewards/accuracies": 0.8329492807388306, "eval_rewards/chosen": -1.0487645864486694, "eval_rewards/margins": 3.4468276500701904, "eval_rewards/rejected": -4.4955925941467285, "eval_runtime": 509.9594, "eval_samples_per_second": 3.4, "eval_steps_per_second": 0.426, "step": 240 }, { "epoch": 0.5584615384615385, "grad_norm": 18.836184473735898, "learning_rate": 4.5272728547889687e-07, "logits/chosen": -0.4127146601676941, "logits/rejected": -0.4089764952659607, "logps/chosen": -38.14063262939453, "logps/rejected": -42.034847259521484, "loss": 0.1809, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -1.1325756311416626, "rewards/margins": 4.140430927276611, "rewards/rejected": -5.273006916046143, "step": 242 }, { "epoch": 0.563076923076923, "grad_norm": 57.85094267690425, "learning_rate": 4.5154074748142535e-07, "logits/chosen": -0.36731666326522827, "logits/rejected": -0.36327943205833435, "logps/chosen": -31.921220779418945, "logps/rejected": -37.49308776855469, "loss": 0.2465, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.9683434963226318, "rewards/margins": 2.7872416973114014, "rewards/rejected": -3.755585193634033, "step": 244 }, { "epoch": 0.5676923076923077, "grad_norm": 40.08736941641743, "learning_rate": 4.503410981967158e-07, "logits/chosen": -0.3854740858078003, "logits/rejected": -0.3787837326526642, "logps/chosen": -24.31875991821289, "logps/rejected": -57.45841598510742, "loss": 0.2918, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.8772084712982178, "rewards/margins": 4.705995082855225, "rewards/rejected": -5.583203315734863, "step": 246 }, { "epoch": 0.5723076923076923, "grad_norm": 26.218142014024643, "learning_rate": 4.4912841566827333e-07, "logits/chosen": -0.455485463142395, "logits/rejected": -0.4470650851726532, "logps/chosen": -26.294097900390625, "logps/rejected": -49.45498275756836, "loss": 0.1772, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.8886358737945557, "rewards/margins": 3.8760058879852295, "rewards/rejected": -4.764641284942627, "step": 248 }, { "epoch": 0.5769230769230769, "grad_norm": 33.82360810528879, "learning_rate": 4.4790277878748415e-07, "logits/chosen": -0.42798396944999695, "logits/rejected": -0.4229770302772522, "logps/chosen": -29.242835998535156, "logps/rejected": -31.0858154296875, "loss": 0.1851, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.6897026896476746, "rewards/margins": 2.8393633365631104, "rewards/rejected": -3.529066324234009, "step": 250 }, { "epoch": 0.5815384615384616, "grad_norm": 30.285023133450697, "learning_rate": 4.466642672884835e-07, "logits/chosen": -0.3738807141780853, "logits/rejected": -0.3704882860183716, "logps/chosen": -20.846439361572266, "logps/rejected": -44.28904342651367, "loss": 0.1913, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -1.1368333101272583, "rewards/margins": 3.7949838638305664, "rewards/rejected": -4.931817054748535, "step": 252 }, { "epoch": 0.5861538461538461, "grad_norm": 21.699418362055532, "learning_rate": 4.454129617429682e-07, "logits/chosen": -0.4325045943260193, "logits/rejected": -0.43138301372528076, "logps/chosen": -35.05239486694336, "logps/rejected": -35.216007232666016, "loss": 0.2181, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.6422653794288635, "rewards/margins": 3.1057541370391846, "rewards/rejected": -3.7480196952819824, "step": 254 }, { "epoch": 0.5907692307692308, "grad_norm": 13.61574461077273, "learning_rate": 4.441489435549551e-07, "logits/chosen": -0.39467182755470276, "logits/rejected": -0.390009343624115, "logps/chosen": -29.853254318237305, "logps/rejected": -47.853858947753906, "loss": 0.2009, "rewards/accuracies": 0.75, "rewards/chosen": -1.163262963294983, "rewards/margins": 3.5671658515930176, "rewards/rejected": -4.730428695678711, "step": 256 }, { "epoch": 0.5953846153846154, "grad_norm": 29.432419859674056, "learning_rate": 4.4287229495548573e-07, "logits/chosen": -0.4452096223831177, "logits/rejected": -0.4424053430557251, "logps/chosen": -36.030330657958984, "logps/rejected": -34.27677536010742, "loss": 0.1692, "rewards/accuracies": 0.9722222089767456, "rewards/chosen": -0.6483229398727417, "rewards/margins": 3.4191946983337402, "rewards/rejected": -4.0675177574157715, "step": 258 }, { "epoch": 0.6, "grad_norm": 31.269021609003353, "learning_rate": 4.415830989972761e-07, "logits/chosen": -0.4377964735031128, "logits/rejected": -0.43135735392570496, "logps/chosen": -26.93490982055664, "logps/rejected": -33.63737869262695, "loss": 0.2017, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.8301222324371338, "rewards/margins": 3.1450002193450928, "rewards/rejected": -3.9751226902008057, "step": 260 }, { "epoch": 0.6, "eval_logits/chosen": -0.41452378034591675, "eval_logits/rejected": -0.4095514416694641, "eval_logps/chosen": -30.60530662536621, "eval_logps/rejected": -43.20734786987305, "eval_loss": 0.2185508906841278, "eval_rewards/accuracies": 0.8375576138496399, "eval_rewards/chosen": -0.8734327554702759, "eval_rewards/margins": 3.6382253170013428, "eval_rewards/rejected": -4.511658191680908, "eval_runtime": 503.8644, "eval_samples_per_second": 3.441, "eval_steps_per_second": 0.431, "step": 260 }, { "epoch": 0.6046153846153847, "grad_norm": 24.485138572162725, "learning_rate": 4.402814395493142e-07, "logits/chosen": -0.4203398823738098, "logits/rejected": -0.41677144169807434, "logps/chosen": -31.907039642333984, "logps/rejected": -24.965557098388672, "loss": 0.232, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.27512991428375244, "rewards/margins": 2.8020739555358887, "rewards/rejected": -3.0772039890289307, "step": 262 }, { "epoch": 0.6092307692307692, "grad_norm": 21.1456204923027, "learning_rate": 4.3896740129140354e-07, "logits/chosen": -0.41654694080352783, "logits/rejected": -0.402098149061203, "logps/chosen": -28.926246643066406, "logps/rejected": -29.76662826538086, "loss": 0.1926, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -1.17801034450531, "rewards/margins": 3.207771062850952, "rewards/rejected": -4.385781764984131, "step": 264 }, { "epoch": 0.6138461538461538, "grad_norm": 42.651303921098446, "learning_rate": 4.3764106970865456e-07, "logits/chosen": -0.4485589563846588, "logits/rejected": -0.43967097997665405, "logps/chosen": -29.77332305908203, "logps/rejected": -51.997276306152344, "loss": 0.2309, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -1.3469719886779785, "rewards/margins": 3.8554887771606445, "rewards/rejected": -5.202460765838623, "step": 266 }, { "epoch": 0.6184615384615385, "grad_norm": 28.262165957337512, "learning_rate": 4.3630253108592305e-07, "logits/chosen": -0.42454972863197327, "logits/rejected": -0.4207165837287903, "logps/chosen": -39.332069396972656, "logps/rejected": -38.881370544433594, "loss": 0.1751, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -1.2131102085113525, "rewards/margins": 3.619950294494629, "rewards/rejected": -4.8330607414245605, "step": 268 }, { "epoch": 0.6230769230769231, "grad_norm": 28.997711823063405, "learning_rate": 4.3495187250219723e-07, "logits/chosen": -0.45721203088760376, "logits/rejected": -0.45834487676620483, "logps/chosen": -26.57935905456543, "logps/rejected": -32.53868103027344, "loss": 0.2315, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -1.3750054836273193, "rewards/margins": 2.813932418823242, "rewards/rejected": -4.188937664031982, "step": 270 }, { "epoch": 0.6276923076923077, "grad_norm": 18.029821243951154, "learning_rate": 4.3358918182493253e-07, "logits/chosen": -0.46232733130455017, "logits/rejected": -0.46510839462280273, "logps/chosen": -25.596466064453125, "logps/rejected": -43.442752838134766, "loss": 0.1735, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -1.2356699705123901, "rewards/margins": 3.3778905868530273, "rewards/rejected": -4.613561153411865, "step": 272 }, { "epoch": 0.6323076923076923, "grad_norm": 26.854495269462472, "learning_rate": 4.3221454770433554e-07, "logits/chosen": -0.42746999859809875, "logits/rejected": -0.4174771308898926, "logps/chosen": -38.78968811035156, "logps/rejected": -48.67451858520508, "loss": 0.1717, "rewards/accuracies": 0.9444444179534912, "rewards/chosen": -1.0048799514770508, "rewards/margins": 4.1948676109313965, "rewards/rejected": -5.199747085571289, "step": 274 }, { "epoch": 0.6369230769230769, "grad_norm": 30.559498134471106, "learning_rate": 4.308280595675966e-07, "logits/chosen": -0.3896549940109253, "logits/rejected": -0.3890588879585266, "logps/chosen": -37.46322250366211, "logps/rejected": -35.077457427978516, "loss": 0.2855, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -1.1684030294418335, "rewards/margins": 2.635406732559204, "rewards/rejected": -3.803809642791748, "step": 276 }, { "epoch": 0.6415384615384615, "grad_norm": 34.086546047195235, "learning_rate": 4.2942980761307227e-07, "logits/chosen": -0.4673234224319458, "logits/rejected": -0.4619278311729431, "logps/chosen": -23.853961944580078, "logps/rejected": -40.64710998535156, "loss": 0.2089, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": -1.6247446537017822, "rewards/margins": 3.278303384780884, "rewards/rejected": -4.903048038482666, "step": 278 }, { "epoch": 0.6461538461538462, "grad_norm": 13.328287218706498, "learning_rate": 4.2801988280441765e-07, "logits/chosen": -0.45690426230430603, "logits/rejected": -0.45414966344833374, "logps/chosen": -30.93409538269043, "logps/rejected": -35.455078125, "loss": 0.148, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.8127410411834717, "rewards/margins": 3.260218858718872, "rewards/rejected": -4.072959899902344, "step": 280 }, { "epoch": 0.6461538461538462, "eval_logits/chosen": -0.4170861542224884, "eval_logits/rejected": -0.4120103716850281, "eval_logps/chosen": -30.824983596801758, "eval_logps/rejected": -43.666385650634766, "eval_loss": 0.21110185980796814, "eval_rewards/accuracies": 0.8444700241088867, "eval_rewards/chosen": -0.9832708239555359, "eval_rewards/margins": 3.757906675338745, "eval_rewards/rejected": -4.741177082061768, "eval_runtime": 508.8349, "eval_samples_per_second": 3.408, "eval_steps_per_second": 0.426, "step": 280 }, { "epoch": 0.6507692307692308, "grad_norm": 23.51514616755879, "learning_rate": 4.2659837686466813e-07, "logits/chosen": -0.3964659869670868, "logits/rejected": -0.3960285186767578, "logps/chosen": -32.214149475097656, "logps/rejected": -40.32206344604492, "loss": 0.2606, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.5793440937995911, "rewards/margins": 3.9720256328582764, "rewards/rejected": -4.551369667053223, "step": 282 }, { "epoch": 0.6553846153846153, "grad_norm": 44.87105211142761, "learning_rate": 4.25165382270273e-07, "logits/chosen": -0.4471447169780731, "logits/rejected": -0.4434424340724945, "logps/chosen": -22.62065887451172, "logps/rejected": -31.19291877746582, "loss": 0.2507, "rewards/accuracies": 0.7777777910232544, "rewards/chosen": -0.8934513926506042, "rewards/margins": 2.8396835327148438, "rewards/rejected": -3.7331347465515137, "step": 284 }, { "epoch": 0.66, "grad_norm": 21.904239676444085, "learning_rate": 4.2372099224507875e-07, "logits/chosen": -0.4560166597366333, "logits/rejected": -0.4430373013019562, "logps/chosen": -30.52613067626953, "logps/rejected": -43.70925521850586, "loss": 0.1835, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -1.0851600170135498, "rewards/margins": 3.905780792236328, "rewards/rejected": -4.990941047668457, "step": 286 }, { "epoch": 0.6646153846153846, "grad_norm": 38.91215620797999, "learning_rate": 4.2226530075426503e-07, "logits/chosen": -0.46308934688568115, "logits/rejected": -0.4644758403301239, "logps/chosen": -49.016292572021484, "logps/rejected": -32.58894729614258, "loss": 0.2054, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.9090245366096497, "rewards/margins": 3.222705364227295, "rewards/rejected": -4.131730556488037, "step": 288 }, { "epoch": 0.6692307692307692, "grad_norm": 29.687860938033246, "learning_rate": 4.2079840249823106e-07, "logits/chosen": -0.4102253019809723, "logits/rejected": -0.4064684510231018, "logps/chosen": -34.0399169921875, "logps/rejected": -63.65049743652344, "loss": 0.2376, "rewards/accuracies": 0.7777777910232544, "rewards/chosen": -1.3158775568008423, "rewards/margins": 4.716902732849121, "rewards/rejected": -6.032780170440674, "step": 290 }, { "epoch": 0.6738461538461539, "grad_norm": 24.425898306665736, "learning_rate": 4.193203929064353e-07, "logits/chosen": -0.3812604546546936, "logits/rejected": -0.3740667700767517, "logps/chosen": -33.9622802734375, "logps/rejected": -40.855690002441406, "loss": 0.2094, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.36320310831069946, "rewards/margins": 4.014423370361328, "rewards/rejected": -4.377626419067383, "step": 292 }, { "epoch": 0.6784615384615384, "grad_norm": 35.472818425490054, "learning_rate": 4.1783136813118705e-07, "logits/chosen": -0.4340391457080841, "logits/rejected": -0.4350210428237915, "logps/chosen": -30.103715896606445, "logps/rejected": -28.662017822265625, "loss": 0.224, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": -0.9305572509765625, "rewards/margins": 2.55446457862854, "rewards/rejected": -3.4850215911865234, "step": 294 }, { "epoch": 0.683076923076923, "grad_norm": 25.755633726397985, "learning_rate": 4.163314250413913e-07, "logits/chosen": -0.3662879765033722, "logits/rejected": -0.3629528880119324, "logps/chosen": -30.26826286315918, "logps/rejected": -32.83157730102539, "loss": 0.1396, "rewards/accuracies": 0.9444444179534912, "rewards/chosen": -0.10325232148170471, "rewards/margins": 3.6300764083862305, "rewards/rejected": -3.7333288192749023, "step": 296 }, { "epoch": 0.6876923076923077, "grad_norm": 21.98139687061052, "learning_rate": 4.1482066121624716e-07, "logits/chosen": -0.4054791033267975, "logits/rejected": -0.40484312176704407, "logps/chosen": -26.702163696289062, "logps/rejected": -29.949188232421875, "loss": 0.2334, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.9674122333526611, "rewards/margins": 3.2847795486450195, "rewards/rejected": -4.252191543579102, "step": 298 }, { "epoch": 0.6923076923076923, "grad_norm": 32.79875475268989, "learning_rate": 4.1329917493889933e-07, "logits/chosen": -0.4318622350692749, "logits/rejected": -0.4295927882194519, "logps/chosen": -25.54435920715332, "logps/rejected": -38.12559127807617, "loss": 0.2106, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.8421418070793152, "rewards/margins": 2.966458320617676, "rewards/rejected": -3.808600425720215, "step": 300 }, { "epoch": 0.6923076923076923, "eval_logits/chosen": -0.4134284555912018, "eval_logits/rejected": -0.40848222374916077, "eval_logps/chosen": -30.257047653198242, "eval_logps/rejected": -43.326053619384766, "eval_loss": 0.20553433895111084, "eval_rewards/accuracies": 0.8479262590408325, "eval_rewards/chosen": -0.6993027329444885, "eval_rewards/margins": 3.871710777282715, "eval_rewards/rejected": -4.571013450622559, "eval_runtime": 508.2329, "eval_samples_per_second": 3.412, "eval_steps_per_second": 0.427, "step": 300 }, { "epoch": 0.696923076923077, "grad_norm": 21.265971183383012, "learning_rate": 4.117670651900446e-07, "logits/chosen": -0.4743236303329468, "logits/rejected": -0.4737284481525421, "logps/chosen": -33.09712219238281, "logps/rejected": -32.526432037353516, "loss": 0.2656, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": -0.6792583465576172, "rewards/margins": 2.8626317977905273, "rewards/rejected": -3.5418901443481445, "step": 302 }, { "epoch": 0.7015384615384616, "grad_norm": 33.45234477936918, "learning_rate": 4.1022443164149237e-07, "logits/chosen": -0.4031560719013214, "logits/rejected": -0.40104442834854126, "logps/chosen": -32.153839111328125, "logps/rejected": -41.3182258605957, "loss": 0.2019, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -1.2330535650253296, "rewards/margins": 3.241495370864868, "rewards/rejected": -4.47454833984375, "step": 304 }, { "epoch": 0.7061538461538461, "grad_norm": 26.006527463479824, "learning_rate": 4.086713746496808e-07, "logits/chosen": -0.43675369024276733, "logits/rejected": -0.43135344982147217, "logps/chosen": -29.610496520996094, "logps/rejected": -44.182029724121094, "loss": 0.1596, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.26641184091567993, "rewards/margins": 4.72556734085083, "rewards/rejected": -4.991979598999023, "step": 306 }, { "epoch": 0.7107692307692308, "grad_norm": 16.2323534744151, "learning_rate": 4.0710799524914805e-07, "logits/chosen": -0.4202316999435425, "logits/rejected": -0.42065203189849854, "logps/chosen": -41.52705383300781, "logps/rejected": -34.62565994262695, "loss": 0.1449, "rewards/accuracies": 0.9722222089767456, "rewards/chosen": -0.9847439527511597, "rewards/margins": 3.193284034729004, "rewards/rejected": -4.178027629852295, "step": 308 }, { "epoch": 0.7153846153846154, "grad_norm": 19.449774860072015, "learning_rate": 4.055343951459592e-07, "logits/chosen": -0.43803703784942627, "logits/rejected": -0.4306415319442749, "logps/chosen": -22.680768966674805, "logps/rejected": -43.69520950317383, "loss": 0.1629, "rewards/accuracies": 0.9722222089767456, "rewards/chosen": -0.28689083456993103, "rewards/margins": 4.347331523895264, "rewards/rejected": -4.634222030639648, "step": 310 }, { "epoch": 0.72, "grad_norm": 17.10809005018044, "learning_rate": 4.0395067671108985e-07, "logits/chosen": -0.4748198688030243, "logits/rejected": -0.46837103366851807, "logps/chosen": -23.172752380371094, "logps/rejected": -28.870391845703125, "loss": 0.2036, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.1968996822834015, "rewards/margins": 3.125986099243164, "rewards/rejected": -3.3228859901428223, "step": 312 }, { "epoch": 0.7246153846153847, "grad_norm": 31.3724900765819, "learning_rate": 4.0235694297376637e-07, "logits/chosen": -0.42866167426109314, "logits/rejected": -0.4237157106399536, "logps/chosen": -29.394073486328125, "logps/rejected": -47.55759811401367, "loss": 0.2134, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.30406326055526733, "rewards/margins": 3.6837353706359863, "rewards/rejected": -3.9877989292144775, "step": 314 }, { "epoch": 0.7292307692307692, "grad_norm": 23.705563401878223, "learning_rate": 4.0075329761476347e-07, "logits/chosen": -0.4352235198020935, "logits/rejected": -0.43573349714279175, "logps/chosen": -35.37888717651367, "logps/rejected": -37.18360900878906, "loss": 0.165, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.9772714972496033, "rewards/margins": 3.1175262928009033, "rewards/rejected": -4.0947980880737305, "step": 316 }, { "epoch": 0.7338461538461538, "grad_norm": 12.575172339221478, "learning_rate": 3.991398449596588e-07, "logits/chosen": -0.3766963481903076, "logits/rejected": -0.370633602142334, "logps/chosen": -29.137102127075195, "logps/rejected": -45.29551315307617, "loss": 0.1544, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.4704323410987854, "rewards/margins": 4.004377365112305, "rewards/rejected": -4.474809646606445, "step": 318 }, { "epoch": 0.7384615384615385, "grad_norm": 13.090554115672795, "learning_rate": 3.9751668997204647e-07, "logits/chosen": -0.42111262679100037, "logits/rejected": -0.4189242422580719, "logps/chosen": -39.390987396240234, "logps/rejected": -37.506431579589844, "loss": 0.1523, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.4286647439002991, "rewards/margins": 3.3345425128936768, "rewards/rejected": -3.76320743560791, "step": 320 }, { "epoch": 0.7384615384615385, "eval_logits/chosen": -0.4214804172515869, "eval_logits/rejected": -0.4163365066051483, "eval_logps/chosen": -29.679540634155273, "eval_logps/rejected": -43.06067657470703, "eval_loss": 0.2023773342370987, "eval_rewards/accuracies": 0.8467742204666138, "eval_rewards/chosen": -0.41054773330688477, "eval_rewards/margins": 4.027776718139648, "eval_rewards/rejected": -4.438324451446533, "eval_runtime": 502.9654, "eval_samples_per_second": 3.448, "eval_steps_per_second": 0.431, "step": 320 }, { "epoch": 0.7430769230769231, "grad_norm": 32.306345055876804, "learning_rate": 3.958839382467084e-07, "logits/chosen": -0.4031261205673218, "logits/rejected": -0.3948514759540558, "logps/chosen": -20.96992301940918, "logps/rejected": -37.17094039916992, "loss": 0.2435, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": -0.07512470334768295, "rewards/margins": 3.314379930496216, "rewards/rejected": -3.389504909515381, "step": 322 }, { "epoch": 0.7476923076923077, "grad_norm": 25.553065871979427, "learning_rate": 3.9424169600274494e-07, "logits/chosen": -0.444087415933609, "logits/rejected": -0.4429420530796051, "logps/chosen": -23.210834503173828, "logps/rejected": -34.436309814453125, "loss": 0.2163, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.6498683094978333, "rewards/margins": 2.9317610263824463, "rewards/rejected": -3.581629514694214, "step": 324 }, { "epoch": 0.7523076923076923, "grad_norm": 17.07414056072156, "learning_rate": 3.9259007007666436e-07, "logits/chosen": -0.3948967456817627, "logits/rejected": -0.39322227239608765, "logps/chosen": -35.03683853149414, "logps/rejected": -37.341915130615234, "loss": 0.1817, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.27339833974838257, "rewards/margins": 3.6408724784851074, "rewards/rejected": -3.914271116256714, "step": 326 }, { "epoch": 0.7569230769230769, "grad_norm": 28.45028616201227, "learning_rate": 3.909291679154332e-07, "logits/chosen": -0.4438144862651825, "logits/rejected": -0.4391555190086365, "logps/chosen": -26.795574188232422, "logps/rejected": -35.73371887207031, "loss": 0.2031, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.5279018878936768, "rewards/margins": 3.3851261138916016, "rewards/rejected": -3.913027763366699, "step": 328 }, { "epoch": 0.7615384615384615, "grad_norm": 18.545290736700906, "learning_rate": 3.892590975694858e-07, "logits/chosen": -0.44910138845443726, "logits/rejected": -0.45103660225868225, "logps/chosen": -28.11017608642578, "logps/rejected": -25.93398094177246, "loss": 0.1591, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.07017502188682556, "rewards/margins": 3.3744397163391113, "rewards/rejected": -3.4446146488189697, "step": 330 }, { "epoch": 0.7661538461538462, "grad_norm": 20.06963860923726, "learning_rate": 3.875799676856952e-07, "logits/chosen": -0.48793941736221313, "logits/rejected": -0.4828093945980072, "logps/chosen": -26.22766876220703, "logps/rejected": -53.15538787841797, "loss": 0.169, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.2522682547569275, "rewards/margins": 5.069521427154541, "rewards/rejected": -5.321789264678955, "step": 332 }, { "epoch": 0.7707692307692308, "grad_norm": 41.757269842833246, "learning_rate": 3.858918875003053e-07, "logits/chosen": -0.4556080400943756, "logits/rejected": -0.45018041133880615, "logps/chosen": -29.08209991455078, "logps/rejected": -40.44395446777344, "loss": 0.2021, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.2314501404762268, "rewards/margins": 4.296255111694336, "rewards/rejected": -4.52770471572876, "step": 334 }, { "epoch": 0.7753846153846153, "grad_norm": 16.08933882113157, "learning_rate": 3.8419496683182396e-07, "logits/chosen": -0.44065940380096436, "logits/rejected": -0.4261338710784912, "logps/chosen": -28.18364715576172, "logps/rejected": -50.94450378417969, "loss": 0.1599, "rewards/accuracies": 0.9722222089767456, "rewards/chosen": -0.3340262174606323, "rewards/margins": 4.681273937225342, "rewards/rejected": -5.015300750732422, "step": 336 }, { "epoch": 0.78, "grad_norm": 29.889533234627475, "learning_rate": 3.824893160738792e-07, "logits/chosen": -0.4374091327190399, "logits/rejected": -0.4296647608280182, "logps/chosen": -37.9844856262207, "logps/rejected": -41.31779479980469, "loss": 0.2096, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.35117557644844055, "rewards/margins": 4.604701042175293, "rewards/rejected": -4.253525733947754, "step": 338 }, { "epoch": 0.7846153846153846, "grad_norm": 19.17718579000364, "learning_rate": 3.8077504618803737e-07, "logits/chosen": -0.3993787467479706, "logits/rejected": -0.3983612358570099, "logps/chosen": -27.12627410888672, "logps/rejected": -28.516714096069336, "loss": 0.1606, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.11244955658912659, "rewards/margins": 2.78592586517334, "rewards/rejected": -2.6734766960144043, "step": 340 }, { "epoch": 0.7846153846153846, "eval_logits/chosen": -0.42125818133354187, "eval_logits/rejected": -0.4164798855781555, "eval_logps/chosen": -29.02248191833496, "eval_logps/rejected": -42.798072814941406, "eval_loss": 0.19942985475063324, "eval_rewards/accuracies": 0.8467742204666138, "eval_rewards/chosen": -0.08201881498098373, "eval_rewards/margins": 4.225000381469727, "eval_rewards/rejected": -4.307018756866455, "eval_runtime": 503.4566, "eval_samples_per_second": 3.444, "eval_steps_per_second": 0.431, "step": 340 }, { "epoch": 0.7892307692307692, "grad_norm": 34.38608306125369, "learning_rate": 3.7905226869658446e-07, "logits/chosen": -0.4014025330543518, "logits/rejected": -0.3914772570133209, "logps/chosen": -27.84336280822754, "logps/rejected": -36.56361770629883, "loss": 0.2656, "rewards/accuracies": 0.7777777910232544, "rewards/chosen": -0.08617917448282242, "rewards/margins": 3.3050012588500977, "rewards/rejected": -3.3911805152893066, "step": 342 }, { "epoch": 0.7938461538461539, "grad_norm": 23.19353728595283, "learning_rate": 3.773210956752709e-07, "logits/chosen": -0.4105421304702759, "logits/rejected": -0.40849363803863525, "logps/chosen": -26.71829605102539, "logps/rejected": -30.233633041381836, "loss": 0.212, "rewards/accuracies": 0.7222222089767456, "rewards/chosen": -0.3683412969112396, "rewards/margins": 3.1401641368865967, "rewards/rejected": -3.5085055828094482, "step": 344 }, { "epoch": 0.7984615384615384, "grad_norm": 10.251881413431331, "learning_rate": 3.7558163974602093e-07, "logits/chosen": -0.43146970868110657, "logits/rejected": -0.41738688945770264, "logps/chosen": -25.759613037109375, "logps/rejected": -39.87126159667969, "loss": 0.1796, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.28593504428863525, "rewards/margins": 4.14475154876709, "rewards/rejected": -3.8588168621063232, "step": 346 }, { "epoch": 0.803076923076923, "grad_norm": 30.47250744578108, "learning_rate": 3.73834014069605e-07, "logits/chosen": -0.43334850668907166, "logits/rejected": -0.42349883913993835, "logps/chosen": -37.59834289550781, "logps/rejected": -51.57698059082031, "loss": 0.1978, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.29734522104263306, "rewards/margins": 5.007655143737793, "rewards/rejected": -5.305000305175781, "step": 348 }, { "epoch": 0.8076923076923077, "grad_norm": 24.819212610649004, "learning_rate": 3.7207833233827914e-07, "logits/chosen": -0.4134765863418579, "logits/rejected": -0.4089513421058655, "logps/chosen": -26.07123565673828, "logps/rejected": -38.38258743286133, "loss": 0.198, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.22059744596481323, "rewards/margins": 3.9326553344726562, "rewards/rejected": -4.153252124786377, "step": 350 }, { "epoch": 0.8123076923076923, "grad_norm": 23.706047568108115, "learning_rate": 3.7031470876838786e-07, "logits/chosen": -0.4194492697715759, "logits/rejected": -0.40730518102645874, "logps/chosen": -28.82728385925293, "logps/rejected": -57.56040573120117, "loss": 0.179, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.9220519661903381, "rewards/margins": 4.264041900634766, "rewards/rejected": -5.186093330383301, "step": 352 }, { "epoch": 0.816923076923077, "grad_norm": 18.043781215169886, "learning_rate": 3.6854325809293455e-07, "logits/chosen": -0.3817636966705322, "logits/rejected": -0.3748534917831421, "logps/chosen": -20.240158081054688, "logps/rejected": -58.97285842895508, "loss": 0.1789, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": -0.35545703768730164, "rewards/margins": 5.164008140563965, "rewards/rejected": -5.51946496963501, "step": 354 }, { "epoch": 0.8215384615384616, "grad_norm": 25.87285253533825, "learning_rate": 3.6676409555411653e-07, "logits/chosen": -0.41905197501182556, "logits/rejected": -0.4115257263183594, "logps/chosen": -27.22644805908203, "logps/rejected": -62.18017578125, "loss": 0.1854, "rewards/accuracies": 0.9722222089767456, "rewards/chosen": -0.48464149236679077, "rewards/margins": 6.608328819274902, "rewards/rejected": -7.09296989440918, "step": 356 }, { "epoch": 0.8261538461538461, "grad_norm": 14.119308899853314, "learning_rate": 3.6497733689582866e-07, "logits/chosen": -0.41661736369132996, "logits/rejected": -0.41109681129455566, "logps/chosen": -24.489599227905273, "logps/rejected": -35.24691390991211, "loss": 0.1535, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.4926097095012665, "rewards/margins": 3.530369758605957, "rewards/rejected": -4.022979259490967, "step": 358 }, { "epoch": 0.8307692307692308, "grad_norm": 38.344673058047896, "learning_rate": 3.631830983561335e-07, "logits/chosen": -0.45140349864959717, "logits/rejected": -0.45079073309898376, "logps/chosen": -23.43295669555664, "logps/rejected": -27.048038482666016, "loss": 0.1777, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.6527023315429688, "rewards/margins": 2.8010711669921875, "rewards/rejected": -3.453773021697998, "step": 360 }, { "epoch": 0.8307692307692308, "eval_logits/chosen": -0.4186362326145172, "eval_logits/rejected": -0.4136468470096588, "eval_logps/chosen": -29.61492919921875, "eval_logps/rejected": -43.617103576660156, "eval_loss": 0.1945820450782776, "eval_rewards/accuracies": 0.8513824939727783, "eval_rewards/chosen": -0.3782421052455902, "eval_rewards/margins": 4.338295936584473, "eval_rewards/rejected": -4.716537952423096, "eval_runtime": 506.5284, "eval_samples_per_second": 3.423, "eval_steps_per_second": 0.428, "step": 360 }, { "epoch": 0.8353846153846154, "grad_norm": 16.55857832540749, "learning_rate": 3.613814966596991e-07, "logits/chosen": -0.46038734912872314, "logits/rejected": -0.4492572546005249, "logps/chosen": -25.93819236755371, "logps/rejected": -50.828739166259766, "loss": 0.2136, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.8569762706756592, "rewards/margins": 4.576515197753906, "rewards/rejected": -5.433491230010986, "step": 362 }, { "epoch": 0.84, "grad_norm": 11.721886926873214, "learning_rate": 3.595726490102059e-07, "logits/chosen": -0.4281451106071472, "logits/rejected": -0.42401790618896484, "logps/chosen": -23.987388610839844, "logps/rejected": -36.76484298706055, "loss": 0.09, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.017054447904229164, "rewards/margins": 4.532209396362305, "rewards/rejected": -4.549264907836914, "step": 364 }, { "epoch": 0.8446153846153847, "grad_norm": 16.23696727613243, "learning_rate": 3.577566730827214e-07, "logits/chosen": -0.44551965594291687, "logits/rejected": -0.43860509991645813, "logps/chosen": -30.526634216308594, "logps/rejected": -45.12278747558594, "loss": 0.2314, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.21121951937675476, "rewards/margins": 4.676613807678223, "rewards/rejected": -4.887833595275879, "step": 366 }, { "epoch": 0.8492307692307692, "grad_norm": 29.921844427102474, "learning_rate": 3.559336870160453e-07, "logits/chosen": -0.41445478796958923, "logits/rejected": -0.40936967730522156, "logps/chosen": -21.065773010253906, "logps/rejected": -47.20044708251953, "loss": 0.2006, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.2605896294116974, "rewards/margins": 4.393010139465332, "rewards/rejected": -4.653599262237549, "step": 368 }, { "epoch": 0.8538461538461538, "grad_norm": 27.661765772812224, "learning_rate": 3.541038094050241e-07, "logits/chosen": -0.414185106754303, "logits/rejected": -0.40629175305366516, "logps/chosen": -27.4779052734375, "logps/rejected": -62.92180633544922, "loss": 0.2134, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -1.1156302690505981, "rewards/margins": 5.80250358581543, "rewards/rejected": -6.918133735656738, "step": 370 }, { "epoch": 0.8584615384615385, "grad_norm": 27.80789926275254, "learning_rate": 3.52267159292835e-07, "logits/chosen": -0.4086429476737976, "logits/rejected": -0.40484270453453064, "logps/chosen": -30.99095916748047, "logps/rejected": -53.7381591796875, "loss": 0.1496, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.42847952246665955, "rewards/margins": 5.564027786254883, "rewards/rejected": -5.992506980895996, "step": 372 }, { "epoch": 0.8630769230769231, "grad_norm": 15.518050865973265, "learning_rate": 3.5042385616324236e-07, "logits/chosen": -0.3771006166934967, "logits/rejected": -0.3636988401412964, "logps/chosen": -19.442859649658203, "logps/rejected": -57.44426727294922, "loss": 0.1562, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.27324867248535156, "rewards/margins": 5.852780342102051, "rewards/rejected": -6.126028537750244, "step": 374 }, { "epoch": 0.8676923076923077, "grad_norm": 17.6707452061646, "learning_rate": 3.485740199328244e-07, "logits/chosen": -0.40194880962371826, "logits/rejected": -0.3989940881729126, "logps/chosen": -38.314414978027344, "logps/rejected": -37.43749237060547, "loss": 0.1453, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.2914290130138397, "rewards/margins": 4.130940914154053, "rewards/rejected": -4.422369480133057, "step": 376 }, { "epoch": 0.8723076923076923, "grad_norm": 12.317586105643123, "learning_rate": 3.4671777094317196e-07, "logits/chosen": -0.40350526571273804, "logits/rejected": -0.39631906151771545, "logps/chosen": -33.33686828613281, "logps/rejected": -45.19147491455078, "loss": 0.1402, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -1.2224059104919434, "rewards/margins": 4.0000457763671875, "rewards/rejected": -5.222451686859131, "step": 378 }, { "epoch": 0.8769230769230769, "grad_norm": 21.79906998164569, "learning_rate": 3.448552299530595e-07, "logits/chosen": -0.4637221693992615, "logits/rejected": -0.4593777656555176, "logps/chosen": -27.355539321899414, "logps/rejected": -42.75727844238281, "loss": 0.1684, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -1.1025246381759644, "rewards/margins": 4.087032794952393, "rewards/rejected": -5.189557075500488, "step": 380 }, { "epoch": 0.8769230769230769, "eval_logits/chosen": -0.4187977910041809, "eval_logits/rejected": -0.4135534465312958, "eval_logps/chosen": -30.3321533203125, "eval_logps/rejected": -44.58085632324219, "eval_loss": 0.19118133187294006, "eval_rewards/accuracies": 0.8513824939727783, "eval_rewards/chosen": -0.7368546724319458, "eval_rewards/margins": 4.461559295654297, "eval_rewards/rejected": -5.198413848876953, "eval_runtime": 507.2831, "eval_samples_per_second": 3.418, "eval_steps_per_second": 0.428, "step": 380 }, { "epoch": 0.8815384615384615, "grad_norm": 30.355329796288427, "learning_rate": 3.429865181305894e-07, "logits/chosen": -0.4384227991104126, "logits/rejected": -0.43502819538116455, "logps/chosen": -32.88569259643555, "logps/rejected": -48.558433532714844, "loss": 0.1983, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.7865729928016663, "rewards/margins": 4.326223850250244, "rewards/rejected": -5.112797260284424, "step": 382 }, { "epoch": 0.8861538461538462, "grad_norm": 28.87717615974696, "learning_rate": 3.411117570453091e-07, "logits/chosen": -0.4351966977119446, "logits/rejected": -0.42611631751060486, "logps/chosen": -30.210994720458984, "logps/rejected": -47.79801940917969, "loss": 0.1854, "rewards/accuracies": 0.9722222089767456, "rewards/chosen": -0.96357661485672, "rewards/margins": 4.84887170791626, "rewards/rejected": -5.812448501586914, "step": 384 }, { "epoch": 0.8907692307692308, "grad_norm": 19.86896438382728, "learning_rate": 3.392310686603025e-07, "logits/chosen": -0.40435516834259033, "logits/rejected": -0.40078163146972656, "logps/chosen": -33.531829833984375, "logps/rejected": -45.7469482421875, "loss": 0.1934, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": -1.0277481079101562, "rewards/margins": 3.6171319484710693, "rewards/rejected": -4.6448798179626465, "step": 386 }, { "epoch": 0.8953846153846153, "grad_norm": 12.657150462750414, "learning_rate": 3.3734457532425554e-07, "logits/chosen": -0.44205257296562195, "logits/rejected": -0.43028175830841064, "logps/chosen": -27.385295867919922, "logps/rejected": -58.39698028564453, "loss": 0.1758, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.5980458855628967, "rewards/margins": 6.282012939453125, "rewards/rejected": -6.8800578117370605, "step": 388 }, { "epoch": 0.9, "grad_norm": 19.7738166942249, "learning_rate": 3.354523997634969e-07, "logits/chosen": -0.38840749859809875, "logits/rejected": -0.3808763325214386, "logps/chosen": -26.59889030456543, "logps/rejected": -60.10518264770508, "loss": 0.1959, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -1.0057358741760254, "rewards/margins": 5.927921295166016, "rewards/rejected": -6.933656692504883, "step": 390 }, { "epoch": 0.9046153846153846, "grad_norm": 36.26703530825595, "learning_rate": 3.3355466507401374e-07, "logits/chosen": -0.43338432908058167, "logits/rejected": -0.4314180612564087, "logps/chosen": -35.501949310302734, "logps/rejected": -28.30125617980957, "loss": 0.2227, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": -0.47697877883911133, "rewards/margins": 3.62644624710083, "rewards/rejected": -4.103425025939941, "step": 392 }, { "epoch": 0.9092307692307692, "grad_norm": 23.419371218956712, "learning_rate": 3.3165149471344394e-07, "logits/chosen": -0.438678503036499, "logits/rejected": -0.436648964881897, "logps/chosen": -28.57487678527832, "logps/rejected": -39.63655090332031, "loss": 0.2015, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -1.0588184595108032, "rewards/margins": 4.30199670791626, "rewards/rejected": -5.360815525054932, "step": 394 }, { "epoch": 0.9138461538461539, "grad_norm": 21.805997093011054, "learning_rate": 3.297430124930444e-07, "logits/chosen": -0.4441942572593689, "logits/rejected": -0.4420095980167389, "logps/chosen": -35.50737380981445, "logps/rejected": -47.934085845947266, "loss": 0.2443, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": -1.081557273864746, "rewards/margins": 3.6952524185180664, "rewards/rejected": -4.776809215545654, "step": 396 }, { "epoch": 0.9184615384615384, "grad_norm": 7.97818333672435, "learning_rate": 3.2782934256963647e-07, "logits/chosen": -0.4364481568336487, "logits/rejected": -0.4294630289077759, "logps/chosen": -33.90742492675781, "logps/rejected": -67.54840850830078, "loss": 0.1389, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.8434889316558838, "rewards/margins": 6.298786640167236, "rewards/rejected": -7.142275333404541, "step": 398 }, { "epoch": 0.9230769230769231, "grad_norm": 14.880140389824264, "learning_rate": 3.259106094375289e-07, "logits/chosen": -0.4415682256221771, "logits/rejected": -0.4384162425994873, "logps/chosen": -28.99742317199707, "logps/rejected": -46.49116516113281, "loss": 0.1601, "rewards/accuracies": 0.9722222089767456, "rewards/chosen": -0.6465389728546143, "rewards/margins": 5.060551643371582, "rewards/rejected": -5.707090377807617, "step": 400 }, { "epoch": 0.9230769230769231, "eval_logits/chosen": -0.41892069578170776, "eval_logits/rejected": -0.41379600763320923, "eval_logps/chosen": -30.745765686035156, "eval_logps/rejected": -45.12676239013672, "eval_loss": 0.18738648295402527, "eval_rewards/accuracies": 0.8536866307258606, "eval_rewards/chosen": -0.9436614513397217, "eval_rewards/margins": 4.527709484100342, "eval_rewards/rejected": -5.471370697021484, "eval_runtime": 504.3625, "eval_samples_per_second": 3.438, "eval_steps_per_second": 0.43, "step": 400 }, { "epoch": 0.9276923076923077, "grad_norm": 31.148994983665478, "learning_rate": 3.239869379204189e-07, "logits/chosen": -0.43767163157463074, "logits/rejected": -0.4331532418727875, "logps/chosen": -32.910770416259766, "logps/rejected": -43.170127868652344, "loss": 0.1459, "rewards/accuracies": 0.9722222089767456, "rewards/chosen": -1.2280699014663696, "rewards/margins": 3.950821876525879, "rewards/rejected": -5.178892135620117, "step": 402 }, { "epoch": 0.9323076923076923, "grad_norm": 39.78584916809417, "learning_rate": 3.2205845316327144e-07, "logits/chosen": -0.425650954246521, "logits/rejected": -0.4268127977848053, "logps/chosen": -23.308561325073242, "logps/rejected": -28.359622955322266, "loss": 0.2998, "rewards/accuracies": 0.75, "rewards/chosen": -1.2385773658752441, "rewards/margins": 2.610461950302124, "rewards/rejected": -3.849039316177368, "step": 404 }, { "epoch": 0.936923076923077, "grad_norm": 25.750238525166676, "learning_rate": 3.2012528062417845e-07, "logits/chosen": -0.42968493700027466, "logits/rejected": -0.42483997344970703, "logps/chosen": -28.544315338134766, "logps/rejected": -33.00044250488281, "loss": 0.1541, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -1.0130796432495117, "rewards/margins": 3.6104660034179688, "rewards/rejected": -4.623545169830322, "step": 406 }, { "epoch": 0.9415384615384615, "grad_norm": 17.4537879823929, "learning_rate": 3.1818754606619643e-07, "logits/chosen": -0.40116989612579346, "logits/rejected": -0.3927208185195923, "logps/chosen": -14.82399845123291, "logps/rejected": -42.62455749511719, "loss": 0.2865, "rewards/accuracies": 0.75, "rewards/chosen": -0.4872245192527771, "rewards/margins": 4.121938228607178, "rewards/rejected": -4.6091628074646, "step": 408 }, { "epoch": 0.9461538461538461, "grad_norm": 19.6294139089247, "learning_rate": 3.162453755491655e-07, "logits/chosen": -0.44356614351272583, "logits/rejected": -0.44109952449798584, "logps/chosen": -21.10145378112793, "logps/rejected": -48.5328254699707, "loss": 0.1388, "rewards/accuracies": 0.9444444179534912, "rewards/chosen": -0.7301766872406006, "rewards/margins": 5.231949329376221, "rewards/rejected": -5.962125778198242, "step": 410 }, { "epoch": 0.9507692307692308, "grad_norm": 17.818394576762543, "learning_rate": 3.142988954215079e-07, "logits/chosen": -0.40068677067756653, "logits/rejected": -0.3975728452205658, "logps/chosen": -27.05259132385254, "logps/rejected": -50.10287857055664, "loss": 0.1716, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.9632964730262756, "rewards/margins": 4.797210693359375, "rewards/rejected": -5.760507583618164, "step": 412 }, { "epoch": 0.9553846153846154, "grad_norm": 18.94989487963629, "learning_rate": 3.1234823231200925e-07, "logits/chosen": -0.4617886543273926, "logits/rejected": -0.4470604360103607, "logps/chosen": -24.05147933959961, "logps/rejected": -51.06510543823242, "loss": 0.1524, "rewards/accuracies": 0.9444444179534912, "rewards/chosen": -0.6726754903793335, "rewards/margins": 5.38551139831543, "rewards/rejected": -6.0581865310668945, "step": 414 }, { "epoch": 0.96, "grad_norm": 21.359564369883266, "learning_rate": 3.1039351312157993e-07, "logits/chosen": -0.3944624960422516, "logits/rejected": -0.38715553283691406, "logps/chosen": -28.74252700805664, "logps/rejected": -49.56966781616211, "loss": 0.172, "rewards/accuracies": 0.9722222089767456, "rewards/chosen": -0.9229162931442261, "rewards/margins": 5.164016246795654, "rewards/rejected": -6.08693265914917, "step": 416 }, { "epoch": 0.9646153846153847, "grad_norm": 39.2638634508674, "learning_rate": 3.0843486501499967e-07, "logits/chosen": -0.45682817697525024, "logits/rejected": -0.444223552942276, "logps/chosen": -30.7056884765625, "logps/rejected": -42.09891128540039, "loss": 0.1998, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.8110179901123047, "rewards/margins": 4.732272148132324, "rewards/rejected": -5.543290615081787, "step": 418 }, { "epoch": 0.9692307692307692, "grad_norm": 26.69831878482791, "learning_rate": 3.064724154126449e-07, "logits/chosen": -0.37316325306892395, "logits/rejected": -0.3706890940666199, "logps/chosen": -33.49404525756836, "logps/rejected": -42.253334045410156, "loss": 0.1926, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.9432663321495056, "rewards/margins": 4.801055908203125, "rewards/rejected": -5.744322299957275, "step": 420 }, { "epoch": 0.9692307692307692, "eval_logits/chosen": -0.42235231399536133, "eval_logits/rejected": -0.41760390996932983, "eval_logps/chosen": -30.618261337280273, "eval_logps/rejected": -45.24935531616211, "eval_loss": 0.18495789170265198, "eval_rewards/accuracies": 0.8548387289047241, "eval_rewards/chosen": -0.8799088001251221, "eval_rewards/margins": 4.652754783630371, "eval_rewards/rejected": -5.532663822174072, "eval_runtime": 506.4345, "eval_samples_per_second": 3.424, "eval_steps_per_second": 0.428, "step": 420 }, { "epoch": 0.9738461538461538, "grad_norm": 26.429309245448025, "learning_rate": 3.045062919821995e-07, "logits/chosen": -0.4473799467086792, "logits/rejected": -0.4387081265449524, "logps/chosen": -29.986135482788086, "logps/rejected": -48.23318099975586, "loss": 0.2208, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": -0.9130682349205017, "rewards/margins": 4.49908447265625, "rewards/rejected": -5.4121527671813965, "step": 422 }, { "epoch": 0.9784615384615385, "grad_norm": 22.56111748373276, "learning_rate": 3.0253662263034925e-07, "logits/chosen": -0.41079118847846985, "logits/rejected": -0.4088312089443207, "logps/chosen": -33.51829528808594, "logps/rejected": -41.554603576660156, "loss": 0.1672, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.8566321134567261, "rewards/margins": 4.636307716369629, "rewards/rejected": -5.4929399490356445, "step": 424 }, { "epoch": 0.9830769230769231, "grad_norm": 20.23179280158031, "learning_rate": 3.005635354944606e-07, "logits/chosen": -0.428958535194397, "logits/rejected": -0.42861926555633545, "logps/chosen": -34.26374816894531, "logps/rejected": -31.41779899597168, "loss": 0.1709, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -1.4080199003219604, "rewards/margins": 3.372980833053589, "rewards/rejected": -4.78100061416626, "step": 426 }, { "epoch": 0.9876923076923076, "grad_norm": 20.568879603268485, "learning_rate": 2.9858715893424504e-07, "logits/chosen": -0.42916062474250793, "logits/rejected": -0.4187799096107483, "logps/chosen": -23.50701332092285, "logps/rejected": -69.53659057617188, "loss": 0.1165, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.7494051456451416, "rewards/margins": 6.971426486968994, "rewards/rejected": -7.720830917358398, "step": 428 }, { "epoch": 0.9923076923076923, "grad_norm": 14.585516147453433, "learning_rate": 2.966076215234082e-07, "logits/chosen": -0.4945632219314575, "logits/rejected": -0.49019941687583923, "logps/chosen": -21.51183319091797, "logps/rejected": -59.04719924926758, "loss": 0.134, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -1.095296859741211, "rewards/margins": 5.902851581573486, "rewards/rejected": -6.998149394989014, "step": 430 }, { "epoch": 0.9969230769230769, "grad_norm": 22.17191357344303, "learning_rate": 2.94625052041286e-07, "logits/chosen": -0.4531210660934448, "logits/rejected": -0.4497845470905304, "logps/chosen": -29.616464614868164, "logps/rejected": -51.388221740722656, "loss": 0.2234, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.6977284550666809, "rewards/margins": 5.5895586013793945, "rewards/rejected": -6.287287712097168, "step": 432 }, { "epoch": 1.0015384615384615, "grad_norm": 17.608616745764742, "learning_rate": 2.926395794644665e-07, "logits/chosen": -0.3989698588848114, "logits/rejected": -0.3986192047595978, "logps/chosen": -27.283063888549805, "logps/rejected": -35.429542541503906, "loss": 0.1572, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -1.196885347366333, "rewards/margins": 3.4778902530670166, "rewards/rejected": -4.674776077270508, "step": 434 }, { "epoch": 1.0061538461538462, "grad_norm": 13.609500399236131, "learning_rate": 2.906513329583991e-07, "logits/chosen": -0.4256007969379425, "logits/rejected": -0.4220072627067566, "logps/chosen": -25.50336456298828, "logps/rejected": -46.2448616027832, "loss": 0.1607, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.9498477578163147, "rewards/margins": 5.3759613037109375, "rewards/rejected": -6.325809001922607, "step": 436 }, { "epoch": 1.0107692307692309, "grad_norm": 14.35461535388244, "learning_rate": 2.886604418689921e-07, "logits/chosen": -0.41400301456451416, "logits/rejected": -0.3970973491668701, "logps/chosen": -23.69174575805664, "logps/rejected": -63.7423095703125, "loss": 0.1931, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.8411877155303955, "rewards/margins": 6.537564754486084, "rewards/rejected": -7.378751754760742, "step": 438 }, { "epoch": 1.0153846153846153, "grad_norm": 6.080790310742912, "learning_rate": 2.866670357141979e-07, "logits/chosen": -0.4005855917930603, "logits/rejected": -0.3919644057750702, "logps/chosen": -28.573617935180664, "logps/rejected": -47.51435852050781, "loss": 0.1496, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.609701931476593, "rewards/margins": 5.819619655609131, "rewards/rejected": -6.429322242736816, "step": 440 }, { "epoch": 1.0153846153846153, "eval_logits/chosen": -0.4261355400085449, "eval_logits/rejected": -0.4213837683200836, "eval_logps/chosen": -30.299606323242188, "eval_logps/rejected": -45.094974517822266, "eval_loss": 0.18333862721920013, "eval_rewards/accuracies": 0.8525345325469971, "eval_rewards/chosen": -0.7205813527107239, "eval_rewards/margins": 4.734891414642334, "eval_rewards/rejected": -5.455472946166992, "eval_runtime": 504.2212, "eval_samples_per_second": 3.439, "eval_steps_per_second": 0.43, "step": 440 }, { "epoch": 1.02, "grad_norm": 18.54763999700523, "learning_rate": 2.8467124417558737e-07, "logits/chosen": -0.43253931403160095, "logits/rejected": -0.43203675746917725, "logps/chosen": -32.16371154785156, "logps/rejected": -36.97785186767578, "loss": 0.1419, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.694267213344574, "rewards/margins": 4.412461280822754, "rewards/rejected": -5.106728553771973, "step": 442 }, { "epoch": 1.0246153846153847, "grad_norm": 11.624548142854733, "learning_rate": 2.8267319708991253e-07, "logits/chosen": -0.42751240730285645, "logits/rejected": -0.42402034997940063, "logps/chosen": -37.04402542114258, "logps/rejected": -36.21348571777344, "loss": 0.1659, "rewards/accuracies": 0.9444444179534912, "rewards/chosen": -0.44168946146965027, "rewards/margins": 4.51059627532959, "rewards/rejected": -4.9522857666015625, "step": 444 }, { "epoch": 1.0292307692307692, "grad_norm": 15.107744257410477, "learning_rate": 2.806730244406612e-07, "logits/chosen": -0.48664745688438416, "logits/rejected": -0.4811630845069885, "logps/chosen": -30.887107849121094, "logps/rejected": -52.15692138671875, "loss": 0.1411, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.7369892597198486, "rewards/margins": 6.05087947845459, "rewards/rejected": -6.787868976593018, "step": 446 }, { "epoch": 1.0338461538461539, "grad_norm": 11.698550171300518, "learning_rate": 2.786708563496001e-07, "logits/chosen": -0.4510349631309509, "logits/rejected": -0.44699403643608093, "logps/chosen": -35.29894256591797, "logps/rejected": -40.9118537902832, "loss": 0.1057, "rewards/accuracies": 0.9444444179534912, "rewards/chosen": -0.3786778450012207, "rewards/margins": 4.980160713195801, "rewards/rejected": -5.3588385581970215, "step": 448 }, { "epoch": 1.0384615384615385, "grad_norm": 15.94482902920788, "learning_rate": 2.7666682306830994e-07, "logits/chosen": -0.45571020245552063, "logits/rejected": -0.45306020975112915, "logps/chosen": -29.1939697265625, "logps/rejected": -29.48745346069336, "loss": 0.1438, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.4602106511592865, "rewards/margins": 4.023096084594727, "rewards/rejected": -4.483306884765625, "step": 450 }, { "epoch": 1.043076923076923, "grad_norm": 15.189711799928178, "learning_rate": 2.746610549697119e-07, "logits/chosen": -0.4809485077857971, "logits/rejected": -0.4782992899417877, "logps/chosen": -29.195072174072266, "logps/rejected": -39.80377197265625, "loss": 0.1365, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.11167682707309723, "rewards/margins": 4.948356628417969, "rewards/rejected": -4.836679458618164, "step": 452 }, { "epoch": 1.0476923076923077, "grad_norm": 14.46918791976569, "learning_rate": 2.7265368253958615e-07, "logits/chosen": -0.42312443256378174, "logits/rejected": -0.4153350591659546, "logps/chosen": -23.33478546142578, "logps/rejected": -42.642574310302734, "loss": 0.1291, "rewards/accuracies": 0.9444444179534912, "rewards/chosen": -0.3456007242202759, "rewards/margins": 4.775884628295898, "rewards/rejected": -5.121485233306885, "step": 454 }, { "epoch": 1.0523076923076924, "grad_norm": 7.6608330031374035, "learning_rate": 2.706448363680831e-07, "logits/chosen": -0.4585114121437073, "logits/rejected": -0.45315349102020264, "logps/chosen": -27.26013946533203, "logps/rejected": -54.716033935546875, "loss": 0.0577, "rewards/accuracies": 0.9722222089767456, "rewards/chosen": -0.4251694977283478, "rewards/margins": 5.789327144622803, "rewards/rejected": -6.214497089385986, "step": 456 }, { "epoch": 1.0569230769230769, "grad_norm": 7.0897688772043574, "learning_rate": 2.686346471412277e-07, "logits/chosen": -0.4366828203201294, "logits/rejected": -0.42981481552124023, "logps/chosen": -20.309011459350586, "logps/rejected": -46.80522155761719, "loss": 0.0877, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.13008694350719452, "rewards/margins": 5.475755214691162, "rewards/rejected": -5.605840682983398, "step": 458 }, { "epoch": 1.0615384615384615, "grad_norm": 17.303889361232052, "learning_rate": 2.6662324563241805e-07, "logits/chosen": -0.4282498061656952, "logits/rejected": -0.42142820358276367, "logps/chosen": -28.0141658782959, "logps/rejected": -48.67052459716797, "loss": 0.1422, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.3313429057598114, "rewards/margins": 5.242645740509033, "rewards/rejected": -5.573988914489746, "step": 460 }, { "epoch": 1.0615384615384615, "eval_logits/chosen": -0.4306621849536896, "eval_logits/rejected": -0.42571890354156494, "eval_logps/chosen": -29.58946990966797, "eval_logps/rejected": -44.63399887084961, "eval_loss": 0.18279969692230225, "eval_rewards/accuracies": 0.8525345325469971, "eval_rewards/chosen": -0.36551275849342346, "eval_rewards/margins": 4.85947322845459, "eval_rewards/rejected": -5.2249860763549805, "eval_runtime": 539.2481, "eval_samples_per_second": 3.216, "eval_steps_per_second": 0.402, "step": 460 }, { "epoch": 1.0661538461538462, "grad_norm": 6.803154365699166, "learning_rate": 2.6461076269391713e-07, "logits/chosen": -0.40221667289733887, "logits/rejected": -0.39481696486473083, "logps/chosen": -34.92960739135742, "logps/rejected": -50.57633590698242, "loss": 0.0862, "rewards/accuracies": 0.9444444179534912, "rewards/chosen": -0.914921224117279, "rewards/margins": 5.707345962524414, "rewards/rejected": -6.62226676940918, "step": 462 }, { "epoch": 1.0707692307692307, "grad_norm": 34.36171997675168, "learning_rate": 2.625973292483409e-07, "logits/chosen": -0.4790021479129791, "logits/rejected": -0.46986639499664307, "logps/chosen": -39.99270248413086, "logps/rejected": -48.73933029174805, "loss": 0.1497, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.1914079189300537, "rewards/margins": 5.431238174438477, "rewards/rejected": -5.622646331787109, "step": 464 }, { "epoch": 1.0753846153846154, "grad_norm": 22.90677988420913, "learning_rate": 2.6058307628014065e-07, "logits/chosen": -0.4648301899433136, "logits/rejected": -0.45539039373397827, "logps/chosen": -29.436492919921875, "logps/rejected": -36.52954864501953, "loss": 0.0937, "rewards/accuracies": 0.9444444179534912, "rewards/chosen": 0.13956691324710846, "rewards/margins": 4.977226734161377, "rewards/rejected": -4.83765983581543, "step": 466 }, { "epoch": 1.08, "grad_norm": 11.948410817522065, "learning_rate": 2.5856813482708217e-07, "logits/chosen": -0.38539624214172363, "logits/rejected": -0.3782796263694763, "logps/chosen": -31.06607437133789, "logps/rejected": -37.78165817260742, "loss": 0.1272, "rewards/accuracies": 0.9444444179534912, "rewards/chosen": 0.4081376791000366, "rewards/margins": 4.731931686401367, "rewards/rejected": -4.323793888092041, "step": 468 }, { "epoch": 1.0846153846153845, "grad_norm": 15.403601313739959, "learning_rate": 2.565526359717206e-07, "logits/chosen": -0.40367501974105835, "logits/rejected": -0.3996252715587616, "logps/chosen": -30.526506423950195, "logps/rejected": -46.99552917480469, "loss": 0.1757, "rewards/accuracies": 0.9444444179534912, "rewards/chosen": -0.6265747547149658, "rewards/margins": 5.05012321472168, "rewards/rejected": -5.676698684692383, "step": 470 }, { "epoch": 1.0892307692307692, "grad_norm": 9.7719232718892, "learning_rate": 2.545367108328731e-07, "logits/chosen": -0.4403303861618042, "logits/rejected": -0.4350850284099579, "logps/chosen": -38.43405532836914, "logps/rejected": -36.40499496459961, "loss": 0.1381, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.3411208391189575, "rewards/margins": 4.477426528930664, "rewards/rejected": -4.81854772567749, "step": 472 }, { "epoch": 1.093846153846154, "grad_norm": 13.916006221104306, "learning_rate": 2.525204905570889e-07, "logits/chosen": -0.454936146736145, "logits/rejected": -0.44734886288642883, "logps/chosen": -36.918949127197266, "logps/rejected": -57.40822982788086, "loss": 0.1091, "rewards/accuracies": 0.9444444179534912, "rewards/chosen": 0.12125682830810547, "rewards/margins": 6.582164764404297, "rewards/rejected": -6.460907459259033, "step": 474 }, { "epoch": 1.0984615384615384, "grad_norm": 5.508289669315093, "learning_rate": 2.505041063101171e-07, "logits/chosen": -0.46973931789398193, "logits/rejected": -0.4579940438270569, "logps/chosen": -31.71024513244629, "logps/rejected": -48.37898254394531, "loss": 0.1312, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.06245845556259155, "rewards/margins": 5.579024314880371, "rewards/rejected": -5.641482830047607, "step": 476 }, { "epoch": 1.103076923076923, "grad_norm": 8.237217897421921, "learning_rate": 2.4848768926837466e-07, "logits/chosen": -0.4385973811149597, "logits/rejected": -0.42893168330192566, "logps/chosen": -27.372316360473633, "logps/rejected": -65.15902709960938, "loss": 0.1236, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.8572553992271423, "rewards/margins": 6.578119277954102, "rewards/rejected": -7.435373783111572, "step": 478 }, { "epoch": 1.1076923076923078, "grad_norm": 10.248326533412026, "learning_rate": 2.464713706104113e-07, "logits/chosen": -0.45089420676231384, "logits/rejected": -0.4460269510746002, "logps/chosen": -30.84375762939453, "logps/rejected": -47.04542541503906, "loss": 0.1242, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.33427292108535767, "rewards/margins": 5.303442478179932, "rewards/rejected": -5.637715816497803, "step": 480 }, { "epoch": 1.1076923076923078, "eval_logits/chosen": -0.43585485219955444, "eval_logits/rejected": -0.43083125352859497, "eval_logps/chosen": -29.441936492919922, "eval_logps/rejected": -44.649879455566406, "eval_loss": 0.18123859167099, "eval_rewards/accuracies": 0.8513824939727783, "eval_rewards/chosen": -0.2917479872703552, "eval_rewards/margins": 4.9411749839782715, "eval_rewards/rejected": -5.23292350769043, "eval_runtime": 576.1489, "eval_samples_per_second": 3.01, "eval_steps_per_second": 0.377, "step": 480 }, { "epoch": 1.1123076923076922, "grad_norm": 14.30436745905321, "learning_rate": 2.444552815083767e-07, "logits/chosen": -0.3899229168891907, "logits/rejected": -0.388058066368103, "logps/chosen": -27.3692626953125, "logps/rejected": -33.886592864990234, "loss": 0.1522, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.14449940621852875, "rewards/margins": 4.185901165008545, "rewards/rejected": -4.330400466918945, "step": 482 }, { "epoch": 1.116923076923077, "grad_norm": 13.829655930752596, "learning_rate": 2.4243955311948693e-07, "logits/chosen": -0.3911668658256531, "logits/rejected": -0.37596791982650757, "logps/chosen": -21.08230972290039, "logps/rejected": -52.7446403503418, "loss": 0.1409, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.24917255342006683, "rewards/margins": 6.088818550109863, "rewards/rejected": -6.337990760803223, "step": 484 }, { "epoch": 1.1215384615384616, "grad_norm": 8.129554422889694, "learning_rate": 2.4042431657749115e-07, "logits/chosen": -0.4462336003780365, "logits/rejected": -0.43516308069229126, "logps/chosen": -30.197654724121094, "logps/rejected": -73.82144927978516, "loss": 0.1087, "rewards/accuracies": 0.9444444179534912, "rewards/chosen": -0.12235673516988754, "rewards/margins": 8.374778747558594, "rewards/rejected": -8.497135162353516, "step": 486 }, { "epoch": 1.126153846153846, "grad_norm": 9.775158225779034, "learning_rate": 2.384097029841419e-07, "logits/chosen": -0.4285200834274292, "logits/rejected": -0.41790592670440674, "logps/chosen": -30.823646545410156, "logps/rejected": -43.81809997558594, "loss": 0.1018, "rewards/accuracies": 0.9444444179534912, "rewards/chosen": 0.4461793899536133, "rewards/margins": 5.640202522277832, "rewards/rejected": -5.194023132324219, "step": 488 }, { "epoch": 1.1307692307692307, "grad_norm": 10.213931426481256, "learning_rate": 2.3639584340066544e-07, "logits/chosen": -0.4395972490310669, "logits/rejected": -0.43329057097435, "logps/chosen": -22.31542205810547, "logps/rejected": -51.99763107299805, "loss": 0.1601, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": -0.2867180407047272, "rewards/margins": 5.327702522277832, "rewards/rejected": -5.614420413970947, "step": 490 }, { "epoch": 1.1353846153846154, "grad_norm": 11.837139694007076, "learning_rate": 2.3438286883923539e-07, "logits/chosen": -0.46829140186309814, "logits/rejected": -0.4594018757343292, "logps/chosen": -33.9882698059082, "logps/rejected": -44.247772216796875, "loss": 0.1408, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.38947921991348267, "rewards/margins": 5.819252967834473, "rewards/rejected": -5.429773330688477, "step": 492 }, { "epoch": 1.1400000000000001, "grad_norm": 12.086720030696792, "learning_rate": 2.323709102544506e-07, "logits/chosen": -0.442399263381958, "logits/rejected": -0.4426099359989166, "logps/chosen": -29.08201789855957, "logps/rejected": -22.29375457763672, "loss": 0.1581, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 1.0119986534118652, "rewards/margins": 3.4419338703155518, "rewards/rejected": -2.4299352169036865, "step": 494 }, { "epoch": 1.1446153846153846, "grad_norm": 23.761778184813924, "learning_rate": 2.3036009853481474e-07, "logits/chosen": -0.46128442883491516, "logits/rejected": -0.4482616186141968, "logps/chosen": -25.646957397460938, "logps/rejected": -66.59310150146484, "loss": 0.1582, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.34237024188041687, "rewards/margins": 7.462684631347656, "rewards/rejected": -7.8050537109375, "step": 496 }, { "epoch": 1.1492307692307693, "grad_norm": 14.275517293356359, "learning_rate": 2.283505644942223e-07, "logits/chosen": -0.38553929328918457, "logits/rejected": -0.3888304531574249, "logps/chosen": -20.24867057800293, "logps/rejected": -35.16265106201172, "loss": 0.1635, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": 0.14710961282253265, "rewards/margins": 4.560574531555176, "rewards/rejected": -4.41346549987793, "step": 498 }, { "epoch": 1.1538461538461537, "grad_norm": 10.89483094880467, "learning_rate": 2.2634243886344781e-07, "logits/chosen": -0.41619181632995605, "logits/rejected": -0.4102987051010132, "logps/chosen": -25.553611755371094, "logps/rejected": -45.57319259643555, "loss": 0.1348, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.6297872066497803, "rewards/margins": 6.220609664916992, "rewards/rejected": -6.85039758682251, "step": 500 }, { "epoch": 1.1538461538461537, "eval_logits/chosen": -0.4352647662162781, "eval_logits/rejected": -0.4305744171142578, "eval_logps/chosen": -29.774337768554688, "eval_logps/rejected": -45.14006805419922, "eval_loss": 0.1789507418870926, "eval_rewards/accuracies": 0.8513824939727783, "eval_rewards/chosen": -0.45794734358787537, "eval_rewards/margins": 5.0200724601745605, "eval_rewards/rejected": -5.4780192375183105, "eval_runtime": 560.5025, "eval_samples_per_second": 3.094, "eval_steps_per_second": 0.387, "step": 500 }, { "epoch": 1.1584615384615384, "grad_norm": 17.07914047402375, "learning_rate": 2.2433585228164115e-07, "logits/chosen": -0.476253867149353, "logits/rejected": -0.4664095640182495, "logps/chosen": -35.093570709228516, "logps/rejected": -48.185882568359375, "loss": 0.1547, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.3796059191226959, "rewards/margins": 5.511943340301514, "rewards/rejected": -5.891549587249756, "step": 502 }, { "epoch": 1.1630769230769231, "grad_norm": 14.032383137471154, "learning_rate": 2.2233093528782938e-07, "logits/chosen": -0.40187719464302063, "logits/rejected": -0.39918211102485657, "logps/chosen": -34.03489685058594, "logps/rejected": -47.03575134277344, "loss": 0.1083, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.1774817854166031, "rewards/margins": 5.615666389465332, "rewards/rejected": -5.793148040771484, "step": 504 }, { "epoch": 1.1676923076923078, "grad_norm": 8.15808394680485, "learning_rate": 2.2032781831242367e-07, "logits/chosen": -0.4417678415775299, "logits/rejected": -0.43445712327957153, "logps/chosen": -15.875346183776855, "logps/rejected": -39.37602233886719, "loss": 0.1393, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.05537441745400429, "rewards/margins": 5.128325462341309, "rewards/rejected": -5.072950839996338, "step": 506 }, { "epoch": 1.1723076923076923, "grad_norm": 23.058443462217795, "learning_rate": 2.183266316687347e-07, "logits/chosen": -0.47501933574676514, "logits/rejected": -0.47421959042549133, "logps/chosen": -27.21249771118164, "logps/rejected": -26.752079010009766, "loss": 0.1963, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.015099167823791504, "rewards/margins": 3.6199541091918945, "rewards/rejected": -3.6048550605773926, "step": 508 }, { "epoch": 1.176923076923077, "grad_norm": 7.474201792038496, "learning_rate": 2.16327505544495e-07, "logits/chosen": -0.4653564989566803, "logits/rejected": -0.4561309218406677, "logps/chosen": -27.21685028076172, "logps/rejected": -59.89400863647461, "loss": 0.101, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.24764370918273926, "rewards/margins": 7.359438419342041, "rewards/rejected": -7.607082843780518, "step": 510 }, { "epoch": 1.1815384615384614, "grad_norm": 14.977920616325804, "learning_rate": 2.143305699933892e-07, "logits/chosen": -0.4481993317604065, "logits/rejected": -0.4424552917480469, "logps/chosen": -25.842750549316406, "logps/rejected": -35.22297286987305, "loss": 0.1513, "rewards/accuracies": 0.9444444179534912, "rewards/chosen": -0.20215344429016113, "rewards/margins": 4.765042781829834, "rewards/rejected": -4.967195510864258, "step": 512 }, { "epoch": 1.1861538461538461, "grad_norm": 8.871925218116283, "learning_rate": 2.1233595492659382e-07, "logits/chosen": -0.4569311738014221, "logits/rejected": -0.4524325132369995, "logps/chosen": -36.34779739379883, "logps/rejected": -38.69430923461914, "loss": 0.0915, "rewards/accuracies": 0.9444444179534912, "rewards/chosen": -0.695059061050415, "rewards/margins": 4.799535751342773, "rewards/rejected": -5.494595050811768, "step": 514 }, { "epoch": 1.1907692307692308, "grad_norm": 8.587436296101457, "learning_rate": 2.1034379010432542e-07, "logits/chosen": -0.4389886260032654, "logits/rejected": -0.44006067514419556, "logps/chosen": -26.863874435424805, "logps/rejected": -32.41796875, "loss": 0.1397, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.27708467841148376, "rewards/margins": 4.265104293823242, "rewards/rejected": -4.542189121246338, "step": 516 }, { "epoch": 1.1953846153846155, "grad_norm": 15.420997688681869, "learning_rate": 2.0835420512739957e-07, "logits/chosen": -0.46533700823783875, "logits/rejected": -0.4506280720233917, "logps/chosen": -25.814783096313477, "logps/rejected": -70.91129302978516, "loss": 0.1224, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.07975815236568451, "rewards/margins": 8.272908210754395, "rewards/rejected": -8.352665901184082, "step": 518 }, { "epoch": 1.2, "grad_norm": 18.357638459147704, "learning_rate": 2.0636732942879917e-07, "logits/chosen": -0.42653000354766846, "logits/rejected": -0.4225360155105591, "logps/chosen": -29.01532745361328, "logps/rejected": -49.411109924316406, "loss": 0.1179, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": 0.15314319729804993, "rewards/margins": 5.538592338562012, "rewards/rejected": -5.385449409484863, "step": 520 }, { "epoch": 1.2, "eval_logits/chosen": -0.4414067566394806, "eval_logits/rejected": -0.4365512728691101, "eval_logps/chosen": -29.9921875, "eval_logps/rejected": -45.457550048828125, "eval_loss": 0.17792561650276184, "eval_rewards/accuracies": 0.8548387289047241, "eval_rewards/chosen": -0.5668714642524719, "eval_rewards/margins": 5.069887638092041, "eval_rewards/rejected": -5.6367597579956055, "eval_runtime": 502.7708, "eval_samples_per_second": 3.449, "eval_steps_per_second": 0.432, "step": 520 }, { "epoch": 1.2046153846153846, "grad_norm": 6.246838517522184, "learning_rate": 2.0438329226525415e-07, "logits/chosen": -0.44413790106773376, "logits/rejected": -0.42687925696372986, "logps/chosen": -34.58333206176758, "logps/rejected": -33.14214324951172, "loss": 0.1034, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": 0.2999877333641052, "rewards/margins": 4.772705554962158, "rewards/rejected": -4.472717761993408, "step": 522 }, { "epoch": 1.209230769230769, "grad_norm": 14.056782247704538, "learning_rate": 2.0240222270883288e-07, "logits/chosen": -0.431974321603775, "logits/rejected": -0.4254220724105835, "logps/chosen": -23.886573791503906, "logps/rejected": -55.92216110229492, "loss": 0.1599, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.13437525928020477, "rewards/margins": 6.35684061050415, "rewards/rejected": -6.2224650382995605, "step": 524 }, { "epoch": 1.2138461538461538, "grad_norm": 3.5861499566842516, "learning_rate": 2.0042424963854542e-07, "logits/chosen": -0.46107277274131775, "logits/rejected": -0.4549606144428253, "logps/chosen": -29.465402603149414, "logps/rejected": -65.7586669921875, "loss": 0.0735, "rewards/accuracies": 0.9722222089767456, "rewards/chosen": -0.5647941827774048, "rewards/margins": 6.686285018920898, "rewards/rejected": -7.25108003616333, "step": 526 }, { "epoch": 1.2184615384615385, "grad_norm": 8.511072994659669, "learning_rate": 1.9844950173195883e-07, "logits/chosen": -0.4436500072479248, "logits/rejected": -0.4398376941680908, "logps/chosen": -27.61098861694336, "logps/rejected": -40.1098518371582, "loss": 0.0992, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.4000561535358429, "rewards/margins": 4.830748558044434, "rewards/rejected": -5.230805397033691, "step": 528 }, { "epoch": 1.2230769230769232, "grad_norm": 13.317079117277856, "learning_rate": 1.964781074568265e-07, "logits/chosen": -0.42789122462272644, "logits/rejected": -0.42622682452201843, "logps/chosen": -28.192739486694336, "logps/rejected": -34.41347885131836, "loss": 0.1237, "rewards/accuracies": 0.9444444179534912, "rewards/chosen": -0.43256738781929016, "rewards/margins": 5.318607807159424, "rewards/rejected": -5.7511749267578125, "step": 530 }, { "epoch": 1.2276923076923076, "grad_norm": 7.273203543027733, "learning_rate": 1.9451019506273018e-07, "logits/chosen": -0.4606661796569824, "logits/rejected": -0.45366737246513367, "logps/chosen": -23.910785675048828, "logps/rejected": -45.56764221191406, "loss": 0.1522, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.0013485749950632453, "rewards/margins": 5.501904487609863, "rewards/rejected": -5.5032525062561035, "step": 532 }, { "epoch": 1.2323076923076923, "grad_norm": 8.484520032770613, "learning_rate": 1.9254589257273712e-07, "logits/chosen": -0.4633462727069855, "logits/rejected": -0.4581097364425659, "logps/chosen": -22.684478759765625, "logps/rejected": -43.12580490112305, "loss": 0.1041, "rewards/accuracies": 0.9444444179534912, "rewards/chosen": -0.39974701404571533, "rewards/margins": 6.006505489349365, "rewards/rejected": -6.406252384185791, "step": 534 }, { "epoch": 1.236923076923077, "grad_norm": 8.61504165644796, "learning_rate": 1.9058532777507141e-07, "logits/chosen": -0.4871542155742645, "logits/rejected": -0.47969764471054077, "logps/chosen": -28.18987274169922, "logps/rejected": -43.51951599121094, "loss": 0.0886, "rewards/accuracies": 0.9722222089767456, "rewards/chosen": 0.20687071979045868, "rewards/margins": 5.764058589935303, "rewards/rejected": -5.557188034057617, "step": 536 }, { "epoch": 1.2415384615384615, "grad_norm": 17.475122434333045, "learning_rate": 1.886286282148002e-07, "logits/chosen": -0.4087609648704529, "logits/rejected": -0.39553922414779663, "logps/chosen": -26.770692825317383, "logps/rejected": -54.18208312988281, "loss": 0.1552, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.6067285537719727, "rewards/margins": 5.93799352645874, "rewards/rejected": -6.544721603393555, "step": 538 }, { "epoch": 1.2461538461538462, "grad_norm": 5.957799871266853, "learning_rate": 1.8667592118553693e-07, "logits/chosen": -0.41777941584587097, "logits/rejected": -0.4128054082393646, "logps/chosen": -31.925973892211914, "logps/rejected": -49.55064010620117, "loss": 0.1094, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.34601137042045593, "rewards/margins": 6.164337158203125, "rewards/rejected": -6.510347843170166, "step": 540 }, { "epoch": 1.2461538461538462, "eval_logits/chosen": -0.43877169489860535, "eval_logits/rejected": -0.43405881524086, "eval_logps/chosen": -30.319114685058594, "eval_logps/rejected": -45.845218658447266, "eval_loss": 0.17548762261867523, "eval_rewards/accuracies": 0.8594470024108887, "eval_rewards/chosen": -0.7303365468978882, "eval_rewards/margins": 5.100256443023682, "eval_rewards/rejected": -5.830592632293701, "eval_runtime": 533.5477, "eval_samples_per_second": 3.25, "eval_steps_per_second": 0.407, "step": 540 }, { "epoch": 1.2507692307692309, "grad_norm": 23.33076194006214, "learning_rate": 1.8472733372115956e-07, "logits/chosen": -0.41587164998054504, "logits/rejected": -0.41326984763145447, "logps/chosen": -30.331064224243164, "logps/rejected": -50.16444396972656, "loss": 0.1582, "rewards/accuracies": 1.0, "rewards/chosen": -0.6089301109313965, "rewards/margins": 6.805273056030273, "rewards/rejected": -7.41420316696167, "step": 542 }, { "epoch": 1.2553846153846153, "grad_norm": 9.486708288913684, "learning_rate": 1.8278299258754692e-07, "logits/chosen": -0.45922935009002686, "logits/rejected": -0.44640716910362244, "logps/chosen": -29.84143829345703, "logps/rejected": -80.63742065429688, "loss": 0.1533, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -1.0425394773483276, "rewards/margins": 8.659127235412598, "rewards/rejected": -9.701665878295898, "step": 544 }, { "epoch": 1.26, "grad_norm": 8.415595976698144, "learning_rate": 1.808430242743316e-07, "logits/chosen": -0.391701340675354, "logits/rejected": -0.3901844918727875, "logps/chosen": -31.800350189208984, "logps/rejected": -41.65314483642578, "loss": 0.1559, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.42311179637908936, "rewards/margins": 5.024416446685791, "rewards/rejected": -5.44752836227417, "step": 546 }, { "epoch": 1.2646153846153847, "grad_norm": 13.520722677312147, "learning_rate": 1.7890755498667104e-07, "logits/chosen": -0.48314183950424194, "logits/rejected": -0.47610917687416077, "logps/chosen": -23.926097869873047, "logps/rejected": -34.52913284301758, "loss": 0.1144, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.49489152431488037, "rewards/margins": 4.849336624145508, "rewards/rejected": -5.344228267669678, "step": 548 }, { "epoch": 1.2692307692307692, "grad_norm": 15.795932895868892, "learning_rate": 1.7697671063703756e-07, "logits/chosen": -0.41217005252838135, "logits/rejected": -0.40901029109954834, "logps/chosen": -23.710527420043945, "logps/rejected": -28.55902862548828, "loss": 0.129, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.5014798641204834, "rewards/margins": 3.887040138244629, "rewards/rejected": -4.388520240783691, "step": 550 }, { "epoch": 1.2738461538461539, "grad_norm": 21.469856484577797, "learning_rate": 1.750506168370267e-07, "logits/chosen": -0.46302923560142517, "logits/rejected": -0.4612545967102051, "logps/chosen": -28.68931770324707, "logps/rejected": -34.03602981567383, "loss": 0.1737, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.16566115617752075, "rewards/margins": 4.427058219909668, "rewards/rejected": -4.592720031738281, "step": 552 }, { "epoch": 1.2784615384615385, "grad_norm": 14.161394479189466, "learning_rate": 1.7312939888918594e-07, "logits/chosen": -0.48352310061454773, "logits/rejected": -0.47621166706085205, "logps/chosen": -35.41975402832031, "logps/rejected": -61.59049606323242, "loss": 0.0835, "rewards/accuracies": 0.9444444179534912, "rewards/chosen": -1.2020230293273926, "rewards/margins": 7.048378944396973, "rewards/rejected": -8.250401496887207, "step": 554 }, { "epoch": 1.283076923076923, "grad_norm": 11.763961853590274, "learning_rate": 1.712131817788628e-07, "logits/chosen": -0.39804789423942566, "logits/rejected": -0.39240336418151855, "logps/chosen": -32.114383697509766, "logps/rejected": -43.75135040283203, "loss": 0.1478, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -1.227245807647705, "rewards/margins": 4.835330486297607, "rewards/rejected": -6.062576770782471, "step": 556 }, { "epoch": 1.2876923076923077, "grad_norm": 11.746882908808615, "learning_rate": 1.693020901660738e-07, "logits/chosen": -0.43442314863204956, "logits/rejected": -0.426601380109787, "logps/chosen": -29.56785011291504, "logps/rejected": -51.36748123168945, "loss": 0.069, "rewards/accuracies": 0.9722222089767456, "rewards/chosen": -0.3340129852294922, "rewards/margins": 6.56093692779541, "rewards/rejected": -6.894948959350586, "step": 558 }, { "epoch": 1.2923076923076924, "grad_norm": 17.957849070684656, "learning_rate": 1.6739624837739518e-07, "logits/chosen": -0.46732431650161743, "logits/rejected": -0.4640684723854065, "logps/chosen": -38.6497802734375, "logps/rejected": -40.31655502319336, "loss": 0.1883, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.5640177130699158, "rewards/margins": 5.062933444976807, "rewards/rejected": -5.626951217651367, "step": 560 }, { "epoch": 1.2923076923076924, "eval_logits/chosen": -0.44211652874946594, "eval_logits/rejected": -0.4374195635318756, "eval_logps/chosen": -30.299867630004883, "eval_logps/rejected": -46.00411605834961, "eval_loss": 0.17604655027389526, "eval_rewards/accuracies": 0.8548387289047241, "eval_rewards/chosen": -0.7207121849060059, "eval_rewards/margins": 5.189333438873291, "eval_rewards/rejected": -5.9100446701049805, "eval_runtime": 525.7629, "eval_samples_per_second": 3.298, "eval_steps_per_second": 0.413, "step": 560 }, { "epoch": 1.2969230769230768, "grad_norm": 6.3060465486651855, "learning_rate": 1.6549578039787434e-07, "logits/chosen": -0.43606239557266235, "logits/rejected": -0.4302014410495758, "logps/chosen": -32.69771957397461, "logps/rejected": -55.64405822753906, "loss": 0.1058, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.6670701503753662, "rewards/margins": 6.244661331176758, "rewards/rejected": -6.911731243133545, "step": 562 }, { "epoch": 1.3015384615384615, "grad_norm": 8.175626005583243, "learning_rate": 1.6360080986296384e-07, "logits/chosen": -0.48304304480552673, "logits/rejected": -0.47264793515205383, "logps/chosen": -25.004295349121094, "logps/rejected": -46.13277053833008, "loss": 0.142, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.5224790573120117, "rewards/margins": 5.995500564575195, "rewards/rejected": -6.517979145050049, "step": 564 }, { "epoch": 1.3061538461538462, "grad_norm": 7.182626017610704, "learning_rate": 1.6171146005047894e-07, "logits/chosen": -0.4484933316707611, "logits/rejected": -0.4458969533443451, "logps/chosen": -38.01683807373047, "logps/rejected": -55.08387756347656, "loss": 0.1369, "rewards/accuracies": 0.9444444179534912, "rewards/chosen": -0.9473094940185547, "rewards/margins": 6.618728160858154, "rewards/rejected": -7.566038131713867, "step": 566 }, { "epoch": 1.3107692307692307, "grad_norm": 23.65905318183108, "learning_rate": 1.5982785387257694e-07, "logits/chosen": -0.4703744351863861, "logits/rejected": -0.4685504138469696, "logps/chosen": -26.94400978088379, "logps/rejected": -41.18229675292969, "loss": 0.1537, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -1.0912222862243652, "rewards/margins": 4.332563877105713, "rewards/rejected": -5.42378568649292, "step": 568 }, { "epoch": 1.3153846153846154, "grad_norm": 19.208564099124775, "learning_rate": 1.5795011386776159e-07, "logits/chosen": -0.3927058279514313, "logits/rejected": -0.3895053565502167, "logps/chosen": -32.23387145996094, "logps/rejected": -41.6467170715332, "loss": 0.1437, "rewards/accuracies": 0.9444444179534912, "rewards/chosen": -0.8561339378356934, "rewards/margins": 4.904449939727783, "rewards/rejected": -5.760583877563477, "step": 570 }, { "epoch": 1.32, "grad_norm": 7.6455588339340075, "learning_rate": 1.560783621929113e-07, "logits/chosen": -0.4686591327190399, "logits/rejected": -0.4682163596153259, "logps/chosen": -33.203792572021484, "logps/rejected": -26.867284774780273, "loss": 0.1034, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.24691404402256012, "rewards/margins": 3.709285259246826, "rewards/rejected": -3.9561996459960938, "step": 572 }, { "epoch": 1.3246153846153845, "grad_norm": 38.63928409392581, "learning_rate": 1.5421272061533177e-07, "logits/chosen": -0.5254351496696472, "logits/rejected": -0.5237731337547302, "logps/chosen": -25.478132247924805, "logps/rejected": -37.34519958496094, "loss": 0.2187, "rewards/accuracies": 0.7777777910232544, "rewards/chosen": -0.46039220690727234, "rewards/margins": 3.8166046142578125, "rewards/rejected": -4.276997089385986, "step": 574 }, { "epoch": 1.3292307692307692, "grad_norm": 18.54196074276664, "learning_rate": 1.5235331050483513e-07, "logits/chosen": -0.41437721252441406, "logits/rejected": -0.41407954692840576, "logps/chosen": -33.27976989746094, "logps/rejected": -36.62285614013672, "loss": 0.14, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -1.3867942094802856, "rewards/margins": 3.8547847270965576, "rewards/rejected": -5.241579055786133, "step": 576 }, { "epoch": 1.333846153846154, "grad_norm": 8.32902585608401, "learning_rate": 1.5050025282584327e-07, "logits/chosen": -0.44869646430015564, "logits/rejected": -0.44119614362716675, "logps/chosen": -33.84681701660156, "logps/rejected": -52.08148956298828, "loss": 0.0976, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.49174821376800537, "rewards/margins": 6.421357154846191, "rewards/rejected": -6.91310453414917, "step": 578 }, { "epoch": 1.3384615384615386, "grad_norm": 5.550203240932015, "learning_rate": 1.4865366812951921e-07, "logits/chosen": -0.4623647928237915, "logits/rejected": -0.45940276980400085, "logps/chosen": -26.25106430053711, "logps/rejected": -27.46146011352539, "loss": 0.13, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.1943831741809845, "rewards/margins": 4.196011066436768, "rewards/rejected": -4.39039421081543, "step": 580 }, { "epoch": 1.3384615384615386, "eval_logits/chosen": -0.4482934772968292, "eval_logits/rejected": -0.4432106912136078, "eval_logps/chosen": -30.489978790283203, "eval_logps/rejected": -46.32124328613281, "eval_loss": 0.17471282184123993, "eval_rewards/accuracies": 0.8582949042320251, "eval_rewards/chosen": -0.8157680630683899, "eval_rewards/margins": 5.252838134765625, "eval_rewards/rejected": -6.068605422973633, "eval_runtime": 515.6269, "eval_samples_per_second": 3.363, "eval_steps_per_second": 0.421, "step": 580 }, { "epoch": 1.343076923076923, "grad_norm": 8.043354124759, "learning_rate": 1.4681367654592446e-07, "logits/chosen": -0.47537389397621155, "logits/rejected": -0.47570711374282837, "logps/chosen": -32.6854248046875, "logps/rejected": -34.94247817993164, "loss": 0.0651, "rewards/accuracies": 0.9444444179534912, "rewards/chosen": -0.2593272030353546, "rewards/margins": 4.336564064025879, "rewards/rejected": -4.595890522003174, "step": 582 }, { "epoch": 1.3476923076923077, "grad_norm": 8.969876191247874, "learning_rate": 1.4498039777620353e-07, "logits/chosen": -0.47895950078964233, "logits/rejected": -0.476862370967865, "logps/chosen": -44.07612609863281, "logps/rejected": -53.13533020019531, "loss": 0.1219, "rewards/accuracies": 1.0, "rewards/chosen": -0.2392204850912094, "rewards/margins": 5.760405540466309, "rewards/rejected": -5.999626159667969, "step": 584 }, { "epoch": 1.3523076923076922, "grad_norm": 6.462268980996396, "learning_rate": 1.4315395108479728e-07, "logits/chosen": -0.4445556402206421, "logits/rejected": -0.4383234977722168, "logps/chosen": -26.974239349365234, "logps/rejected": -40.67680358886719, "loss": 0.1056, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.9164202809333801, "rewards/margins": 4.923435688018799, "rewards/rejected": -5.839855194091797, "step": 586 }, { "epoch": 1.356923076923077, "grad_norm": 18.623980288558833, "learning_rate": 1.4133445529168365e-07, "logits/chosen": -0.4315257668495178, "logits/rejected": -0.427613228559494, "logps/chosen": -35.48088836669922, "logps/rejected": -50.93061065673828, "loss": 0.0945, "rewards/accuracies": 1.0, "rewards/chosen": -0.884118378162384, "rewards/margins": 6.591081619262695, "rewards/rejected": -7.475199222564697, "step": 588 }, { "epoch": 1.3615384615384616, "grad_norm": 5.260142072193007, "learning_rate": 1.395220287646483e-07, "logits/chosen": -0.42319419980049133, "logits/rejected": -0.4161040186882019, "logps/chosen": -33.856258392333984, "logps/rejected": -49.7766227722168, "loss": 0.1056, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -1.002943992614746, "rewards/margins": 6.032007217407227, "rewards/rejected": -7.034951210021973, "step": 590 }, { "epoch": 1.3661538461538463, "grad_norm": 7.692538406087756, "learning_rate": 1.377167894115837e-07, "logits/chosen": -0.44187918305397034, "logits/rejected": -0.4436172842979431, "logps/chosen": -28.86629295349121, "logps/rejected": -40.77040100097656, "loss": 0.1213, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.7521371841430664, "rewards/margins": 4.674728870391846, "rewards/rejected": -5.426866054534912, "step": 592 }, { "epoch": 1.3707692307692307, "grad_norm": 11.792211927093387, "learning_rate": 1.3591885467281877e-07, "logits/chosen": -0.48456090688705444, "logits/rejected": -0.4828470051288605, "logps/chosen": -30.24077033996582, "logps/rejected": -37.01740264892578, "loss": 0.1067, "rewards/accuracies": 0.9722222089767456, "rewards/chosen": -0.5764757394790649, "rewards/margins": 4.6280517578125, "rewards/rejected": -5.204527378082275, "step": 594 }, { "epoch": 1.3753846153846154, "grad_norm": 10.297119411851707, "learning_rate": 1.3412834151347896e-07, "logits/chosen": -0.43680185079574585, "logits/rejected": -0.43017539381980896, "logps/chosen": -24.517793655395508, "logps/rejected": -44.11916732788086, "loss": 0.1195, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.24036963284015656, "rewards/margins": 5.397197723388672, "rewards/rejected": -5.63756799697876, "step": 596 }, { "epoch": 1.38, "grad_norm": 5.180795840407191, "learning_rate": 1.323453664158769e-07, "logits/chosen": -0.4807315468788147, "logits/rejected": -0.46822589635849, "logps/chosen": -22.968257904052734, "logps/rejected": -65.51927185058594, "loss": 0.1619, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": -1.441799521446228, "rewards/margins": 6.657855033874512, "rewards/rejected": -8.099655151367188, "step": 598 }, { "epoch": 1.3846153846153846, "grad_norm": 8.869881389861584, "learning_rate": 1.3057004537193422e-07, "logits/chosen": -0.4717283844947815, "logits/rejected": -0.466756135225296, "logps/chosen": -29.466285705566406, "logps/rejected": -49.824188232421875, "loss": 0.1282, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.683549702167511, "rewards/margins": 6.033261775970459, "rewards/rejected": -6.716811656951904, "step": 600 }, { "epoch": 1.3846153846153846, "eval_logits/chosen": -0.44368964433670044, "eval_logits/rejected": -0.4389301538467407, "eval_logps/chosen": -30.7845458984375, "eval_logps/rejected": -46.73870086669922, "eval_loss": 0.17330653965473175, "eval_rewards/accuracies": 0.8605991005897522, "eval_rewards/chosen": -0.9630516767501831, "eval_rewards/margins": 5.314283847808838, "eval_rewards/rejected": -6.2773356437683105, "eval_runtime": 516.1841, "eval_samples_per_second": 3.359, "eval_steps_per_second": 0.42, "step": 600 }, { "epoch": 1.3892307692307693, "grad_norm": 18.38229782422149, "learning_rate": 1.2880249387563662e-07, "logits/chosen": -0.4752500355243683, "logits/rejected": -0.46704113483428955, "logps/chosen": -27.798635482788086, "logps/rejected": -55.05019760131836, "loss": 0.0812, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -1.4935674667358398, "rewards/margins": 5.991979598999023, "rewards/rejected": -7.485546588897705, "step": 602 }, { "epoch": 1.393846153846154, "grad_norm": 3.7694071772705176, "learning_rate": 1.2704282691551938e-07, "logits/chosen": -0.45651334524154663, "logits/rejected": -0.4481605887413025, "logps/chosen": -36.59762191772461, "logps/rejected": -66.54610443115234, "loss": 0.1242, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -1.0427519083023071, "rewards/margins": 6.9082159996032715, "rewards/rejected": -7.950966835021973, "step": 604 }, { "epoch": 1.3984615384615384, "grad_norm": 5.444143706459322, "learning_rate": 1.2529115896718714e-07, "logits/chosen": -0.4201243221759796, "logits/rejected": -0.41837078332901, "logps/chosen": -44.628116607666016, "logps/rejected": -46.915340423583984, "loss": 0.1135, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.9091212153434753, "rewards/margins": 5.618221282958984, "rewards/rejected": -6.527342796325684, "step": 606 }, { "epoch": 1.403076923076923, "grad_norm": 10.700509513891362, "learning_rate": 1.2354760398586708e-07, "logits/chosen": -0.46022334694862366, "logits/rejected": -0.4452159106731415, "logps/chosen": -40.61338806152344, "logps/rejected": -70.18826293945312, "loss": 0.0891, "rewards/accuracies": 0.9722222089767456, "rewards/chosen": -1.4041969776153564, "rewards/margins": 7.264396667480469, "rewards/rejected": -8.668594360351562, "step": 608 }, { "epoch": 1.4076923076923076, "grad_norm": 2.1177239867437, "learning_rate": 1.2181227539899468e-07, "logits/chosen": -0.5053317546844482, "logits/rejected": -0.5032351016998291, "logps/chosen": -31.564104080200195, "logps/rejected": -52.15016174316406, "loss": 0.1243, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.13930216431617737, "rewards/margins": 6.855900764465332, "rewards/rejected": -6.995202541351318, "step": 610 }, { "epoch": 1.4123076923076923, "grad_norm": 15.469500510450402, "learning_rate": 1.2008528609883557e-07, "logits/chosen": -0.4295639395713806, "logits/rejected": -0.4234282374382019, "logps/chosen": -32.4191780090332, "logps/rejected": -62.65770721435547, "loss": 0.1193, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -1.0219553709030151, "rewards/margins": 7.397497177124023, "rewards/rejected": -8.419452667236328, "step": 612 }, { "epoch": 1.416923076923077, "grad_norm": 3.178877332926101, "learning_rate": 1.1836674843514042e-07, "logits/chosen": -0.40342938899993896, "logits/rejected": -0.3988759219646454, "logps/chosen": -21.697105407714844, "logps/rejected": -43.94187545776367, "loss": 0.1003, "rewards/accuracies": 0.9444444179534912, "rewards/chosen": -0.8876277208328247, "rewards/margins": 5.724794864654541, "rewards/rejected": -6.612421989440918, "step": 614 }, { "epoch": 1.4215384615384616, "grad_norm": 3.6793184538422326, "learning_rate": 1.1665677420783671e-07, "logits/chosen": -0.48808521032333374, "logits/rejected": -0.4859094023704529, "logps/chosen": -34.99571228027344, "logps/rejected": -39.803985595703125, "loss": 0.0822, "rewards/accuracies": 1.0, "rewards/chosen": -1.0026580095291138, "rewards/margins": 5.45047664642334, "rewards/rejected": -6.453134536743164, "step": 616 }, { "epoch": 1.426153846153846, "grad_norm": 16.695577440547794, "learning_rate": 1.149554746597553e-07, "logits/chosen": -0.48872339725494385, "logits/rejected": -0.4808255732059479, "logps/chosen": -26.101930618286133, "logps/rejected": -44.3250846862793, "loss": 0.1596, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -1.004285216331482, "rewards/margins": 5.792173862457275, "rewards/rejected": -6.796459674835205, "step": 618 }, { "epoch": 1.4307692307692308, "grad_norm": 13.42168687729947, "learning_rate": 1.1326296046939333e-07, "logits/chosen": -0.49368032813072205, "logits/rejected": -0.4939752221107483, "logps/chosen": -34.26783752441406, "logps/rejected": -33.68894958496094, "loss": 0.1207, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.24355633556842804, "rewards/margins": 4.698565483093262, "rewards/rejected": -4.942121982574463, "step": 620 }, { "epoch": 1.4307692307692308, "eval_logits/chosen": -0.4475726783275604, "eval_logits/rejected": -0.4426879584789276, "eval_logps/chosen": -30.83469009399414, "eval_logps/rejected": -46.93183517456055, "eval_loss": 0.17354248464107513, "eval_rewards/accuracies": 0.8536866307258606, "eval_rewards/chosen": -0.988125205039978, "eval_rewards/margins": 5.385776996612549, "eval_rewards/rejected": -6.373903274536133, "eval_runtime": 508.6082, "eval_samples_per_second": 3.409, "eval_steps_per_second": 0.427, "step": 620 }, { "epoch": 1.4353846153846153, "grad_norm": 16.03721446625383, "learning_rate": 1.1157934174371413e-07, "logits/chosen": -0.40826666355133057, "logits/rejected": -0.40914714336395264, "logps/chosen": -37.774322509765625, "logps/rejected": -46.308685302734375, "loss": 0.1332, "rewards/accuracies": 0.9444444179534912, "rewards/chosen": -0.7187108993530273, "rewards/margins": 5.952962875366211, "rewards/rejected": -6.67167329788208, "step": 622 }, { "epoch": 1.44, "grad_norm": 9.989041737289213, "learning_rate": 1.0990472801098419e-07, "logits/chosen": -0.44723987579345703, "logits/rejected": -0.4433395564556122, "logps/chosen": -21.289173126220703, "logps/rejected": -40.282203674316406, "loss": 0.1023, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.37924104928970337, "rewards/margins": 5.2494964599609375, "rewards/rejected": -5.628737926483154, "step": 624 }, { "epoch": 1.4446153846153846, "grad_norm": 10.147212931816624, "learning_rate": 1.0823922821364795e-07, "logits/chosen": -0.43332383036613464, "logits/rejected": -0.42938029766082764, "logps/chosen": -32.57557678222656, "logps/rejected": -37.7658576965332, "loss": 0.1189, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.751186192035675, "rewards/margins": 5.093667984008789, "rewards/rejected": -5.844854354858398, "step": 626 }, { "epoch": 1.4492307692307693, "grad_norm": 6.709065459441931, "learning_rate": 1.0658295070124026e-07, "logits/chosen": -0.4798267185688019, "logits/rejected": -0.47460561990737915, "logps/chosen": -39.24932861328125, "logps/rejected": -50.80968475341797, "loss": 0.141, "rewards/accuracies": 0.9444444179534912, "rewards/chosen": -0.7187818884849548, "rewards/margins": 6.41911506652832, "rewards/rejected": -7.13789701461792, "step": 628 }, { "epoch": 1.4538461538461538, "grad_norm": 6.61693248992949, "learning_rate": 1.0493600322333762e-07, "logits/chosen": -0.46035754680633545, "logits/rejected": -0.44821494817733765, "logps/chosen": -34.0073127746582, "logps/rejected": -73.41968536376953, "loss": 0.0821, "rewards/accuracies": 0.9722222089767456, "rewards/chosen": -0.784911572933197, "rewards/margins": 8.927276611328125, "rewards/rejected": -9.712186813354492, "step": 630 }, { "epoch": 1.4584615384615385, "grad_norm": 10.495867841312156, "learning_rate": 1.0329849292254883e-07, "logits/chosen": -0.48313766717910767, "logits/rejected": -0.4751304090023041, "logps/chosen": -30.91708755493164, "logps/rejected": -63.11465072631836, "loss": 0.135, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.9800366163253784, "rewards/margins": 7.840139389038086, "rewards/rejected": -8.820176124572754, "step": 632 }, { "epoch": 1.463076923076923, "grad_norm": 7.122666564054563, "learning_rate": 1.0167052632754458e-07, "logits/chosen": -0.4631768763065338, "logits/rejected": -0.4663302004337311, "logps/chosen": -30.481611251831055, "logps/rejected": -40.83177947998047, "loss": 0.0963, "rewards/accuracies": 0.9722222089767456, "rewards/chosen": -1.3963757753372192, "rewards/margins": 4.901059150695801, "rewards/rejected": -6.297435283660889, "step": 634 }, { "epoch": 1.4676923076923076, "grad_norm": 7.6754610914046255, "learning_rate": 1.0005220934612713e-07, "logits/chosen": -0.48726287484169006, "logits/rejected": -0.4836198091506958, "logps/chosen": -33.868221282958984, "logps/rejected": -35.494903564453125, "loss": 0.1268, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.781172513961792, "rewards/margins": 4.606103420257568, "rewards/rejected": -5.3872761726379395, "step": 636 }, { "epoch": 1.4723076923076923, "grad_norm": 4.394579617768486, "learning_rate": 9.844364725834056e-08, "logits/chosen": -0.4536009430885315, "logits/rejected": -0.446668416261673, "logps/chosen": -35.52815246582031, "logps/rejected": -54.59818649291992, "loss": 0.0654, "rewards/accuracies": 0.9722222089767456, "rewards/chosen": -1.248176097869873, "rewards/margins": 6.95098876953125, "rewards/rejected": -8.199164390563965, "step": 638 }, { "epoch": 1.476923076923077, "grad_norm": 13.851192186996709, "learning_rate": 9.68449447096217e-08, "logits/chosen": -0.4486643970012665, "logits/rejected": -0.44762691855430603, "logps/chosen": -30.82935333251953, "logps/rejected": -40.4957160949707, "loss": 0.1906, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.9672742486000061, "rewards/margins": 5.168215751647949, "rewards/rejected": -6.135489463806152, "step": 640 }, { "epoch": 1.476923076923077, "eval_logits/chosen": -0.4496226906776428, "eval_logits/rejected": -0.44475340843200684, "eval_logps/chosen": -30.85221290588379, "eval_logps/rejected": -47.09688949584961, "eval_loss": 0.17315760254859924, "eval_rewards/accuracies": 0.8559907674789429, "eval_rewards/chosen": -0.9968856573104858, "eval_rewards/margins": 5.4595465660095215, "eval_rewards/rejected": -6.456432342529297, "eval_runtime": 505.3381, "eval_samples_per_second": 3.431, "eval_steps_per_second": 0.429, "step": 640 }, { "epoch": 1.4815384615384615, "grad_norm": 14.122051876190357, "learning_rate": 9.525620570399259e-08, "logits/chosen": -0.43323299288749695, "logits/rejected": -0.42855146527290344, "logps/chosen": -33.61738204956055, "logps/rejected": -52.76134490966797, "loss": 0.0718, "rewards/accuracies": 0.9722222089767456, "rewards/chosen": -1.0422890186309814, "rewards/margins": 6.377801895141602, "rewards/rejected": -7.420090675354004, "step": 642 }, { "epoch": 1.4861538461538462, "grad_norm": 9.451549384905373, "learning_rate": 9.36775335972943e-08, "logits/chosen": -0.43770718574523926, "logits/rejected": -0.4287017583847046, "logps/chosen": -31.985116958618164, "logps/rejected": -69.42501068115234, "loss": 0.1135, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.7848724126815796, "rewards/margins": 8.417101860046387, "rewards/rejected": -9.201974868774414, "step": 644 }, { "epoch": 1.4907692307692308, "grad_norm": 7.039881363814246, "learning_rate": 9.210903109046284e-08, "logits/chosen": -0.5208732485771179, "logits/rejected": -0.5058411955833435, "logps/chosen": -19.167829513549805, "logps/rejected": -47.509395599365234, "loss": 0.0693, "rewards/accuracies": 0.9722222089767456, "rewards/chosen": -0.495722234249115, "rewards/margins": 7.417731285095215, "rewards/rejected": -7.913454055786133, "step": 646 }, { "epoch": 1.4953846153846153, "grad_norm": 7.513358705098759, "learning_rate": 9.05508002228485e-08, "logits/chosen": -0.4772546589374542, "logits/rejected": -0.4674406051635742, "logps/chosen": -20.904613494873047, "logps/rejected": -45.344276428222656, "loss": 0.1018, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.5619287490844727, "rewards/margins": 5.992631912231445, "rewards/rejected": -6.55456018447876, "step": 648 }, { "epoch": 1.5, "grad_norm": 5.163262466936855, "learning_rate": 8.900294236557707e-08, "logits/chosen": -0.4919062852859497, "logits/rejected": -0.4884023666381836, "logps/chosen": -25.894731521606445, "logps/rejected": -33.06015396118164, "loss": 0.1531, "rewards/accuracies": 0.7777777910232544, "rewards/chosen": -0.818977952003479, "rewards/margins": 3.840023994445801, "rewards/rejected": -4.65900182723999, "step": 650 }, { "epoch": 1.5046153846153847, "grad_norm": 7.935441978775117, "learning_rate": 8.746555821495561e-08, "logits/chosen": -0.3938957154750824, "logits/rejected": -0.3846832513809204, "logps/chosen": -29.969451904296875, "logps/rejected": -74.45105743408203, "loss": 0.0969, "rewards/accuracies": 0.9444444179534912, "rewards/chosen": -1.0499709844589233, "rewards/margins": 8.556549072265625, "rewards/rejected": -9.606517791748047, "step": 652 }, { "epoch": 1.5092307692307694, "grad_norm": 15.015619286831695, "learning_rate": 8.593874778592122e-08, "logits/chosen": -0.47225937247276306, "logits/rejected": -0.46751588582992554, "logps/chosen": -22.55743980407715, "logps/rejected": -37.160099029541016, "loss": 0.1349, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.20361822843551636, "rewards/margins": 5.03909158706665, "rewards/rejected": -5.242710113525391, "step": 654 }, { "epoch": 1.5138461538461538, "grad_norm": 16.984332287145218, "learning_rate": 8.442261040553472e-08, "logits/chosen": -0.42722606658935547, "logits/rejected": -0.4261399805545807, "logps/chosen": -39.27968215942383, "logps/rejected": -42.59625244140625, "loss": 0.1151, "rewards/accuracies": 0.9444444179534912, "rewards/chosen": -1.1750526428222656, "rewards/margins": 5.367068290710449, "rewards/rejected": -6.542121410369873, "step": 656 }, { "epoch": 1.5184615384615383, "grad_norm": 6.648137955433888, "learning_rate": 8.291724470651903e-08, "logits/chosen": -0.49384915828704834, "logits/rejected": -0.4838346838951111, "logps/chosen": -33.4910888671875, "logps/rejected": -45.4775505065918, "loss": 0.1163, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.9873102903366089, "rewards/margins": 5.118143081665039, "rewards/rejected": -6.1054534912109375, "step": 658 }, { "epoch": 1.523076923076923, "grad_norm": 4.2717995977336045, "learning_rate": 8.14227486208423e-08, "logits/chosen": -0.38486483693122864, "logits/rejected": -0.37810662388801575, "logps/chosen": -28.15804672241211, "logps/rejected": -48.63478088378906, "loss": 0.124, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.6729797720909119, "rewards/margins": 6.412167549133301, "rewards/rejected": -7.085147857666016, "step": 660 }, { "epoch": 1.523076923076923, "eval_logits/chosen": -0.4508785903453827, "eval_logits/rejected": -0.44609692692756653, "eval_logps/chosen": -30.673160552978516, "eval_logps/rejected": -46.99982452392578, "eval_loss": 0.17300111055374146, "eval_rewards/accuracies": 0.8536866307258606, "eval_rewards/chosen": -0.9073583483695984, "eval_rewards/margins": 5.500541687011719, "eval_rewards/rejected": -6.407899856567383, "eval_runtime": 511.2851, "eval_samples_per_second": 3.391, "eval_steps_per_second": 0.424, "step": 660 }, { "epoch": 1.5276923076923077, "grad_norm": 13.44752815643731, "learning_rate": 7.993921937334716e-08, "logits/chosen": -0.4475086033344269, "logits/rejected": -0.44877344369888306, "logps/chosen": -26.55576515197754, "logps/rejected": -35.18043518066406, "loss": 0.1439, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.48653051257133484, "rewards/margins": 5.022270202636719, "rewards/rejected": -5.508800506591797, "step": 662 }, { "epoch": 1.5323076923076924, "grad_norm": 5.466356752812198, "learning_rate": 7.846675347542578e-08, "logits/chosen": -0.4301062226295471, "logits/rejected": -0.4246373772621155, "logps/chosen": -32.72763442993164, "logps/rejected": -36.464515686035156, "loss": 0.0615, "rewards/accuracies": 0.9444444179534912, "rewards/chosen": -0.10833287239074707, "rewards/margins": 5.950449466705322, "rewards/rejected": -6.058782577514648, "step": 664 }, { "epoch": 1.536923076923077, "grad_norm": 19.34735548207154, "learning_rate": 7.700544671874079e-08, "logits/chosen": -0.5050385594367981, "logits/rejected": -0.5065574049949646, "logps/chosen": -43.71310043334961, "logps/rejected": -43.1418571472168, "loss": 0.1497, "rewards/accuracies": 1.0, "rewards/chosen": -0.6898919343948364, "rewards/margins": 5.588415145874023, "rewards/rejected": -6.2783074378967285, "step": 666 }, { "epoch": 1.5415384615384615, "grad_norm": 8.453944863452394, "learning_rate": 7.555539416899437e-08, "logits/chosen": -0.45561864972114563, "logits/rejected": -0.4405089318752289, "logps/chosen": -24.58259391784668, "logps/rejected": -52.34214782714844, "loss": 0.1603, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.9588913321495056, "rewards/margins": 6.157428741455078, "rewards/rejected": -7.1163201332092285, "step": 668 }, { "epoch": 1.546153846153846, "grad_norm": 4.371973354470921, "learning_rate": 7.41166901597429e-08, "logits/chosen": -0.4739670157432556, "logits/rejected": -0.46882593631744385, "logps/chosen": -28.16082000732422, "logps/rejected": -50.97901916503906, "loss": 0.0986, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.7207522988319397, "rewards/margins": 6.651329517364502, "rewards/rejected": -7.372081279754639, "step": 670 }, { "epoch": 1.5507692307692307, "grad_norm": 19.790331773905894, "learning_rate": 7.268942828626046e-08, "logits/chosen": -0.43110519647598267, "logits/rejected": -0.42571258544921875, "logps/chosen": -27.180479049682617, "logps/rejected": -43.61553955078125, "loss": 0.1136, "rewards/accuracies": 0.9444444179534912, "rewards/chosen": -0.7698748707771301, "rewards/margins": 6.077162265777588, "rewards/rejected": -6.8470377922058105, "step": 672 }, { "epoch": 1.5553846153846154, "grad_norm": 8.967577630520251, "learning_rate": 7.127370139945018e-08, "logits/chosen": -0.4663132131099701, "logits/rejected": -0.4576176404953003, "logps/chosen": -34.94415283203125, "logps/rejected": -54.50322723388672, "loss": 0.1183, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.7654924392700195, "rewards/margins": 6.722795009613037, "rewards/rejected": -7.488286972045898, "step": 674 }, { "epoch": 1.56, "grad_norm": 11.9882475900896, "learning_rate": 6.986960159980326e-08, "logits/chosen": -0.4708311855792999, "logits/rejected": -0.46971288323402405, "logps/chosen": -30.48172950744629, "logps/rejected": -47.16158676147461, "loss": 0.1076, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -1.132474660873413, "rewards/margins": 5.086870193481445, "rewards/rejected": -6.2193450927734375, "step": 676 }, { "epoch": 1.5646153846153847, "grad_norm": 18.20222451183193, "learning_rate": 6.847722023140776e-08, "logits/chosen": -0.449653685092926, "logits/rejected": -0.44860389828681946, "logps/chosen": -20.110612869262695, "logps/rejected": -37.876060485839844, "loss": 0.1308, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.7210627794265747, "rewards/margins": 5.16010046005249, "rewards/rejected": -5.881162643432617, "step": 678 }, { "epoch": 1.5692307692307692, "grad_norm": 12.652237641034137, "learning_rate": 6.709664787600616e-08, "logits/chosen": -0.46333450078964233, "logits/rejected": -0.4621225893497467, "logps/chosen": -29.29336166381836, "logps/rejected": -34.81156539916992, "loss": 0.1501, "rewards/accuracies": 0.9722222089767456, "rewards/chosen": -0.42165952920913696, "rewards/margins": 5.197292804718018, "rewards/rejected": -5.618952751159668, "step": 680 }, { "epoch": 1.5692307692307692, "eval_logits/chosen": -0.45489436388015747, "eval_logits/rejected": -0.4500066041946411, "eval_logps/chosen": -30.551109313964844, "eval_logps/rejected": -46.96470260620117, "eval_loss": 0.17306138575077057, "eval_rewards/accuracies": 0.8548387289047241, "eval_rewards/chosen": -0.8463338017463684, "eval_rewards/margins": 5.544005870819092, "eval_rewards/rejected": -6.3903398513793945, "eval_runtime": 587.6391, "eval_samples_per_second": 2.951, "eval_steps_per_second": 0.369, "step": 680 }, { "epoch": 1.573846153846154, "grad_norm": 18.229324350154982, "learning_rate": 6.572797434710219e-08, "logits/chosen": -0.5045414566993713, "logits/rejected": -0.5032651424407959, "logps/chosen": -31.237674713134766, "logps/rejected": -48.47135543823242, "loss": 0.142, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.9193555116653442, "rewards/margins": 5.215946674346924, "rewards/rejected": -6.13530158996582, "step": 682 }, { "epoch": 1.5784615384615384, "grad_norm": 3.947263193808749, "learning_rate": 6.437128868411856e-08, "logits/chosen": -0.46912819147109985, "logits/rejected": -0.4734024405479431, "logps/chosen": -27.25926971435547, "logps/rejected": -30.237674713134766, "loss": 0.1211, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.35293272137641907, "rewards/margins": 4.425691604614258, "rewards/rejected": -4.778625011444092, "step": 684 }, { "epoch": 1.583076923076923, "grad_norm": 6.478427622905253, "learning_rate": 6.302667914660384e-08, "logits/chosen": -0.44192954897880554, "logits/rejected": -0.4351538419723511, "logps/chosen": -25.08927345275879, "logps/rejected": -43.76457214355469, "loss": 0.1305, "rewards/accuracies": 0.9444444179534912, "rewards/chosen": -0.5325224995613098, "rewards/margins": 5.548949718475342, "rewards/rejected": -6.081472396850586, "step": 686 }, { "epoch": 1.5876923076923077, "grad_norm": 24.108290270350512, "learning_rate": 6.169423320849112e-08, "logits/chosen": -0.4146944284439087, "logits/rejected": -0.41156822443008423, "logps/chosen": -30.91227149963379, "logps/rejected": -35.69466781616211, "loss": 0.1385, "rewards/accuracies": 0.9444444179534912, "rewards/chosen": -1.0772337913513184, "rewards/margins": 4.478698253631592, "rewards/rejected": -5.55593204498291, "step": 688 }, { "epoch": 1.5923076923076924, "grad_norm": 11.995585070667776, "learning_rate": 6.037403755240748e-08, "logits/chosen": -0.4293345808982849, "logits/rejected": -0.4295496940612793, "logps/chosen": -32.228668212890625, "logps/rejected": -38.77988815307617, "loss": 0.0838, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.1786275953054428, "rewards/margins": 5.444575309753418, "rewards/rejected": -5.623202323913574, "step": 690 }, { "epoch": 1.596923076923077, "grad_norm": 4.047674436900951, "learning_rate": 5.9066178064034326e-08, "logits/chosen": -0.4635583162307739, "logits/rejected": -0.45304298400878906, "logps/chosen": -17.480928421020508, "logps/rejected": -74.0090103149414, "loss": 0.2029, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.17733684182167053, "rewards/margins": 9.169536590576172, "rewards/rejected": -9.34687328338623, "step": 692 }, { "epoch": 1.6015384615384616, "grad_norm": 13.367361995751285, "learning_rate": 5.777073982652064e-08, "logits/chosen": -0.44826704263687134, "logits/rejected": -0.44426432251930237, "logps/chosen": -18.58295440673828, "logps/rejected": -37.799747467041016, "loss": 0.1531, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.9348115921020508, "rewards/margins": 5.391566753387451, "rewards/rejected": -6.326377868652344, "step": 694 }, { "epoch": 1.606153846153846, "grad_norm": 8.701545792799616, "learning_rate": 5.6487807114947325e-08, "logits/chosen": -0.4739130735397339, "logits/rejected": -0.46692582964897156, "logps/chosen": -32.2366828918457, "logps/rejected": -66.81201934814453, "loss": 0.1034, "rewards/accuracies": 0.9444444179534912, "rewards/chosen": -0.9106719493865967, "rewards/margins": 7.628146648406982, "rewards/rejected": -8.538818359375, "step": 696 }, { "epoch": 1.6107692307692307, "grad_norm": 10.314716919638604, "learning_rate": 5.521746339084532e-08, "logits/chosen": -0.44541698694229126, "logits/rejected": -0.4445607364177704, "logps/chosen": -28.11867332458496, "logps/rejected": -52.054779052734375, "loss": 0.1536, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.7623778581619263, "rewards/margins": 6.327053070068359, "rewards/rejected": -7.089430332183838, "step": 698 }, { "epoch": 1.6153846153846154, "grad_norm": 8.330777397853717, "learning_rate": 5.39597912967652e-08, "logits/chosen": -0.44752609729766846, "logits/rejected": -0.4385342299938202, "logps/chosen": -31.53993034362793, "logps/rejected": -51.640995025634766, "loss": 0.1199, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -1.0500842332839966, "rewards/margins": 5.811091423034668, "rewards/rejected": -6.8611741065979, "step": 700 }, { "epoch": 1.6153846153846154, "eval_logits/chosen": -0.4506380558013916, "eval_logits/rejected": -0.44612976908683777, "eval_logps/chosen": -30.54046058654785, "eval_logps/rejected": -46.961219787597656, "eval_loss": 0.1732894778251648, "eval_rewards/accuracies": 0.8548387289047241, "eval_rewards/chosen": -0.8410088419914246, "eval_rewards/margins": 5.547587871551514, "eval_rewards/rejected": -6.388597011566162, "eval_runtime": 552.02, "eval_samples_per_second": 3.141, "eval_steps_per_second": 0.393, "step": 700 } ], "logging_steps": 2, "max_steps": 866, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }