{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.000355033405416, "eval_steps": 100, "global_step": 2905, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 3.436426116838488e-08, "logits/chosen": 2.1834254264831543, "logits/rejected": 1.839876413345337, "logps/chosen": -357.42919921875, "logps/rejected": -306.576904296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "learning_rate": 3.4364261168384884e-07, "logits/chosen": 2.1468355655670166, "logits/rejected": 1.514005184173584, "logps/chosen": -411.6521911621094, "logps/rejected": -314.8995056152344, "loss": 0.694, "rewards/accuracies": 0.4548611044883728, "rewards/chosen": -0.002364569343626499, "rewards/margins": -0.0007941992953419685, "rewards/rejected": -0.0015703702811151743, "step": 10 }, { "epoch": 0.02, "learning_rate": 6.872852233676977e-07, "logits/chosen": 2.05692720413208, "logits/rejected": 1.566576361656189, "logps/chosen": -380.4183044433594, "logps/rejected": -310.2190246582031, "loss": 0.6949, "rewards/accuracies": 0.4703125059604645, "rewards/chosen": -0.0030785556882619858, "rewards/margins": -0.0024148449301719666, "rewards/rejected": -0.0006637109909206629, "step": 20 }, { "epoch": 0.03, "learning_rate": 1.0309278350515464e-06, "logits/chosen": 2.101928234100342, "logits/rejected": 1.6467519998550415, "logps/chosen": -395.2138671875, "logps/rejected": -315.73284912109375, "loss": 0.6969, "rewards/accuracies": 0.44843751192092896, "rewards/chosen": -0.0034078925382345915, "rewards/margins": -0.006428100168704987, "rewards/rejected": 0.003020207630470395, "step": 30 }, { "epoch": 0.04, "learning_rate": 1.3745704467353954e-06, "logits/chosen": 2.055060386657715, "logits/rejected": 1.6383787393569946, "logps/chosen": -364.0431823730469, "logps/rejected": -303.153564453125, "loss": 0.6918, "rewards/accuracies": 0.503125011920929, "rewards/chosen": 0.0021386637818068266, "rewards/margins": 0.0038275974802672863, "rewards/rejected": -0.0016889336984604597, "step": 40 }, { "epoch": 0.05, "learning_rate": 1.718213058419244e-06, "logits/chosen": 2.076786756515503, "logits/rejected": 1.5966410636901855, "logps/chosen": -386.95574951171875, "logps/rejected": -303.09893798828125, "loss": 0.6926, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -9.494787082076073e-05, "rewards/margins": 0.0021279542706906796, "rewards/rejected": -0.002222902374342084, "step": 50 }, { "epoch": 0.06, "learning_rate": 2.061855670103093e-06, "logits/chosen": 2.115320920944214, "logits/rejected": 1.5076439380645752, "logps/chosen": -385.6374206542969, "logps/rejected": -310.1552429199219, "loss": 0.6942, "rewards/accuracies": 0.48906248807907104, "rewards/chosen": 0.0017262250185012817, "rewards/margins": -0.0010376462014392018, "rewards/rejected": 0.002763871569186449, "step": 60 }, { "epoch": 0.07, "learning_rate": 2.405498281786942e-06, "logits/chosen": 2.066307783126831, "logits/rejected": 1.467484712600708, "logps/chosen": -392.7117614746094, "logps/rejected": -295.0877380371094, "loss": 0.6916, "rewards/accuracies": 0.5234375, "rewards/chosen": 0.004793156404048204, "rewards/margins": 0.004090175963938236, "rewards/rejected": 0.0007029807893559337, "step": 70 }, { "epoch": 0.08, "learning_rate": 2.7491408934707907e-06, "logits/chosen": 2.1403486728668213, "logits/rejected": 1.5628957748413086, "logps/chosen": -409.4387512207031, "logps/rejected": -308.16864013671875, "loss": 0.6914, "rewards/accuracies": 0.5390625, "rewards/chosen": 0.008215180598199368, "rewards/margins": 0.0046447524800896645, "rewards/rejected": 0.0035704285837709904, "step": 80 }, { "epoch": 0.09, "learning_rate": 3.0927835051546395e-06, "logits/chosen": 2.103337049484253, "logits/rejected": 1.5271997451782227, "logps/chosen": -392.5189514160156, "logps/rejected": -314.1755676269531, "loss": 0.693, "rewards/accuracies": 0.5171874761581421, "rewards/chosen": 0.007820327766239643, "rewards/margins": 0.0013317877892404795, "rewards/rejected": 0.006488540209829807, "step": 90 }, { "epoch": 0.1, "learning_rate": 3.436426116838488e-06, "logits/chosen": 2.17207932472229, "logits/rejected": 1.5873886346817017, "logps/chosen": -393.0949401855469, "logps/rejected": -300.1360168457031, "loss": 0.6903, "rewards/accuracies": 0.528124988079071, "rewards/chosen": 0.015811745077371597, "rewards/margins": 0.006908411625772715, "rewards/rejected": 0.00890333205461502, "step": 100 }, { "epoch": 0.1, "eval_logits/chosen": 2.143984794616699, "eval_logits/rejected": 1.7053232192993164, "eval_logps/chosen": -392.367431640625, "eval_logps/rejected": -305.6642150878906, "eval_loss": 0.6886197924613953, "eval_rewards/accuracies": 0.5550000071525574, "eval_rewards/chosen": 0.019879885017871857, "eval_rewards/margins": 0.010274061933159828, "eval_rewards/rejected": 0.009605822153389454, "eval_runtime": 790.9682, "eval_samples_per_second": 2.529, "eval_steps_per_second": 0.632, "step": 100 }, { "epoch": 0.11, "learning_rate": 3.780068728522337e-06, "logits/chosen": 2.155015468597412, "logits/rejected": 1.5513948202133179, "logps/chosen": -406.37725830078125, "logps/rejected": -291.5572204589844, "loss": 0.6883, "rewards/accuracies": 0.5625, "rewards/chosen": 0.02593260072171688, "rewards/margins": 0.010871843434870243, "rewards/rejected": 0.015060758218169212, "step": 110 }, { "epoch": 0.12, "learning_rate": 4.123711340206186e-06, "logits/chosen": 2.1130483150482178, "logits/rejected": 1.5732020139694214, "logps/chosen": -393.74053955078125, "logps/rejected": -313.2681884765625, "loss": 0.6879, "rewards/accuracies": 0.573437511920929, "rewards/chosen": 0.030793720856308937, "rewards/margins": 0.011967557482421398, "rewards/rejected": 0.018826160579919815, "step": 120 }, { "epoch": 0.13, "learning_rate": 4.467353951890035e-06, "logits/chosen": 2.1919827461242676, "logits/rejected": 1.5891977548599243, "logps/chosen": -388.03558349609375, "logps/rejected": -288.43853759765625, "loss": 0.6825, "rewards/accuracies": 0.614062488079071, "rewards/chosen": 0.04453224316239357, "rewards/margins": 0.0232230331748724, "rewards/rejected": 0.021309208124876022, "step": 130 }, { "epoch": 0.14, "learning_rate": 4.810996563573884e-06, "logits/chosen": 2.0544376373291016, "logits/rejected": 1.5181596279144287, "logps/chosen": -400.3055114746094, "logps/rejected": -295.1371154785156, "loss": 0.6826, "rewards/accuracies": 0.59375, "rewards/chosen": 0.055656980723142624, "rewards/margins": 0.023503083735704422, "rewards/rejected": 0.0321538969874382, "step": 140 }, { "epoch": 0.15, "learning_rate": 5.154639175257732e-06, "logits/chosen": 2.028446912765503, "logits/rejected": 1.5749940872192383, "logps/chosen": -369.68914794921875, "logps/rejected": -290.5052795410156, "loss": 0.6814, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": 0.0687035471200943, "rewards/margins": 0.026732921600341797, "rewards/rejected": 0.041970618069171906, "step": 150 }, { "epoch": 0.17, "learning_rate": 5.4982817869415815e-06, "logits/chosen": 2.1807594299316406, "logits/rejected": 1.546294093132019, "logps/chosen": -385.5616149902344, "logps/rejected": -299.00079345703125, "loss": 0.6751, "rewards/accuracies": 0.621874988079071, "rewards/chosen": 0.0830768570303917, "rewards/margins": 0.040458060801029205, "rewards/rejected": 0.04261879622936249, "step": 160 }, { "epoch": 0.18, "learning_rate": 5.84192439862543e-06, "logits/chosen": 2.1456942558288574, "logits/rejected": 1.395061731338501, "logps/chosen": -408.2718200683594, "logps/rejected": -304.8404846191406, "loss": 0.6705, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.10708057880401611, "rewards/margins": 0.05216604471206665, "rewards/rejected": 0.054914526641368866, "step": 170 }, { "epoch": 0.19, "learning_rate": 6.185567010309279e-06, "logits/chosen": 2.0543594360351562, "logits/rejected": 1.578578233718872, "logps/chosen": -402.57696533203125, "logps/rejected": -315.94403076171875, "loss": 0.6692, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.12524382770061493, "rewards/margins": 0.05606088042259216, "rewards/rejected": 0.06918295472860336, "step": 180 }, { "epoch": 0.2, "learning_rate": 6.529209621993128e-06, "logits/chosen": 2.2270493507385254, "logits/rejected": 1.5178806781768799, "logps/chosen": -404.85235595703125, "logps/rejected": -305.0975646972656, "loss": 0.6561, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": 0.1562216579914093, "rewards/margins": 0.08503619581460953, "rewards/rejected": 0.07118546217679977, "step": 190 }, { "epoch": 0.21, "learning_rate": 6.872852233676976e-06, "logits/chosen": 2.0733094215393066, "logits/rejected": 1.5157095193862915, "logps/chosen": -370.6416320800781, "logps/rejected": -291.9568786621094, "loss": 0.6517, "rewards/accuracies": 0.6484375, "rewards/chosen": 0.15907886624336243, "rewards/margins": 0.09750664979219437, "rewards/rejected": 0.06157219409942627, "step": 200 }, { "epoch": 0.21, "eval_logits/chosen": 2.1312851905822754, "eval_logits/rejected": 1.6923099756240845, "eval_logps/chosen": -390.76202392578125, "eval_logps/rejected": -304.94482421875, "eval_loss": 0.6528406739234924, "eval_rewards/accuracies": 0.6449999809265137, "eval_rewards/chosen": 0.18042300641536713, "eval_rewards/margins": 0.09887553006410599, "eval_rewards/rejected": 0.08154748380184174, "eval_runtime": 789.0602, "eval_samples_per_second": 2.535, "eval_steps_per_second": 0.634, "step": 200 }, { "epoch": 0.22, "learning_rate": 7.216494845360825e-06, "logits/chosen": 2.015683650970459, "logits/rejected": 1.610907793045044, "logps/chosen": -393.47491455078125, "logps/rejected": -322.23553466796875, "loss": 0.6569, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": 0.18590465188026428, "rewards/margins": 0.09214220941066742, "rewards/rejected": 0.09376244992017746, "step": 210 }, { "epoch": 0.23, "learning_rate": 7.560137457044674e-06, "logits/chosen": 2.0618667602539062, "logits/rejected": 1.6181659698486328, "logps/chosen": -385.1166687011719, "logps/rejected": -301.85369873046875, "loss": 0.6471, "rewards/accuracies": 0.6484375, "rewards/chosen": 0.19887574017047882, "rewards/margins": 0.11533377319574356, "rewards/rejected": 0.08354197442531586, "step": 220 }, { "epoch": 0.24, "learning_rate": 7.903780068728523e-06, "logits/chosen": 2.074047327041626, "logits/rejected": 1.4734312295913696, "logps/chosen": -398.4089660644531, "logps/rejected": -313.59368896484375, "loss": 0.6368, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.22408001124858856, "rewards/margins": 0.1446545422077179, "rewards/rejected": 0.07942546904087067, "step": 230 }, { "epoch": 0.25, "learning_rate": 8.247422680412371e-06, "logits/chosen": 2.11418080329895, "logits/rejected": 1.597436547279358, "logps/chosen": -395.60369873046875, "logps/rejected": -317.60833740234375, "loss": 0.6402, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.22398725152015686, "rewards/margins": 0.14209990203380585, "rewards/rejected": 0.08188734203577042, "step": 240 }, { "epoch": 0.26, "learning_rate": 8.591065292096221e-06, "logits/chosen": 2.019456148147583, "logits/rejected": 1.5297256708145142, "logps/chosen": -392.3404235839844, "logps/rejected": -309.720703125, "loss": 0.6399, "rewards/accuracies": 0.6734374761581421, "rewards/chosen": 0.22662346065044403, "rewards/margins": 0.14939570426940918, "rewards/rejected": 0.07722777128219604, "step": 250 }, { "epoch": 0.27, "learning_rate": 8.93470790378007e-06, "logits/chosen": 2.059398651123047, "logits/rejected": 1.3951969146728516, "logps/chosen": -363.30023193359375, "logps/rejected": -279.39215087890625, "loss": 0.6302, "rewards/accuracies": 0.676562488079071, "rewards/chosen": 0.19638481736183167, "rewards/margins": 0.16942109167575836, "rewards/rejected": 0.026963721960783005, "step": 260 }, { "epoch": 0.28, "learning_rate": 9.278350515463918e-06, "logits/chosen": 2.0663836002349854, "logits/rejected": 1.4294058084487915, "logps/chosen": -413.36029052734375, "logps/rejected": -310.18414306640625, "loss": 0.6031, "rewards/accuracies": 0.690625011920929, "rewards/chosen": 0.270337849855423, "rewards/margins": 0.2435520887374878, "rewards/rejected": 0.026785755529999733, "step": 270 }, { "epoch": 0.29, "learning_rate": 9.621993127147768e-06, "logits/chosen": 2.0455291271209717, "logits/rejected": 1.3879245519638062, "logps/chosen": -392.5285949707031, "logps/rejected": -297.49078369140625, "loss": 0.6071, "rewards/accuracies": 0.6640625, "rewards/chosen": 0.26367807388305664, "rewards/margins": 0.25060200691223145, "rewards/rejected": 0.013076068833470345, "step": 280 }, { "epoch": 0.3, "learning_rate": 9.965635738831616e-06, "logits/chosen": 2.0284011363983154, "logits/rejected": 1.4462382793426514, "logps/chosen": -367.0660705566406, "logps/rejected": -286.4015808105469, "loss": 0.6278, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": 0.22588007152080536, "rewards/margins": 0.2054826021194458, "rewards/rejected": 0.020397469401359558, "step": 290 }, { "epoch": 0.31, "learning_rate": 9.96557000765111e-06, "logits/chosen": 2.1280786991119385, "logits/rejected": 1.5619724988937378, "logps/chosen": -383.9112548828125, "logps/rejected": -300.21392822265625, "loss": 0.6324, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.2175460308790207, "rewards/margins": 0.1999557912349701, "rewards/rejected": 0.017590241506695747, "step": 300 }, { "epoch": 0.31, "eval_logits/chosen": 2.0913290977478027, "eval_logits/rejected": 1.6577811241149902, "eval_logps/chosen": -390.0550537109375, "eval_logps/rejected": -305.7412414550781, "eval_loss": 0.6121114492416382, "eval_rewards/accuracies": 0.6694999933242798, "eval_rewards/chosen": 0.25111907720565796, "eval_rewards/margins": 0.24921704828739166, "eval_rewards/rejected": 0.0019020471954718232, "eval_runtime": 789.4778, "eval_samples_per_second": 2.533, "eval_steps_per_second": 0.633, "step": 300 }, { "epoch": 0.32, "learning_rate": 9.927314460596788e-06, "logits/chosen": 2.0782768726348877, "logits/rejected": 1.5225298404693604, "logps/chosen": -390.0548095703125, "logps/rejected": -306.93157958984375, "loss": 0.6085, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": 0.2482437640428543, "rewards/margins": 0.263679563999176, "rewards/rejected": -0.015435831621289253, "step": 310 }, { "epoch": 0.33, "learning_rate": 9.889058913542464e-06, "logits/chosen": 2.023192882537842, "logits/rejected": 1.4350343942642212, "logps/chosen": -395.04168701171875, "logps/rejected": -292.7287292480469, "loss": 0.6123, "rewards/accuracies": 0.667187511920929, "rewards/chosen": 0.2264510691165924, "rewards/margins": 0.26108288764953613, "rewards/rejected": -0.03463183715939522, "step": 320 }, { "epoch": 0.34, "learning_rate": 9.850803366488141e-06, "logits/chosen": 2.0813698768615723, "logits/rejected": 1.5433803796768188, "logps/chosen": -392.0400695800781, "logps/rejected": -311.6988525390625, "loss": 0.6039, "rewards/accuracies": 0.6734374761581421, "rewards/chosen": 0.2503505349159241, "rewards/margins": 0.2822737991809845, "rewards/rejected": -0.031923286616802216, "step": 330 }, { "epoch": 0.35, "learning_rate": 9.812547819433819e-06, "logits/chosen": 2.0011212825775146, "logits/rejected": 1.510357141494751, "logps/chosen": -369.2845458984375, "logps/rejected": -299.0702209472656, "loss": 0.6011, "rewards/accuracies": 0.6578124761581421, "rewards/chosen": 0.235904261469841, "rewards/margins": 0.30216702818870544, "rewards/rejected": -0.06626276671886444, "step": 340 }, { "epoch": 0.36, "learning_rate": 9.774292272379496e-06, "logits/chosen": 2.0211989879608154, "logits/rejected": 1.5567843914031982, "logps/chosen": -383.47198486328125, "logps/rejected": -300.10260009765625, "loss": 0.6064, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.23364949226379395, "rewards/margins": 0.2908868193626404, "rewards/rejected": -0.05723733827471733, "step": 350 }, { "epoch": 0.37, "learning_rate": 9.736036725325173e-06, "logits/chosen": 2.148892879486084, "logits/rejected": 1.4798226356506348, "logps/chosen": -390.1018371582031, "logps/rejected": -303.35357666015625, "loss": 0.5942, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.2585497796535492, "rewards/margins": 0.3302469849586487, "rewards/rejected": -0.07169722020626068, "step": 360 }, { "epoch": 0.38, "learning_rate": 9.69778117827085e-06, "logits/chosen": 1.9939512014389038, "logits/rejected": 1.4861663579940796, "logps/chosen": -384.94287109375, "logps/rejected": -315.71331787109375, "loss": 0.6058, "rewards/accuracies": 0.676562488079071, "rewards/chosen": 0.21476662158966064, "rewards/margins": 0.30891257524490356, "rewards/rejected": -0.09414595365524292, "step": 370 }, { "epoch": 0.39, "learning_rate": 9.659525631216528e-06, "logits/chosen": 2.0157127380371094, "logits/rejected": 1.480704665184021, "logps/chosen": -380.3633117675781, "logps/rejected": -311.96514892578125, "loss": 0.5846, "rewards/accuracies": 0.703125, "rewards/chosen": 0.1938484162092209, "rewards/margins": 0.3648601472377777, "rewards/rejected": -0.1710117906332016, "step": 380 }, { "epoch": 0.4, "learning_rate": 9.621270084162203e-06, "logits/chosen": 1.9651451110839844, "logits/rejected": 1.348719835281372, "logps/chosen": -384.85528564453125, "logps/rejected": -296.9830627441406, "loss": 0.5876, "rewards/accuracies": 0.707812488079071, "rewards/chosen": 0.21678230166435242, "rewards/margins": 0.370342493057251, "rewards/rejected": -0.15356016159057617, "step": 390 }, { "epoch": 0.41, "learning_rate": 9.58301453710788e-06, "logits/chosen": 2.0349812507629395, "logits/rejected": 1.38664972782135, "logps/chosen": -398.1830749511719, "logps/rejected": -305.6664733886719, "loss": 0.5872, "rewards/accuracies": 0.684374988079071, "rewards/chosen": 0.21142737567424774, "rewards/margins": 0.39249366521835327, "rewards/rejected": -0.18106630444526672, "step": 400 }, { "epoch": 0.41, "eval_logits/chosen": 2.0757977962493896, "eval_logits/rejected": 1.6514394283294678, "eval_logps/chosen": -390.49053955078125, "eval_logps/rejected": -307.37255859375, "eval_loss": 0.5909814238548279, "eval_rewards/accuracies": 0.684499979019165, "eval_rewards/chosen": 0.2075754553079605, "eval_rewards/margins": 0.3688032925128937, "eval_rewards/rejected": -0.16122785210609436, "eval_runtime": 786.1714, "eval_samples_per_second": 2.544, "eval_steps_per_second": 0.636, "step": 400 }, { "epoch": 0.42, "learning_rate": 9.544758990053558e-06, "logits/chosen": 1.895699143409729, "logits/rejected": 1.5060499906539917, "logps/chosen": -385.333740234375, "logps/rejected": -315.8406982421875, "loss": 0.5928, "rewards/accuracies": 0.671875, "rewards/chosen": 0.20046250522136688, "rewards/margins": 0.395385205745697, "rewards/rejected": -0.19492270052433014, "step": 410 }, { "epoch": 0.43, "learning_rate": 9.506503442999235e-06, "logits/chosen": 2.077272653579712, "logits/rejected": 1.4193239212036133, "logps/chosen": -408.7973937988281, "logps/rejected": -313.09930419921875, "loss": 0.5695, "rewards/accuracies": 0.698437511920929, "rewards/chosen": 0.21614034473896027, "rewards/margins": 0.44615036249160767, "rewards/rejected": -0.230009987950325, "step": 420 }, { "epoch": 0.44, "learning_rate": 9.468247895944913e-06, "logits/chosen": 1.9182850122451782, "logits/rejected": 1.3885082006454468, "logps/chosen": -372.385986328125, "logps/rejected": -289.5968933105469, "loss": 0.5728, "rewards/accuracies": 0.703125, "rewards/chosen": 0.16733184456825256, "rewards/margins": 0.42595013976097107, "rewards/rejected": -0.2586182951927185, "step": 430 }, { "epoch": 0.45, "learning_rate": 9.42999234889059e-06, "logits/chosen": 2.0264792442321777, "logits/rejected": 1.424521803855896, "logps/chosen": -403.67645263671875, "logps/rejected": -308.82489013671875, "loss": 0.5901, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": 0.18142452836036682, "rewards/margins": 0.4204360544681549, "rewards/rejected": -0.23901152610778809, "step": 440 }, { "epoch": 0.46, "learning_rate": 9.391736801836267e-06, "logits/chosen": 1.9406378269195557, "logits/rejected": 1.5541361570358276, "logps/chosen": -386.9203186035156, "logps/rejected": -317.9880676269531, "loss": 0.608, "rewards/accuracies": 0.660937488079071, "rewards/chosen": 0.16303928196430206, "rewards/margins": 0.36464864015579224, "rewards/rejected": -0.20160937309265137, "step": 450 }, { "epoch": 0.48, "learning_rate": 9.353481254781945e-06, "logits/chosen": 1.965314269065857, "logits/rejected": 1.449096918106079, "logps/chosen": -380.2003479003906, "logps/rejected": -302.89398193359375, "loss": 0.6049, "rewards/accuracies": 0.6703125238418579, "rewards/chosen": 0.14995898306369781, "rewards/margins": 0.3634600043296814, "rewards/rejected": -0.21350105106830597, "step": 460 }, { "epoch": 0.49, "learning_rate": 9.31522570772762e-06, "logits/chosen": 2.014303207397461, "logits/rejected": 1.429276704788208, "logps/chosen": -403.7447509765625, "logps/rejected": -316.78729248046875, "loss": 0.5911, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.2223070114850998, "rewards/margins": 0.4202900528907776, "rewards/rejected": -0.197983056306839, "step": 470 }, { "epoch": 0.5, "learning_rate": 9.2769701606733e-06, "logits/chosen": 2.0352532863616943, "logits/rejected": 1.3191134929656982, "logps/chosen": -387.82379150390625, "logps/rejected": -309.2143859863281, "loss": 0.5539, "rewards/accuracies": 0.7109375, "rewards/chosen": 0.2285471260547638, "rewards/margins": 0.5033730864524841, "rewards/rejected": -0.27482596039772034, "step": 480 }, { "epoch": 0.51, "learning_rate": 9.238714613618975e-06, "logits/chosen": 1.9018347263336182, "logits/rejected": 1.5643819570541382, "logps/chosen": -385.23028564453125, "logps/rejected": -301.07757568359375, "loss": 0.5789, "rewards/accuracies": 0.682812511920929, "rewards/chosen": 0.18415650725364685, "rewards/margins": 0.451254278421402, "rewards/rejected": -0.26709774136543274, "step": 490 }, { "epoch": 0.52, "learning_rate": 9.200459066564654e-06, "logits/chosen": 2.0545425415039062, "logits/rejected": 1.4605293273925781, "logps/chosen": -402.0376892089844, "logps/rejected": -302.36749267578125, "loss": 0.5861, "rewards/accuracies": 0.653124988079071, "rewards/chosen": 0.1868111938238144, "rewards/margins": 0.43735194206237793, "rewards/rejected": -0.25054073333740234, "step": 500 }, { "epoch": 0.52, "eval_logits/chosen": 2.0720036029815674, "eval_logits/rejected": 1.6531344652175903, "eval_logps/chosen": -390.726318359375, "eval_logps/rejected": -308.2648620605469, "eval_loss": 0.5802921056747437, "eval_rewards/accuracies": 0.6924999952316284, "eval_rewards/chosen": 0.18399661779403687, "eval_rewards/margins": 0.4344543516635895, "eval_rewards/rejected": -0.2504577338695526, "eval_runtime": 787.0774, "eval_samples_per_second": 2.541, "eval_steps_per_second": 0.635, "step": 500 }, { "epoch": 0.53, "learning_rate": 9.16220351951033e-06, "logits/chosen": 2.03237247467041, "logits/rejected": 1.387549638748169, "logps/chosen": -395.69842529296875, "logps/rejected": -293.83978271484375, "loss": 0.5451, "rewards/accuracies": 0.723437488079071, "rewards/chosen": 0.20167216658592224, "rewards/margins": 0.5336199998855591, "rewards/rejected": -0.33194786310195923, "step": 510 }, { "epoch": 0.54, "learning_rate": 9.123947972456007e-06, "logits/chosen": 1.9885421991348267, "logits/rejected": 1.4406263828277588, "logps/chosen": -396.9605407714844, "logps/rejected": -313.31390380859375, "loss": 0.5961, "rewards/accuracies": 0.65625, "rewards/chosen": 0.18424133956432343, "rewards/margins": 0.4461286962032318, "rewards/rejected": -0.26188740134239197, "step": 520 }, { "epoch": 0.55, "learning_rate": 9.085692425401684e-06, "logits/chosen": 1.9618057012557983, "logits/rejected": 1.453439474105835, "logps/chosen": -400.43524169921875, "logps/rejected": -316.65472412109375, "loss": 0.561, "rewards/accuracies": 0.698437511920929, "rewards/chosen": 0.2231372594833374, "rewards/margins": 0.5305289030075073, "rewards/rejected": -0.3073917627334595, "step": 530 }, { "epoch": 0.56, "learning_rate": 9.047436878347362e-06, "logits/chosen": 1.9433200359344482, "logits/rejected": 1.6072521209716797, "logps/chosen": -381.23248291015625, "logps/rejected": -305.9108581542969, "loss": 0.589, "rewards/accuracies": 0.698437511920929, "rewards/chosen": 0.13119344413280487, "rewards/margins": 0.43791618943214417, "rewards/rejected": -0.3067227303981781, "step": 540 }, { "epoch": 0.57, "learning_rate": 9.009181331293039e-06, "logits/chosen": 2.0213942527770996, "logits/rejected": 1.4648019075393677, "logps/chosen": -387.95111083984375, "logps/rejected": -317.6094055175781, "loss": 0.6032, "rewards/accuracies": 0.667187511920929, "rewards/chosen": 0.143142968416214, "rewards/margins": 0.41468414664268494, "rewards/rejected": -0.27154120802879333, "step": 550 }, { "epoch": 0.58, "learning_rate": 8.970925784238716e-06, "logits/chosen": 1.9625047445297241, "logits/rejected": 1.3018137216567993, "logps/chosen": -398.4783630371094, "logps/rejected": -315.54693603515625, "loss": 0.5562, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": 0.1752193719148636, "rewards/margins": 0.5354190468788147, "rewards/rejected": -0.36019963026046753, "step": 560 }, { "epoch": 0.59, "learning_rate": 8.932670237184392e-06, "logits/chosen": 2.158475399017334, "logits/rejected": 1.3677072525024414, "logps/chosen": -393.03521728515625, "logps/rejected": -301.0921936035156, "loss": 0.5577, "rewards/accuracies": 0.7109375, "rewards/chosen": 0.21825072169303894, "rewards/margins": 0.5301347374916077, "rewards/rejected": -0.3118840157985687, "step": 570 }, { "epoch": 0.6, "learning_rate": 8.89441469013007e-06, "logits/chosen": 2.0326755046844482, "logits/rejected": 1.4735110998153687, "logps/chosen": -400.419921875, "logps/rejected": -317.2841796875, "loss": 0.5864, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": 0.20257218182086945, "rewards/margins": 0.4641403257846832, "rewards/rejected": -0.26156818866729736, "step": 580 }, { "epoch": 0.61, "learning_rate": 8.856159143075746e-06, "logits/chosen": 2.025437593460083, "logits/rejected": 1.4637222290039062, "logps/chosen": -395.8478088378906, "logps/rejected": -306.3287658691406, "loss": 0.5818, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": 0.17499013245105743, "rewards/margins": 0.47637930512428284, "rewards/rejected": -0.3013891577720642, "step": 590 }, { "epoch": 0.62, "learning_rate": 8.817903596021424e-06, "logits/chosen": 1.999328374862671, "logits/rejected": 1.4125392436981201, "logps/chosen": -392.04046630859375, "logps/rejected": -317.8755187988281, "loss": 0.5738, "rewards/accuracies": 0.6890624761581421, "rewards/chosen": 0.20593245327472687, "rewards/margins": 0.4844569265842438, "rewards/rejected": -0.2785245180130005, "step": 600 }, { "epoch": 0.62, "eval_logits/chosen": 2.0712270736694336, "eval_logits/rejected": 1.6547694206237793, "eval_logps/chosen": -390.5502014160156, "eval_logps/rejected": -308.4646301269531, "eval_loss": 0.5745378732681274, "eval_rewards/accuracies": 0.6890000104904175, "eval_rewards/chosen": 0.20160633325576782, "eval_rewards/margins": 0.4720412790775299, "eval_rewards/rejected": -0.27043500542640686, "eval_runtime": 783.909, "eval_samples_per_second": 2.551, "eval_steps_per_second": 0.638, "step": 600 }, { "epoch": 0.63, "learning_rate": 8.779648048967101e-06, "logits/chosen": 2.030611515045166, "logits/rejected": 1.513715386390686, "logps/chosen": -411.4903869628906, "logps/rejected": -325.0130310058594, "loss": 0.5653, "rewards/accuracies": 0.715624988079071, "rewards/chosen": 0.26050645112991333, "rewards/margins": 0.5234567523002625, "rewards/rejected": -0.2629503011703491, "step": 610 }, { "epoch": 0.64, "learning_rate": 8.741392501912778e-06, "logits/chosen": 2.0594470500946045, "logits/rejected": 1.4518964290618896, "logps/chosen": -382.01025390625, "logps/rejected": -293.17047119140625, "loss": 0.5678, "rewards/accuracies": 0.6890624761581421, "rewards/chosen": 0.2278972566127777, "rewards/margins": 0.5156549215316772, "rewards/rejected": -0.28775766491889954, "step": 620 }, { "epoch": 0.65, "learning_rate": 8.703136954858456e-06, "logits/chosen": 1.9925777912139893, "logits/rejected": 1.4612175226211548, "logps/chosen": -395.0455627441406, "logps/rejected": -304.58392333984375, "loss": 0.577, "rewards/accuracies": 0.7109375, "rewards/chosen": 0.16291160881519318, "rewards/margins": 0.4789190888404846, "rewards/rejected": -0.31600743532180786, "step": 630 }, { "epoch": 0.66, "learning_rate": 8.664881407804131e-06, "logits/chosen": 2.0611515045166016, "logits/rejected": 1.4914582967758179, "logps/chosen": -399.3297119140625, "logps/rejected": -323.3060302734375, "loss": 0.5542, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.21616140007972717, "rewards/margins": 0.547519326210022, "rewards/rejected": -0.3313579559326172, "step": 640 }, { "epoch": 0.67, "learning_rate": 8.62662586074981e-06, "logits/chosen": 2.04584002494812, "logits/rejected": 1.5650436878204346, "logps/chosen": -406.27471923828125, "logps/rejected": -312.99102783203125, "loss": 0.5917, "rewards/accuracies": 0.651562511920929, "rewards/chosen": 0.17155683040618896, "rewards/margins": 0.4932311177253723, "rewards/rejected": -0.32167428731918335, "step": 650 }, { "epoch": 0.68, "learning_rate": 8.588370313695486e-06, "logits/chosen": 2.0872044563293457, "logits/rejected": 1.576183557510376, "logps/chosen": -407.88165283203125, "logps/rejected": -321.4124450683594, "loss": 0.5963, "rewards/accuracies": 0.6734374761581421, "rewards/chosen": 0.163691446185112, "rewards/margins": 0.4260079860687256, "rewards/rejected": -0.2623165249824524, "step": 660 }, { "epoch": 0.69, "learning_rate": 8.550114766641163e-06, "logits/chosen": 2.022469997406006, "logits/rejected": 1.4793013334274292, "logps/chosen": -389.3971252441406, "logps/rejected": -305.6213684082031, "loss": 0.5469, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.199744313955307, "rewards/margins": 0.5485779047012329, "rewards/rejected": -0.3488336503505707, "step": 670 }, { "epoch": 0.7, "learning_rate": 8.51185921958684e-06, "logits/chosen": 2.036457061767578, "logits/rejected": 1.4570763111114502, "logps/chosen": -391.916259765625, "logps/rejected": -313.1102294921875, "loss": 0.5817, "rewards/accuracies": 0.682812511920929, "rewards/chosen": 0.13610802590847015, "rewards/margins": 0.4853372573852539, "rewards/rejected": -0.34922921657562256, "step": 680 }, { "epoch": 0.71, "learning_rate": 8.473603672532518e-06, "logits/chosen": 2.030815601348877, "logits/rejected": 1.4559288024902344, "logps/chosen": -390.91748046875, "logps/rejected": -310.41436767578125, "loss": 0.5709, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1338653415441513, "rewards/margins": 0.4942235052585602, "rewards/rejected": -0.36035817861557007, "step": 690 }, { "epoch": 0.72, "learning_rate": 8.435348125478195e-06, "logits/chosen": 2.074737310409546, "logits/rejected": 1.5686569213867188, "logps/chosen": -402.61907958984375, "logps/rejected": -308.25030517578125, "loss": 0.5638, "rewards/accuracies": 0.6953125, "rewards/chosen": 0.23072762787342072, "rewards/margins": 0.5431710481643677, "rewards/rejected": -0.3124435544013977, "step": 700 }, { "epoch": 0.72, "eval_logits/chosen": 2.0751755237579346, "eval_logits/rejected": 1.6514369249343872, "eval_logps/chosen": -390.6432800292969, "eval_logps/rejected": -308.89520263671875, "eval_loss": 0.5672246217727661, "eval_rewards/accuracies": 0.6974999904632568, "eval_rewards/chosen": 0.19229747354984283, "eval_rewards/margins": 0.5057877898216248, "eval_rewards/rejected": -0.3134903311729431, "eval_runtime": 783.5776, "eval_samples_per_second": 2.552, "eval_steps_per_second": 0.638, "step": 700 }, { "epoch": 0.73, "learning_rate": 8.397092578423873e-06, "logits/chosen": 2.054037094116211, "logits/rejected": 1.4720035791397095, "logps/chosen": -409.67724609375, "logps/rejected": -325.295654296875, "loss": 0.5773, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.21965818107128143, "rewards/margins": 0.5199156403541565, "rewards/rejected": -0.30025750398635864, "step": 710 }, { "epoch": 0.74, "learning_rate": 8.358837031369548e-06, "logits/chosen": 2.0749356746673584, "logits/rejected": 1.642960548400879, "logps/chosen": -365.96173095703125, "logps/rejected": -293.26214599609375, "loss": 0.5931, "rewards/accuracies": 0.671875, "rewards/chosen": 0.14973695576190948, "rewards/margins": 0.4421418309211731, "rewards/rejected": -0.29240483045578003, "step": 720 }, { "epoch": 0.75, "learning_rate": 8.320581484315227e-06, "logits/chosen": 1.9439254999160767, "logits/rejected": 1.4602925777435303, "logps/chosen": -366.2799377441406, "logps/rejected": -295.20587158203125, "loss": 0.5865, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.13285613059997559, "rewards/margins": 0.4571729600429535, "rewards/rejected": -0.3243167996406555, "step": 730 }, { "epoch": 0.76, "learning_rate": 8.282325937260903e-06, "logits/chosen": 2.010871171951294, "logits/rejected": 1.4174662828445435, "logps/chosen": -366.3360900878906, "logps/rejected": -282.5572509765625, "loss": 0.5526, "rewards/accuracies": 0.729687511920929, "rewards/chosen": 0.15969808399677277, "rewards/margins": 0.5379899144172668, "rewards/rejected": -0.3782918155193329, "step": 740 }, { "epoch": 0.77, "learning_rate": 8.24407039020658e-06, "logits/chosen": 2.060192108154297, "logits/rejected": 1.4041664600372314, "logps/chosen": -388.09368896484375, "logps/rejected": -304.89007568359375, "loss": 0.5723, "rewards/accuracies": 0.7046874761581421, "rewards/chosen": 0.170013427734375, "rewards/margins": 0.5185979604721069, "rewards/rejected": -0.34858450293540955, "step": 750 }, { "epoch": 0.78, "learning_rate": 8.205814843152258e-06, "logits/chosen": 1.9835844039916992, "logits/rejected": 1.5863354206085205, "logps/chosen": -394.2154541015625, "logps/rejected": -313.39459228515625, "loss": 0.5726, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.17952260375022888, "rewards/margins": 0.48593711853027344, "rewards/rejected": -0.3064144551753998, "step": 760 }, { "epoch": 0.8, "learning_rate": 8.167559296097935e-06, "logits/chosen": 2.054565668106079, "logits/rejected": 1.4364211559295654, "logps/chosen": -389.25738525390625, "logps/rejected": -302.16314697265625, "loss": 0.5727, "rewards/accuracies": 0.6640625, "rewards/chosen": 0.17098964750766754, "rewards/margins": 0.5419114232063293, "rewards/rejected": -0.3709217309951782, "step": 770 }, { "epoch": 0.81, "learning_rate": 8.129303749043612e-06, "logits/chosen": 2.0685715675354004, "logits/rejected": 1.4904701709747314, "logps/chosen": -389.28546142578125, "logps/rejected": -300.2917785644531, "loss": 0.581, "rewards/accuracies": 0.698437511920929, "rewards/chosen": 0.13563263416290283, "rewards/margins": 0.5030705332756042, "rewards/rejected": -0.3674378991127014, "step": 780 }, { "epoch": 0.82, "learning_rate": 8.09104820198929e-06, "logits/chosen": 1.9442039728164673, "logits/rejected": 1.5138075351715088, "logps/chosen": -401.49188232421875, "logps/rejected": -322.7591247558594, "loss": 0.5711, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1477270871400833, "rewards/margins": 0.519391655921936, "rewards/rejected": -0.37166452407836914, "step": 790 }, { "epoch": 0.83, "learning_rate": 8.052792654934967e-06, "logits/chosen": 2.012821674346924, "logits/rejected": 1.5833919048309326, "logps/chosen": -393.879150390625, "logps/rejected": -322.25872802734375, "loss": 0.6039, "rewards/accuracies": 0.682812511920929, "rewards/chosen": 0.111841581761837, "rewards/margins": 0.4324871897697449, "rewards/rejected": -0.32064563035964966, "step": 800 }, { "epoch": 0.83, "eval_logits/chosen": 2.0655617713928223, "eval_logits/rejected": 1.6458100080490112, "eval_logps/chosen": -390.8603820800781, "eval_logps/rejected": -309.1968688964844, "eval_loss": 0.56373530626297, "eval_rewards/accuracies": 0.6995000243186951, "eval_rewards/chosen": 0.1705896407365799, "eval_rewards/margins": 0.5142490863800049, "eval_rewards/rejected": -0.3436594605445862, "eval_runtime": 780.8428, "eval_samples_per_second": 2.561, "eval_steps_per_second": 0.64, "step": 800 }, { "epoch": 0.84, "learning_rate": 8.014537107880642e-06, "logits/chosen": 1.9995319843292236, "logits/rejected": 1.4297235012054443, "logps/chosen": -388.7563171386719, "logps/rejected": -306.8624572753906, "loss": 0.5817, "rewards/accuracies": 0.6859375238418579, "rewards/chosen": 0.17700453102588654, "rewards/margins": 0.5003529787063599, "rewards/rejected": -0.32334843277931213, "step": 810 }, { "epoch": 0.85, "learning_rate": 7.97628156082632e-06, "logits/chosen": 2.037069320678711, "logits/rejected": 1.4553593397140503, "logps/chosen": -394.61602783203125, "logps/rejected": -315.02606201171875, "loss": 0.5659, "rewards/accuracies": 0.723437488079071, "rewards/chosen": 0.14454925060272217, "rewards/margins": 0.5223037004470825, "rewards/rejected": -0.3777545094490051, "step": 820 }, { "epoch": 0.86, "learning_rate": 7.938026013771997e-06, "logits/chosen": 1.9464938640594482, "logits/rejected": 1.457650899887085, "logps/chosen": -386.8043212890625, "logps/rejected": -310.10015869140625, "loss": 0.5917, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.11999593675136566, "rewards/margins": 0.4883539080619812, "rewards/rejected": -0.3683580160140991, "step": 830 }, { "epoch": 0.87, "learning_rate": 7.899770466717674e-06, "logits/chosen": 1.9806597232818604, "logits/rejected": 1.5842230319976807, "logps/chosen": -372.48822021484375, "logps/rejected": -312.10333251953125, "loss": 0.6004, "rewards/accuracies": 0.6890624761581421, "rewards/chosen": 0.10952778905630112, "rewards/margins": 0.43161463737487793, "rewards/rejected": -0.3220868706703186, "step": 840 }, { "epoch": 0.88, "learning_rate": 7.861514919663352e-06, "logits/chosen": 2.0717015266418457, "logits/rejected": 1.355222463607788, "logps/chosen": -397.9261779785156, "logps/rejected": -315.6976013183594, "loss": 0.5692, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.17800065875053406, "rewards/margins": 0.5210934281349182, "rewards/rejected": -0.34309273958206177, "step": 850 }, { "epoch": 0.89, "learning_rate": 7.823259372609029e-06, "logits/chosen": 1.91803777217865, "logits/rejected": 1.6049435138702393, "logps/chosen": -380.4819030761719, "logps/rejected": -323.6252136230469, "loss": 0.5871, "rewards/accuracies": 0.6875, "rewards/chosen": 0.12971186637878418, "rewards/margins": 0.4712757468223572, "rewards/rejected": -0.341563880443573, "step": 860 }, { "epoch": 0.9, "learning_rate": 7.785003825554705e-06, "logits/chosen": 2.008575916290283, "logits/rejected": 1.4885327816009521, "logps/chosen": -390.3096618652344, "logps/rejected": -316.64752197265625, "loss": 0.5437, "rewards/accuracies": 0.7171875238418579, "rewards/chosen": 0.16415062546730042, "rewards/margins": 0.5805265307426453, "rewards/rejected": -0.41637593507766724, "step": 870 }, { "epoch": 0.91, "learning_rate": 7.746748278500384e-06, "logits/chosen": 1.9549353122711182, "logits/rejected": 1.4889514446258545, "logps/chosen": -374.76861572265625, "logps/rejected": -296.1573791503906, "loss": 0.5676, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.10873917490243912, "rewards/margins": 0.5081608891487122, "rewards/rejected": -0.39942169189453125, "step": 880 }, { "epoch": 0.92, "learning_rate": 7.70849273144606e-06, "logits/chosen": 2.122917413711548, "logits/rejected": 1.4685524702072144, "logps/chosen": -411.05401611328125, "logps/rejected": -313.86920166015625, "loss": 0.5499, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.18222926557064056, "rewards/margins": 0.5844322443008423, "rewards/rejected": -0.4022029936313629, "step": 890 }, { "epoch": 0.93, "learning_rate": 7.670237184391738e-06, "logits/chosen": 1.9092445373535156, "logits/rejected": 1.4852876663208008, "logps/chosen": -398.60284423828125, "logps/rejected": -323.40008544921875, "loss": 0.5981, "rewards/accuracies": 0.667187511920929, "rewards/chosen": 0.11897972971200943, "rewards/margins": 0.47275203466415405, "rewards/rejected": -0.3537723422050476, "step": 900 }, { "epoch": 0.93, "eval_logits/chosen": 2.0300445556640625, "eval_logits/rejected": 1.6137703657150269, "eval_logps/chosen": -390.7452392578125, "eval_logps/rejected": -309.2731628417969, "eval_loss": 0.5607659816741943, "eval_rewards/accuracies": 0.7080000042915344, "eval_rewards/chosen": 0.1821025013923645, "eval_rewards/margins": 0.5333853960037231, "eval_rewards/rejected": -0.35128283500671387, "eval_runtime": 783.5683, "eval_samples_per_second": 2.552, "eval_steps_per_second": 0.638, "step": 900 }, { "epoch": 0.94, "learning_rate": 7.631981637337414e-06, "logits/chosen": 1.9673570394515991, "logits/rejected": 1.509300947189331, "logps/chosen": -391.52349853515625, "logps/rejected": -313.527587890625, "loss": 0.5769, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": 0.16880027949810028, "rewards/margins": 0.5148627161979675, "rewards/rejected": -0.34606242179870605, "step": 910 }, { "epoch": 0.95, "learning_rate": 7.593726090283091e-06, "logits/chosen": 1.9883625507354736, "logits/rejected": 1.456743836402893, "logps/chosen": -402.96051025390625, "logps/rejected": -318.6654357910156, "loss": 0.5583, "rewards/accuracies": 0.698437511920929, "rewards/chosen": 0.20691514015197754, "rewards/margins": 0.5567981004714966, "rewards/rejected": -0.34988290071487427, "step": 920 }, { "epoch": 0.96, "learning_rate": 7.555470543228769e-06, "logits/chosen": 2.1020729541778564, "logits/rejected": 1.299307942390442, "logps/chosen": -404.10919189453125, "logps/rejected": -301.3094787597656, "loss": 0.5452, "rewards/accuracies": 0.7203124761581421, "rewards/chosen": 0.1887318342924118, "rewards/margins": 0.6003209352493286, "rewards/rejected": -0.4115891456604004, "step": 930 }, { "epoch": 0.97, "learning_rate": 7.517214996174446e-06, "logits/chosen": 1.8526630401611328, "logits/rejected": 1.5025310516357422, "logps/chosen": -398.599365234375, "logps/rejected": -322.88885498046875, "loss": 0.5546, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.15927289426326752, "rewards/margins": 0.603129506111145, "rewards/rejected": -0.44385671615600586, "step": 940 }, { "epoch": 0.98, "learning_rate": 7.478959449120123e-06, "logits/chosen": 1.9074022769927979, "logits/rejected": 1.4816137552261353, "logps/chosen": -388.3846130371094, "logps/rejected": -316.42547607421875, "loss": 0.5927, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.11518700420856476, "rewards/margins": 0.4817870557308197, "rewards/rejected": -0.36660006642341614, "step": 950 }, { "epoch": 0.99, "learning_rate": 7.4407039020658e-06, "logits/chosen": 2.0646157264709473, "logits/rejected": 1.502964735031128, "logps/chosen": -412.48193359375, "logps/rejected": -324.00494384765625, "loss": 0.5614, "rewards/accuracies": 0.698437511920929, "rewards/chosen": 0.20487312972545624, "rewards/margins": 0.5513113141059875, "rewards/rejected": -0.3464382290840149, "step": 960 }, { "epoch": 1.0, "learning_rate": 7.402448355011477e-06, "logits/chosen": 2.051802635192871, "logits/rejected": 1.4896515607833862, "logps/chosen": -391.8228759765625, "logps/rejected": -308.06463623046875, "loss": 0.5443, "rewards/accuracies": 0.739062488079071, "rewards/chosen": 0.15718279778957367, "rewards/margins": 0.5793766379356384, "rewards/rejected": -0.42219385504722595, "step": 970 }, { "epoch": 1.01, "learning_rate": 7.3641928079571544e-06, "logits/chosen": 2.022383213043213, "logits/rejected": 1.4268032312393188, "logps/chosen": -392.06280517578125, "logps/rejected": -313.8526611328125, "loss": 0.5563, "rewards/accuracies": 0.707812488079071, "rewards/chosen": 0.18751327693462372, "rewards/margins": 0.5673260688781738, "rewards/rejected": -0.3798127770423889, "step": 980 }, { "epoch": 1.02, "learning_rate": 7.325937260902831e-06, "logits/chosen": 2.0046603679656982, "logits/rejected": 1.471187710762024, "logps/chosen": -382.95550537109375, "logps/rejected": -297.2669677734375, "loss": 0.5261, "rewards/accuracies": 0.734375, "rewards/chosen": 0.1799861192703247, "rewards/margins": 0.6370871663093567, "rewards/rejected": -0.457101047039032, "step": 990 }, { "epoch": 1.03, "learning_rate": 7.287681713848509e-06, "logits/chosen": 1.9740931987762451, "logits/rejected": 1.435901403427124, "logps/chosen": -385.82843017578125, "logps/rejected": -301.95513916015625, "loss": 0.552, "rewards/accuracies": 0.729687511920929, "rewards/chosen": 0.1762188971042633, "rewards/margins": 0.5995662212371826, "rewards/rejected": -0.4233472943305969, "step": 1000 }, { "epoch": 1.03, "eval_logits/chosen": 2.025008201599121, "eval_logits/rejected": 1.6075959205627441, "eval_logps/chosen": -391.2901306152344, "eval_logps/rejected": -310.08160400390625, "eval_loss": 0.5569751262664795, "eval_rewards/accuracies": 0.7095000147819519, "eval_rewards/chosen": 0.127613827586174, "eval_rewards/margins": 0.5597450733184814, "eval_rewards/rejected": -0.43213126063346863, "eval_runtime": 785.2561, "eval_samples_per_second": 2.547, "eval_steps_per_second": 0.637, "step": 1000 }, { "epoch": 1.04, "learning_rate": 7.2494261667941855e-06, "logits/chosen": 1.959423303604126, "logits/rejected": 1.334148645401001, "logps/chosen": -405.4046630859375, "logps/rejected": -296.9268798828125, "loss": 0.5238, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.17236432433128357, "rewards/margins": 0.6547759771347046, "rewards/rejected": -0.48241162300109863, "step": 1010 }, { "epoch": 1.05, "learning_rate": 7.211170619739862e-06, "logits/chosen": 1.9886548519134521, "logits/rejected": 1.4431886672973633, "logps/chosen": -395.4007568359375, "logps/rejected": -317.0209045410156, "loss": 0.5589, "rewards/accuracies": 0.698437511920929, "rewards/chosen": 0.13654819130897522, "rewards/margins": 0.5796588063240051, "rewards/rejected": -0.4431106150150299, "step": 1020 }, { "epoch": 1.06, "learning_rate": 7.17291507268554e-06, "logits/chosen": 1.9506893157958984, "logits/rejected": 1.4356205463409424, "logps/chosen": -391.89697265625, "logps/rejected": -325.9364013671875, "loss": 0.5735, "rewards/accuracies": 0.707812488079071, "rewards/chosen": 0.18971514701843262, "rewards/margins": 0.5603563189506531, "rewards/rejected": -0.37064120173454285, "step": 1030 }, { "epoch": 1.07, "learning_rate": 7.134659525631217e-06, "logits/chosen": 1.9769697189331055, "logits/rejected": 1.35940420627594, "logps/chosen": -392.7068786621094, "logps/rejected": -298.1733093261719, "loss": 0.5672, "rewards/accuracies": 0.698437511920929, "rewards/chosen": 0.13843858242034912, "rewards/margins": 0.5539274215698242, "rewards/rejected": -0.4154888093471527, "step": 1040 }, { "epoch": 1.08, "learning_rate": 7.096403978576895e-06, "logits/chosen": 1.9737446308135986, "logits/rejected": 1.307278037071228, "logps/chosen": -405.0069580078125, "logps/rejected": -297.667236328125, "loss": 0.5619, "rewards/accuracies": 0.715624988079071, "rewards/chosen": 0.17121900618076324, "rewards/margins": 0.5970702767372131, "rewards/rejected": -0.4258512556552887, "step": 1050 }, { "epoch": 1.09, "learning_rate": 7.058148431522571e-06, "logits/chosen": 2.0491771697998047, "logits/rejected": 1.4548033475875854, "logps/chosen": -409.97540283203125, "logps/rejected": -323.37506103515625, "loss": 0.5484, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.21986207365989685, "rewards/margins": 0.6308740377426147, "rewards/rejected": -0.4110119938850403, "step": 1060 }, { "epoch": 1.11, "learning_rate": 7.019892884468249e-06, "logits/chosen": 2.0015454292297363, "logits/rejected": 1.3827497959136963, "logps/chosen": -389.93255615234375, "logps/rejected": -310.8482971191406, "loss": 0.5713, "rewards/accuracies": 0.684374988079071, "rewards/chosen": 0.1651327908039093, "rewards/margins": 0.521917998790741, "rewards/rejected": -0.35678520798683167, "step": 1070 }, { "epoch": 1.12, "learning_rate": 6.981637337413926e-06, "logits/chosen": 1.952951192855835, "logits/rejected": 1.3774534463882446, "logps/chosen": -389.974853515625, "logps/rejected": -301.3324279785156, "loss": 0.5619, "rewards/accuracies": 0.714062511920929, "rewards/chosen": 0.17954735457897186, "rewards/margins": 0.5666349530220032, "rewards/rejected": -0.3870876431465149, "step": 1080 }, { "epoch": 1.13, "learning_rate": 6.9433817903596024e-06, "logits/chosen": 1.9664720296859741, "logits/rejected": 1.426381230354309, "logps/chosen": -391.7522888183594, "logps/rejected": -309.78131103515625, "loss": 0.5616, "rewards/accuracies": 0.715624988079071, "rewards/chosen": 0.19263026118278503, "rewards/margins": 0.5670469999313354, "rewards/rejected": -0.37441685795783997, "step": 1090 }, { "epoch": 1.14, "learning_rate": 6.90512624330528e-06, "logits/chosen": 1.9100536108016968, "logits/rejected": 1.4144351482391357, "logps/chosen": -399.7219543457031, "logps/rejected": -324.71435546875, "loss": 0.5507, "rewards/accuracies": 0.7015625238418579, "rewards/chosen": 0.19064724445343018, "rewards/margins": 0.5907467603683472, "rewards/rejected": -0.4000994563102722, "step": 1100 }, { "epoch": 1.14, "eval_logits/chosen": 2.0176031589508057, "eval_logits/rejected": 1.599656581878662, "eval_logps/chosen": -391.1213684082031, "eval_logps/rejected": -309.881591796875, "eval_loss": 0.5565231442451477, "eval_rewards/accuracies": 0.7129999995231628, "eval_rewards/chosen": 0.14448867738246918, "eval_rewards/margins": 0.5566179156303406, "eval_rewards/rejected": -0.41212931275367737, "eval_runtime": 805.8141, "eval_samples_per_second": 2.482, "eval_steps_per_second": 0.62, "step": 1100 }, { "epoch": 1.15, "learning_rate": 6.866870696250957e-06, "logits/chosen": 1.8571895360946655, "logits/rejected": 1.3931454420089722, "logps/chosen": -387.07366943359375, "logps/rejected": -316.1067810058594, "loss": 0.5621, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": 0.12436342239379883, "rewards/margins": 0.5537547469139099, "rewards/rejected": -0.42939138412475586, "step": 1110 }, { "epoch": 1.16, "learning_rate": 6.828615149196634e-06, "logits/chosen": 1.9544684886932373, "logits/rejected": 1.5813820362091064, "logps/chosen": -411.29693603515625, "logps/rejected": -332.6891174316406, "loss": 0.5514, "rewards/accuracies": 0.71875, "rewards/chosen": 0.12947118282318115, "rewards/margins": 0.5757249593734741, "rewards/rejected": -0.4462536871433258, "step": 1120 }, { "epoch": 1.17, "learning_rate": 6.790359602142311e-06, "logits/chosen": 2.047799825668335, "logits/rejected": 1.3800785541534424, "logps/chosen": -402.60382080078125, "logps/rejected": -300.91534423828125, "loss": 0.5273, "rewards/accuracies": 0.7171875238418579, "rewards/chosen": 0.18002963066101074, "rewards/margins": 0.6796966791152954, "rewards/rejected": -0.4996670186519623, "step": 1130 }, { "epoch": 1.18, "learning_rate": 6.752104055087988e-06, "logits/chosen": 1.951654076576233, "logits/rejected": 1.5494517087936401, "logps/chosen": -372.7135314941406, "logps/rejected": -309.16357421875, "loss": 0.5791, "rewards/accuracies": 0.690625011920929, "rewards/chosen": 0.07283169031143188, "rewards/margins": 0.5024065971374512, "rewards/rejected": -0.4295749068260193, "step": 1140 }, { "epoch": 1.19, "learning_rate": 6.7138485080336655e-06, "logits/chosen": 2.0237233638763428, "logits/rejected": 1.3377307653427124, "logps/chosen": -401.9421691894531, "logps/rejected": -308.84063720703125, "loss": 0.5341, "rewards/accuracies": 0.7515624761581421, "rewards/chosen": 0.15094538033008575, "rewards/margins": 0.6542151570320129, "rewards/rejected": -0.5032697916030884, "step": 1150 }, { "epoch": 1.2, "learning_rate": 6.675592960979342e-06, "logits/chosen": 1.9867032766342163, "logits/rejected": 1.4300332069396973, "logps/chosen": -375.734130859375, "logps/rejected": -293.6662292480469, "loss": 0.5551, "rewards/accuracies": 0.7015625238418579, "rewards/chosen": 0.04818694666028023, "rewards/margins": 0.5770074129104614, "rewards/rejected": -0.5288205146789551, "step": 1160 }, { "epoch": 1.21, "learning_rate": 6.63733741392502e-06, "logits/chosen": 1.9565074443817139, "logits/rejected": 1.38991379737854, "logps/chosen": -382.67791748046875, "logps/rejected": -292.191650390625, "loss": 0.5367, "rewards/accuracies": 0.7171875238418579, "rewards/chosen": 0.09852553904056549, "rewards/margins": 0.6386170387268066, "rewards/rejected": -0.5400915145874023, "step": 1170 }, { "epoch": 1.22, "learning_rate": 6.599081866870697e-06, "logits/chosen": 1.909627914428711, "logits/rejected": 1.482661485671997, "logps/chosen": -387.24102783203125, "logps/rejected": -323.66094970703125, "loss": 0.5758, "rewards/accuracies": 0.684374988079071, "rewards/chosen": 0.06888823211193085, "rewards/margins": 0.559230625629425, "rewards/rejected": -0.490342378616333, "step": 1180 }, { "epoch": 1.23, "learning_rate": 6.560826319816373e-06, "logits/chosen": 1.994439721107483, "logits/rejected": 1.355993628501892, "logps/chosen": -414.5179138183594, "logps/rejected": -320.8077392578125, "loss": 0.5245, "rewards/accuracies": 0.7265625, "rewards/chosen": 0.18673229217529297, "rewards/margins": 0.6650137901306152, "rewards/rejected": -0.47828155755996704, "step": 1190 }, { "epoch": 1.24, "learning_rate": 6.522570772762051e-06, "logits/chosen": 1.9116106033325195, "logits/rejected": 1.4166170358657837, "logps/chosen": -379.8743591308594, "logps/rejected": -305.5546875, "loss": 0.5564, "rewards/accuracies": 0.7203124761581421, "rewards/chosen": 0.07706007361412048, "rewards/margins": 0.5815634727478027, "rewards/rejected": -0.5045033693313599, "step": 1200 }, { "epoch": 1.24, "eval_logits/chosen": 2.0026917457580566, "eval_logits/rejected": 1.584486484527588, "eval_logps/chosen": -391.0874328613281, "eval_logps/rejected": -310.2327575683594, "eval_loss": 0.5531337261199951, "eval_rewards/accuracies": 0.7129999995231628, "eval_rewards/chosen": 0.14788569509983063, "eval_rewards/margins": 0.5951315760612488, "eval_rewards/rejected": -0.4472459852695465, "eval_runtime": 814.0304, "eval_samples_per_second": 2.457, "eval_steps_per_second": 0.614, "step": 1200 }, { "epoch": 1.25, "learning_rate": 6.484315225707728e-06, "logits/chosen": 1.9260963201522827, "logits/rejected": 1.4190881252288818, "logps/chosen": -391.89044189453125, "logps/rejected": -322.5896911621094, "loss": 0.532, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": 0.1751636564731598, "rewards/margins": 0.6663321256637573, "rewards/rejected": -0.4911685585975647, "step": 1210 }, { "epoch": 1.26, "learning_rate": 6.446059678653406e-06, "logits/chosen": 1.964051604270935, "logits/rejected": 1.4130991697311401, "logps/chosen": -401.72607421875, "logps/rejected": -313.59490966796875, "loss": 0.5357, "rewards/accuracies": 0.7109375, "rewards/chosen": 0.17210204899311066, "rewards/margins": 0.700817883014679, "rewards/rejected": -0.5287158489227295, "step": 1220 }, { "epoch": 1.27, "learning_rate": 6.407804131599082e-06, "logits/chosen": 2.022641658782959, "logits/rejected": 1.4481173753738403, "logps/chosen": -393.9057312011719, "logps/rejected": -310.4652404785156, "loss": 0.5642, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.148196741938591, "rewards/margins": 0.5735715627670288, "rewards/rejected": -0.4253748059272766, "step": 1230 }, { "epoch": 1.28, "learning_rate": 6.369548584544759e-06, "logits/chosen": 1.9537503719329834, "logits/rejected": 1.4066534042358398, "logps/chosen": -401.105712890625, "logps/rejected": -324.2802429199219, "loss": 0.5749, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": 0.21560892462730408, "rewards/margins": 0.570895791053772, "rewards/rejected": -0.35528677701950073, "step": 1240 }, { "epoch": 1.29, "learning_rate": 6.331293037490437e-06, "logits/chosen": 1.9738185405731201, "logits/rejected": 1.5079987049102783, "logps/chosen": -386.6836853027344, "logps/rejected": -321.1151428222656, "loss": 0.557, "rewards/accuracies": 0.714062511920929, "rewards/chosen": 0.22610990703105927, "rewards/margins": 0.5788172483444214, "rewards/rejected": -0.3527073264122009, "step": 1250 }, { "epoch": 1.3, "learning_rate": 6.2930374904361135e-06, "logits/chosen": 1.9609134197235107, "logits/rejected": 1.465968370437622, "logps/chosen": -377.80023193359375, "logps/rejected": -312.888671875, "loss": 0.5835, "rewards/accuracies": 0.698437511920929, "rewards/chosen": 0.1519862860441208, "rewards/margins": 0.5421239137649536, "rewards/rejected": -0.390137642621994, "step": 1260 }, { "epoch": 1.31, "learning_rate": 6.254781943381792e-06, "logits/chosen": 1.9608758687973022, "logits/rejected": 1.3631694316864014, "logps/chosen": -412.5445861816406, "logps/rejected": -314.4869079589844, "loss": 0.5554, "rewards/accuracies": 0.7171875238418579, "rewards/chosen": 0.2576819658279419, "rewards/margins": 0.6070693135261536, "rewards/rejected": -0.34938734769821167, "step": 1270 }, { "epoch": 1.32, "learning_rate": 6.216526396327468e-06, "logits/chosen": 1.964524507522583, "logits/rejected": 1.4231841564178467, "logps/chosen": -385.58587646484375, "logps/rejected": -317.8310852050781, "loss": 0.5566, "rewards/accuracies": 0.714062511920929, "rewards/chosen": 0.1964552104473114, "rewards/margins": 0.5943530201911926, "rewards/rejected": -0.3978978097438812, "step": 1280 }, { "epoch": 1.33, "learning_rate": 6.178270849273145e-06, "logits/chosen": 2.0061771869659424, "logits/rejected": 1.448512077331543, "logps/chosen": -380.1380615234375, "logps/rejected": -312.3320007324219, "loss": 0.5526, "rewards/accuracies": 0.721875011920929, "rewards/chosen": 0.16521182656288147, "rewards/margins": 0.5941886901855469, "rewards/rejected": -0.4289769232273102, "step": 1290 }, { "epoch": 1.34, "learning_rate": 6.140015302218823e-06, "logits/chosen": 1.952558159828186, "logits/rejected": 1.4093446731567383, "logps/chosen": -391.9740295410156, "logps/rejected": -312.6495361328125, "loss": 0.5619, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": 0.14877018332481384, "rewards/margins": 0.5804526209831238, "rewards/rejected": -0.43168243765830994, "step": 1300 }, { "epoch": 1.34, "eval_logits/chosen": 1.9939156770706177, "eval_logits/rejected": 1.580398440361023, "eval_logps/chosen": -390.8511962890625, "eval_logps/rejected": -309.8240661621094, "eval_loss": 0.5531826019287109, "eval_rewards/accuracies": 0.715499997138977, "eval_rewards/chosen": 0.1715077906847, "eval_rewards/margins": 0.5778877139091492, "eval_rewards/rejected": -0.40637990832328796, "eval_runtime": 798.7699, "eval_samples_per_second": 2.504, "eval_steps_per_second": 0.626, "step": 1300 }, { "epoch": 1.35, "learning_rate": 6.101759755164499e-06, "logits/chosen": 2.0415663719177246, "logits/rejected": 1.4603914022445679, "logps/chosen": -408.11395263671875, "logps/rejected": -305.8910827636719, "loss": 0.5484, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.21981962025165558, "rewards/margins": 0.6258476376533508, "rewards/rejected": -0.4060280919075012, "step": 1310 }, { "epoch": 1.36, "learning_rate": 6.063504208110177e-06, "logits/chosen": 1.9391075372695923, "logits/rejected": 1.3968732357025146, "logps/chosen": -400.03570556640625, "logps/rejected": -311.88043212890625, "loss": 0.5723, "rewards/accuracies": 0.707812488079071, "rewards/chosen": 0.1595560610294342, "rewards/margins": 0.5328544974327087, "rewards/rejected": -0.37329840660095215, "step": 1320 }, { "epoch": 1.37, "learning_rate": 6.025248661055854e-06, "logits/chosen": 1.8722625970840454, "logits/rejected": 1.4146499633789062, "logps/chosen": -383.120849609375, "logps/rejected": -314.26263427734375, "loss": 0.5613, "rewards/accuracies": 0.7015625238418579, "rewards/chosen": 0.21696746349334717, "rewards/margins": 0.5736743211746216, "rewards/rejected": -0.3567068874835968, "step": 1330 }, { "epoch": 1.38, "learning_rate": 5.98699311400153e-06, "logits/chosen": 1.8491770029067993, "logits/rejected": 1.4446467161178589, "logps/chosen": -383.78863525390625, "logps/rejected": -309.2484436035156, "loss": 0.5913, "rewards/accuracies": 0.6796875, "rewards/chosen": 0.15157422423362732, "rewards/margins": 0.5313254594802856, "rewards/rejected": -0.3797512352466583, "step": 1340 }, { "epoch": 1.39, "learning_rate": 5.948737566947208e-06, "logits/chosen": 1.8374077081680298, "logits/rejected": 1.3561475276947021, "logps/chosen": -378.27252197265625, "logps/rejected": -293.6542053222656, "loss": 0.5405, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": 0.15810391306877136, "rewards/margins": 0.6269534230232239, "rewards/rejected": -0.4688494801521301, "step": 1350 }, { "epoch": 1.4, "learning_rate": 5.910482019892885e-06, "logits/chosen": 1.9720348119735718, "logits/rejected": 1.4627792835235596, "logps/chosen": -371.9326477050781, "logps/rejected": -306.34954833984375, "loss": 0.5669, "rewards/accuracies": 0.7046874761581421, "rewards/chosen": 0.19215206801891327, "rewards/margins": 0.5904697179794312, "rewards/rejected": -0.39831772446632385, "step": 1360 }, { "epoch": 1.41, "learning_rate": 5.872226472838562e-06, "logits/chosen": 2.028496742248535, "logits/rejected": 1.4983336925506592, "logps/chosen": -396.7496643066406, "logps/rejected": -321.01434326171875, "loss": 0.6035, "rewards/accuracies": 0.667187511920929, "rewards/chosen": 0.189684197306633, "rewards/margins": 0.49408969283103943, "rewards/rejected": -0.3044055104255676, "step": 1370 }, { "epoch": 1.43, "learning_rate": 5.833970925784239e-06, "logits/chosen": 1.9712852239608765, "logits/rejected": 1.4186289310455322, "logps/chosen": -399.1224670410156, "logps/rejected": -306.6077575683594, "loss": 0.5586, "rewards/accuracies": 0.723437488079071, "rewards/chosen": 0.23837897181510925, "rewards/margins": 0.5666823983192444, "rewards/rejected": -0.3283034861087799, "step": 1380 }, { "epoch": 1.44, "learning_rate": 5.795715378729916e-06, "logits/chosen": 1.926466941833496, "logits/rejected": 1.3504817485809326, "logps/chosen": -400.1119384765625, "logps/rejected": -326.3760681152344, "loss": 0.5511, "rewards/accuracies": 0.714062511920929, "rewards/chosen": 0.27727603912353516, "rewards/margins": 0.617030918598175, "rewards/rejected": -0.3397548794746399, "step": 1390 }, { "epoch": 1.45, "learning_rate": 5.7574598316755935e-06, "logits/chosen": 2.026695966720581, "logits/rejected": 1.4384924173355103, "logps/chosen": -397.0252990722656, "logps/rejected": -305.0713806152344, "loss": 0.5621, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.22499485313892365, "rewards/margins": 0.520427405834198, "rewards/rejected": -0.29543253779411316, "step": 1400 }, { "epoch": 1.45, "eval_logits/chosen": 2.007225513458252, "eval_logits/rejected": 1.5934194326400757, "eval_logps/chosen": -390.5641784667969, "eval_logps/rejected": -309.43115234375, "eval_loss": 0.5522844195365906, "eval_rewards/accuracies": 0.7149999737739563, "eval_rewards/chosen": 0.2002071589231491, "eval_rewards/margins": 0.5672940015792847, "eval_rewards/rejected": -0.36708685755729675, "eval_runtime": 792.8122, "eval_samples_per_second": 2.523, "eval_steps_per_second": 0.631, "step": 1400 }, { "epoch": 1.46, "learning_rate": 5.71920428462127e-06, "logits/chosen": 2.0195109844207764, "logits/rejected": 1.5116552114486694, "logps/chosen": -388.59503173828125, "logps/rejected": -308.09100341796875, "loss": 0.555, "rewards/accuracies": 0.715624988079071, "rewards/chosen": 0.17492921650409698, "rewards/margins": 0.5775406360626221, "rewards/rejected": -0.4026114344596863, "step": 1410 }, { "epoch": 1.47, "learning_rate": 5.680948737566948e-06, "logits/chosen": 1.8798792362213135, "logits/rejected": 1.3930842876434326, "logps/chosen": -374.4825134277344, "logps/rejected": -300.51593017578125, "loss": 0.5693, "rewards/accuracies": 0.698437511920929, "rewards/chosen": 0.192795068025589, "rewards/margins": 0.5325729250907898, "rewards/rejected": -0.3397778570652008, "step": 1420 }, { "epoch": 1.48, "learning_rate": 5.642693190512625e-06, "logits/chosen": 1.8837686777114868, "logits/rejected": 1.401320457458496, "logps/chosen": -382.8334045410156, "logps/rejected": -316.2846984863281, "loss": 0.5397, "rewards/accuracies": 0.707812488079071, "rewards/chosen": 0.21436075866222382, "rewards/margins": 0.6173911094665527, "rewards/rejected": -0.4030303359031677, "step": 1430 }, { "epoch": 1.49, "learning_rate": 5.604437643458301e-06, "logits/chosen": 1.8869644403457642, "logits/rejected": 1.450655221939087, "logps/chosen": -383.06512451171875, "logps/rejected": -304.9476318359375, "loss": 0.5361, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.1894446164369583, "rewards/margins": 0.6348510980606079, "rewards/rejected": -0.4454064965248108, "step": 1440 }, { "epoch": 1.5, "learning_rate": 5.566182096403979e-06, "logits/chosen": 1.8538520336151123, "logits/rejected": 1.3565696477890015, "logps/chosen": -425.103271484375, "logps/rejected": -315.42340087890625, "loss": 0.528, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.2426043301820755, "rewards/margins": 0.6733630895614624, "rewards/rejected": -0.4307587742805481, "step": 1450 }, { "epoch": 1.51, "learning_rate": 5.527926549349656e-06, "logits/chosen": 1.9136693477630615, "logits/rejected": 1.3390287160873413, "logps/chosen": -388.55267333984375, "logps/rejected": -307.3643798828125, "loss": 0.5427, "rewards/accuracies": 0.745312511920929, "rewards/chosen": 0.24117830395698547, "rewards/margins": 0.6111575365066528, "rewards/rejected": -0.36997920274734497, "step": 1460 }, { "epoch": 1.52, "learning_rate": 5.489671002295334e-06, "logits/chosen": 1.895220160484314, "logits/rejected": 1.4189845323562622, "logps/chosen": -388.977783203125, "logps/rejected": -317.02630615234375, "loss": 0.553, "rewards/accuracies": 0.729687511920929, "rewards/chosen": 0.22869646549224854, "rewards/margins": 0.6072007417678833, "rewards/rejected": -0.37850421667099, "step": 1470 }, { "epoch": 1.53, "learning_rate": 5.45141545524101e-06, "logits/chosen": 1.9026139974594116, "logits/rejected": 1.416741132736206, "logps/chosen": -388.9039001464844, "logps/rejected": -319.28778076171875, "loss": 0.5249, "rewards/accuracies": 0.734375, "rewards/chosen": 0.20672424137592316, "rewards/margins": 0.6609183549880981, "rewards/rejected": -0.4541941285133362, "step": 1480 }, { "epoch": 1.54, "learning_rate": 5.413159908186687e-06, "logits/chosen": 1.8846362829208374, "logits/rejected": 1.4321391582489014, "logps/chosen": -386.5036315917969, "logps/rejected": -317.74517822265625, "loss": 0.5671, "rewards/accuracies": 0.6796875, "rewards/chosen": 0.15309680998325348, "rewards/margins": 0.5837612748146057, "rewards/rejected": -0.4306644797325134, "step": 1490 }, { "epoch": 1.55, "learning_rate": 5.374904361132365e-06, "logits/chosen": 1.9596405029296875, "logits/rejected": 1.3423256874084473, "logps/chosen": -387.34295654296875, "logps/rejected": -304.25054931640625, "loss": 0.56, "rewards/accuracies": 0.707812488079071, "rewards/chosen": 0.14706505835056305, "rewards/margins": 0.5944403409957886, "rewards/rejected": -0.4473752975463867, "step": 1500 }, { "epoch": 1.55, "eval_logits/chosen": 1.9811891317367554, "eval_logits/rejected": 1.5742919445037842, "eval_logps/chosen": -390.88580322265625, "eval_logps/rejected": -310.0704345703125, "eval_loss": 0.5498195290565491, "eval_rewards/accuracies": 0.7204999923706055, "eval_rewards/chosen": 0.1680394560098648, "eval_rewards/margins": 0.5990530848503113, "eval_rewards/rejected": -0.43101364374160767, "eval_runtime": 929.67, "eval_samples_per_second": 2.151, "eval_steps_per_second": 0.538, "step": 1500 }, { "epoch": 1.56, "learning_rate": 5.3366488140780415e-06, "logits/chosen": 1.9913889169692993, "logits/rejected": 1.2893562316894531, "logps/chosen": -376.86651611328125, "logps/rejected": -272.89495849609375, "loss": 0.534, "rewards/accuracies": 0.715624988079071, "rewards/chosen": 0.16582927107810974, "rewards/margins": 0.6355129480361938, "rewards/rejected": -0.4696837067604065, "step": 1510 }, { "epoch": 1.57, "learning_rate": 5.29839326702372e-06, "logits/chosen": 1.9447301626205444, "logits/rejected": 1.3859245777130127, "logps/chosen": -388.61834716796875, "logps/rejected": -315.6685485839844, "loss": 0.5567, "rewards/accuracies": 0.707812488079071, "rewards/chosen": 0.1793958842754364, "rewards/margins": 0.6203678250312805, "rewards/rejected": -0.4409719407558441, "step": 1520 }, { "epoch": 1.58, "learning_rate": 5.260137719969396e-06, "logits/chosen": 1.8826446533203125, "logits/rejected": 1.3331654071807861, "logps/chosen": -383.728515625, "logps/rejected": -300.64703369140625, "loss": 0.5195, "rewards/accuracies": 0.7484375238418579, "rewards/chosen": 0.2130293846130371, "rewards/margins": 0.704422652721405, "rewards/rejected": -0.4913933277130127, "step": 1530 }, { "epoch": 1.59, "learning_rate": 5.221882172915073e-06, "logits/chosen": 1.8909591436386108, "logits/rejected": 1.3999977111816406, "logps/chosen": -399.75885009765625, "logps/rejected": -313.6823425292969, "loss": 0.5376, "rewards/accuracies": 0.734375, "rewards/chosen": 0.15901023149490356, "rewards/margins": 0.6692485809326172, "rewards/rejected": -0.5102383494377136, "step": 1540 }, { "epoch": 1.6, "learning_rate": 5.183626625860751e-06, "logits/chosen": 1.8340473175048828, "logits/rejected": 1.4051573276519775, "logps/chosen": -374.83782958984375, "logps/rejected": -301.93182373046875, "loss": 0.555, "rewards/accuracies": 0.7046874761581421, "rewards/chosen": 0.14348945021629333, "rewards/margins": 0.6144425868988037, "rewards/rejected": -0.4709531366825104, "step": 1550 }, { "epoch": 1.61, "learning_rate": 5.145371078806427e-06, "logits/chosen": 1.8588403463363647, "logits/rejected": 1.3841490745544434, "logps/chosen": -402.9275207519531, "logps/rejected": -322.46026611328125, "loss": 0.5344, "rewards/accuracies": 0.745312511920929, "rewards/chosen": 0.17771649360656738, "rewards/margins": 0.6865950226783752, "rewards/rejected": -0.5088784694671631, "step": 1560 }, { "epoch": 1.62, "learning_rate": 5.107115531752105e-06, "logits/chosen": 1.9124072790145874, "logits/rejected": 1.2741097211837769, "logps/chosen": -395.0220642089844, "logps/rejected": -310.54217529296875, "loss": 0.5376, "rewards/accuracies": 0.7265625, "rewards/chosen": 0.16719238460063934, "rewards/margins": 0.6582823991775513, "rewards/rejected": -0.49108999967575073, "step": 1570 }, { "epoch": 1.63, "learning_rate": 5.068859984697782e-06, "logits/chosen": 1.8537428379058838, "logits/rejected": 1.3776872158050537, "logps/chosen": -382.8225402832031, "logps/rejected": -312.5972595214844, "loss": 0.5356, "rewards/accuracies": 0.7328125238418579, "rewards/chosen": 0.1402355134487152, "rewards/margins": 0.6729972958564758, "rewards/rejected": -0.5327617526054382, "step": 1580 }, { "epoch": 1.64, "learning_rate": 5.030604437643458e-06, "logits/chosen": 1.9892429113388062, "logits/rejected": 1.48006272315979, "logps/chosen": -393.6377868652344, "logps/rejected": -317.9789733886719, "loss": 0.5448, "rewards/accuracies": 0.707812488079071, "rewards/chosen": 0.18385156989097595, "rewards/margins": 0.6433760523796082, "rewards/rejected": -0.459524542093277, "step": 1590 }, { "epoch": 1.65, "learning_rate": 4.992348890589136e-06, "logits/chosen": 1.8467633724212646, "logits/rejected": 1.4027966260910034, "logps/chosen": -375.00225830078125, "logps/rejected": -304.69970703125, "loss": 0.5531, "rewards/accuracies": 0.71875, "rewards/chosen": 0.16209235787391663, "rewards/margins": 0.6290103197097778, "rewards/rejected": -0.4669179916381836, "step": 1600 }, { "epoch": 1.65, "eval_logits/chosen": 1.9653117656707764, "eval_logits/rejected": 1.5567891597747803, "eval_logps/chosen": -391.0749206542969, "eval_logps/rejected": -310.5533752441406, "eval_loss": 0.5468813180923462, "eval_rewards/accuracies": 0.718500018119812, "eval_rewards/chosen": 0.14913547039031982, "eval_rewards/margins": 0.6284475326538086, "eval_rewards/rejected": -0.47931212186813354, "eval_runtime": 826.787, "eval_samples_per_second": 2.419, "eval_steps_per_second": 0.605, "step": 1600 }, { "epoch": 1.66, "learning_rate": 4.954093343534813e-06, "logits/chosen": 1.9594428539276123, "logits/rejected": 1.3632463216781616, "logps/chosen": -380.3455810546875, "logps/rejected": -307.2145080566406, "loss": 0.5563, "rewards/accuracies": 0.7171875238418579, "rewards/chosen": 0.1433544009923935, "rewards/margins": 0.6070438623428345, "rewards/rejected": -0.4636894762516022, "step": 1610 }, { "epoch": 1.67, "learning_rate": 4.9158377964804895e-06, "logits/chosen": 1.8713620901107788, "logits/rejected": 1.3161896467208862, "logps/chosen": -372.5840759277344, "logps/rejected": -296.98565673828125, "loss": 0.5515, "rewards/accuracies": 0.729687511920929, "rewards/chosen": 0.13692577183246613, "rewards/margins": 0.6046603918075562, "rewards/rejected": -0.4677346646785736, "step": 1620 }, { "epoch": 1.68, "learning_rate": 4.877582249426167e-06, "logits/chosen": 1.9094302654266357, "logits/rejected": 1.4339892864227295, "logps/chosen": -402.25067138671875, "logps/rejected": -327.61761474609375, "loss": 0.5701, "rewards/accuracies": 0.684374988079071, "rewards/chosen": 0.16984933614730835, "rewards/margins": 0.5965026021003723, "rewards/rejected": -0.4266532361507416, "step": 1630 }, { "epoch": 1.69, "learning_rate": 4.839326702371844e-06, "logits/chosen": 1.9412224292755127, "logits/rejected": 1.3085362911224365, "logps/chosen": -392.1555480957031, "logps/rejected": -318.64007568359375, "loss": 0.5632, "rewards/accuracies": 0.6953125, "rewards/chosen": 0.1391909420490265, "rewards/margins": 0.6007251143455505, "rewards/rejected": -0.46153420209884644, "step": 1640 }, { "epoch": 1.7, "learning_rate": 4.8010711553175215e-06, "logits/chosen": 1.9966071844100952, "logits/rejected": 1.3831865787506104, "logps/chosen": -413.2572326660156, "logps/rejected": -326.82000732421875, "loss": 0.5311, "rewards/accuracies": 0.7265625, "rewards/chosen": 0.2764299511909485, "rewards/margins": 0.7140663266181946, "rewards/rejected": -0.4376363754272461, "step": 1650 }, { "epoch": 1.71, "learning_rate": 4.762815608263199e-06, "logits/chosen": 1.87734055519104, "logits/rejected": 1.4385781288146973, "logps/chosen": -397.2535400390625, "logps/rejected": -334.5222473144531, "loss": 0.5894, "rewards/accuracies": 0.659375011920929, "rewards/chosen": 0.14717647433280945, "rewards/margins": 0.578982412815094, "rewards/rejected": -0.43180593848228455, "step": 1660 }, { "epoch": 1.72, "learning_rate": 4.724560061208875e-06, "logits/chosen": 1.9491714239120483, "logits/rejected": 1.4346935749053955, "logps/chosen": -382.19952392578125, "logps/rejected": -310.0763854980469, "loss": 0.5556, "rewards/accuracies": 0.739062488079071, "rewards/chosen": 0.2093806266784668, "rewards/margins": 0.6028395891189575, "rewards/rejected": -0.39345893263816833, "step": 1670 }, { "epoch": 1.74, "learning_rate": 4.686304514154553e-06, "logits/chosen": 1.962201714515686, "logits/rejected": 1.3550127744674683, "logps/chosen": -389.6959228515625, "logps/rejected": -305.4635314941406, "loss": 0.5573, "rewards/accuracies": 0.7109375, "rewards/chosen": 0.18966779112815857, "rewards/margins": 0.5900794267654419, "rewards/rejected": -0.4004116952419281, "step": 1680 }, { "epoch": 1.75, "learning_rate": 4.64804896710023e-06, "logits/chosen": 1.842717170715332, "logits/rejected": 1.3413383960723877, "logps/chosen": -384.57440185546875, "logps/rejected": -315.2056579589844, "loss": 0.565, "rewards/accuracies": 0.7046874761581421, "rewards/chosen": 0.1576361507177353, "rewards/margins": 0.5914202928543091, "rewards/rejected": -0.433784157037735, "step": 1690 }, { "epoch": 1.76, "learning_rate": 4.609793420045907e-06, "logits/chosen": 2.056537628173828, "logits/rejected": 1.3731919527053833, "logps/chosen": -389.4626159667969, "logps/rejected": -296.30615234375, "loss": 0.5218, "rewards/accuracies": 0.7406250238418579, "rewards/chosen": 0.22705134749412537, "rewards/margins": 0.7206255197525024, "rewards/rejected": -0.4935741424560547, "step": 1700 }, { "epoch": 1.76, "eval_logits/chosen": 1.978134274482727, "eval_logits/rejected": 1.5730693340301514, "eval_logps/chosen": -390.732177734375, "eval_logps/rejected": -310.0751647949219, "eval_loss": 0.5466998815536499, "eval_rewards/accuracies": 0.7204999923706055, "eval_rewards/chosen": 0.18340666592121124, "eval_rewards/margins": 0.6148949265480042, "eval_rewards/rejected": -0.4314882457256317, "eval_runtime": 936.4251, "eval_samples_per_second": 2.136, "eval_steps_per_second": 0.534, "step": 1700 }, { "epoch": 1.77, "learning_rate": 4.5715378729915846e-06, "logits/chosen": 1.8325378894805908, "logits/rejected": 1.458996057510376, "logps/chosen": -408.39617919921875, "logps/rejected": -320.1488952636719, "loss": 0.5524, "rewards/accuracies": 0.7203124761581421, "rewards/chosen": 0.20333287119865417, "rewards/margins": 0.6326154470443726, "rewards/rejected": -0.4292825758457184, "step": 1710 }, { "epoch": 1.78, "learning_rate": 4.533282325937261e-06, "logits/chosen": 1.9904448986053467, "logits/rejected": 1.2993494272232056, "logps/chosen": -387.580810546875, "logps/rejected": -300.5718078613281, "loss": 0.5183, "rewards/accuracies": 0.729687511920929, "rewards/chosen": 0.17427341639995575, "rewards/margins": 0.7210553288459778, "rewards/rejected": -0.5467818975448608, "step": 1720 }, { "epoch": 1.79, "learning_rate": 4.495026778882938e-06, "logits/chosen": 1.952392578125, "logits/rejected": 1.2962398529052734, "logps/chosen": -382.77618408203125, "logps/rejected": -298.95220947265625, "loss": 0.544, "rewards/accuracies": 0.7265625, "rewards/chosen": 0.16867509484291077, "rewards/margins": 0.6715518832206726, "rewards/rejected": -0.502876877784729, "step": 1730 }, { "epoch": 1.8, "learning_rate": 4.456771231828616e-06, "logits/chosen": 1.9417846202850342, "logits/rejected": 1.3025237321853638, "logps/chosen": -381.9095764160156, "logps/rejected": -297.0923767089844, "loss": 0.5457, "rewards/accuracies": 0.7046874761581421, "rewards/chosen": 0.18698151409626007, "rewards/margins": 0.6296481490135193, "rewards/rejected": -0.4426666796207428, "step": 1740 }, { "epoch": 1.81, "learning_rate": 4.418515684774293e-06, "logits/chosen": 1.927423119544983, "logits/rejected": 1.2898788452148438, "logps/chosen": -367.6332702636719, "logps/rejected": -297.900390625, "loss": 0.5409, "rewards/accuracies": 0.721875011920929, "rewards/chosen": 0.1265488564968109, "rewards/margins": 0.660210132598877, "rewards/rejected": -0.5336612462997437, "step": 1750 }, { "epoch": 1.82, "learning_rate": 4.38026013771997e-06, "logits/chosen": 2.0257441997528076, "logits/rejected": 1.470089077949524, "logps/chosen": -384.87005615234375, "logps/rejected": -309.94793701171875, "loss": 0.5163, "rewards/accuracies": 0.7265625, "rewards/chosen": 0.1944606453180313, "rewards/margins": 0.7155885696411133, "rewards/rejected": -0.5211279988288879, "step": 1760 }, { "epoch": 1.83, "learning_rate": 4.342004590665647e-06, "logits/chosen": 1.9760411977767944, "logits/rejected": 1.4933598041534424, "logps/chosen": -377.43341064453125, "logps/rejected": -311.15313720703125, "loss": 0.4971, "rewards/accuracies": 0.7578125, "rewards/chosen": 0.16314932703971863, "rewards/margins": 0.7901633381843567, "rewards/rejected": -0.6270139813423157, "step": 1770 }, { "epoch": 1.84, "learning_rate": 4.303749043611324e-06, "logits/chosen": 1.809069037437439, "logits/rejected": 1.406779170036316, "logps/chosen": -367.8652038574219, "logps/rejected": -306.23004150390625, "loss": 0.5599, "rewards/accuracies": 0.6859375238418579, "rewards/chosen": 0.09964510053396225, "rewards/margins": 0.6223139762878418, "rewards/rejected": -0.5226688981056213, "step": 1780 }, { "epoch": 1.85, "learning_rate": 4.2654934965570014e-06, "logits/chosen": 1.946282982826233, "logits/rejected": 1.353463888168335, "logps/chosen": -398.5594482421875, "logps/rejected": -312.0063171386719, "loss": 0.5416, "rewards/accuracies": 0.729687511920929, "rewards/chosen": 0.16393008828163147, "rewards/margins": 0.681474506855011, "rewards/rejected": -0.5175443887710571, "step": 1790 }, { "epoch": 1.86, "learning_rate": 4.227237949502678e-06, "logits/chosen": 1.860234022140503, "logits/rejected": 1.3306976556777954, "logps/chosen": -393.6831359863281, "logps/rejected": -309.77728271484375, "loss": 0.543, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.16483023762702942, "rewards/margins": 0.6842068433761597, "rewards/rejected": -0.5193766355514526, "step": 1800 }, { "epoch": 1.86, "eval_logits/chosen": 1.9750703573226929, "eval_logits/rejected": 1.5700032711029053, "eval_logps/chosen": -391.13616943359375, "eval_logps/rejected": -310.76885986328125, "eval_loss": 0.5447162985801697, "eval_rewards/accuracies": 0.7214999794960022, "eval_rewards/chosen": 0.14301449060440063, "eval_rewards/margins": 0.6438723206520081, "eval_rewards/rejected": -0.5008578300476074, "eval_runtime": 855.1326, "eval_samples_per_second": 2.339, "eval_steps_per_second": 0.585, "step": 1800 }, { "epoch": 1.87, "learning_rate": 4.188982402448355e-06, "logits/chosen": 1.8849036693572998, "logits/rejected": 1.482762098312378, "logps/chosen": -400.40765380859375, "logps/rejected": -318.3489074707031, "loss": 0.5176, "rewards/accuracies": 0.7265625, "rewards/chosen": 0.1822209358215332, "rewards/margins": 0.7383443117141724, "rewards/rejected": -0.5561233758926392, "step": 1810 }, { "epoch": 1.88, "learning_rate": 4.1507268553940326e-06, "logits/chosen": 2.008263111114502, "logits/rejected": 1.4915977716445923, "logps/chosen": -398.11676025390625, "logps/rejected": -307.8688049316406, "loss": 0.5478, "rewards/accuracies": 0.7203124761581421, "rewards/chosen": 0.18287725746631622, "rewards/margins": 0.669849157333374, "rewards/rejected": -0.4869718551635742, "step": 1820 }, { "epoch": 1.89, "learning_rate": 4.112471308339709e-06, "logits/chosen": 1.8399749994277954, "logits/rejected": 1.3244879245758057, "logps/chosen": -386.895751953125, "logps/rejected": -304.82794189453125, "loss": 0.5302, "rewards/accuracies": 0.703125, "rewards/chosen": 0.11319240182638168, "rewards/margins": 0.6828613877296448, "rewards/rejected": -0.5696690082550049, "step": 1830 }, { "epoch": 1.9, "learning_rate": 4.074215761285386e-06, "logits/chosen": 1.8389393091201782, "logits/rejected": 1.4442493915557861, "logps/chosen": -389.1490173339844, "logps/rejected": -317.86260986328125, "loss": 0.5655, "rewards/accuracies": 0.721875011920929, "rewards/chosen": 0.1321045607328415, "rewards/margins": 0.6073146462440491, "rewards/rejected": -0.4752101004123688, "step": 1840 }, { "epoch": 1.91, "learning_rate": 4.035960214231064e-06, "logits/chosen": 1.8804800510406494, "logits/rejected": 1.3740012645721436, "logps/chosen": -394.7861022949219, "logps/rejected": -310.35955810546875, "loss": 0.5199, "rewards/accuracies": 0.75, "rewards/chosen": 0.17971648275852203, "rewards/margins": 0.7268356680870056, "rewards/rejected": -0.5471192002296448, "step": 1850 }, { "epoch": 1.92, "learning_rate": 3.997704667176741e-06, "logits/chosen": 1.835761308670044, "logits/rejected": 1.408271074295044, "logps/chosen": -386.2900390625, "logps/rejected": -308.59967041015625, "loss": 0.5562, "rewards/accuracies": 0.7015625238418579, "rewards/chosen": 0.14548413455486298, "rewards/margins": 0.6259700655937195, "rewards/rejected": -0.4804859757423401, "step": 1860 }, { "epoch": 1.93, "learning_rate": 3.959449120122418e-06, "logits/chosen": 1.9499410390853882, "logits/rejected": 1.3671890497207642, "logps/chosen": -408.0618896484375, "logps/rejected": -298.3442687988281, "loss": 0.5258, "rewards/accuracies": 0.7203124761581421, "rewards/chosen": 0.1869424283504486, "rewards/margins": 0.7301429510116577, "rewards/rejected": -0.5432006120681763, "step": 1870 }, { "epoch": 1.94, "learning_rate": 3.921193573068095e-06, "logits/chosen": 1.8840137720108032, "logits/rejected": 1.360190749168396, "logps/chosen": -387.3899841308594, "logps/rejected": -297.8107604980469, "loss": 0.5766, "rewards/accuracies": 0.6953125, "rewards/chosen": 0.09462201595306396, "rewards/margins": 0.5949129462242126, "rewards/rejected": -0.5002909302711487, "step": 1880 }, { "epoch": 1.95, "learning_rate": 3.882938026013772e-06, "logits/chosen": 2.041311264038086, "logits/rejected": 1.3203338384628296, "logps/chosen": -391.720458984375, "logps/rejected": -300.0113830566406, "loss": 0.5466, "rewards/accuracies": 0.714062511920929, "rewards/chosen": 0.1572737842798233, "rewards/margins": 0.705586314201355, "rewards/rejected": -0.5483125448226929, "step": 1890 }, { "epoch": 1.96, "learning_rate": 3.8446824789594494e-06, "logits/chosen": 1.8803116083145142, "logits/rejected": 1.4347630739212036, "logps/chosen": -399.1441955566406, "logps/rejected": -325.72552490234375, "loss": 0.529, "rewards/accuracies": 0.7359374761581421, "rewards/chosen": 0.21699848771095276, "rewards/margins": 0.7015321850776672, "rewards/rejected": -0.4845336079597473, "step": 1900 }, { "epoch": 1.96, "eval_logits/chosen": 1.9625109434127808, "eval_logits/rejected": 1.55705726146698, "eval_logps/chosen": -390.899658203125, "eval_logps/rejected": -310.4549865722656, "eval_loss": 0.5444718599319458, "eval_rewards/accuracies": 0.7214999794960022, "eval_rewards/chosen": 0.16666516661643982, "eval_rewards/margins": 0.6361386179924011, "eval_rewards/rejected": -0.4694734215736389, "eval_runtime": 839.9174, "eval_samples_per_second": 2.381, "eval_steps_per_second": 0.595, "step": 1900 }, { "epoch": 1.97, "learning_rate": 3.8064269319051268e-06, "logits/chosen": 1.9793459177017212, "logits/rejected": 1.2376145124435425, "logps/chosen": -414.79290771484375, "logps/rejected": -318.006103515625, "loss": 0.5195, "rewards/accuracies": 0.754687488079071, "rewards/chosen": 0.2078409641981125, "rewards/margins": 0.7472761869430542, "rewards/rejected": -0.5394352078437805, "step": 1910 }, { "epoch": 1.98, "learning_rate": 3.768171384850804e-06, "logits/chosen": 1.893792748451233, "logits/rejected": 1.3338686227798462, "logps/chosen": -399.4332275390625, "logps/rejected": -318.19464111328125, "loss": 0.5537, "rewards/accuracies": 0.707812488079071, "rewards/chosen": 0.15611618757247925, "rewards/margins": 0.6536720991134644, "rewards/rejected": -0.49755582213401794, "step": 1920 }, { "epoch": 1.99, "learning_rate": 3.7299158377964806e-06, "logits/chosen": 2.039228677749634, "logits/rejected": 1.4236376285552979, "logps/chosen": -395.6639404296875, "logps/rejected": -296.0408935546875, "loss": 0.5262, "rewards/accuracies": 0.754687488079071, "rewards/chosen": 0.2058977633714676, "rewards/margins": 0.6912787556648254, "rewards/rejected": -0.48538097739219666, "step": 1930 }, { "epoch": 2.0, "learning_rate": 3.691660290742158e-06, "logits/chosen": 1.9960769414901733, "logits/rejected": 1.380825161933899, "logps/chosen": -411.43963623046875, "logps/rejected": -323.51800537109375, "loss": 0.518, "rewards/accuracies": 0.721875011920929, "rewards/chosen": 0.21606655418872833, "rewards/margins": 0.74088054895401, "rewards/rejected": -0.5248139500617981, "step": 1940 }, { "epoch": 2.01, "learning_rate": 3.653404743687835e-06, "logits/chosen": 1.9273459911346436, "logits/rejected": 1.3717738389968872, "logps/chosen": -400.0301818847656, "logps/rejected": -303.9085388183594, "loss": 0.5197, "rewards/accuracies": 0.7328125238418579, "rewards/chosen": 0.18468081951141357, "rewards/margins": 0.7331782579421997, "rewards/rejected": -0.5484973788261414, "step": 1950 }, { "epoch": 2.02, "learning_rate": 3.615149196633512e-06, "logits/chosen": 1.9463794231414795, "logits/rejected": 1.468846321105957, "logps/chosen": -389.9013977050781, "logps/rejected": -310.2885437011719, "loss": 0.5438, "rewards/accuracies": 0.7203124761581421, "rewards/chosen": 0.1666269302368164, "rewards/margins": 0.656773030757904, "rewards/rejected": -0.49014607071876526, "step": 1960 }, { "epoch": 2.03, "learning_rate": 3.5768936495791894e-06, "logits/chosen": 1.8418022394180298, "logits/rejected": 1.5075290203094482, "logps/chosen": -382.7862854003906, "logps/rejected": -323.84918212890625, "loss": 0.5467, "rewards/accuracies": 0.7265625, "rewards/chosen": 0.13486254215240479, "rewards/margins": 0.6457086205482483, "rewards/rejected": -0.5108460783958435, "step": 1970 }, { "epoch": 2.04, "learning_rate": 3.5386381025248663e-06, "logits/chosen": 1.9035232067108154, "logits/rejected": 1.3666303157806396, "logps/chosen": -395.26690673828125, "logps/rejected": -304.6217956542969, "loss": 0.5115, "rewards/accuracies": 0.7593749761581421, "rewards/chosen": 0.2111072987318039, "rewards/margins": 0.714438796043396, "rewards/rejected": -0.5033314824104309, "step": 1980 }, { "epoch": 2.06, "learning_rate": 3.5003825554705432e-06, "logits/chosen": 1.8348839282989502, "logits/rejected": 1.3579442501068115, "logps/chosen": -398.62542724609375, "logps/rejected": -315.0461730957031, "loss": 0.5306, "rewards/accuracies": 0.7484375238418579, "rewards/chosen": 0.1667102873325348, "rewards/margins": 0.7245245575904846, "rewards/rejected": -0.5578143000602722, "step": 1990 }, { "epoch": 2.07, "learning_rate": 3.4621270084162205e-06, "logits/chosen": 1.898446798324585, "logits/rejected": 1.3372671604156494, "logps/chosen": -383.80828857421875, "logps/rejected": -300.1930236816406, "loss": 0.5312, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.17795772850513458, "rewards/margins": 0.6998143196105957, "rewards/rejected": -0.5218566060066223, "step": 2000 }, { "epoch": 2.07, "eval_logits/chosen": 1.9503859281539917, "eval_logits/rejected": 1.548615574836731, "eval_logps/chosen": -391.12811279296875, "eval_logps/rejected": -310.9095458984375, "eval_loss": 0.5446658730506897, "eval_rewards/accuracies": 0.7200000286102295, "eval_rewards/chosen": 0.14381767809391022, "eval_rewards/margins": 0.6587409377098083, "eval_rewards/rejected": -0.5149232745170593, "eval_runtime": 815.2559, "eval_samples_per_second": 2.453, "eval_steps_per_second": 0.613, "step": 2000 }, { "epoch": 2.08, "learning_rate": 3.423871461361898e-06, "logits/chosen": 1.9202572107315063, "logits/rejected": 1.3474085330963135, "logps/chosen": -382.2535705566406, "logps/rejected": -304.4725036621094, "loss": 0.5306, "rewards/accuracies": 0.714062511920929, "rewards/chosen": 0.15759733319282532, "rewards/margins": 0.694337010383606, "rewards/rejected": -0.536739706993103, "step": 2010 }, { "epoch": 2.09, "learning_rate": 3.385615914307575e-06, "logits/chosen": 1.932586669921875, "logits/rejected": 1.403100848197937, "logps/chosen": -374.32843017578125, "logps/rejected": -289.51776123046875, "loss": 0.5393, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": 0.14734536409378052, "rewards/margins": 0.6604071855545044, "rewards/rejected": -0.5130618810653687, "step": 2020 }, { "epoch": 2.1, "learning_rate": 3.3473603672532517e-06, "logits/chosen": 1.9338274002075195, "logits/rejected": 1.379889726638794, "logps/chosen": -408.8347473144531, "logps/rejected": -316.31512451171875, "loss": 0.5253, "rewards/accuracies": 0.734375, "rewards/chosen": 0.20510144531726837, "rewards/margins": 0.7322179079055786, "rewards/rejected": -0.5271164178848267, "step": 2030 }, { "epoch": 2.11, "learning_rate": 3.309104820198929e-06, "logits/chosen": 1.8762325048446655, "logits/rejected": 1.3051378726959229, "logps/chosen": -417.5376892089844, "logps/rejected": -325.60687255859375, "loss": 0.5242, "rewards/accuracies": 0.7265625, "rewards/chosen": 0.17885348200798035, "rewards/margins": 0.7652937769889832, "rewards/rejected": -0.5864403247833252, "step": 2040 }, { "epoch": 2.12, "learning_rate": 3.2708492731446063e-06, "logits/chosen": 1.9480432271957397, "logits/rejected": 1.3318156003952026, "logps/chosen": -388.45062255859375, "logps/rejected": -308.0003967285156, "loss": 0.504, "rewards/accuracies": 0.746874988079071, "rewards/chosen": 0.18859431147575378, "rewards/margins": 0.7672872543334961, "rewards/rejected": -0.5786929130554199, "step": 2050 }, { "epoch": 2.13, "learning_rate": 3.2325937260902836e-06, "logits/chosen": 1.8998448848724365, "logits/rejected": 1.3768129348754883, "logps/chosen": -378.49072265625, "logps/rejected": -307.764892578125, "loss": 0.5531, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.11820445209741592, "rewards/margins": 0.6751288771629333, "rewards/rejected": -0.5569244623184204, "step": 2060 }, { "epoch": 2.14, "learning_rate": 3.1943381790359605e-06, "logits/chosen": 1.9828697443008423, "logits/rejected": 1.284719705581665, "logps/chosen": -399.6075439453125, "logps/rejected": -312.29058837890625, "loss": 0.4985, "rewards/accuracies": 0.746874988079071, "rewards/chosen": 0.16622154414653778, "rewards/margins": 0.7923561930656433, "rewards/rejected": -0.6261346936225891, "step": 2070 }, { "epoch": 2.15, "learning_rate": 3.1560826319816374e-06, "logits/chosen": 1.9396564960479736, "logits/rejected": 1.3735202550888062, "logps/chosen": -391.2259521484375, "logps/rejected": -309.46728515625, "loss": 0.5184, "rewards/accuracies": 0.729687511920929, "rewards/chosen": 0.18753013014793396, "rewards/margins": 0.7682276964187622, "rewards/rejected": -0.5806975364685059, "step": 2080 }, { "epoch": 2.16, "learning_rate": 3.1178270849273148e-06, "logits/chosen": 1.816192626953125, "logits/rejected": 1.3793643712997437, "logps/chosen": -375.1627197265625, "logps/rejected": -302.96990966796875, "loss": 0.5599, "rewards/accuracies": 0.692187488079071, "rewards/chosen": 0.07474367320537567, "rewards/margins": 0.5998262166976929, "rewards/rejected": -0.5250825881958008, "step": 2090 }, { "epoch": 2.17, "learning_rate": 3.0795715378729917e-06, "logits/chosen": 1.9707539081573486, "logits/rejected": 1.2945826053619385, "logps/chosen": -397.34368896484375, "logps/rejected": -309.4389953613281, "loss": 0.557, "rewards/accuracies": 0.7171875238418579, "rewards/chosen": 0.14388945698738098, "rewards/margins": 0.6636291742324829, "rewards/rejected": -0.5197397470474243, "step": 2100 }, { "epoch": 2.17, "eval_logits/chosen": 1.9431500434875488, "eval_logits/rejected": 1.5426315069198608, "eval_logps/chosen": -391.1111145019531, "eval_logps/rejected": -310.9047546386719, "eval_loss": 0.5443159937858582, "eval_rewards/accuracies": 0.7195000052452087, "eval_rewards/chosen": 0.14551454782485962, "eval_rewards/margins": 0.6599627733230591, "eval_rewards/rejected": -0.5144481658935547, "eval_runtime": 796.8247, "eval_samples_per_second": 2.51, "eval_steps_per_second": 0.627, "step": 2100 }, { "epoch": 2.18, "learning_rate": 3.041315990818669e-06, "logits/chosen": 1.8863273859024048, "logits/rejected": 1.3362462520599365, "logps/chosen": -384.15069580078125, "logps/rejected": -305.6822204589844, "loss": 0.548, "rewards/accuracies": 0.723437488079071, "rewards/chosen": 0.1467057466506958, "rewards/margins": 0.6738990545272827, "rewards/rejected": -0.5271933674812317, "step": 2110 }, { "epoch": 2.19, "learning_rate": 3.0030604437643463e-06, "logits/chosen": 1.886776328086853, "logits/rejected": 1.2770754098892212, "logps/chosen": -404.9661560058594, "logps/rejected": -316.3112487792969, "loss": 0.5223, "rewards/accuracies": 0.7265625, "rewards/chosen": 0.18509329855442047, "rewards/margins": 0.7196551561355591, "rewards/rejected": -0.534561812877655, "step": 2120 }, { "epoch": 2.2, "learning_rate": 2.9648048967100236e-06, "logits/chosen": 1.8622076511383057, "logits/rejected": 1.4184415340423584, "logps/chosen": -393.64117431640625, "logps/rejected": -316.99212646484375, "loss": 0.5506, "rewards/accuracies": 0.734375, "rewards/chosen": 0.1718263030052185, "rewards/margins": 0.6905843615531921, "rewards/rejected": -0.5187580585479736, "step": 2130 }, { "epoch": 2.21, "learning_rate": 2.9265493496557e-06, "logits/chosen": 1.8317846059799194, "logits/rejected": 1.4165518283843994, "logps/chosen": -392.5058288574219, "logps/rejected": -309.49029541015625, "loss": 0.5355, "rewards/accuracies": 0.734375, "rewards/chosen": 0.13338004052639008, "rewards/margins": 0.6904716491699219, "rewards/rejected": -0.5570915937423706, "step": 2140 }, { "epoch": 2.22, "learning_rate": 2.8882938026013774e-06, "logits/chosen": 1.8177263736724854, "logits/rejected": 1.272984266281128, "logps/chosen": -390.7951965332031, "logps/rejected": -300.60955810546875, "loss": 0.517, "rewards/accuracies": 0.734375, "rewards/chosen": 0.19476288557052612, "rewards/margins": 0.7465412020683289, "rewards/rejected": -0.551778256893158, "step": 2150 }, { "epoch": 2.23, "learning_rate": 2.8500382555470547e-06, "logits/chosen": 1.8210207223892212, "logits/rejected": 1.2449976205825806, "logps/chosen": -394.6636657714844, "logps/rejected": -312.54046630859375, "loss": 0.5039, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.21618905663490295, "rewards/margins": 0.8034523129463196, "rewards/rejected": -0.587263286113739, "step": 2160 }, { "epoch": 2.24, "learning_rate": 2.8117827084927316e-06, "logits/chosen": 1.8524186611175537, "logits/rejected": 1.3088740110397339, "logps/chosen": -386.0102844238281, "logps/rejected": -314.1142578125, "loss": 0.518, "rewards/accuracies": 0.745312511920929, "rewards/chosen": 0.16865886747837067, "rewards/margins": 0.7468781471252441, "rewards/rejected": -0.5782192945480347, "step": 2170 }, { "epoch": 2.25, "learning_rate": 2.773527161438409e-06, "logits/chosen": 1.942042589187622, "logits/rejected": 1.322804570198059, "logps/chosen": -381.44073486328125, "logps/rejected": -296.1006774902344, "loss": 0.5216, "rewards/accuracies": 0.7203124761581421, "rewards/chosen": 0.1690443605184555, "rewards/margins": 0.7338646650314331, "rewards/rejected": -0.5648202896118164, "step": 2180 }, { "epoch": 2.26, "learning_rate": 2.735271614384086e-06, "logits/chosen": 1.8513023853302002, "logits/rejected": 1.3959568738937378, "logps/chosen": -379.71612548828125, "logps/rejected": -323.3500671386719, "loss": 0.5377, "rewards/accuracies": 0.7328125238418579, "rewards/chosen": 0.16209326684474945, "rewards/margins": 0.6926944255828857, "rewards/rejected": -0.5306012034416199, "step": 2190 }, { "epoch": 2.27, "learning_rate": 2.6970160673297628e-06, "logits/chosen": 1.9198487997055054, "logits/rejected": 1.4289544820785522, "logps/chosen": -389.8049621582031, "logps/rejected": -311.67327880859375, "loss": 0.5161, "rewards/accuracies": 0.7593749761581421, "rewards/chosen": 0.19835540652275085, "rewards/margins": 0.7266281843185425, "rewards/rejected": -0.5282727479934692, "step": 2200 }, { "epoch": 2.27, "eval_logits/chosen": 1.9349169731140137, "eval_logits/rejected": 1.5365551710128784, "eval_logps/chosen": -391.19598388671875, "eval_logps/rejected": -311.17047119140625, "eval_loss": 0.5435791015625, "eval_rewards/accuracies": 0.7210000157356262, "eval_rewards/chosen": 0.13703039288520813, "eval_rewards/margins": 0.6780468821525574, "eval_rewards/rejected": -0.5410164594650269, "eval_runtime": 795.8698, "eval_samples_per_second": 2.513, "eval_steps_per_second": 0.628, "step": 2200 }, { "epoch": 2.28, "learning_rate": 2.65876052027544e-06, "logits/chosen": 1.7786388397216797, "logits/rejected": 1.3866907358169556, "logps/chosen": -365.8920593261719, "logps/rejected": -298.8044738769531, "loss": 0.5743, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.04651176184415817, "rewards/margins": 0.6249829530715942, "rewards/rejected": -0.5784710645675659, "step": 2210 }, { "epoch": 2.29, "learning_rate": 2.6205049732211174e-06, "logits/chosen": 1.9371654987335205, "logits/rejected": 1.500967025756836, "logps/chosen": -412.75555419921875, "logps/rejected": -320.0480041503906, "loss": 0.5343, "rewards/accuracies": 0.721875011920929, "rewards/chosen": 0.21757829189300537, "rewards/margins": 0.7169729471206665, "rewards/rejected": -0.49939459562301636, "step": 2220 }, { "epoch": 2.3, "learning_rate": 2.5822494261667947e-06, "logits/chosen": 1.8165029287338257, "logits/rejected": 1.2959017753601074, "logps/chosen": -376.7248229980469, "logps/rejected": -301.94903564453125, "loss": 0.558, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.09275922924280167, "rewards/margins": 0.684673011302948, "rewards/rejected": -0.5919138193130493, "step": 2230 }, { "epoch": 2.31, "learning_rate": 2.543993879112471e-06, "logits/chosen": 1.85550856590271, "logits/rejected": 1.333299160003662, "logps/chosen": -409.77276611328125, "logps/rejected": -309.10919189453125, "loss": 0.5173, "rewards/accuracies": 0.760937511920929, "rewards/chosen": 0.22029216587543488, "rewards/margins": 0.793341338634491, "rewards/rejected": -0.5730491876602173, "step": 2240 }, { "epoch": 2.32, "learning_rate": 2.5057383320581485e-06, "logits/chosen": 1.8908500671386719, "logits/rejected": 1.4216768741607666, "logps/chosen": -392.8185729980469, "logps/rejected": -320.29254150390625, "loss": 0.5491, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.14646360278129578, "rewards/margins": 0.6358945965766907, "rewards/rejected": -0.4894309937953949, "step": 2250 }, { "epoch": 2.33, "learning_rate": 2.467482785003826e-06, "logits/chosen": 1.8548777103424072, "logits/rejected": 1.3177944421768188, "logps/chosen": -389.3084716796875, "logps/rejected": -314.4157409667969, "loss": 0.5541, "rewards/accuracies": 0.71875, "rewards/chosen": 0.14970391988754272, "rewards/margins": 0.6609498858451843, "rewards/rejected": -0.5112460255622864, "step": 2260 }, { "epoch": 2.34, "learning_rate": 2.429227237949503e-06, "logits/chosen": 1.8962656259536743, "logits/rejected": 1.2882287502288818, "logps/chosen": -390.0376892089844, "logps/rejected": -311.2901306152344, "loss": 0.5403, "rewards/accuracies": 0.7203124761581421, "rewards/chosen": 0.20038647949695587, "rewards/margins": 0.6643846035003662, "rewards/rejected": -0.46399813890457153, "step": 2270 }, { "epoch": 2.35, "learning_rate": 2.39097169089518e-06, "logits/chosen": 1.8536241054534912, "logits/rejected": 1.3984171152114868, "logps/chosen": -393.1831359863281, "logps/rejected": -314.572021484375, "loss": 0.5685, "rewards/accuracies": 0.692187488079071, "rewards/chosen": 0.2075819969177246, "rewards/margins": 0.6353307962417603, "rewards/rejected": -0.42774876952171326, "step": 2280 }, { "epoch": 2.37, "learning_rate": 2.352716143840857e-06, "logits/chosen": 1.866075873374939, "logits/rejected": 1.3068325519561768, "logps/chosen": -393.79925537109375, "logps/rejected": -303.6260070800781, "loss": 0.5431, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.21125376224517822, "rewards/margins": 0.679940402507782, "rewards/rejected": -0.46868667006492615, "step": 2290 }, { "epoch": 2.38, "learning_rate": 2.3144605967865343e-06, "logits/chosen": 1.8734241724014282, "logits/rejected": 1.350146770477295, "logps/chosen": -402.53924560546875, "logps/rejected": -333.2471008300781, "loss": 0.5543, "rewards/accuracies": 0.715624988079071, "rewards/chosen": 0.1887063980102539, "rewards/margins": 0.6754096746444702, "rewards/rejected": -0.4867033064365387, "step": 2300 }, { "epoch": 2.38, "eval_logits/chosen": 1.924289584159851, "eval_logits/rejected": 1.5231640338897705, "eval_logps/chosen": -390.42486572265625, "eval_logps/rejected": -310.1583557128906, "eval_loss": 0.5435594916343689, "eval_rewards/accuracies": 0.7225000262260437, "eval_rewards/chosen": 0.2141418159008026, "eval_rewards/margins": 0.6539459824562073, "eval_rewards/rejected": -0.43980422616004944, "eval_runtime": 795.9021, "eval_samples_per_second": 2.513, "eval_steps_per_second": 0.628, "step": 2300 }, { "epoch": 2.39, "learning_rate": 2.276205049732211e-06, "logits/chosen": 1.8453460931777954, "logits/rejected": 1.3969268798828125, "logps/chosen": -384.0627136230469, "logps/rejected": -302.47454833984375, "loss": 0.5222, "rewards/accuracies": 0.754687488079071, "rewards/chosen": 0.21984462440013885, "rewards/margins": 0.7229960560798645, "rewards/rejected": -0.5031514167785645, "step": 2310 }, { "epoch": 2.4, "learning_rate": 2.2379495026778885e-06, "logits/chosen": 1.8691091537475586, "logits/rejected": 1.3912206888198853, "logps/chosen": -398.4359130859375, "logps/rejected": -309.218994140625, "loss": 0.5575, "rewards/accuracies": 0.71875, "rewards/chosen": 0.18310242891311646, "rewards/margins": 0.6079894304275513, "rewards/rejected": -0.4248870015144348, "step": 2320 }, { "epoch": 2.41, "learning_rate": 2.1996939556235654e-06, "logits/chosen": 1.9119606018066406, "logits/rejected": 1.2945181131362915, "logps/chosen": -413.5851135253906, "logps/rejected": -309.5904846191406, "loss": 0.5253, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.25865715742111206, "rewards/margins": 0.7330378890037537, "rewards/rejected": -0.4743807911872864, "step": 2330 }, { "epoch": 2.42, "learning_rate": 2.1614384085692427e-06, "logits/chosen": 1.9302775859832764, "logits/rejected": 1.4838932752609253, "logps/chosen": -390.00164794921875, "logps/rejected": -304.65118408203125, "loss": 0.551, "rewards/accuracies": 0.7171875238418579, "rewards/chosen": 0.18264324963092804, "rewards/margins": 0.6449249982833862, "rewards/rejected": -0.4622817039489746, "step": 2340 }, { "epoch": 2.43, "learning_rate": 2.1231828615149196e-06, "logits/chosen": 1.8314825296401978, "logits/rejected": 1.431232213973999, "logps/chosen": -389.6313781738281, "logps/rejected": -321.3496398925781, "loss": 0.537, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.24707123637199402, "rewards/margins": 0.6673041582107544, "rewards/rejected": -0.4202328324317932, "step": 2350 }, { "epoch": 2.44, "learning_rate": 2.084927314460597e-06, "logits/chosen": 1.9019018411636353, "logits/rejected": 1.4433711767196655, "logps/chosen": -386.1368408203125, "logps/rejected": -305.8728942871094, "loss": 0.5394, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": 0.22535470128059387, "rewards/margins": 0.6739178895950317, "rewards/rejected": -0.44856318831443787, "step": 2360 }, { "epoch": 2.45, "learning_rate": 2.0466717674062743e-06, "logits/chosen": 1.8650842905044556, "logits/rejected": 1.256481409072876, "logps/chosen": -388.1630859375, "logps/rejected": -296.2594299316406, "loss": 0.5701, "rewards/accuracies": 0.715624988079071, "rewards/chosen": 0.16668713092803955, "rewards/margins": 0.6169299483299255, "rewards/rejected": -0.4502428472042084, "step": 2370 }, { "epoch": 2.46, "learning_rate": 2.008416220351951e-06, "logits/chosen": 1.9502109289169312, "logits/rejected": 1.4297467470169067, "logps/chosen": -379.29815673828125, "logps/rejected": -312.89910888671875, "loss": 0.5471, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": 0.22775688767433167, "rewards/margins": 0.6174046397209167, "rewards/rejected": -0.38964781165122986, "step": 2380 }, { "epoch": 2.47, "learning_rate": 1.9701606732976285e-06, "logits/chosen": 1.7859550714492798, "logits/rejected": 1.280915379524231, "logps/chosen": -387.32220458984375, "logps/rejected": -308.6187438964844, "loss": 0.5354, "rewards/accuracies": 0.7203124761581421, "rewards/chosen": 0.22848454117774963, "rewards/margins": 0.7262995839118958, "rewards/rejected": -0.4978150427341461, "step": 2390 }, { "epoch": 2.48, "learning_rate": 1.9319051262433054e-06, "logits/chosen": 1.8666341304779053, "logits/rejected": 1.340829610824585, "logps/chosen": -372.02099609375, "logps/rejected": -291.7071228027344, "loss": 0.5393, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.16351720690727234, "rewards/margins": 0.6243674159049988, "rewards/rejected": -0.46085023880004883, "step": 2400 }, { "epoch": 2.48, "eval_logits/chosen": 1.9255341291427612, "eval_logits/rejected": 1.5261861085891724, "eval_logps/chosen": -390.41448974609375, "eval_logps/rejected": -310.10858154296875, "eval_loss": 0.5431024432182312, "eval_rewards/accuracies": 0.7214999794960022, "eval_rewards/chosen": 0.21517375111579895, "eval_rewards/margins": 0.6500040292739868, "eval_rewards/rejected": -0.43483030796051025, "eval_runtime": 795.9475, "eval_samples_per_second": 2.513, "eval_steps_per_second": 0.628, "step": 2400 }, { "epoch": 2.49, "learning_rate": 1.8936495791889825e-06, "logits/chosen": 1.838332176208496, "logits/rejected": 1.378769040107727, "logps/chosen": -403.71209716796875, "logps/rejected": -318.07440185546875, "loss": 0.5287, "rewards/accuracies": 0.7421875, "rewards/chosen": 0.2571999430656433, "rewards/margins": 0.7231813669204712, "rewards/rejected": -0.4659813940525055, "step": 2410 }, { "epoch": 2.5, "learning_rate": 1.8553940321346598e-06, "logits/chosen": 1.9624725580215454, "logits/rejected": 1.4516370296478271, "logps/chosen": -408.7811584472656, "logps/rejected": -315.2079162597656, "loss": 0.5287, "rewards/accuracies": 0.721875011920929, "rewards/chosen": 0.2522364854812622, "rewards/margins": 0.7050968408584595, "rewards/rejected": -0.45286035537719727, "step": 2420 }, { "epoch": 2.51, "learning_rate": 1.8171384850803367e-06, "logits/chosen": 1.8794381618499756, "logits/rejected": 1.3900786638259888, "logps/chosen": -369.208251953125, "logps/rejected": -284.5470275878906, "loss": 0.5529, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.18967363238334656, "rewards/margins": 0.6356136202812195, "rewards/rejected": -0.44593995809555054, "step": 2430 }, { "epoch": 2.52, "learning_rate": 1.7788829380260138e-06, "logits/chosen": 1.8972177505493164, "logits/rejected": 1.3639757633209229, "logps/chosen": -396.42706298828125, "logps/rejected": -312.1378173828125, "loss": 0.5383, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": 0.1813143938779831, "rewards/margins": 0.693839430809021, "rewards/rejected": -0.5125250220298767, "step": 2440 }, { "epoch": 2.53, "learning_rate": 1.7406273909716912e-06, "logits/chosen": 1.9779412746429443, "logits/rejected": 1.3862651586532593, "logps/chosen": -384.77325439453125, "logps/rejected": -308.67352294921875, "loss": 0.5321, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": 0.223913311958313, "rewards/margins": 0.6955308318138123, "rewards/rejected": -0.4716174602508545, "step": 2450 }, { "epoch": 2.54, "learning_rate": 1.702371843917368e-06, "logits/chosen": 1.9245936870574951, "logits/rejected": 1.2373225688934326, "logps/chosen": -408.88604736328125, "logps/rejected": -289.3724365234375, "loss": 0.51, "rewards/accuracies": 0.729687511920929, "rewards/chosen": 0.3024453818798065, "rewards/margins": 0.8122583627700806, "rewards/rejected": -0.5098131895065308, "step": 2460 }, { "epoch": 2.55, "learning_rate": 1.6641162968630454e-06, "logits/chosen": 1.9536139965057373, "logits/rejected": 1.412117838859558, "logps/chosen": -408.3688659667969, "logps/rejected": -329.55694580078125, "loss": 0.5188, "rewards/accuracies": 0.7328125238418579, "rewards/chosen": 0.271922767162323, "rewards/margins": 0.7405332326889038, "rewards/rejected": -0.4686105251312256, "step": 2470 }, { "epoch": 2.56, "learning_rate": 1.6258607498087223e-06, "logits/chosen": 1.9300897121429443, "logits/rejected": 1.3482757806777954, "logps/chosen": -404.9444274902344, "logps/rejected": -326.31072998046875, "loss": 0.5264, "rewards/accuracies": 0.7265625, "rewards/chosen": 0.20008817315101624, "rewards/margins": 0.7382076978683472, "rewards/rejected": -0.5381194949150085, "step": 2480 }, { "epoch": 2.57, "learning_rate": 1.5876052027543996e-06, "logits/chosen": 1.9504390954971313, "logits/rejected": 1.2894113063812256, "logps/chosen": -403.97906494140625, "logps/rejected": -305.0173034667969, "loss": 0.4885, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.2770701050758362, "rewards/margins": 0.8133407831192017, "rewards/rejected": -0.5362707376480103, "step": 2490 }, { "epoch": 2.58, "learning_rate": 1.5493496557000767e-06, "logits/chosen": 1.8714988231658936, "logits/rejected": 1.3067795038223267, "logps/chosen": -407.8332214355469, "logps/rejected": -316.13116455078125, "loss": 0.55, "rewards/accuracies": 0.7109375, "rewards/chosen": 0.19110047817230225, "rewards/margins": 0.6722851395606995, "rewards/rejected": -0.4811846613883972, "step": 2500 }, { "epoch": 2.58, "eval_logits/chosen": 1.9283673763275146, "eval_logits/rejected": 1.5277034044265747, "eval_logps/chosen": -390.8309326171875, "eval_logps/rejected": -310.63330078125, "eval_loss": 0.5426416993141174, "eval_rewards/accuracies": 0.7254999876022339, "eval_rewards/chosen": 0.17353439331054688, "eval_rewards/margins": 0.6608353853225708, "eval_rewards/rejected": -0.4873010218143463, "eval_runtime": 797.2966, "eval_samples_per_second": 2.508, "eval_steps_per_second": 0.627, "step": 2500 }, { "epoch": 2.59, "learning_rate": 1.5110941086457536e-06, "logits/chosen": 1.9851598739624023, "logits/rejected": 1.316880464553833, "logps/chosen": -413.71527099609375, "logps/rejected": -315.7725524902344, "loss": 0.5301, "rewards/accuracies": 0.7265625, "rewards/chosen": 0.2565341591835022, "rewards/margins": 0.7113798260688782, "rewards/rejected": -0.454845666885376, "step": 2510 }, { "epoch": 2.6, "learning_rate": 1.472838561591431e-06, "logits/chosen": 1.9016873836517334, "logits/rejected": 1.305879831314087, "logps/chosen": -393.092529296875, "logps/rejected": -314.27618408203125, "loss": 0.5402, "rewards/accuracies": 0.7328125238418579, "rewards/chosen": 0.23168377578258514, "rewards/margins": 0.7217429876327515, "rewards/rejected": -0.4900591969490051, "step": 2520 }, { "epoch": 2.61, "learning_rate": 1.434583014537108e-06, "logits/chosen": 1.7998332977294922, "logits/rejected": 1.4078634977340698, "logps/chosen": -374.17230224609375, "logps/rejected": -311.94415283203125, "loss": 0.5536, "rewards/accuracies": 0.7171875238418579, "rewards/chosen": 0.11247539520263672, "rewards/margins": 0.6445512771606445, "rewards/rejected": -0.532075822353363, "step": 2530 }, { "epoch": 2.62, "learning_rate": 1.3963274674827851e-06, "logits/chosen": 1.7847728729248047, "logits/rejected": 1.404350996017456, "logps/chosen": -377.0040588378906, "logps/rejected": -316.1141662597656, "loss": 0.5461, "rewards/accuracies": 0.721875011920929, "rewards/chosen": 0.1496005356311798, "rewards/margins": 0.6688281297683716, "rewards/rejected": -0.5192276239395142, "step": 2540 }, { "epoch": 2.63, "learning_rate": 1.3580719204284623e-06, "logits/chosen": 1.796651840209961, "logits/rejected": 1.3197319507598877, "logps/chosen": -387.6080627441406, "logps/rejected": -319.40032958984375, "loss": 0.5211, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.19702936708927155, "rewards/margins": 0.7392258048057556, "rewards/rejected": -0.5421965718269348, "step": 2550 }, { "epoch": 2.64, "learning_rate": 1.3198163733741392e-06, "logits/chosen": 1.9106481075286865, "logits/rejected": 1.3078062534332275, "logps/chosen": -416.9351501464844, "logps/rejected": -317.9123840332031, "loss": 0.5106, "rewards/accuracies": 0.739062488079071, "rewards/chosen": 0.22059306502342224, "rewards/margins": 0.7541698217391968, "rewards/rejected": -0.5335767269134521, "step": 2560 }, { "epoch": 2.65, "learning_rate": 1.2815608263198165e-06, "logits/chosen": 1.8229938745498657, "logits/rejected": 1.4329159259796143, "logps/chosen": -378.3718566894531, "logps/rejected": -306.49798583984375, "loss": 0.5425, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.18917712569236755, "rewards/margins": 0.6734928488731384, "rewards/rejected": -0.4843156933784485, "step": 2570 }, { "epoch": 2.66, "learning_rate": 1.2433052792654936e-06, "logits/chosen": 1.8129692077636719, "logits/rejected": 1.2704432010650635, "logps/chosen": -386.3455505371094, "logps/rejected": -302.18756103515625, "loss": 0.5196, "rewards/accuracies": 0.7328125238418579, "rewards/chosen": 0.1656632125377655, "rewards/margins": 0.697827935218811, "rewards/rejected": -0.5321647524833679, "step": 2580 }, { "epoch": 2.68, "learning_rate": 1.2050497322111707e-06, "logits/chosen": 1.9466886520385742, "logits/rejected": 1.3993752002716064, "logps/chosen": -384.9145812988281, "logps/rejected": -310.48419189453125, "loss": 0.5482, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.17689287662506104, "rewards/margins": 0.6814517974853516, "rewards/rejected": -0.5045589208602905, "step": 2590 }, { "epoch": 2.69, "learning_rate": 1.1667941851568478e-06, "logits/chosen": 1.8453514575958252, "logits/rejected": 1.3914258480072021, "logps/chosen": -399.472412109375, "logps/rejected": -315.1606140136719, "loss": 0.516, "rewards/accuracies": 0.75, "rewards/chosen": 0.23511438071727753, "rewards/margins": 0.729377031326294, "rewards/rejected": -0.49426260590553284, "step": 2600 }, { "epoch": 2.69, "eval_logits/chosen": 1.923756718635559, "eval_logits/rejected": 1.523386001586914, "eval_logps/chosen": -390.8656921386719, "eval_logps/rejected": -310.6644287109375, "eval_loss": 0.5422862768173218, "eval_rewards/accuracies": 0.7275000214576721, "eval_rewards/chosen": 0.17005755007266998, "eval_rewards/margins": 0.6604735851287842, "eval_rewards/rejected": -0.4904159903526306, "eval_runtime": 798.1778, "eval_samples_per_second": 2.506, "eval_steps_per_second": 0.626, "step": 2600 }, { "epoch": 2.7, "learning_rate": 1.128538638102525e-06, "logits/chosen": 1.8115577697753906, "logits/rejected": 1.3338521718978882, "logps/chosen": -382.3858337402344, "logps/rejected": -315.61346435546875, "loss": 0.5451, "rewards/accuracies": 0.723437488079071, "rewards/chosen": 0.17404262721538544, "rewards/margins": 0.6530550718307495, "rewards/rejected": -0.4790124297142029, "step": 2610 }, { "epoch": 2.71, "learning_rate": 1.090283091048202e-06, "logits/chosen": 1.8975880146026611, "logits/rejected": 1.3036696910858154, "logps/chosen": -384.90728759765625, "logps/rejected": -313.3616638183594, "loss": 0.5163, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": 0.19925454258918762, "rewards/margins": 0.7436668276786804, "rewards/rejected": -0.5444123148918152, "step": 2620 }, { "epoch": 2.72, "learning_rate": 1.0520275439938791e-06, "logits/chosen": 1.7692668437957764, "logits/rejected": 1.4176040887832642, "logps/chosen": -384.73370361328125, "logps/rejected": -315.3553466796875, "loss": 0.5131, "rewards/accuracies": 0.7328125238418579, "rewards/chosen": 0.21966759860515594, "rewards/margins": 0.7312489748001099, "rewards/rejected": -0.511581301689148, "step": 2630 }, { "epoch": 2.73, "learning_rate": 1.0137719969395565e-06, "logits/chosen": 1.849914789199829, "logits/rejected": 1.3912808895111084, "logps/chosen": -362.81011962890625, "logps/rejected": -306.07708740234375, "loss": 0.5481, "rewards/accuracies": 0.734375, "rewards/chosen": 0.15157254040241241, "rewards/margins": 0.6434847712516785, "rewards/rejected": -0.49191227555274963, "step": 2640 }, { "epoch": 2.74, "learning_rate": 9.755164498852336e-07, "logits/chosen": 1.806741714477539, "logits/rejected": 1.4580366611480713, "logps/chosen": -384.57171630859375, "logps/rejected": -330.3133850097656, "loss": 0.5663, "rewards/accuracies": 0.703125, "rewards/chosen": 0.18018949031829834, "rewards/margins": 0.6130504608154297, "rewards/rejected": -0.43286100029945374, "step": 2650 }, { "epoch": 2.75, "learning_rate": 9.372609028309106e-07, "logits/chosen": 1.8819621801376343, "logits/rejected": 1.319496989250183, "logps/chosen": -396.9870910644531, "logps/rejected": -300.1016540527344, "loss": 0.5194, "rewards/accuracies": 0.7406250238418579, "rewards/chosen": 0.2204170525074005, "rewards/margins": 0.7351399064064026, "rewards/rejected": -0.5147228837013245, "step": 2660 }, { "epoch": 2.76, "learning_rate": 8.990053557765877e-07, "logits/chosen": 1.841212511062622, "logits/rejected": 1.2556275129318237, "logps/chosen": -378.9607849121094, "logps/rejected": -294.499267578125, "loss": 0.5174, "rewards/accuracies": 0.760937511920929, "rewards/chosen": 0.19511821866035461, "rewards/margins": 0.7270212173461914, "rewards/rejected": -0.5319029092788696, "step": 2670 }, { "epoch": 2.77, "learning_rate": 8.607498087222647e-07, "logits/chosen": 1.8978729248046875, "logits/rejected": 1.347278356552124, "logps/chosen": -392.6346435546875, "logps/rejected": -305.59539794921875, "loss": 0.5023, "rewards/accuracies": 0.745312511920929, "rewards/chosen": 0.2255987823009491, "rewards/margins": 0.7454218864440918, "rewards/rejected": -0.5198231339454651, "step": 2680 }, { "epoch": 2.78, "learning_rate": 8.22494261667942e-07, "logits/chosen": 1.8556360006332397, "logits/rejected": 1.379469871520996, "logps/chosen": -393.61419677734375, "logps/rejected": -321.4953918457031, "loss": 0.5611, "rewards/accuracies": 0.703125, "rewards/chosen": 0.13316483795642853, "rewards/margins": 0.6840446591377258, "rewards/rejected": -0.5508798360824585, "step": 2690 }, { "epoch": 2.79, "learning_rate": 7.84238714613619e-07, "logits/chosen": 1.8189160823822021, "logits/rejected": 1.3523896932601929, "logps/chosen": -385.1004333496094, "logps/rejected": -320.13348388671875, "loss": 0.5132, "rewards/accuracies": 0.746874988079071, "rewards/chosen": 0.21642343699932098, "rewards/margins": 0.7652528285980225, "rewards/rejected": -0.5488293766975403, "step": 2700 }, { "epoch": 2.79, "eval_logits/chosen": 1.9201858043670654, "eval_logits/rejected": 1.5212676525115967, "eval_logps/chosen": -390.87872314453125, "eval_logps/rejected": -310.7050476074219, "eval_loss": 0.5420036911964417, "eval_rewards/accuracies": 0.7245000004768372, "eval_rewards/chosen": 0.1687532663345337, "eval_rewards/margins": 0.663231372833252, "eval_rewards/rejected": -0.4944780766963959, "eval_runtime": 798.2763, "eval_samples_per_second": 2.505, "eval_steps_per_second": 0.626, "step": 2700 }, { "epoch": 2.8, "learning_rate": 7.459831675592961e-07, "logits/chosen": 1.8809839487075806, "logits/rejected": 1.320902705192566, "logps/chosen": -377.96392822265625, "logps/rejected": -307.28900146484375, "loss": 0.5383, "rewards/accuracies": 0.734375, "rewards/chosen": 0.1433677226305008, "rewards/margins": 0.6681622862815857, "rewards/rejected": -0.5247945785522461, "step": 2710 }, { "epoch": 2.81, "learning_rate": 7.077276205049732e-07, "logits/chosen": 1.8229610919952393, "logits/rejected": 1.3789646625518799, "logps/chosen": -384.7790832519531, "logps/rejected": -330.26544189453125, "loss": 0.5569, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.15766794979572296, "rewards/margins": 0.6201016902923584, "rewards/rejected": -0.46243375539779663, "step": 2720 }, { "epoch": 2.82, "learning_rate": 6.694720734506505e-07, "logits/chosen": 1.8970234394073486, "logits/rejected": 1.2469165325164795, "logps/chosen": -398.9933776855469, "logps/rejected": -297.5898132324219, "loss": 0.4922, "rewards/accuracies": 0.7578125, "rewards/chosen": 0.2657439112663269, "rewards/margins": 0.8046365976333618, "rewards/rejected": -0.5388926267623901, "step": 2730 }, { "epoch": 2.83, "learning_rate": 6.312165263963276e-07, "logits/chosen": 1.8662869930267334, "logits/rejected": 1.2493693828582764, "logps/chosen": -421.33502197265625, "logps/rejected": -315.5128479003906, "loss": 0.5421, "rewards/accuracies": 0.7328125238418579, "rewards/chosen": 0.20604190230369568, "rewards/margins": 0.7129246592521667, "rewards/rejected": -0.5068827867507935, "step": 2740 }, { "epoch": 2.84, "learning_rate": 5.929609793420047e-07, "logits/chosen": 1.889392614364624, "logits/rejected": 1.2931582927703857, "logps/chosen": -390.3883972167969, "logps/rejected": -299.5200500488281, "loss": 0.5369, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.21739423274993896, "rewards/margins": 0.7245230078697205, "rewards/rejected": -0.5071288347244263, "step": 2750 }, { "epoch": 2.85, "learning_rate": 5.547054322876817e-07, "logits/chosen": 1.836212396621704, "logits/rejected": 1.2998713254928589, "logps/chosen": -380.26239013671875, "logps/rejected": -298.9835205078125, "loss": 0.5241, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.20903977751731873, "rewards/margins": 0.7375310659408569, "rewards/rejected": -0.5284911394119263, "step": 2760 }, { "epoch": 2.86, "learning_rate": 5.164498852333589e-07, "logits/chosen": 1.8360679149627686, "logits/rejected": 1.3816144466400146, "logps/chosen": -388.96405029296875, "logps/rejected": -306.56585693359375, "loss": 0.5466, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.18503810465335846, "rewards/margins": 0.6872378587722778, "rewards/rejected": -0.5021997690200806, "step": 2770 }, { "epoch": 2.87, "learning_rate": 4.78194338179036e-07, "logits/chosen": 1.7588326930999756, "logits/rejected": 1.4671173095703125, "logps/chosen": -373.9554748535156, "logps/rejected": -325.2327575683594, "loss": 0.5576, "rewards/accuracies": 0.707812488079071, "rewards/chosen": 0.14026573300361633, "rewards/margins": 0.6305382251739502, "rewards/rejected": -0.4902724623680115, "step": 2780 }, { "epoch": 2.88, "learning_rate": 4.399387911247131e-07, "logits/chosen": 1.9163461923599243, "logits/rejected": 1.5186926126480103, "logps/chosen": -390.3271484375, "logps/rejected": -317.7460632324219, "loss": 0.5418, "rewards/accuracies": 0.739062488079071, "rewards/chosen": 0.17590954899787903, "rewards/margins": 0.6520856022834778, "rewards/rejected": -0.47617602348327637, "step": 2790 }, { "epoch": 2.89, "learning_rate": 4.0168324407039023e-07, "logits/chosen": 1.9037706851959229, "logits/rejected": 1.3236620426177979, "logps/chosen": -383.06707763671875, "logps/rejected": -308.722412109375, "loss": 0.5412, "rewards/accuracies": 0.734375, "rewards/chosen": 0.16230130195617676, "rewards/margins": 0.6994771957397461, "rewards/rejected": -0.5371758341789246, "step": 2800 }, { "epoch": 2.89, "eval_logits/chosen": 1.9216077327728271, "eval_logits/rejected": 1.5223499536514282, "eval_logps/chosen": -390.8974609375, "eval_logps/rejected": -310.7515869140625, "eval_loss": 0.5421897172927856, "eval_rewards/accuracies": 0.722000002861023, "eval_rewards/chosen": 0.1668809950351715, "eval_rewards/margins": 0.6660127639770508, "eval_rewards/rejected": -0.4991317391395569, "eval_runtime": 797.8098, "eval_samples_per_second": 2.507, "eval_steps_per_second": 0.627, "step": 2800 }, { "epoch": 2.9, "learning_rate": 3.634276970160674e-07, "logits/chosen": 1.9225976467132568, "logits/rejected": 1.3995444774627686, "logps/chosen": -385.5367126464844, "logps/rejected": -304.0980224609375, "loss": 0.531, "rewards/accuracies": 0.7328125238418579, "rewards/chosen": 0.16200684010982513, "rewards/margins": 0.6947375535964966, "rewards/rejected": -0.5327308177947998, "step": 2810 }, { "epoch": 2.91, "learning_rate": 3.2517214996174445e-07, "logits/chosen": 1.9011256694793701, "logits/rejected": 1.422337293624878, "logps/chosen": -395.55865478515625, "logps/rejected": -312.18475341796875, "loss": 0.524, "rewards/accuracies": 0.7328125238418579, "rewards/chosen": 0.16321566700935364, "rewards/margins": 0.721695601940155, "rewards/rejected": -0.558479905128479, "step": 2820 }, { "epoch": 2.92, "learning_rate": 2.8691660290742157e-07, "logits/chosen": 1.835877776145935, "logits/rejected": 1.3597265481948853, "logps/chosen": -384.87982177734375, "logps/rejected": -305.2154541015625, "loss": 0.5408, "rewards/accuracies": 0.721875011920929, "rewards/chosen": 0.17432734370231628, "rewards/margins": 0.6742584109306335, "rewards/rejected": -0.4999311566352844, "step": 2830 }, { "epoch": 2.93, "learning_rate": 2.4866105585309873e-07, "logits/chosen": 1.9166672229766846, "logits/rejected": 1.2945325374603271, "logps/chosen": -392.184326171875, "logps/rejected": -315.0354919433594, "loss": 0.5479, "rewards/accuracies": 0.7203124761581421, "rewards/chosen": 0.1650506854057312, "rewards/margins": 0.6694290637969971, "rewards/rejected": -0.5043784379959106, "step": 2840 }, { "epoch": 2.94, "learning_rate": 2.1040550879877584e-07, "logits/chosen": 1.9222066402435303, "logits/rejected": 1.3432600498199463, "logps/chosen": -394.2955017089844, "logps/rejected": -313.95941162109375, "loss": 0.5356, "rewards/accuracies": 0.734375, "rewards/chosen": 0.18698087334632874, "rewards/margins": 0.7258338928222656, "rewards/rejected": -0.5388529896736145, "step": 2850 }, { "epoch": 2.95, "learning_rate": 1.7214996174445295e-07, "logits/chosen": 1.850600004196167, "logits/rejected": 1.463445782661438, "logps/chosen": -382.59649658203125, "logps/rejected": -319.3958435058594, "loss": 0.5622, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": 0.16887713968753815, "rewards/margins": 0.681231677532196, "rewards/rejected": -0.5123544931411743, "step": 2860 }, { "epoch": 2.96, "learning_rate": 1.338944146901301e-07, "logits/chosen": 1.9172236919403076, "logits/rejected": 1.4463660717010498, "logps/chosen": -411.2205505371094, "logps/rejected": -333.6241149902344, "loss": 0.5105, "rewards/accuracies": 0.7328125238418579, "rewards/chosen": 0.2917831838130951, "rewards/margins": 0.7725859880447388, "rewards/rejected": -0.4808027744293213, "step": 2870 }, { "epoch": 2.97, "learning_rate": 9.56388676358072e-08, "logits/chosen": 1.9173275232315063, "logits/rejected": 1.2591311931610107, "logps/chosen": -408.18963623046875, "logps/rejected": -313.57720947265625, "loss": 0.5173, "rewards/accuracies": 0.753125011920929, "rewards/chosen": 0.22628772258758545, "rewards/margins": 0.7554703950881958, "rewards/rejected": -0.5291827321052551, "step": 2880 }, { "epoch": 2.98, "learning_rate": 5.738332058148432e-08, "logits/chosen": 1.787086844444275, "logits/rejected": 1.30776846408844, "logps/chosen": -376.986572265625, "logps/rejected": -310.27886962890625, "loss": 0.5542, "rewards/accuracies": 0.6953125, "rewards/chosen": 0.18192549049854279, "rewards/margins": 0.6590663194656372, "rewards/rejected": -0.47714075446128845, "step": 2890 }, { "epoch": 3.0, "learning_rate": 1.912777352716144e-08, "logits/chosen": 1.8844960927963257, "logits/rejected": 1.4650356769561768, "logps/chosen": -382.848876953125, "logps/rejected": -318.86083984375, "loss": 0.5387, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.1745864599943161, "rewards/margins": 0.7120822668075562, "rewards/rejected": -0.5374957919120789, "step": 2900 }, { "epoch": 3.0, "eval_logits/chosen": 1.9216641187667847, "eval_logits/rejected": 1.524082064628601, "eval_logps/chosen": -390.9371643066406, "eval_logps/rejected": -310.7856140136719, "eval_loss": 0.5420743227005005, "eval_rewards/accuracies": 0.7235000133514404, "eval_rewards/chosen": 0.16290949285030365, "eval_rewards/margins": 0.6654456853866577, "eval_rewards/rejected": -0.5025361180305481, "eval_runtime": 800.7679, "eval_samples_per_second": 2.498, "eval_steps_per_second": 0.624, "step": 2900 }, { "epoch": 3.0, "step": 2905, "total_flos": 0.0, "train_loss": 0.5633580120213061, "train_runtime": 136089.4761, "train_samples_per_second": 1.366, "train_steps_per_second": 0.021 } ], "logging_steps": 10, "max_steps": 2905, "num_train_epochs": 4, "save_steps": 500, "total_flos": 0.0, "trial_name": null, "trial_params": null }